Imported Upstream version 0.1.6~dfsg
[samtools.git] / bam_rmdupse.c
1 #include <math.h>
2 #include "sam.h"
3 #include "khash.h"
4
5 typedef struct {
6         int n, m;
7         int *a;
8 } listelem_t;
9
10 KHASH_MAP_INIT_INT(32, listelem_t)
11
12 #define BLOCK_SIZE 65536
13
14 typedef struct {
15         bam1_t *b;
16         int rpos, score;
17 } elem_t;
18
19 typedef struct {
20         int n, max, x;
21         elem_t *buf;
22 } buffer_t;
23
24 static int fill_buf(samfile_t *in, buffer_t *buf)
25 {
26         int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
27         bam1_t *b = bam_init1();
28         bam1_core_t *c = &b->core;
29         // squeeze out the empty cells at the beginning
30         for (i = 0; i < buf->n; ++i)
31                 if (buf->buf[i].b) break;
32         if (i < buf->n) { // squeeze
33                 if (i > 0) {
34                         memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
35                         buf->n = buf->n - i;
36                 }
37         } else buf->n = 0;
38         // calculate min_rpos
39         for (i = 0; i < buf->n; ++i) {
40                 elem_t *e = buf->buf + i;
41                 if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
42                         min_rpos = buf->buf[i].rpos;
43         }
44         // fill the buffer
45         buf->x = -1;
46         last_tid = buf->n? buf->buf[0].b->core.tid : -1;
47         capacity = buf->n + BLOCK_SIZE;
48         while ((ret = samread(in, b)) >= 0) {
49                 elem_t *e;
50                 uint8_t *qual = bam1_qual(b);
51                 int is_mapped;
52                 if (last_tid < 0) last_tid = c->tid;
53                 if (c->tid != last_tid) {
54                         if (buf->x < 0) buf->x = buf->n;
55                 }
56                 if (buf->n >= buf->max) { // enlarge
57                         buf->max = buf->max? buf->max<<1 : 8;
58                         buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
59                 }
60                 e = &buf->buf[buf->n++];
61                 e->b = bam_dup1(b);
62                 e->rpos = -1; e->score = 0;
63                 for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
64                 e->score = (double)e->score / sqrt(c->l_qseq + 1);
65                 is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
66                 if (!is_mapped) e->score = -1;
67                 if (is_mapped && (c->flag & BAM_FREVERSE)) {
68                         e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
69                         if (min_rpos > e->rpos) min_rpos = e->rpos;
70                 }
71                 if (buf->n >= capacity) {
72                         if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
73                         else break;
74                 }
75         }
76         if (ret >= 0 && buf->x < 0) buf->x = buf->n;
77         bam_destroy1(b);
78         return buf->n;
79 }
80
81 static void rmdupse_buf(buffer_t *buf)
82 {
83         khash_t(32) *h;
84         uint32_t key;
85         khint_t k;
86         int mpos, i, upper;
87         listelem_t *p;
88         mpos = 0x7fffffff;
89         mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
90         upper = (buf->x < 0)? buf->n : buf->x;
91         // fill the hash table
92         h = kh_init(32);
93         for (i = 0; i < upper; ++i) {
94                 elem_t *e = buf->buf + i;
95                 int ret;
96                 if (e->score < 0) continue;
97                 if (e->rpos >= 0) {
98                         if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
99                         else continue;
100                 } else {
101                         if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
102                         else continue;
103                 }
104                 k = kh_put(32, h, key, &ret);
105                 p = &kh_val(h, k);
106                 if (ret == 0) { // present in the hash table
107                         if (p->n == p->m) {
108                                 p->m <<= 1;
109                                 p->a = (int*)realloc(p->a, p->m * sizeof(int));
110                         }
111                         p->a[p->n++] = i;
112                 } else {
113                         p->m = p->n = 1;
114                         p->a = (int*)calloc(p->m, sizeof(int));
115                         p->a[0] = i;
116                 }
117         }
118         // rmdup
119         for (k = kh_begin(h); k < kh_end(h); ++k) {
120                 if (kh_exist(h, k)) {
121                         int max, maxi;
122                         p = &kh_val(h, k);
123                         // get the max
124                         for (i = max = 0, maxi = -1; i < p->n; ++i) {
125                                 if (buf->buf[p->a[i]].score > max) {
126                                         max = buf->buf[p->a[i]].score;
127                                         maxi = i;
128                                 }
129                         }
130                         // mark the elements
131                         for (i = 0; i < p->n; ++i) {
132                                 buf->buf[p->a[i]].score = -1;
133                                 if (i != maxi) {
134                                         bam_destroy1(buf->buf[p->a[i]].b);
135                                         buf->buf[p->a[i]].b = 0;
136                                 }
137                         }
138                         // free
139                         free(p->a);
140                 }
141         }
142         kh_destroy(32, h);
143 }
144
145 static void dump_buf(buffer_t *buf, samfile_t *out)
146 {
147         int i;
148         for (i = 0; i < buf->n; ++i) {
149                 elem_t *e = buf->buf + i;
150                 if (e->score != -1) break;
151                 if (e->b) {
152                         samwrite(out, e->b);
153                         bam_destroy1(e->b);
154                         e->b = 0;
155                 }
156         }
157 }
158
159 int bam_rmdupse(int argc, char *argv[])
160 {
161         samfile_t *in, *out;
162         buffer_t *buf;
163         if (argc < 3) {
164                 fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n\n");
165                 fprintf(stderr, "Note: Picard is recommended for this task.\n");
166                 return 1;
167         }
168         buf = calloc(1, sizeof(buffer_t));
169         in = samopen(argv[1], "rb", 0);
170         out = samopen(argv[2], "wb", in->header);
171         while (fill_buf(in, buf)) {
172                 rmdupse_buf(buf);
173                 dump_buf(buf, out);
174         }
175         samclose(in); samclose(out);
176         free(buf->buf); free(buf);
177         return 0;
178 }