Update debian changelog
[pysam.git] / samtools / bam_rmdupse.c.pysam.c
1 #include "pysam.h"
2
3 #include <math.h>
4 #include "sam.h"
5 #include "khash.h"
6 #include "klist.h"
7
8 #define QUEUE_CLEAR_SIZE 0x100000
9 #define MAX_POS 0x7fffffff
10
11 typedef struct {
12         int endpos;
13         uint32_t score:31, discarded:1;
14         bam1_t *b;
15 } elem_t, *elem_p;
16 #define __free_elem(p) bam_destroy1((p)->data.b)
17 KLIST_INIT(q, elem_t, __free_elem)
18 typedef klist_t(q) queue_t;
19
20 KHASH_MAP_INIT_INT(best, elem_p)
21 typedef khash_t(best) besthash_t;
22
23 typedef struct {
24         uint64_t n_checked, n_removed;
25         besthash_t *left, *rght;
26 } lib_aux_t;
27 KHASH_MAP_INIT_STR(lib, lib_aux_t)
28
29 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
30 {
31         khint_t k = kh_get(lib, aux, lib);
32         if (k == kh_end(aux)) {
33                 int ret;
34                 char *p = strdup(lib);
35                 lib_aux_t *q;
36                 k = kh_put(lib, aux, p, &ret);
37                 q = &kh_val(aux, k);
38                 q->left = kh_init(best);
39                 q->rght = kh_init(best);
40                 q->n_checked = q->n_removed = 0;
41                 return q;
42         } else return &kh_val(aux, k);
43 }
44
45 static inline int sum_qual(const bam1_t *b)
46 {
47         int i, q;
48         uint8_t *qual = bam1_qual(b);
49         for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
50         return q;
51 }
52
53 static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
54 {
55         elem_t *p = kl_pushp(q, queue);
56         p->discarded = 0;
57         p->endpos = endpos; p->score = score;
58         if (p->b == 0) p->b = bam_init1();
59         bam_copy1(p->b, b);
60         return p;
61 }
62
63 static void clear_besthash(besthash_t *h, int32_t pos)
64 {
65         khint_t k;
66         for (k = kh_begin(h); k != kh_end(h); ++k)
67                 if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
68                         kh_del(best, h, k);
69 }
70
71 static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
72 {
73         if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
74                 khint_t k;
75                 while (1) {
76                         elem_t *q;
77                         if (queue->head == queue->tail) break;
78                         q = &kl_val(queue->head);
79                         if (q->discarded) {
80                                 q->b->data_len = 0;
81                                 kl_shift(q, queue, 0);
82                                 continue;
83                         }
84                         if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
85                         samwrite(out, q->b);
86                         q->b->data_len = 0;
87                         kl_shift(q, queue, 0);
88                 }
89                 for (k = kh_begin(h); k != kh_end(h); ++k) {
90                         if (kh_exist(h, k)) {
91                                 clear_besthash(kh_val(h, k).left, pos);
92                                 clear_besthash(kh_val(h, k).rght, pos);
93                         }
94                 }
95         }
96 }
97
98 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
99 {
100         bam1_t *b;
101         queue_t *queue;
102         khint_t k;
103         int last_tid = -2;
104         khash_t(lib) *aux;
105
106         aux = kh_init(lib);
107         b = bam_init1();
108         queue = kl_init(q);
109         while (samread(in, b) >= 0) {
110                 bam1_core_t *c = &b->core;
111                 int endpos = bam_calend(c, bam1_cigar(b));
112                 int score = sum_qual(b);
113                 
114                 if (last_tid != c->tid) {
115                         if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
116                         last_tid = c->tid;
117                 } else dump_alignment(out, queue, c->pos, aux);
118                 if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
119                         push_queue(queue, b, endpos, score);
120                 } else {
121                         const char *lib;
122                         lib_aux_t *q;
123                         besthash_t *h;
124                         uint32_t key;
125                         int ret;
126                         lib = bam_get_library(in->header, b);
127                         q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
128                         ++q->n_checked;
129                         h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
130                         key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
131                         k = kh_put(best, h, key, &ret);
132                         if (ret == 0) { // in the hash table
133                                 elem_t *p = kh_val(h, k);
134                                 ++q->n_removed;
135                                 if (p->score < score) {
136                                         if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
137                                                 p->discarded = 1;
138                                                 kh_val(h, k) = push_queue(queue, b, endpos, score);
139                                         } else { // replace
140                                                 p->score = score; p->endpos = endpos;
141                                                 bam_copy1(p->b, b);
142                                         }
143                                 } // otherwise, discard the alignment
144                         } else kh_val(h, k) = push_queue(queue, b, endpos, score);
145                 }
146         }
147         dump_alignment(out, queue, MAX_POS, aux);
148
149         for (k = kh_begin(aux); k != kh_end(aux); ++k) {
150                 if (kh_exist(aux, k)) {
151                         lib_aux_t *q = &kh_val(aux, k);
152                         fprintf(pysamerr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
153                                         (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
154                         kh_destroy(best, q->left); kh_destroy(best, q->rght);
155                         free((char*)kh_key(aux, k));
156                 }
157         }
158         kh_destroy(lib, aux);
159         bam_destroy1(b);
160         kl_destroy(q, queue);
161 }