Update debian changelog
[pysam.git] / samtools / bam_rmdup.c.pysam.c
1 #include "pysam.h"
2
3 #include <stdlib.h>
4 #include <string.h>
5 #include <stdio.h>
6 #include <zlib.h>
7 #include <unistd.h>
8 #include "sam.h"
9
10 typedef bam1_t *bam1_p;
11
12 #include "khash.h"
13 KHASH_SET_INIT_STR(name)
14 KHASH_MAP_INIT_INT64(pos, bam1_p)
15
16 #define BUFFER_SIZE 0x40000
17
18 typedef struct {
19         uint64_t n_checked, n_removed;
20         khash_t(pos) *best_hash;
21 } lib_aux_t;
22 KHASH_MAP_INIT_STR(lib, lib_aux_t)
23
24 typedef struct {
25         int n, max;
26         bam1_t **a;
27 } tmp_stack_t;
28
29 static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
30 {
31         if (stack->n == stack->max) {
32                 stack->max = stack->max? stack->max<<1 : 0x10000;
33                 stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
34         }
35         stack->a[stack->n++] = b;
36 }
37
38 static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
39 {
40         int i;
41         for (i = 0; i != stack->n; ++i) {
42                 samwrite(out, stack->a[i]);
43                 bam_destroy1(stack->a[i]);
44         }
45         stack->n = 0;
46 }
47
48 static void clear_del_set(khash_t(name) *del_set)
49 {
50         khint_t k;
51         for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
52                 if (kh_exist(del_set, k))
53                         free((char*)kh_key(del_set, k));
54         kh_clear(name, del_set);
55 }
56
57 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
58 {
59         khint_t k = kh_get(lib, aux, lib);
60         if (k == kh_end(aux)) {
61                 int ret;
62                 char *p = strdup(lib);
63                 lib_aux_t *q;
64                 k = kh_put(lib, aux, p, &ret);
65                 q = &kh_val(aux, k);
66                 q->n_checked = q->n_removed = 0;
67                 q->best_hash = kh_init(pos);
68                 return q;
69         } else return &kh_val(aux, k);
70 }
71
72 static void clear_best(khash_t(lib) *aux, int max)
73 {
74         khint_t k;
75         for (k = kh_begin(aux); k != kh_end(aux); ++k) {
76                 if (kh_exist(aux, k)) {
77                         lib_aux_t *q = &kh_val(aux, k);
78                         if (kh_size(q->best_hash) >= max)
79                                 kh_clear(pos, q->best_hash);
80                 }
81         }
82 }
83
84 static inline int sum_qual(const bam1_t *b)
85 {
86         int i, q;
87         uint8_t *qual = bam1_qual(b);
88         for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
89         return q;
90 }
91
92 void bam_rmdup_core(samfile_t *in, samfile_t *out)
93 {
94         bam1_t *b;
95         int last_tid = -1, last_pos = -1;
96         tmp_stack_t stack;
97         khint_t k;
98         khash_t(lib) *aux;
99         khash_t(name) *del_set;
100         
101         aux = kh_init(lib);
102         del_set = kh_init(name);
103         b = bam_init1();
104         memset(&stack, 0, sizeof(tmp_stack_t));
105
106         kh_resize(name, del_set, 4 * BUFFER_SIZE);
107         while (samread(in, b) >= 0) {
108                 bam1_core_t *c = &b->core;
109                 if (c->tid != last_tid || last_pos != c->pos) {
110                         dump_best(&stack, out); // write the result
111                         clear_best(aux, BUFFER_SIZE);
112                         if (c->tid != last_tid) {
113                                 clear_best(aux, 0);
114                                 if (kh_size(del_set)) { // check
115                                         fprintf(pysamerr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
116                                         clear_del_set(del_set);
117                                 }
118                                 if ((int)c->tid == -1) { // append unmapped reads
119                                         samwrite(out, b);
120                                         while (samread(in, b) >= 0) samwrite(out, b);
121                                         break;
122                                 }
123                                 last_tid = c->tid;
124                                 fprintf(pysamerr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
125                         }
126                 }
127                 if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
128                         samwrite(out, b);
129                 } else if (c->isize > 0) { // paired, head
130                         uint64_t key = (uint64_t)c->pos<<32 | c->isize;
131                         const char *lib;
132                         lib_aux_t *q;
133                         int ret;
134                         lib = bam_get_library(in->header, b);
135                         q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
136                         ++q->n_checked;
137                         k = kh_put(pos, q->best_hash, key, &ret);
138                         if (ret == 0) { // found in best_hash
139                                 bam1_t *p = kh_val(q->best_hash, k);
140                                 ++q->n_removed;
141                                 if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
142                                         kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
143                                         bam_copy1(p, b); // replaced as b
144                                 } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
145                                 if (ret == 0)
146                                         fprintf(pysamerr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
147                         } else { // not found in best_hash
148                                 kh_val(q->best_hash, k) = bam_dup1(b);
149                                 stack_insert(&stack, kh_val(q->best_hash, k));
150                         }
151                 } else { // paired, tail
152                         k = kh_get(name, del_set, bam1_qname(b));
153                         if (k != kh_end(del_set)) {
154                                 free((char*)kh_key(del_set, k));
155                                 kh_del(name, del_set, k);
156                         } else samwrite(out, b);
157                 }
158                 last_pos = c->pos;
159         }
160
161         for (k = kh_begin(aux); k != kh_end(aux); ++k) {
162                 if (kh_exist(aux, k)) {
163                         lib_aux_t *q = &kh_val(aux, k);                 
164                         dump_best(&stack, out);
165                         fprintf(pysamerr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
166                                         (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
167                         kh_destroy(pos, q->best_hash);
168                         free((char*)kh_key(aux, k));
169                 }
170         }
171         kh_destroy(lib, aux);
172
173         clear_del_set(del_set);
174         kh_destroy(name, del_set);
175         free(stack.a);
176         bam_destroy1(b);
177 }
178
179 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
180
181 int bam_rmdup(int argc, char *argv[])
182 {
183         int c, is_se = 0, force_se = 0;
184         samfile_t *in, *out;
185         while ((c = getopt(argc, argv, "sS")) >= 0) {
186                 switch (c) {
187                 case 's': is_se = 1; break;
188                 case 'S': force_se = is_se = 1; break;
189                 }
190         }
191         if (optind + 2 > argc) {
192                 fprintf(pysamerr, "\n");
193                 fprintf(pysamerr, "Usage:  samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
194                 fprintf(pysamerr, "Option: -s    rmdup for SE reads\n");
195                 fprintf(pysamerr, "        -S    treat PE reads as SE in rmdup (force -s)\n\n");
196                 return 1;
197         }
198         in = samopen(argv[optind], "rb", 0);
199         out = samopen(argv[optind+1], "wb", in->header);
200         if (in == 0 || out == 0) {
201                 fprintf(pysamerr, "[bam_rmdup] fail to read/write input files\n");
202                 return 1;
203         }
204         if (is_se) bam_rmdupse_core(in, out, force_se);
205         else bam_rmdup_core(in, out);
206         samclose(in); samclose(out);
207         return 0;
208 }