5da946011f10ec72758c18cc10af75180a7d8f4e
[samtools.git] / bam_rmdup.c
1 #include <stdlib.h>
2 #include <string.h>
3 #include <stdio.h>
4 #include <zlib.h>
5 #include "bam.h"
6
7 typedef bam1_t *bam1_p;
8 #include "khash.h"
9 KHASH_SET_INIT_STR(name)
10 KHASH_MAP_INIT_INT64(pos, bam1_p)
11
12 #define BUFFER_SIZE 0x40000
13
14 typedef struct {
15         int n, max;
16         bam1_t **a;
17 } tmp_stack_t;
18
19 static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
20 {
21         if (stack->n == stack->max) {
22                 stack->max = stack->max? stack->max<<1 : 0x10000;
23                 stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
24         }
25         stack->a[stack->n++] = b;
26 }
27
28 static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
29 {
30         int i;
31         for (i = 0; i != stack->n; ++i) {
32                 bam_write1(out, stack->a[i]);
33                 bam_destroy1(stack->a[i]);
34         }
35         stack->n = 0;
36         if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
37 }
38
39 static void clear_del_set(khash_t(name) *del_set)
40 {
41         khint_t k;
42         for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
43                 if (kh_exist(del_set, k))
44                         free((char*)kh_key(del_set, k));
45         kh_clear(name, del_set);
46 }
47
48 void bam_rmdup_core(bamFile in, bamFile out)
49 {
50         bam_header_t *header;
51         bam1_t *b;
52         int last_tid = -1, last_pos = -1;
53         uint64_t n_checked = 0, n_removed = 0;
54         tmp_stack_t stack;
55         khint_t k;
56         khash_t(pos) *best_hash;
57         khash_t(name) *del_set;
58
59         best_hash = kh_init(pos);
60         del_set = kh_init(name);
61         b = bam_init1();
62         memset(&stack, 0, sizeof(tmp_stack_t));
63         header = bam_header_read(in);
64         bam_header_write(out, header);
65
66         kh_resize(name, del_set, 4 * BUFFER_SIZE);
67         kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
68         while (bam_read1(in, b) >= 0) {
69                 bam1_core_t *c = &b->core;
70                 if (c->tid != last_tid || last_pos != c->pos) {
71                         dump_best(&stack, best_hash, out); // write the result
72                         if (c->tid != last_tid) {
73                                 kh_clear(pos, best_hash);
74                                 if (kh_size(del_set)) { // check
75                                         fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
76                                         clear_del_set(del_set);
77                                 }
78                                 if ((int)c->tid == -1) { // append unmapped reads
79                                         bam_write1(out, b);
80                                         while (bam_read1(in, b) >= 0) bam_write1(out, b);
81                                         break;
82                                 }
83                                 last_tid = c->tid;
84                                 fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
85                         }
86                 }
87                 if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
88                         bam_write1(out, b);
89                 } else if (c->isize > 0) { // paired, head
90                         uint64_t key = (uint64_t)c->pos<<32 | c->isize;
91                         int ret;
92                         ++n_checked;
93                         k = kh_put(pos, best_hash, key, &ret);
94                         if (ret == 0) { // found in best_hash
95                                 bam1_t *p = kh_val(best_hash, k);
96                                 ++n_removed;
97                                 if (p->core.qual < c->qual) { // the current alignment is better
98                                         kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
99                                         bam_copy1(p, b); // replaced as b
100                                 } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
101                                 if (ret == 0)
102                                         fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
103                         } else { // not found in best_hash
104                                 kh_val(best_hash, k) = bam_dup1(b);
105                                 stack_insert(&stack, kh_val(best_hash, k));
106                         }
107                 } else { // paired, tail
108                         k = kh_get(name, del_set, bam1_qname(b));
109                         if (k != kh_end(del_set)) {
110                                 free((char*)kh_key(del_set, k));
111                                 kh_del(name, del_set, k);
112                         } else bam_write1(out, b);
113                 }
114                 last_pos = c->pos;
115         }
116         dump_best(&stack, best_hash, out);
117
118         bam_header_destroy(header);
119         clear_del_set(del_set);
120         kh_destroy(name, del_set);
121         kh_destroy(pos, best_hash);
122         free(stack.a);
123         bam_destroy1(b);
124         fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
125                         (double)n_removed/n_checked);
126 }
127 int bam_rmdup(int argc, char *argv[])
128 {
129         bamFile in, out;
130         if (argc < 3) {
131                 fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n\n");
132                 fprintf(stderr, "Note: Picard is recommended for this task.\n");
133                 return 1;
134         }
135         in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
136         out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
137         if (in == 0 || out == 0) {
138                 fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
139                 return 1;
140         }
141         bam_rmdup_core(in, out);
142         bam_close(in);
143         bam_close(out);
144         return 0;
145 }