X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bam_rmdupse.c;h=e7dbdc77d05f05d9c9622b484557df3504d6f4c1;hp=df03717232b10f9e23abe8ae0e7eefbc5a6c7d02;hb=016b50ab60e879a0b8f81cb76ce11ea360a03d4a;hpb=b27e00385f41769d03a8cca4dbd71275fc9fa906 diff --git a/bam_rmdupse.c b/bam_rmdupse.c index df03717..e7dbdc7 100644 --- a/bam_rmdupse.c +++ b/bam_rmdupse.c @@ -1,177 +1,159 @@ #include #include "sam.h" #include "khash.h" +#include "klist.h" -typedef struct { - int n, m; - int *a; -} listelem_t; - -KHASH_MAP_INIT_INT(32, listelem_t) - -#define BLOCK_SIZE 65536 +#define QUEUE_CLEAR_SIZE 0x100000 +#define MAX_POS 0x7fffffff typedef struct { + int endpos; + uint32_t score:31, discarded:1; bam1_t *b; - int rpos, score; -} elem_t; +} elem_t, *elem_p; +#define __free_elem(p) bam_destroy1((p)->data.b) +KLIST_INIT(q, elem_t, __free_elem) +typedef klist_t(q) queue_t; + +KHASH_MAP_INIT_INT(best, elem_p) +typedef khash_t(best) besthash_t; typedef struct { - int n, max, x; - elem_t *buf; -} buffer_t; + uint64_t n_checked, n_removed; + besthash_t *left, *rght; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) -static int fill_buf(samfile_t *in, buffer_t *buf) +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) { - int i, ret, last_tid, min_rpos = 0x7fffffff, capacity; - bam1_t *b = bam_init1(); - bam1_core_t *c = &b->core; - // squeeze out the empty cells at the beginning - for (i = 0; i < buf->n; ++i) - if (buf->buf[i].b) break; - if (i < buf->n) { // squeeze - if (i > 0) { - memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i)); - buf->n = buf->n - i; - } - } else buf->n = 0; - // calculate min_rpos - for (i = 0; i < buf->n; ++i) { - elem_t *e = buf->buf + i; - if (e->b && e->rpos >= 0 && e->rpos < min_rpos) - min_rpos = buf->buf[i].rpos; - } - // fill the buffer - buf->x = -1; - last_tid = buf->n? buf->buf[0].b->core.tid : -1; - capacity = buf->n + BLOCK_SIZE; - while ((ret = samread(in, b)) >= 0) { - elem_t *e; - uint8_t *qual = bam1_qual(b); - int is_mapped; - if (last_tid < 0) last_tid = c->tid; - if (c->tid != last_tid) { - if (buf->x < 0) buf->x = buf->n; - } - if (buf->n >= buf->max) { // enlarge - buf->max = buf->max? buf->max<<1 : 8; - buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max); - } - e = &buf->buf[buf->n++]; - e->b = bam_dup1(b); - e->rpos = -1; e->score = 0; - for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1; - e->score = (double)e->score / sqrt(c->l_qseq + 1); - is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1; - if (!is_mapped) e->score = -1; - if (is_mapped && (c->flag & BAM_FREVERSE)) { - e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b)); - if (min_rpos > e->rpos) min_rpos = e->rpos; - } - if (buf->n >= capacity) { - if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE; - else break; - } - } - if (ret >= 0 && buf->x < 0) buf->x = buf->n; - bam_destroy1(b); - return buf->n; + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->left = kh_init(best); + q->rght = kh_init(best); + q->n_checked = q->n_removed = 0; + return q; + } else return &kh_val(aux, k); +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + +static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) +{ + elem_t *p = kl_pushp(q, queue); + p->discarded = 0; + p->endpos = endpos; p->score = score; + if (p->b == 0) p->b = bam_init1(); + bam_copy1(p->b, b); + return p; } -static void rmdupse_buf(buffer_t *buf) +static void clear_besthash(besthash_t *h, int32_t pos) { - khash_t(32) *h; - uint32_t key; khint_t k; - int mpos, i, upper; - listelem_t *p; - mpos = 0x7fffffff; - mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff; - upper = (buf->x < 0)? buf->n : buf->x; - // fill the hash table - h = kh_init(32); - for (i = 0; i < upper; ++i) { - elem_t *e = buf->buf + i; - int ret; - if (e->score < 0) continue; - if (e->rpos >= 0) { - if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1; - else continue; - } else { - if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1; - else continue; - } - k = kh_put(32, h, key, &ret); - p = &kh_val(h, k); - if (ret == 0) { // present in the hash table - if (p->n == p->m) { - p->m <<= 1; - p->a = (int*)realloc(p->a, p->m * sizeof(int)); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) + kh_del(best, h, k); +} + +static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) +{ + if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { + khint_t k; + while (1) { + elem_t *q; + if (queue->head == queue->tail) break; + q = &kl_val(queue->head); + if (q->discarded) { + q->b->data_len = 0; + kl_shift(q, queue, 0); + continue; } - p->a[p->n++] = i; - } else { - p->m = p->n = 1; - p->a = (int*)calloc(p->m, sizeof(int)); - p->a[0] = i; + if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; + samwrite(out, q->b); + q->b->data_len = 0; + kl_shift(q, queue, 0); } - } - // rmdup - for (k = kh_begin(h); k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - int max, maxi; - p = &kh_val(h, k); - // get the max - for (i = max = 0, maxi = -1; i < p->n; ++i) { - if (buf->buf[p->a[i]].score > max) { - max = buf->buf[p->a[i]].score; - maxi = i; - } - } - // mark the elements - for (i = 0; i < p->n; ++i) { - buf->buf[p->a[i]].score = -1; - if (i != maxi) { - bam_destroy1(buf->buf[p->a[i]].b); - buf->buf[p->a[i]].b = 0; - } + for (k = kh_begin(h); k != kh_end(h); ++k) { + if (kh_exist(h, k)) { + clear_besthash(kh_val(h, k).left, pos); + clear_besthash(kh_val(h, k).rght, pos); } - // free - free(p->a); } } - kh_destroy(32, h); } -static void dump_buf(buffer_t *buf, samfile_t *out) +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) { - int i; - for (i = 0; i < buf->n; ++i) { - elem_t *e = buf->buf + i; - if (e->score != -1) break; - if (e->b) { - samwrite(out, e->b); - bam_destroy1(e->b); - e->b = 0; + bam1_t *b; + queue_t *queue; + khint_t k; + int last_tid = -2; + khash_t(lib) *aux; + + aux = kh_init(lib); + b = bam_init1(); + queue = kl_init(q); + while (samread(in, b) >= 0) { + bam1_core_t *c = &b->core; + int endpos = bam_calend(c, bam1_cigar(b)); + int score = sum_qual(b); + + if (last_tid != c->tid) { + if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); + last_tid = c->tid; + } else dump_alignment(out, queue, c->pos, aux); + if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { + push_queue(queue, b, endpos, score); + } else { + const char *lib; + lib_aux_t *q; + besthash_t *h; + uint32_t key; + int ret; + lib = bam_get_library(in->header, b); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + h = (c->flag&BAM_FREVERSE)? q->rght : q->left; + key = (c->flag&BAM_FREVERSE)? endpos : c->pos; + k = kh_put(best, h, key, &ret); + if (ret == 0) { // in the hash table + elem_t *p = kh_val(h, k); + ++q->n_removed; + if (p->score < score) { + if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue + p->discarded = 1; + kh_val(h, k) = push_queue(queue, b, endpos, score); + } else { // replace + p->score = score; p->endpos = endpos; + bam_copy1(p->b, b); + } + } // otherwise, discard the alignment + } else kh_val(h, k) = push_queue(queue, b, endpos, score); } } -} + dump_alignment(out, queue, MAX_POS, aux); -int bam_rmdupse(int argc, char *argv[]) -{ - samfile_t *in, *out; - buffer_t *buf; - if (argc < 3) { - fprintf(stderr, "Usage: samtools rmdupse \n"); - return 1; - } - buf = calloc(1, sizeof(buffer_t)); - in = samopen(argv[1], "rb", 0); - out = samopen(argv[2], "wb", in->header); - while (fill_buf(in, buf)) { - rmdupse_buf(buf); - dump_buf(buf, out); + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(best, q->left); kh_destroy(best, q->rght); + free((char*)kh_key(aux, k)); + } } - samclose(in); samclose(out); - free(buf->buf); free(buf); - return 0; + kh_destroy(lib, aux); + bam_destroy1(b); + kl_destroy(q, queue); }