X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=samtools%2Fbam_rmdupse.c;fp=samtools%2Fbam_rmdupse.c;h=0000000000000000000000000000000000000000;hp=e7dbdc77d05f05d9c9622b484557df3504d6f4c1;hb=d02fe5283ed7a93a2f76a5d6dc6e37b40c11b9b1;hpb=d828f9c9aa78e3d1687265b52de841f3f3852089 diff --git a/samtools/bam_rmdupse.c b/samtools/bam_rmdupse.c deleted file mode 100644 index e7dbdc7..0000000 --- a/samtools/bam_rmdupse.c +++ /dev/null @@ -1,159 +0,0 @@ -#include -#include "sam.h" -#include "khash.h" -#include "klist.h" - -#define QUEUE_CLEAR_SIZE 0x100000 -#define MAX_POS 0x7fffffff - -typedef struct { - int endpos; - uint32_t score:31, discarded:1; - bam1_t *b; -} elem_t, *elem_p; -#define __free_elem(p) bam_destroy1((p)->data.b) -KLIST_INIT(q, elem_t, __free_elem) -typedef klist_t(q) queue_t; - -KHASH_MAP_INIT_INT(best, elem_p) -typedef khash_t(best) besthash_t; - -typedef struct { - uint64_t n_checked, n_removed; - besthash_t *left, *rght; -} lib_aux_t; -KHASH_MAP_INIT_STR(lib, lib_aux_t) - -static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) -{ - khint_t k = kh_get(lib, aux, lib); - if (k == kh_end(aux)) { - int ret; - char *p = strdup(lib); - lib_aux_t *q; - k = kh_put(lib, aux, p, &ret); - q = &kh_val(aux, k); - q->left = kh_init(best); - q->rght = kh_init(best); - q->n_checked = q->n_removed = 0; - return q; - } else return &kh_val(aux, k); -} - -static inline int sum_qual(const bam1_t *b) -{ - int i, q; - uint8_t *qual = bam1_qual(b); - for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; - return q; -} - -static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) -{ - elem_t *p = kl_pushp(q, queue); - p->discarded = 0; - p->endpos = endpos; p->score = score; - if (p->b == 0) p->b = bam_init1(); - bam_copy1(p->b, b); - return p; -} - -static void clear_besthash(besthash_t *h, int32_t pos) -{ - khint_t k; - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) - kh_del(best, h, k); -} - -static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) -{ - if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { - khint_t k; - while (1) { - elem_t *q; - if (queue->head == queue->tail) break; - q = &kl_val(queue->head); - if (q->discarded) { - q->b->data_len = 0; - kl_shift(q, queue, 0); - continue; - } - if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; - samwrite(out, q->b); - q->b->data_len = 0; - kl_shift(q, queue, 0); - } - for (k = kh_begin(h); k != kh_end(h); ++k) { - if (kh_exist(h, k)) { - clear_besthash(kh_val(h, k).left, pos); - clear_besthash(kh_val(h, k).rght, pos); - } - } - } -} - -void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) -{ - bam1_t *b; - queue_t *queue; - khint_t k; - int last_tid = -2; - khash_t(lib) *aux; - - aux = kh_init(lib); - b = bam_init1(); - queue = kl_init(q); - while (samread(in, b) >= 0) { - bam1_core_t *c = &b->core; - int endpos = bam_calend(c, bam1_cigar(b)); - int score = sum_qual(b); - - if (last_tid != c->tid) { - if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); - last_tid = c->tid; - } else dump_alignment(out, queue, c->pos, aux); - if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { - push_queue(queue, b, endpos, score); - } else { - const char *lib; - lib_aux_t *q; - besthash_t *h; - uint32_t key; - int ret; - lib = bam_get_library(in->header, b); - q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); - ++q->n_checked; - h = (c->flag&BAM_FREVERSE)? q->rght : q->left; - key = (c->flag&BAM_FREVERSE)? endpos : c->pos; - k = kh_put(best, h, key, &ret); - if (ret == 0) { // in the hash table - elem_t *p = kh_val(h, k); - ++q->n_removed; - if (p->score < score) { - if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue - p->discarded = 1; - kh_val(h, k) = push_queue(queue, b, endpos, score); - } else { // replace - p->score = score; p->endpos = endpos; - bam_copy1(p->b, b); - } - } // otherwise, discard the alignment - } else kh_val(h, k) = push_queue(queue, b, endpos, score); - } - } - dump_alignment(out, queue, MAX_POS, aux); - - for (k = kh_begin(aux); k != kh_end(aux); ++k) { - if (kh_exist(aux, k)) { - lib_aux_t *q = &kh_val(aux, k); - fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, - (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); - kh_destroy(best, q->left); kh_destroy(best, q->rght); - free((char*)kh_key(aux, k)); - } - } - kh_destroy(lib, aux); - bam_destroy1(b); - kl_destroy(q, queue); -}