#include "bam_endian.h"
#include "knetfile.h"
#include "pysam_util.h"
+#include "errmod.h" // for pysam_dump
+
+#ifndef inline
+#define inline __inline
+#endif
+
+// Definition of pysamerr
+#include "stdio.h"
+FILE * pysamerr = NULL;
+
+FILE * pysam_set_stderr( FILE * f )
+{
+ pysamerr = f;
+ return f;
+}
// #######################################################
// utility routines to avoid using callbacks in bam_fetch
// taken from bam_index.c
// The order of the following declarations is important.
// #######################################################
+#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
+// initialize hashes
typedef struct
{
uint64_t u, v;
#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(myoff, pair64_t, pair64_lt);
+
typedef struct {
uint32_t m, n;
pair64_t *list;
uint64_t *offset;
} bam_lidx_t;
-KSORT_INIT(my_off, pair64_t, pair64_lt);
-KHASH_MAP_INIT_INT(my_i, bam_binlist_t);
+
+// initialize hashes ('i' and 's' are idenditifiers)
+KHASH_MAP_INIT_INT(i, bam_binlist_t);
+KHASH_MAP_INIT_STR(s, int)
struct __bam_index_t
{
int32_t n;
- khash_t(my_i) **index;
+ uint64_t n_no_coor; // unmapped reads without coordinate
+ khash_t(i) **index;
bam_lidx_t *index2;
};
bam_pileup1_t *pu;
int flag_mask;
};
-
+
static mempool_t *mp_init()
{
mempool_t *mp;
}
assert(x > pos); // otherwise a bug
return ret;
-}
-
-
-
+}
// the following code has been taken from bam_plbuf_push
// and modified such that instead of a function call
// the function returns and will continue (if cont is true).
// 1: if buf is full and can be emitted
// 0: if b has been added
// -1: if there was an error
-int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont)
+int pysam_pileup_next(const bam1_t *b,
+ bam_plbuf_t *buf,
+ bam_pileup1_t ** plp,
+ int * tid,
+ int * pos,
+ int * n_plp )
{
- if (!cont)
- {
- if (b) { // fill buffer
- if (b->core.tid < 0) return 0;
- if (b->core.flag & buf->flag_mask) return 0;
- bam_copy1(&buf->tail->b, b);
- buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
- if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
- fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n");
- abort();
- }
- buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
- if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
- buf->tail->next = mp_alloc(buf->mp);
- buf->tail = buf->tail->next;
- }
- } else buf->is_eof = 1;
- }
- else
- // continue end of loop
- {
- // update tid and pos
- if (buf->head->next) {
- if (buf->tid > buf->head->b.core.tid) {
- fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
- return -1;
- }
- }
- if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
- buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
- } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
- buf->pos = buf->head->beg; // jump to the next position
- } else ++buf->pos; // scan contiguously
- if (buf->is_eof && buf->head->next == 0) return 0;
- }
+ *plp = bam_plp_next(buf->iter, tid, pos, n_plp);
+ if (plp == NULL) return 0;
+ return 1;
+}
+
+typedef struct __bmc_aux_t {
+ int max;
+ uint32_t *info;
+ uint16_t *info16;
+ errmod_t *em;
+} bmc_aux_t;
- // enter yield loop
- while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos))
+// Return number of mapped reads on tid.
+// If tid < 0, return mapped reads without a coordinate (0)
+uint32_t pysam_get_mapped( const bam_index_t *idx, const int tid )
+{
+
+ if (tid >= 0)
{
- int n_pu = 0;
- lbnode_t *p, *q;
- buf->dummy->next = buf->head;
- for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
- if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
- q->next = p->next; mp_free(buf->mp, p); p = q;
- } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
- if (n_pu == buf->max_pu) { // then double the capacity
- buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
- buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
- }
- buf->pu[n_pu].b = &p->b;
- if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
- }
- }
- buf->head = buf->dummy->next; // dummy->next may be changed
+ khint_t k;
+ khash_t(i) *h = idx->index[tid];
+ k = kh_get(i, h, BAM_MAX_BIN);
- // exit if alignments need to be emitted
- if (n_pu) { return n_pu; }
-
- // update tid and pos
- if (buf->head->next) {
- if (buf->tid > buf->head->b.core.tid) {
- fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
- return -2;
- }
- }
- if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
- buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
- } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
- buf->pos = buf->head->beg; // jump to the next position
- } else ++buf->pos; // scan contiguously
- if (buf->is_eof && buf->head->next == 0) break;
+ if (k != kh_end(h))
+ return kh_val(h, k).list[1].u;
+ else
+ return 0;
}
+
return 0;
}
-int pysam_get_pos( const bam_plbuf_t *buf)
+uint32_t pysam_get_unmapped( const bam_index_t *idx, const int tid )
{
- return buf->pos;
+
+ if (tid >= 0)
+ {
+ khint_t k;
+ khash_t(i) *h = idx->index[tid];
+ k = kh_get(i, h, BAM_MAX_BIN);
+
+ if (k != kh_end(h))
+ return kh_val(h, k).list[1].v;
+ else
+ return 0;
+ }
+
+ return idx->n_no_coor;
}
+/* uint32_t pysam_glf_depth( glf1_t * g ) */
+/* { */
+/* return g->depth; */
+/* } */
+
+
+/* void pysam_dump_glf( glf1_t * g, bam_maqcns_t * c ) */
+/* { */
+/* int x = 0; */
+/* fprintf(stderr, */
+/* "glf: ref_base=%i, max_mapQ=%i, min_lk=%i, depth=%i", */
+/* g->ref_base, */
+/* g->max_mapQ, */
+/* g->min_lk, */
+/* g->depth ); */
+
+/* for (x = 0; x < 10; ++x) */
+/* fprintf(stderr, ", lk%x=%i, ", x, g->lk[x]); */
+
+/* fprintf(stderr, */
+/* "maqcns: het_rate=%f, theta=%f, n_hap=%i, cap_mapQ=%i, errmod=%i, min_baseQ=%i, eta=%f, q_r=%f, aux_max=%i", */
+/* c->het_rate, */
+/* c->theta, */
+/* c->n_hap, */
+/* c->cap_mapQ, */
+/* c->errmod, */
+/* c->min_baseQ, */
+/* c->eta, */
+/* c->q_r, */
+/* c->aux->max); */
-int pysam_get_tid( const bam_plbuf_t *buf)
-{
- return buf->tid;
-}
+/* for (x = 0; x < c->aux->max; ++x) */
+/* { */
+/* fprintf(stderr, ", info-%i=%i ", x, c->aux->info[x]); */
+/* if (c->aux->info[x] == 0) break; */
+/* } */
+
+/* for (x = 0; x < c->aux->max; ++x) */
+/* { */
+/* fprintf(stderr, ", info16-%i=%i ", x, c->aux->info16[x]); */
+/* if (c->aux->info16[x] == 0) break; */
+/* } */
+/* } */
-bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf)
-{
- return buf->pu;
-}
+
+
// pysam dispatch function to emulate the samtools
// command line within python.
// taken from the main function in bamtk.c
// added code to reset getopt
-extern int main_samview(int argc, char *argv[]);
-extern int main_import(int argc, char *argv[]);
-extern int bam_pileup(int argc, char *argv[]);
-extern int bam_merge(int argc, char *argv[]);
-extern int bam_sort(int argc, char *argv[]);
-extern int bam_index(int argc, char *argv[]);
-extern int faidx_main(int argc, char *argv[]);
-extern int bam_mating(int argc, char *argv[]);
-extern int bam_rmdup(int argc, char *argv[]);
-extern int glf3_view_main(int argc, char *argv[]);
-extern int bam_flagstat(int argc, char *argv[]);
-extern int bam_fillmd(int argc, char *argv[]);
+int bam_taf2baf(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
+int bam_merge(int argc, char *argv[]);
+int bam_index(int argc, char *argv[]);
+int bam_sort(int argc, char *argv[]);
+int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
+int bam_idxstats(int argc, char *argv[]);
+int main_samview(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
+int main_cut_target(int argc, char *argv[]);
+int main_phase(int argc, char *argv[]);
+int main_cat(int argc, char *argv[]);
+int main_depth(int argc, char *argv[]);
+int main_bam2fq(int argc, char *argv[]);
+int faidx_main(int argc, char *argv[]);
int pysam_dispatch(int argc, char *argv[] )
{
-
+ extern int optind;
#ifdef _WIN32
setmode(fileno(stdout), O_BINARY);
setmode(fileno(stdin), O_BINARY);
knet_win32_init();
#endif
#endif
-
- extern int optind;
- // reset getop
+ // reset getopt
optind = 1;
if (argc < 2) return 1;
if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
- else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);
+ else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1);
else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1);
else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
- else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
-
+ else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1);
+ else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1);
+ else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1);
+ else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1);
+ else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1);
+
#if _CURSES_LIB != 0
else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
#endif
return 0;
}
-// standin for bam_destroy1 in bam.h
-// deletes all variable length data
-void pysam_bam_destroy1( bam1_t * b )
-{
- if (b == NULL) return;
- if (b->data != NULL) free(b->data);
- free(b);
-}
-
// taken from samtools/bam_import.c
static inline uint8_t *alloc_data(bam1_t *b, size_t size)
{
uint8_t * pos )
{
int d = nbytes_new-nbytes_old;
+ int new_size;
+ size_t offset;
// no change
if (d == 0) return b;
- int new_size = d + b->data_len;
- size_t offset = pos - b->data;
+ new_size = d + b->data_len;
+ offset = pos - b->data;
//printf("d=%i, old=%i, new=%i, old_size=%i, new_size=%i\n",
// d, nbytes_old, nbytes_new, b->data_len, new_size);
return bam_nt16_table[s];
}
-// stand-ins for samtools macros in bam.h
-char * pysam_bam1_qname( const bam1_t * b)
-{
- return (char*)b->data;
-}
-
-uint32_t * pysam_bam1_cigar( const bam1_t * b)
-{
- return (uint32_t*)(b->data + b->core.l_qname);
-}
-uint8_t * pysam_bam1_seq( const bam1_t * b)
-{
- return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname);
-}
-
-uint8_t * pysam_bam1_qual( const bam1_t * b)
-{
- return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + (b->core.l_qseq + 1)/2);
-}
+void bam_init_header_hash(bam_header_t *header);
-uint8_t * pysam_bam1_aux( const bam1_t * b)
+// translate a reference string *s* to a tid
+// code taken from bam_parse_region
+int pysam_reference2tid( bam_header_t *header, const char * s )
{
- return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + b->core.l_qseq + (b->core.l_qseq + 1)/2);
-}
-
-// #######################################################
-// Iterator implementation
-// #######################################################
+
+ khiter_t iter;
+ khash_t(s) *h;
+
+ bam_init_header_hash(header);
+ h = (khash_t(s)*)header->hash;
-// functions defined in bam_index.c
-extern pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off);
+ iter = kh_get(s, h, s); /* get the ref_id */
+ if (iter == kh_end(h)) { // name not found
+ return -1;
+ }
-static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
-{
- uint32_t rbeg = b->core.pos;
- uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
- return (rend > beg && rbeg < end);
+ return kh_value(h, iter);
}
-struct __bam_fetch_iterator_t
-{
- bam1_t * b;
- pair64_t * off;
- int n_off;
- uint64_t curr_off;
- int curr_chunk;
- bamFile fp;
- int tid;
- int beg;
- int end;
- int n_seeks;
-};
-
-bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end)
-{
- // iterator contains current alignment position
- // and will contain actual alignment during iterations
- bam_fetch_iterator_t* iter = (bam_fetch_iterator_t*)calloc(1, sizeof(bam_fetch_iterator_t));
- iter->b = (bam1_t*)calloc(1, sizeof(bam1_t));
-
- // list of chunks containing our alignments
- iter->off = get_chunk_coordinates(idx, tid, beg, end, &iter->n_off);
-
- // initialise other state variables in iterator
- iter->fp = fp;
- iter->curr_chunk = -1;
- iter->curr_off = 0;
- iter->n_seeks = 0;
- iter->tid = tid;
- iter->beg = beg;
- iter->end = end;
- return iter;
-}
-bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter)
-{
- if (!iter->off) {
- return 0;
- }
+
- int ret;
- // iterate through all alignments in chunks
- for (;;) {
- if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->curr_chunk].v) { // then jump to the next chunk
- if (iter->curr_chunk == iter->n_off - 1) break; // no more chunks
- if (iter->curr_chunk >= 0) assert(iter->curr_off == iter->off[iter->curr_chunk].v); // otherwise bug
- if (iter->curr_chunk < 0 || iter->off[iter->curr_chunk].v != iter->off[iter->curr_chunk+1].u) { // not adjacent chunks; then seek
- bam_seek(iter->fp, iter->off[iter->curr_chunk+1].u, SEEK_SET);
- iter->curr_off = bam_tell(iter->fp);
- ++iter->n_seeks;
- }
- ++iter->curr_chunk;
- }
- if ((ret = bam_read1(iter->fp, iter->b)) > 0) {
- iter->curr_off = bam_tell(iter->fp);
- if (iter->b->core.tid != iter->tid || iter->b->core.pos >= iter->end) break; // no need to proceed
- else if (is_overlap(iter->beg, iter->end, iter->b))
- //
- //func(iter->b, data);
- //
- return iter->b;
- } else
- return 0; // end of file
- }
- return 0;
-}
-void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter)
-{
- // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", iter->n_seeks);
- bam_destroy1(iter->b);
- free(iter->off);
-}
-