Imported Upstream version 0.5
[pysam.git] / pysam / pysam_util.c
index 5360626dd45a1fd767562b68c95c75cb1085d0dd..c9d4543eea72bd3aa193dfff3c7c0f9f5edfac91 100644 (file)
@@ -6,6 +6,21 @@
 #include "bam_endian.h"
 #include "knetfile.h"
 #include "pysam_util.h"
+#include "errmod.h" // for pysam_dump 
+
+#ifndef inline
+#define inline __inline
+#endif
+
+// Definition of pysamerr
+#include "stdio.h"
+FILE * pysamerr = NULL;
+
+FILE * pysam_set_stderr( FILE * f )
+{
+  pysamerr = f;
+  return f;
+}
 
 // #######################################################
 // utility routines to avoid using callbacks in bam_fetch
@@ -32,6 +47,7 @@ typedef struct {
 
 KSORT_INIT(my_off, pair64_t, pair64_lt);
 KHASH_MAP_INIT_INT(my_i, bam_binlist_t);
+KHASH_MAP_INIT_STR(s, int)
 
 struct __bam_index_t
 {
@@ -141,11 +157,8 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
        }
        assert(x > pos); // otherwise a bug
        return ret;
-}
-
-
-
 
+}
 // the following code has been taken from bam_plbuf_push
 // and modified such that instead of a function call
 // the function returns and will continue (if cont is true).
@@ -155,100 +168,72 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
 // 1: if buf is full and can be emitted
 // 0: if b has been added
 // -1: if there was an error
-int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont)
+int pysam_pileup_next(const bam1_t *b,
+                     bam_plbuf_t *buf,
+                     bam_pileup1_t ** plp,
+                     int * tid,
+                     int * pos,
+                     int * n_plp )
 {
-  if (!cont)
-    {
-      if (b) { // fill buffer
-       if (b->core.tid < 0) return 0;
-       if (b->core.flag & buf->flag_mask) return 0;
-       bam_copy1(&buf->tail->b, b);
-       buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
-       if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
-         fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n");
-         abort();
-       }
-       buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
-       if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
-         buf->tail->next = mp_alloc(buf->mp);
-         buf->tail = buf->tail->next;
-       }
-      } else buf->is_eof = 1;
-    }
-  else
-    // continue end of loop
-    {
-      // update tid and pos
-      if (buf->head->next) {
-       if (buf->tid > buf->head->b.core.tid) {
-         fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
-         return -1;
-       }
-      }
-      if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
-       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
-      } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
-       buf->pos = buf->head->beg; // jump to the next position
-      } else ++buf->pos; // scan contiguously
-      if (buf->is_eof && buf->head->next == 0) return 0;
-    }
-
-  // enter yield loop
-  while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos))
-    {
-      int n_pu = 0;
-      lbnode_t *p, *q;
-      buf->dummy->next = buf->head;
-      for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
-       if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
-         q->next = p->next; mp_free(buf->mp, p); p = q;
-       } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
-         if (n_pu == buf->max_pu) { // then double the capacity
-           buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
-           buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
-         }
-         buf->pu[n_pu].b = &p->b;
-         if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
-       }
-      }
-      buf->head = buf->dummy->next; // dummy->next may be changed
-
-      // exit if alignments need to be emitted
-      if (n_pu) { return n_pu; }
-      
-      // update tid and pos
-      if (buf->head->next) {
-       if (buf->tid > buf->head->b.core.tid) {
-         fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
-         return -2;
-       }
-      }
-      if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
-       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
-      } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
-       buf->pos = buf->head->beg; // jump to the next position
-      } else ++buf->pos; // scan contiguously
-      if (buf->is_eof && buf->head->next == 0) break;
-    }
-  return 0;
+  *plp = bam_plp_next(buf->iter, tid, pos, n_plp);
+  if (plp == NULL) return 0;
+  return 1;
 }
 
-int pysam_get_pos( const bam_plbuf_t *buf) 
-{
-  return buf->pos;
-}
+typedef struct __bmc_aux_t {
+       int max;
+       uint32_t *info;
+       uint16_t *info16;
+       errmod_t *em;
+} bmc_aux_t;
 
-  
-int pysam_get_tid( const bam_plbuf_t *buf)
+uint32_t pysam_glf_depth( glf1_t * g )
 {
-  return buf->tid;
+  return g->depth;
 }
 
-bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf)
+
+void pysam_dump_glf( glf1_t * g, bam_maqcns_t * c )
 {
-  return buf->pu;
+  int x = 0;
+  fprintf(stderr,
+         "glf: ref_base=%i, max_mapQ=%i, min_lk=%i, depth=%i",
+         g->ref_base,
+         g->max_mapQ,
+         g->min_lk,
+         g->depth );
+
+  for (x = 0; x < 10; ++x) 
+    fprintf(stderr, ", lk%x=%i, ", x, g->lk[x]);
+
+  fprintf(stderr,
+         "maqcns: het_rate=%f, theta=%f, n_hap=%i, cap_mapQ=%i, errmod=%i, min_baseQ=%i, eta=%f, q_r=%f, aux_max=%i",
+         c->het_rate,
+         c->theta,
+         c->n_hap,
+         c->cap_mapQ,
+         c->errmod,
+         c->min_baseQ,
+         c->eta,
+         c->q_r,
+         c->aux->max);
+  
+  for (x = 0; x < c->aux->max; ++x)
+    {
+      fprintf(stderr, ", info-%i=%i ", x, c->aux->info[x]);
+      if (c->aux->info[x] == 0) break;
+    }
+  
+  for (x = 0; x < c->aux->max; ++x)
+    {
+      fprintf(stderr, ", info16-%i=%i ", x, c->aux->info16[x]);
+      if (c->aux->info16[x] == 0) break;
+    }
 }
 
+
+  
+
 // pysam dispatch function to emulate the samtools
 // command line within python.
 // taken from the main function in bamtk.c
@@ -268,7 +253,7 @@ extern int bam_fillmd(int argc, char *argv[]);
 
 int pysam_dispatch(int argc, char *argv[] )
 {
-
+  extern int optind;
 #ifdef _WIN32
   setmode(fileno(stdout), O_BINARY);
   setmode(fileno(stdin),  O_BINARY);
@@ -276,8 +261,6 @@ int pysam_dispatch(int argc, char *argv[] )
   knet_win32_init();
 #endif
 #endif
-
-  extern int optind;
   
   // reset getop
   optind = 1;
@@ -289,7 +272,7 @@ int pysam_dispatch(int argc, char *argv[] )
   else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);
   else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
   else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
-  else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
+  else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); 
   else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
   else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
   else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
@@ -309,15 +292,6 @@ int pysam_dispatch(int argc, char *argv[] )
   return 0;
 }
 
-// standin for bam_destroy1 in bam.h
-// deletes all variable length data
-void pysam_bam_destroy1( bam1_t * b )
-{
-  if (b == NULL) return;
-  if (b->data != NULL) free(b->data);
-  free(b);
-}
-
 // taken from samtools/bam_import.c
 static inline uint8_t *alloc_data(bam1_t *b, size_t size)
 {
@@ -340,12 +314,14 @@ bam1_t * pysam_bam_update( bam1_t * b,
                           uint8_t * pos )
 {
   int d = nbytes_new-nbytes_old;
+  int new_size;
+  size_t offset;
 
   // no change
   if (d == 0) return b;
 
-  int new_size = d + b->data_len;
-  size_t offset = pos - b->data;
+  new_size = d + b->data_len;
+  offset = pos - b->data;
 
   //printf("d=%i, old=%i, new=%i, old_size=%i, new_size=%i\n",
   // d, nbytes_old, nbytes_new, b->data_len, new_size);
@@ -379,121 +355,32 @@ unsigned char pysam_translate_sequence( const unsigned char s )
   return bam_nt16_table[s];
 }
 
-// stand-ins for samtools macros in bam.h
-char * pysam_bam1_qname( const bam1_t * b)
-{
-  return (char*)b->data;
-}
 
-uint32_t * pysam_bam1_cigar( const bam1_t * b) 
-{
-  return (uint32_t*)(b->data + b->core.l_qname);
-}
-
-uint8_t * pysam_bam1_seq( const bam1_t * b) 
-{
-  return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname);
-}
+void bam_init_header_hash(bam_header_t *header);
 
-uint8_t * pysam_bam1_qual( const bam1_t * b)
+// translate a reference string *s* to a tid
+// code taken from bam_parse_region
+int pysam_reference2tid( bam_header_t *header, const char * s )
 {
-  return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + (b->core.l_qseq + 1)/2);
-}
-
-uint8_t * pysam_bam1_aux( const bam1_t * b)
-{
-  return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + b->core.l_qseq + (b->core.l_qseq + 1)/2);
-}
-
-// #######################################################
-// Iterator implementation
-// #######################################################
+  
+  khiter_t iter;
+  khash_t(s) *h;
+  
+  bam_init_header_hash(header);
+  h = (khash_t(s)*)header->hash;
 
-// functions defined in bam_index.c
-extern pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off);
+  iter = kh_get(s, h, s); /* get the ref_id */
+  if (iter == kh_end(h)) { // name not found
+    return -1;
+  }
 
-static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
-{
-       uint32_t rbeg = b->core.pos;
-       uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
-       return (rend > beg && rbeg < end);
+  return kh_value(h, iter);
 }
 
-struct __bam_fetch_iterator_t
-{
-  bam1_t *        b;
-  pair64_t *      off;
-  int             n_off;
-  uint64_t        curr_off;
-  int             curr_chunk;
-  bamFile              fp;
-  int                          tid;
-  int                          beg;
-  int                          end;
-  int             n_seeks;
-};
-bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end)
-{
-       // iterator contains current alignment position
-       //      and will contain actual alignment during iterations
-       bam_fetch_iterator_t* iter  = (bam_fetch_iterator_t*)calloc(1, sizeof(bam_fetch_iterator_t));
-       iter->b                     = (bam1_t*)calloc(1, sizeof(bam1_t));
-               
-       // list of chunks containing our alignments
-       iter->off = get_chunk_coordinates(idx, tid, beg, end, &iter->n_off);
-       
-       // initialise other state variables in iterator
-       iter->fp                = fp;
-       iter->curr_chunk        = -1;   
-       iter->curr_off          =  0;
-       iter->n_seeks           =  0;    
-       iter->tid                               = tid;
-       iter->beg                               = beg;
-       iter->end                               = end;
-       return iter;
-}
 
-bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter)
-{
-       if (!iter->off) {
-               return 0;
-       }
+  
 
-       int ret;
-       // iterate through all alignments in chunks
-       for (;;) {
-               if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->curr_chunk].v) { // then jump to the next chunk
-                       if (iter->curr_chunk == iter->n_off - 1) break; // no more chunks
-                       if (iter->curr_chunk >= 0) assert(iter->curr_off == iter->off[iter->curr_chunk].v); // otherwise bug
-                       if (iter->curr_chunk < 0 || iter->off[iter->curr_chunk].v != iter->off[iter->curr_chunk+1].u) { // not adjacent chunks; then seek
-                               bam_seek(iter->fp, iter->off[iter->curr_chunk+1].u, SEEK_SET);
-                               iter->curr_off = bam_tell(iter->fp);
-                               ++iter->n_seeks;
-                       }
-                       ++iter->curr_chunk;
-               }
-               if ((ret = bam_read1(iter->fp, iter->b)) > 0) {
-                       iter->curr_off = bam_tell(iter->fp);
-                       if (iter->b->core.tid != iter->tid || iter->b->core.pos >= iter->end) break; // no need to proceed
-                       else if (is_overlap(iter->beg, iter->end, iter->b)) 
-                               //
-                               //func(iter->b, data);
-                               //
-                               return iter->b;
-               } else 
-                       return 0; // end of file
-       }
-       return 0;
-}
 
-void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter)
-{
-  //  fprintf(stderr, "[bam_fetch] # seek calls: %d\n", iter->n_seeks);
-  bam_destroy1(iter->b);
-  free(iter->off);
-}
 
-