+ property genotype:
+ '''the genotype called.'''
+ def __get__(self): return PyString_FromStringAndSize( &self._genotype, 1 )
+
+ property consensus_quality:
+ '''the genotype quality (Phred-scaled).'''
+ def __get__(self): return self._consensus_quality
+
+ property snp_quality:
+ '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+ def __get__(self): return self._snp_quality
+
+ property mapping_quality:
+ '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+ def __get__(self): return self._rms_mapping_quality
+
+ property coverage:
+ '''coverage or read depth - the number of reads involved in the call.'''
+ def __get__(self): return self._coverage
+
+ def __str__(self):
+
+ return "\t".join( map(str, (
+ self.tid,
+ self.pos,
+ self.reference_base,
+ self.genotype,
+ self.consensus_quality,
+ self.snp_quality,
+ self.mapping_quality,
+ self.coverage ) ) )
+
+
+cdef class SNPCallerBase:
+ '''Base class for SNP callers.
+
+ *min_baseQ*
+ minimum base quality (possibly capped by BAQ)
+ *capQ_threshold*
+ coefficient for adjusting mapQ of poor mappings
+ *theta*
+ theta in maq consensus calling model
+ *n_haplotypes*
+ number of haplotypes in the sample
+ *het_rate*
+ prior of a difference between two haplotypes
+ '''
+
+ cdef bam_maqcns_t * c
+ cdef IteratorColumn iter
+
+ def __cinit__(self,
+ IteratorColumn iterator_column,
+ **kwargs ):
+
+ self.iter = iterator_column
+ self.c = bam_maqcns_init()
+
+ # set the default parameterization according to
+ # samtools
+
+ # new default mode for samtools >0.1.10
+ self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
+
+ self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
+ # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
+ self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
+ self.c.het_rate = kwargs.get( "het_rate", 0.001 )
+ self.c.theta = kwargs.get( "theta", 0.83 )
+
+ if self.c.errmod != BAM_ERRMOD_MAQ2:
+ self.c.theta += 0.02
+
+ # call prepare AFTER setting parameters
+ bam_maqcns_prepare( self.c )
+
+ def __dealloc__(self):
+ bam_maqcns_destroy( self.c )
+
+ cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
+ '''debugging output.'''
+
+ pysam_dump_glf( g, self.c );
+ print ""
+ for x in range(self.iter.n_plp):
+ print "--> read %i %s %i" % (x,
+ bam1_qname(self.iter.plp[x].b),
+ self.iter.plp[x].qpos,
+ )
+
+ print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
+ % (self.iter.pos,
+ cns,
+ self.c.q_r,
+ self.iter.n_plp,
+ self.iter.n_plp,
+ rb,
+ cns >> 8 & 0xff,
+ cns >> 16 & 0xff,
+ cns & 0xff,
+ cns >> 28,
+ )
+
+ printf("-------------------------------------\n");
+ sys.stdout.flush()
+
+cdef class IteratorSNPCalls( SNPCallerBase ):
+ """*(IteratorColumn iterator)*
+
+ call SNPs within a region.
+
+ *iterator* is a pileup iterator. SNPs will be called
+ on all positions returned by this iterator.
+
+ This caller is fast if SNPs are called over large continuous
+ regions. It is slow, if instantiated frequently and in random
+ order as the sequence will have to be reloaded.
+
+ """
+
+ def __cinit__(self,
+ IteratorColumn iterator_column,
+ **kwargs ):
+
+ assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ """python version of next().
+ """
+
+ # the following code was adapted from bam_plcmd.c:pileup_func()
+ self.iter.cnext()
+
+ if self.iter.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ if self.iter.plp == NULL:
+ raise StopIteration
+
+ cdef char * seq = self.iter.getSequence()
+ cdef int seq_len = self.iter.seq_len
+
+ assert seq != NULL
+
+ # reference base
+ if self.iter.pos >= seq_len:
+ raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+ cdef int rb = seq[self.iter.pos]
+ cdef uint32_t cns
+ cdef glf1_t * g
+
+ g = bam_maqcns_glfgen( self.iter.n_plp,
+ self.iter.plp,
+ bam_nt16_table[rb],
+ self.c )
+
+ if pysam_glf_depth( g ) == 0:
+ cns = 0xfu << 28 | 0xf << 24
+ else:
+ cns = glf2cns(g, <int>(self.c.q_r + .499))
+
+ free(g)
+
+ cdef SNPCall call
+
+ call = SNPCall()
+ call._tid = self.iter.tid
+ call._pos = self.iter.pos
+ call._reference_base = rb
+ call._genotype = bam_nt16_rev_table[cns>>28]
+ call._consensus_quality = cns >> 8 & 0xff
+ call._snp_quality = cns & 0xff
+ call._rms_mapping_quality = cns >> 16&0xff
+ call._coverage = self.iter.n_plp
+
+ return call
+
+cdef class SNPCaller( SNPCallerBase ):
+ '''*(IteratorColumn iterator_column )*
+
+ The samtools SNP caller.
+
+ This object will call SNPs in *samfile* against the reference
+ sequence in *fasta*.
+
+ This caller is fast for calling few SNPs in selected regions.
+
+ It is slow, if called over large genomic regions.
+ '''
+
+
+ def __cinit__(self,
+ IteratorColumn iterator_column,
+ **kwargs ):
+
+ pass
+
+ def call(self, reference, int pos ):
+ """call a snp on chromosome *reference*
+ and position *pos*.
+
+ returns a :class:`SNPCall` object.
+ """
+
+ cdef int tid = self.iter.samfile.gettid( reference )
+
+ self.iter.reset( tid, pos, pos + 1 )
+
+ while 1:
+ self.iter.cnext()
+
+ if self.iter.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ if self.iter.plp == NULL:
+ raise ValueError( "no reads in region - no call" )
+
+ if self.iter.pos == pos: break
+
+ cdef char * seq = self.iter.getSequence()
+ cdef int seq_len = self.iter.seq_len
+
+ assert seq != NULL
+
+ # reference base
+ if self.iter.pos >= seq_len:
+ raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+ cdef int rb = seq[self.iter.pos]
+ cdef uint32_t cns
+ cdef glf1_t * g
+
+ g = bam_maqcns_glfgen( self.iter.n_plp,
+ self.iter.plp,
+ bam_nt16_table[rb],
+ self.c )
+
+
+ if pysam_glf_depth( g ) == 0:
+ cns = 0xfu << 28 | 0xf << 24
+ else:
+ cns = glf2cns(g, <int>(self.c.q_r + .499))
+
+ free(g)
+
+ cdef SNPCall call
+
+ call = SNPCall()
+ call._tid = self.iter.tid
+ call._pos = self.iter.pos
+ call._reference_base = rb
+ call._genotype = bam_nt16_rev_table[cns>>28]
+ call._consensus_quality = cns >> 8 & 0xff
+ call._snp_quality = cns & 0xff
+ call._rms_mapping_quality = cns >> 16&0xff
+ call._coverage = self.iter.n_plp
+
+ return call
+
+cdef class IndelCall:
+ '''the results of an indel call.'''
+ cdef int _tid
+ cdef int _pos
+ cdef int _coverage
+ cdef int _rms_mapping_quality
+ cdef bam_maqindel_ret_t * _r
+
+ def __cinit__(self):
+ #assert r != NULL
+ #self._r = r
+ pass
+
+ property tid:
+ '''the chromosome ID as is defined in the header'''
+ def __get__(self):
+ return self._tid
+
+ property pos:
+ '''nucleotide position of SNP.'''
+ def __get__(self): return self._pos
+
+ property genotype:
+ '''the genotype called.'''
+ def __get__(self):
+ if self._r.gt == 0:
+ s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+ return "%s/%s" % (s,s)
+ elif self._r.gt == 1:
+ s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+ return "%s/%s" % (s,s)
+ else:
+ return "%s/%s" % (self.first_allele, self.second_allele )
+
+ property consensus_quality:
+ '''the genotype quality (Phred-scaled).'''
+ def __get__(self): return self._r.q_cns
+
+ property snp_quality:
+ '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+ def __get__(self): return self._r.q_ref
+
+ property mapping_quality:
+ '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+ def __get__(self): return self._rms_mapping_quality
+
+ property coverage:
+ '''coverage or read depth - the number of reads involved in the call.'''
+ def __get__(self): return self._coverage
+
+ property first_allele:
+ '''sequence of first allele.'''
+ def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+
+ property second_allele:
+ '''sequence of second allele.'''
+ def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+
+ property reads_first:
+ '''reads supporting first allele.'''
+ def __get__(self): return self._r.cnt1
+
+ property reads_second:
+ '''reads supporting first allele.'''
+ def __get__(self): return self._r.cnt2
+
+ property reads_diff:
+ '''reads supporting first allele.'''
+ def __get__(self): return self._r.cnt_anti
+
+ def __str__(self):
+
+ return "\t".join( map(str, (
+ self.tid,
+ self.pos,
+ self.genotype,
+ self.consensus_quality,
+ self.snp_quality,
+ self.mapping_quality,
+ self.coverage,
+ self.first_allele,
+ self.second_allele,
+ self.reads_first,
+ self.reads_second,
+ self.reads_diff ) ) )
+
+ def __dealloc__(self ):
+ bam_maqindel_ret_destroy(self._r)
+
+cdef class IndelCallerBase:
+ '''Base class for SNP callers.
+
+ *min_baseQ*
+ minimum base quality (possibly capped by BAQ)
+ *capQ_threshold*
+ coefficient for adjusting mapQ of poor mappings
+ *theta*
+ theta in maq consensus calling model
+ *n_haplotypes*
+ number of haplotypes in the sample
+ *het_rate*
+ prior of a difference between two haplotypes
+ '''
+
+ cdef bam_maqindel_opt_t * options
+ cdef IteratorColumn iter
+ cdef int cap_mapQ
+ cdef int max_depth
+
+ def __cinit__(self,
+ IteratorColumn iterator_column,
+ **kwargs ):
+
+
+ self.iter = iterator_column
+
+ assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
+
+ self.options = bam_maqindel_opt_init()
+
+ # set the default parameterization according to
+ # samtools
+
+ self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
+ self.options.q_indel = kwargs.get( "q_indel", 40 )
+ self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
+ self.max_depth = kwargs.get( "max_depth", 1024 )
+
+ def __dealloc__(self):
+ free( self.options )
+
+ def _call( self ):
+
+ cdef char * seq = self.iter.getSequence()
+ cdef int seq_len = self.iter.seq_len
+
+ assert seq != NULL
+
+ # reference base
+ if self.iter.pos >= seq_len:
+ raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+ cdef bam_maqindel_ret_t * r
+
+ cdef int m = min( self.max_depth, self.iter.n_plp )
+
+ # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
+ # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
+ # self.options.indel_err, self.options.ambi_thres );
+
+ r = bam_maqindel(m,
+ self.iter.pos,
+ self.options,
+ self.iter.plp,
+ seq,
+ 0,
+ NULL)
+
+ if r == NULL: return None
+
+ cdef IndelCall call
+ call = IndelCall()
+ call._r = r
+ call._tid = self.iter.tid
+ call._pos = self.iter.pos
+ call._coverage = self.iter.n_plp
+
+ cdef uint64_t rms_aux = 0
+ cdef int i = 0
+ cdef bam_pileup1_t * p
+ cdef int tmp
+
+ for i from 0 <= i < self.iter.n_plp:
+ p = self.iter.plp + i
+ if p.b.core.qual < self.cap_mapQ:
+ tmp = p.b.core.qual
+ else:
+ tmp = self.cap_mapQ
+ rms_aux += tmp * tmp
+
+ call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
+
+ return call
+
+cdef class IndelCaller( IndelCallerBase ):
+ '''*(IteratorColumn iterator_column )*
+
+ The samtools SNP caller.
+
+ This object will call SNPs in *samfile* against the reference
+ sequence in *fasta*.
+
+ This caller is fast for calling few SNPs in selected regions.
+
+ It is slow, if called over large genomic regions.
+ '''
+
+ def __cinit__(self,
+ IteratorColumn iterator_column,
+ **kwargs ):
+
+ pass
+
+ def call(self, reference, int pos ):
+ """call a snp on chromosome *reference*
+ and position *pos*.
+
+ returns a :class:`SNPCall` object or None, if no indel call could be made.
+ """
+
+ cdef int tid = self.iter.samfile.gettid( reference )
+
+ self.iter.reset( tid, pos, pos + 1 )
+
+ while 1:
+ self.iter.cnext()
+
+ if self.iter.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ if self.iter.plp == NULL:
+ raise ValueError( "no reads in region - no call" )
+
+ if self.iter.pos == pos: break
+
+ return self._call()
+
+cdef class IteratorIndelCalls( IndelCallerBase ):
+ """*(IteratorColumn iterator)*
+
+ call indels within a region.
+
+ *iterator* is a pileup iterator. SNPs will be called
+ on all positions returned by this iterator.
+
+ This caller is fast if SNPs are called over large continuous
+ regions. It is slow, if instantiated frequently and in random
+ order as the sequence will have to be reloaded.
+
+ """
+
+ def __cinit__(self,
+ IteratorColumn iterator_column,
+ **kwargs ):
+ pass
+
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ """python version of next().
+ """
+
+ # the following code was adapted from bam_plcmd.c:pileup_func()
+ self.iter.cnext()
+
+ if self.iter.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ if self.iter.plp == NULL:
+ raise StopIteration
+
+ return self._call()
+
+
+
+cdef class IndexedReads:
+ """index a bamfile by read.
+
+ The index is kept in memory.
+
+ By default, the file is re-openend to avoid conflicts if
+ multiple operators work on the same file. Set *reopen* = False
+ to not re-open *samfile*.
+ """
+
+ cdef Samfile samfile
+ cdef samfile_t * fp
+ cdef index
+ # true if samfile belongs to this object
+ cdef int owns_samfile
+
+ def __init__(self, Samfile samfile, int reopen = True ):
+ self.samfile = samfile
+
+ if samfile.isbam: mode = "rb"
+ else: mode = "r"
+
+ # reopen the file - note that this makes the iterator
+ # slow and causes pileup to slow down significantly.
+ if reopen:
+ store = StderrStore()
+ self.fp = samopen( samfile._filename, mode, NULL )
+ store.release()
+ assert self.fp != NULL
+ self.owns_samfile = True
+ else:
+ self.fp = samfile.samfile
+ self.owns_samfile = False
+
+ assert samfile.isbam, "can only IndexReads on bam files"
+
+ def build( self ):
+ '''build index.'''
+
+ self.index = collections.defaultdict( list )
+
+ # this method will start indexing from the current file position
+ # if you decide
+ cdef int ret = 1
+ cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
+
+ cdef uint64_t pos
+
+ while ret > 0:
+ pos = bam_tell( self.fp.x.bam )
+ ret = samread( self.fp, b)
+ if ret > 0:
+ qname = bam1_qname( b )
+ self.index[qname].append( pos )
+
+ bam_destroy1( b )
+
+ def find( self, qname ):
+ if qname in self.index:
+ return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
+ else:
+ raise KeyError( "read %s not found" % qname )
+
+ def __dealloc__(self):
+ if self.owns_samfile: samclose( self.fp )