X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2Fcsamtools.pyx;h=471a445bed09a53bb137f9f2023ef56879d5b5fc;hp=ddd599889b4525548417a43246de4c408d8faaea;hb=e1756c41e7a1d7cc01fb95e42df9dd04da2d2991;hpb=ca46ef4ba4a883c57cea62d5bf1bc021f1185109 diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx index ddd5998..471a445 100644 --- a/pysam/csamtools.pyx +++ b/pysam/csamtools.pyx @@ -413,8 +413,7 @@ cdef int mate_callback( bam1_t *alignment, void *f): cdef class Samfile: - '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None, - add_sq_text = False )* + '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None)* A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened. @@ -428,11 +427,11 @@ cdef class Samfile: f = pysam.Samfile('ex1.bam','rb') - If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following + If mode is not specified, we will try to auto-detect in the order 'r', 'rb', thus both the following should work:: f1 = pysam.Samfile('ex1.bam' ) - f2 = pysam.Samfile('ex1.sam' ) + f2 = pysam.Samfile('ex1.bam' ) If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random access to reads via :meth:`fetch` and :meth:`pileup` is disabled. @@ -450,8 +449,6 @@ cdef class Samfile: 3. If *text* is given, new header text is copied from raw text. 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists. - By default, 'SQ' and 'LN' tags will be added to the header text. This option can be - changed by unsetting the flag *add_sq_text*. ''' @@ -459,7 +456,6 @@ cdef class Samfile: self.samfile = NULL self._filename = NULL self.isbam = False - self.isstream = False self._open( *args, **kwargs ) # allocate memory for iterator @@ -482,7 +478,6 @@ cdef class Samfile: text = None, header = None, port = None, - add_sq_text = True, ): '''open a sam/bam file. @@ -493,21 +488,20 @@ cdef class Samfile: # read mode autodetection if mode is None: try: - self._open(filename, 'rb', template=template, + self._open(filename, 'r', template=template, referencenames=referencenames, referencelengths=referencelengths, text=text, header=header, port=port) return except ValueError, msg: pass - - self._open(filename, 'r', template=template, + self._open(filename, 'rb', template=template, referencenames=referencenames, referencelengths=referencelengths, text=text, header=header, port=port) return - assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode + assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode assert filename != NULL # close a previously opened file @@ -519,7 +513,6 @@ cdef class Samfile: if self._filename != NULL: free(self._filename ) self._filename = strdup( filename ) - self.isstream = strcmp( filename, "-" ) == 0 self.isbam = len(mode) > 1 and mode[1] == 'b' @@ -558,13 +551,6 @@ cdef class Samfile: header_to_write.target_name[x] = calloc(len(name)+1, sizeof(char)) strncpy( header_to_write.target_name[x], name, len(name) ) - # Optionally, if there is no text, add a SAM compatible header to output - # file. - if text is None and add_sq_text: - text = '' - for x from 0 <= x < header_to_write.n_targets: - text += "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] ) - if text != None: # copy without \0 ctext = text @@ -601,9 +587,8 @@ cdef class Samfile: raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode ) #disabled for autodetection to work - # needs to be disabled so that reading from sam-files without headers works - #if self.samfile.header.n_targets == 0: - # raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode) + if self.samfile.header.n_targets == 0: + raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode) if self.samfile == NULL: raise IOError("could not open file `%s`" % filename ) @@ -623,10 +608,7 @@ cdef class Samfile: self.index = bam_index_load(filename) if self.index == NULL: raise IOError("error while opening index `%s` " % filename ) - - if not self.isstream: - self.start_offset = bam_tell( self.samfile.x.bam ) - + def gettid( self, reference ): ''' convert :term:`reference` name into numerical :term:`tid` @@ -706,10 +688,6 @@ cdef class Samfile: return 1, rtid, rstart, rend - def reset( self ): - '''reset file position to beginning of read section.''' - return self.seek( self.start_offset, 0 ) - def seek( self, uint64_t offset, int where = 0): ''' move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`. @@ -719,9 +697,6 @@ cdef class Samfile: raise ValueError( "I/O operation on closed file" ) if not self.isbam: raise NotImplementedError("seek only available in bam files") - if self.isstream: - raise OSError("seek no available in streams") - return bam_seek( self.samfile.x.bam, offset, where ) def tell( self ): @@ -747,11 +722,10 @@ cdef class Samfile: :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. - Without *reference* or *region* all mapped reads will be fetched. The reads will be returned + Without *reference* or *region* all reads will be fetched. The reads will be returned ordered by reference sequence, which will not necessarily be the order within the file. - If *until_eof* is given, all reads from the current file position will be returned - in order as they are within the file. Using this option will also fetch unmapped reads. + *in order as they are within the file*. If only *reference* is set, all reads aligned to *reference* will be fetched. @@ -770,9 +744,6 @@ cdef class Samfile: has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) - if self.isstream: reopen = False - else: reopen = True - if self.isbam: if not until_eof and not self._hasIndex() and not self.isremote: raise ValueError( "fetch called on bamfile without index" ) @@ -789,26 +760,24 @@ cdef class Samfile: fetch_callback ) else: if has_coord: - return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen ) + return IteratorRowRegion( self, rtid, rstart, rend ) else: if until_eof: - return IteratorRowAll( self, reopen=reopen ) + return IteratorRowAll( self ) else: - # AH: check - reason why no reopen for AllRefs? - return IteratorRowAllRefs(self ) # , reopen=reopen ) + return IteratorRowAllRefs(self) else: # check if header is present - otherwise sam_read1 aborts # this happens if a bamfile is opened with mode 'r' - if has_coord: - raise ValueError ("fetching by region is not available for sam files" ) - if self.samfile.header.n_targets == 0: raise ValueError( "fetch called for samfile without header") - + + if region != None: + raise ValueError ("fetch for a region is not available for sam files" ) if callback: raise NotImplementedError( "callback not implemented yet" ) else: - return IteratorRowAll( self, reopen=reopen ) + return IteratorRowAll( self ) def mate( self, AlignedRead read ): @@ -936,14 +905,15 @@ cdef class Samfile: mask Skip all reads with bits set in mask. - max_depth - Maximum read depth permitted. The default limit is *8000*. .. note:: *all* reads which overlap the region are returned. The first base returned will be the first base of the first read *not* necessarily the first base of the region used in the query. + The maximum number of reads considered for pileup is *8000*. This limit is set by + :term:`csamtools`. + ''' cdef int rtid, rstart, rend, has_coord cdef bam_plbuf_t *buf @@ -1051,33 +1021,6 @@ cdef class Samfile: t.append( self.samfile.header.target_len[x] ) return tuple(t) - property mapped: - """total number of mapped reads in file. - """ - def __get__(self): - if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) - if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" ) - - cdef int tid - cdef uint32_t total = 0 - for tid from 0 <= tid < self.samfile.header.n_targets: - total += pysam_get_mapped( self.index, tid ) - return total - - property unmapped: - """total number of unmapped reads in file. - """ - def __get__(self): - if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) - if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" ) - cdef int tid - cdef uint32_t total = 0 - for tid from 0 <= tid < self.samfile.header.n_targets: - total += pysam_get_unmapped( self.index, tid ) - # get unmapped reads without coordinates - total += pysam_get_unmapped( self.index, -1 ) - return total - property text: '''full contents of the :term:`sam file` header as a string.''' def __get__(self): @@ -1215,8 +1158,6 @@ cdef class Samfile: ############################################################### def __iter__(self): if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) - if not self.isbam and self.samfile.header.n_targets == 0: - raise NotImplementedError( "can not iterate over samfile without header") return self cdef bam1_t * getCurrent( self ): @@ -1358,10 +1299,10 @@ cdef class IteratorRowAll(IteratorRow): to not re-open *samfile*. """ - # cdef bam1_t * b - # cdef samfile_t * fp - # # true if samfile belongs to this object - # cdef int owns_samfile + cdef bam1_t * b + cdef samfile_t * fp + # true if samfile belongs to this object + cdef int owns_samfile def __cinit__(self, Samfile samfile, int reopen = True ): @@ -1638,8 +1579,8 @@ cdef class IteratorColumn: A :class:`FastaFile` object mask Skip all reads with bits set in mask. - max_depth - maximum read depth. The default is 8000. + + ''' # result of the last plbuf_push @@ -1654,14 +1595,12 @@ cdef class IteratorColumn: cdef Samfile samfile cdef Fastafile fastafile cdef stepper - cdef int max_depth def __cinit__( self, Samfile samfile, **kwargs ): self.samfile = samfile self.mask = kwargs.get("mask", BAM_DEF_MASK ) self.fastafile = kwargs.get( "fastafile", None ) self.stepper = kwargs.get( "stepper", None ) - self.max_depth = kwargs.get( "max_depth", 8000 ) self.iterdata.seq = NULL self.tid = 0 self.pos = 0 @@ -1669,7 +1608,6 @@ cdef class IteratorColumn: self.plp = NULL self.pileup_iter = NULL - def __iter__(self): return self @@ -1739,9 +1677,6 @@ cdef class IteratorColumn: else: raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper) - if self.max_depth: - bam_plp_set_maxcnt( self.pileup_iter, self.max_depth ) - bam_plp_set_mask( self.pileup_iter, self.mask ) cdef reset( self, tid, start, end ): @@ -2265,9 +2200,6 @@ cdef class AlignedRead: read.tags = read.tags + [("RG",0)] - - This method will happily write the same tag - multiple times. """ def __get__(self): cdef char * ctag @@ -2277,7 +2209,7 @@ cdef class AlignedRead: cdef char auxtype src = self._delegate - if src.l_aux == 0: return [] + if src.l_aux == 0: return None s = bam1_aux( src ) result = [] @@ -2449,7 +2381,7 @@ cdef class AlignedRead: '''length of the read (read only). Returns 0 if not given.''' def __get__(self): return self._delegate.core.l_qseq property aend: - '''aligned end position of the read on the reference genome. Returns + '''aligned end position of the read (read only). Returns None if not available.''' def __get__(self): cdef bam1_t * src @@ -2458,7 +2390,7 @@ cdef class AlignedRead: return None return bam_calend(&src.core, bam1_cigar(src)) property alen: - '''aligned length of the read on the reference genome. Returns None if + '''aligned length of the read (read only). Returns None if not available.''' def __get__(self): cdef bam1_t * src @@ -2474,30 +2406,14 @@ cdef class AlignedRead: def __get__(self): return self._delegate.core.qual def __set__(self, qual): self._delegate.core.qual = qual property mrnm: - """the :term:`reference` id of the mate - deprecated, use RNEXT instead. - """ - def __get__(self): return self._delegate.core.mtid - def __set__(self, mtid): self._delegate.core.mtid = mtid - property rnext: """the :term:`reference` id of the mate """ def __get__(self): return self._delegate.core.mtid def __set__(self, mtid): self._delegate.core.mtid = mtid property mpos: - """the position of the mate - deprecated, use PNEXT instead.""" - def __get__(self): return self._delegate.core.mpos - def __set__(self, mpos): self._delegate.core.mpos = mpos - property pnext: """the position of the mate""" def __get__(self): return self._delegate.core.mpos def __set__(self, mpos): self._delegate.core.mpos = mpos property isize: - """the insert size - deprecated: use tlen instead""" - def __get__(self): return self._delegate.core.isize - def __set__(self, isize): self._delegate.core.isize = isize - property tlen: """the insert size""" def __get__(self): return self._delegate.core.isize def __set__(self, isize): self._delegate.core.isize = isize @@ -2593,10 +2509,8 @@ cdef class AlignedRead: pos += l return result - def overlap( self, uint32_t start, uint32_t end ): - """return number of aligned bases of read overlapping the interval *start* and *end* - on the reference sequence. + """return number of bases on reference overlapping *start* and *end* """ cdef uint32_t k, i, pos, overlap cdef int op, o @@ -2825,7 +2739,7 @@ def _samtools_dispatch( method, '''call ``method`` in samtools providing arguments in args. .. note:: - This method redirects stdout and (optionally) stderr to capture it + This method redirects stdout and stderr to capture it from samtools. If for some reason stdout/stderr disappears the reason might be in this method. @@ -2842,8 +2756,8 @@ def _samtools_dispatch( method, ''' # note that debugging this module can be a problem - # as stdout/stderr will not appear on the terminal - + # as stdout/stderr will not appear + # some special cases if method == "index": if not os.path.exists( args[0] ): @@ -2867,6 +2781,7 @@ def _samtools_dispatch( method, if "-o" in args: raise ValueError("option -o is forbidden in samtools view") args = ( "-o", stdout_f ) + args + # do the function call to samtools cdef char ** cargs cdef int i, n, retval @@ -2877,7 +2792,6 @@ def _samtools_dispatch( method, cargs[0] = "samtools" cargs[1] = method for i from 0 <= i < n: cargs[i+2] = args[i] - retval = pysam_dispatch(n+2, cargs) free( cargs ) @@ -2956,500 +2870,500 @@ cdef class SNPCall: self.coverage ) ) ) -# cdef class SNPCallerBase: -# '''Base class for SNP callers. - -# *min_baseQ* -# minimum base quality (possibly capped by BAQ) -# *capQ_threshold* -# coefficient for adjusting mapQ of poor mappings -# *theta* -# theta in maq consensus calling model -# *n_haplotypes* -# number of haplotypes in the sample -# *het_rate* -# prior of a difference between two haplotypes -# ''' - -# cdef bam_maqcns_t * c -# cdef IteratorColumn iter - -# def __cinit__(self, -# IteratorColumn iterator_column, -# **kwargs ): - -# self.iter = iterator_column -# self.c = bam_maqcns_init() - -# # set the default parameterization according to -# # samtools - -# # new default mode for samtools >0.1.10 -# self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 ) - -# self.c.min_baseQ = kwargs.get( "min_baseQ", 13 ) -# # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 ) -# self.c.n_hap = kwargs.get( "n_haplotypes", 2 ) -# self.c.het_rate = kwargs.get( "het_rate", 0.001 ) -# self.c.theta = kwargs.get( "theta", 0.83 ) - -# if self.c.errmod != BAM_ERRMOD_MAQ2: -# self.c.theta += 0.02 - -# # call prepare AFTER setting parameters -# bam_maqcns_prepare( self.c ) - -# def __dealloc__(self): -# bam_maqcns_destroy( self.c ) - - # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ): - # '''debugging output.''' - - # pysam_dump_glf( g, self.c ); - # print "" - # for x in range(self.iter.n_plp): - # print "--> read %i %s %i" % (x, - # bam1_qname(self.iter.plp[x].b), - # self.iter.plp[x].qpos, - # ) - - # print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \ - # % (self.iter.pos, - # cns, - # self.c.q_r, - # self.iter.n_plp, - # self.iter.n_plp, - # rb, - # cns >> 8 & 0xff, - # cns >> 16 & 0xff, - # cns & 0xff, - # cns >> 28, - # ) - - # printf("-------------------------------------\n"); - # sys.stdout.flush() - -# cdef class IteratorSNPCalls( SNPCallerBase ): -# """*(IteratorColumn iterator)* - -# call SNPs within a region. - -# *iterator* is a pileup iterator. SNPs will be called -# on all positions returned by this iterator. - -# This caller is fast if SNPs are called over large continuous -# regions. It is slow, if instantiated frequently and in random -# order as the sequence will have to be reloaded. - -# """ +cdef class SNPCallerBase: + '''Base class for SNP callers. + + *min_baseQ* + minimum base quality (possibly capped by BAQ) + *capQ_threshold* + coefficient for adjusting mapQ of poor mappings + *theta* + theta in maq consensus calling model + *n_haplotypes* + number of haplotypes in the sample + *het_rate* + prior of a difference between two haplotypes + ''' + + cdef bam_maqcns_t * c + cdef IteratorColumn iter + + def __cinit__(self, + IteratorColumn iterator_column, + **kwargs ): + + self.iter = iterator_column + self.c = bam_maqcns_init() + + # set the default parameterization according to + # samtools + + # new default mode for samtools >0.1.10 + self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 ) + + self.c.min_baseQ = kwargs.get( "min_baseQ", 13 ) + # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 ) + self.c.n_hap = kwargs.get( "n_haplotypes", 2 ) + self.c.het_rate = kwargs.get( "het_rate", 0.001 ) + self.c.theta = kwargs.get( "theta", 0.83 ) + + if self.c.errmod != BAM_ERRMOD_MAQ2: + self.c.theta += 0.02 + + # call prepare AFTER setting parameters + bam_maqcns_prepare( self.c ) + + def __dealloc__(self): + bam_maqcns_destroy( self.c ) + + cdef __dump( self, glf1_t * g, uint32_t cns, int rb ): + '''debugging output.''' + + pysam_dump_glf( g, self.c ); + print "" + for x in range(self.iter.n_plp): + print "--> read %i %s %i" % (x, + bam1_qname(self.iter.plp[x].b), + self.iter.plp[x].qpos, + ) + + print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \ + % (self.iter.pos, + cns, + self.c.q_r, + self.iter.n_plp, + self.iter.n_plp, + rb, + cns >> 8 & 0xff, + cns >> 16 & 0xff, + cns & 0xff, + cns >> 28, + ) + + printf("-------------------------------------\n"); + sys.stdout.flush() + +cdef class IteratorSNPCalls( SNPCallerBase ): + """*(IteratorColumn iterator)* + + call SNPs within a region. + + *iterator* is a pileup iterator. SNPs will be called + on all positions returned by this iterator. + + This caller is fast if SNPs are called over large continuous + regions. It is slow, if instantiated frequently and in random + order as the sequence will have to be reloaded. + + """ -# def __cinit__(self, -# IteratorColumn iterator_column, -# **kwargs ): + def __cinit__(self, + IteratorColumn iterator_column, + **kwargs ): -# assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence" + assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence" -# def __iter__(self): -# return self + def __iter__(self): + return self -# def __next__(self): -# """python version of next(). -# """ + def __next__(self): + """python version of next(). + """ -# # the following code was adapted from bam_plcmd.c:pileup_func() -# self.iter.cnext() + # the following code was adapted from bam_plcmd.c:pileup_func() + self.iter.cnext() -# if self.iter.n_plp < 0: -# raise ValueError("error during iteration" ) + if self.iter.n_plp < 0: + raise ValueError("error during iteration" ) -# if self.iter.plp == NULL: -# raise StopIteration + if self.iter.plp == NULL: + raise StopIteration -# cdef char * seq = self.iter.getSequence() -# cdef int seq_len = self.iter.seq_len + cdef char * seq = self.iter.getSequence() + cdef int seq_len = self.iter.seq_len -# assert seq != NULL + assert seq != NULL -# # reference base -# if self.iter.pos >= seq_len: -# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) + # reference base + if self.iter.pos >= seq_len: + raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) -# cdef int rb = seq[self.iter.pos] -# cdef uint32_t cns -# cdef glf1_t * g + cdef int rb = seq[self.iter.pos] + cdef uint32_t cns + cdef glf1_t * g -# g = bam_maqcns_glfgen( self.iter.n_plp, -# self.iter.plp, -# bam_nt16_table[rb], -# self.c ) + g = bam_maqcns_glfgen( self.iter.n_plp, + self.iter.plp, + bam_nt16_table[rb], + self.c ) -# if pysam_glf_depth( g ) == 0: -# cns = 0xfu << 28 | 0xf << 24 -# else: -# cns = glf2cns(g, (self.c.q_r + .499)) - -# free(g) + if pysam_glf_depth( g ) == 0: + cns = 0xfu << 28 | 0xf << 24 + else: + cns = glf2cns(g, (self.c.q_r + .499)) + + free(g) -# cdef SNPCall call + cdef SNPCall call -# call = SNPCall() -# call._tid = self.iter.tid -# call._pos = self.iter.pos -# call._reference_base = rb -# call._genotype = bam_nt16_rev_table[cns>>28] -# call._consensus_quality = cns >> 8 & 0xff -# call._snp_quality = cns & 0xff -# call._rms_mapping_quality = cns >> 16&0xff -# call._coverage = self.iter.n_plp + call = SNPCall() + call._tid = self.iter.tid + call._pos = self.iter.pos + call._reference_base = rb + call._genotype = bam_nt16_rev_table[cns>>28] + call._consensus_quality = cns >> 8 & 0xff + call._snp_quality = cns & 0xff + call._rms_mapping_quality = cns >> 16&0xff + call._coverage = self.iter.n_plp -# return call + return call -# cdef class SNPCaller( SNPCallerBase ): -# '''*(IteratorColumn iterator_column )* +cdef class SNPCaller( SNPCallerBase ): + '''*(IteratorColumn iterator_column )* -# The samtools SNP caller. + The samtools SNP caller. -# This object will call SNPs in *samfile* against the reference -# sequence in *fasta*. + This object will call SNPs in *samfile* against the reference + sequence in *fasta*. -# This caller is fast for calling few SNPs in selected regions. + This caller is fast for calling few SNPs in selected regions. -# It is slow, if called over large genomic regions. -# ''' + It is slow, if called over large genomic regions. + ''' -# def __cinit__(self, -# IteratorColumn iterator_column, -# **kwargs ): + def __cinit__(self, + IteratorColumn iterator_column, + **kwargs ): -# pass + pass -# def call(self, reference, int pos ): -# """call a snp on chromosome *reference* -# and position *pos*. + def call(self, reference, int pos ): + """call a snp on chromosome *reference* + and position *pos*. -# returns a :class:`SNPCall` object. -# """ + returns a :class:`SNPCall` object. + """ -# cdef int tid = self.iter.samfile.gettid( reference ) + cdef int tid = self.iter.samfile.gettid( reference ) -# self.iter.reset( tid, pos, pos + 1 ) + self.iter.reset( tid, pos, pos + 1 ) -# while 1: -# self.iter.cnext() + while 1: + self.iter.cnext() -# if self.iter.n_plp < 0: -# raise ValueError("error during iteration" ) + if self.iter.n_plp < 0: + raise ValueError("error during iteration" ) -# if self.iter.plp == NULL: -# raise ValueError( "no reads in region - no call" ) + if self.iter.plp == NULL: + raise ValueError( "no reads in region - no call" ) -# if self.iter.pos == pos: break - -# cdef char * seq = self.iter.getSequence() -# cdef int seq_len = self.iter.seq_len - -# assert seq != NULL - -# # reference base -# if self.iter.pos >= seq_len: -# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) - -# cdef int rb = seq[self.iter.pos] -# cdef uint32_t cns -# # cdef glf1_t * g -# # -# # g = bam_maqcns_glfgen( self.iter.n_plp, -# # self.iter.plp, -# # bam_nt16_table[rb], -# # self.c ) -# ## -# # -# # if pysam_glf_depth( g ) == 0: -# # cns = 0xfu << 28 | 0xf << 24 -# # else: -# # cns = glf2cns(g, (self.c.q_r + .499)) -# # -# # free(g) + if self.iter.pos == pos: break + + cdef char * seq = self.iter.getSequence() + cdef int seq_len = self.iter.seq_len + + assert seq != NULL + + # reference base + if self.iter.pos >= seq_len: + raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) + + cdef int rb = seq[self.iter.pos] + cdef uint32_t cns + cdef glf1_t * g + + g = bam_maqcns_glfgen( self.iter.n_plp, + self.iter.plp, + bam_nt16_table[rb], + self.c ) + + + if pysam_glf_depth( g ) == 0: + cns = 0xfu << 28 | 0xf << 24 + else: + cns = glf2cns(g, (self.c.q_r + .499)) + + free(g) -# cdef SNPCall call - -# call = SNPCall() -# call._tid = self.iter.tid -# call._pos = self.iter.pos -# call._reference_base = rb -# call._genotype = bam_nt16_rev_table[cns>>28] -# call._consensus_quality = cns >> 8 & 0xff -# call._snp_quality = cns & 0xff -# call._rms_mapping_quality = cns >> 16&0xff -# call._coverage = self.iter.n_plp - -# return call - -# cdef class IndelCall: -# '''the results of an indel call.''' -# cdef int _tid -# cdef int _pos -# cdef int _coverage -# cdef int _rms_mapping_quality -# cdef bam_maqindel_ret_t * _r - -# def __cinit__(self): -# #assert r != NULL -# #self._r = r -# pass - -# property tid: -# '''the chromosome ID as is defined in the header''' -# def __get__(self): -# return self._tid + cdef SNPCall call + + call = SNPCall() + call._tid = self.iter.tid + call._pos = self.iter.pos + call._reference_base = rb + call._genotype = bam_nt16_rev_table[cns>>28] + call._consensus_quality = cns >> 8 & 0xff + call._snp_quality = cns & 0xff + call._rms_mapping_quality = cns >> 16&0xff + call._coverage = self.iter.n_plp + + return call + +cdef class IndelCall: + '''the results of an indel call.''' + cdef int _tid + cdef int _pos + cdef int _coverage + cdef int _rms_mapping_quality + cdef bam_maqindel_ret_t * _r + + def __cinit__(self): + #assert r != NULL + #self._r = r + pass + + property tid: + '''the chromosome ID as is defined in the header''' + def __get__(self): + return self._tid -# property pos: -# '''nucleotide position of SNP.''' -# def __get__(self): return self._pos - -# property genotype: -# '''the genotype called.''' -# def __get__(self): -# if self._r.gt == 0: -# s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) -# return "%s/%s" % (s,s) -# elif self._r.gt == 1: -# s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) -# return "%s/%s" % (s,s) -# else: -# return "%s/%s" % (self.first_allele, self.second_allele ) - -# property consensus_quality: -# '''the genotype quality (Phred-scaled).''' -# def __get__(self): return self._r.q_cns - -# property snp_quality: -# '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.''' -# def __get__(self): return self._r.q_ref - -# property mapping_quality: -# '''the root mean square (rms) of the mapping quality of all reads involved in the call.''' -# def __get__(self): return self._rms_mapping_quality - -# property coverage: -# '''coverage or read depth - the number of reads involved in the call.''' -# def __get__(self): return self._coverage - -# property first_allele: -# '''sequence of first allele.''' -# def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) - -# property second_allele: -# '''sequence of second allele.''' -# def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) - -# property reads_first: -# '''reads supporting first allele.''' -# def __get__(self): return self._r.cnt1 - -# property reads_second: -# '''reads supporting first allele.''' -# def __get__(self): return self._r.cnt2 - -# property reads_diff: -# '''reads supporting first allele.''' -# def __get__(self): return self._r.cnt_anti - -# def __str__(self): - -# return "\t".join( map(str, ( -# self.tid, -# self.pos, -# self.genotype, -# self.consensus_quality, -# self.snp_quality, -# self.mapping_quality, -# self.coverage, -# self.first_allele, -# self.second_allele, -# self.reads_first, -# self.reads_second, -# self.reads_diff ) ) ) - -# def __dealloc__(self ): -# bam_maqindel_ret_destroy(self._r) - -# cdef class IndelCallerBase: -# '''Base class for SNP callers. - -# *min_baseQ* -# minimum base quality (possibly capped by BAQ) -# *capQ_threshold* -# coefficient for adjusting mapQ of poor mappings -# *theta* -# theta in maq consensus calling model -# *n_haplotypes* -# number of haplotypes in the sample -# *het_rate* -# prior of a difference between two haplotypes -# ''' - -# cdef bam_maqindel_opt_t * options -# cdef IteratorColumn iter -# cdef int cap_mapQ -# cdef int max_depth - -# def __cinit__(self, -# IteratorColumn iterator_column, -# **kwargs ): - - -# self.iter = iterator_column - -# assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence" - -# self.options = bam_maqindel_opt_init() - -# # set the default parameterization according to -# # samtools - -# self.options.r_indel = kwargs.get( "r_indel", 0.00015 ) -# self.options.q_indel = kwargs.get( "q_indel", 40 ) -# self.cap_mapQ = kwargs.get( "cap_mapQ", 60 ) -# self.max_depth = kwargs.get( "max_depth", 1024 ) - -# def __dealloc__(self): -# free( self.options ) + property pos: + '''nucleotide position of SNP.''' + def __get__(self): return self._pos -# def _call( self ): + property genotype: + '''the genotype called.''' + def __get__(self): + if self._r.gt == 0: + s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) + return "%s/%s" % (s,s) + elif self._r.gt == 1: + s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) + return "%s/%s" % (s,s) + else: + return "%s/%s" % (self.first_allele, self.second_allele ) -# cdef char * seq = self.iter.getSequence() -# cdef int seq_len = self.iter.seq_len + property consensus_quality: + '''the genotype quality (Phred-scaled).''' + def __get__(self): return self._r.q_cns -# assert seq != NULL + property snp_quality: + '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.''' + def __get__(self): return self._r.q_ref -# # reference base -# if self.iter.pos >= seq_len: -# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) + property mapping_quality: + '''the root mean square (rms) of the mapping quality of all reads involved in the call.''' + def __get__(self): return self._rms_mapping_quality -# cdef bam_maqindel_ret_t * r - -# cdef int m = min( self.max_depth, self.iter.n_plp ) - -# # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n", -# # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty, -# # self.options.indel_err, self.options.ambi_thres ); - -# r = bam_maqindel(m, -# self.iter.pos, -# self.options, -# self.iter.plp, -# seq, -# 0, -# NULL) - -# if r == NULL: return None + property coverage: + '''coverage or read depth - the number of reads involved in the call.''' + def __get__(self): return self._coverage + + property first_allele: + '''sequence of first allele.''' + def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) + + property second_allele: + '''sequence of second allele.''' + def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) + + property reads_first: + '''reads supporting first allele.''' + def __get__(self): return self._r.cnt1 + + property reads_second: + '''reads supporting first allele.''' + def __get__(self): return self._r.cnt2 + + property reads_diff: + '''reads supporting first allele.''' + def __get__(self): return self._r.cnt_anti + + def __str__(self): + + return "\t".join( map(str, ( + self.tid, + self.pos, + self.genotype, + self.consensus_quality, + self.snp_quality, + self.mapping_quality, + self.coverage, + self.first_allele, + self.second_allele, + self.reads_first, + self.reads_second, + self.reads_diff ) ) ) + + def __dealloc__(self ): + bam_maqindel_ret_destroy(self._r) + +cdef class IndelCallerBase: + '''Base class for SNP callers. + + *min_baseQ* + minimum base quality (possibly capped by BAQ) + *capQ_threshold* + coefficient for adjusting mapQ of poor mappings + *theta* + theta in maq consensus calling model + *n_haplotypes* + number of haplotypes in the sample + *het_rate* + prior of a difference between two haplotypes + ''' + + cdef bam_maqindel_opt_t * options + cdef IteratorColumn iter + cdef int cap_mapQ + cdef int max_depth + + def __cinit__(self, + IteratorColumn iterator_column, + **kwargs ): + + + self.iter = iterator_column + + assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence" -# cdef IndelCall call -# call = IndelCall() -# call._r = r -# call._tid = self.iter.tid -# call._pos = self.iter.pos -# call._coverage = self.iter.n_plp + self.options = bam_maqindel_opt_init() -# cdef uint64_t rms_aux = 0 -# cdef int i = 0 -# cdef bam_pileup1_t * p -# cdef int tmp + # set the default parameterization according to + # samtools -# for i from 0 <= i < self.iter.n_plp: -# p = self.iter.plp + i -# if p.b.core.qual < self.cap_mapQ: -# tmp = p.b.core.qual -# else: -# tmp = self.cap_mapQ -# rms_aux += tmp * tmp + self.options.r_indel = kwargs.get( "r_indel", 0.00015 ) + self.options.q_indel = kwargs.get( "q_indel", 40 ) + self.cap_mapQ = kwargs.get( "cap_mapQ", 60 ) + self.max_depth = kwargs.get( "max_depth", 1024 ) -# call._rms_mapping_quality = (sqrt(rms_aux / self.iter.n_plp) + .499) + def __dealloc__(self): + free( self.options ) -# return call + def _call( self ): -# cdef class IndelCaller( IndelCallerBase ): -# '''*(IteratorColumn iterator_column )* + cdef char * seq = self.iter.getSequence() + cdef int seq_len = self.iter.seq_len -# The samtools SNP caller. + assert seq != NULL -# This object will call SNPs in *samfile* against the reference -# sequence in *fasta*. + # reference base + if self.iter.pos >= seq_len: + raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) -# This caller is fast for calling few SNPs in selected regions. + cdef bam_maqindel_ret_t * r + + cdef int m = min( self.max_depth, self.iter.n_plp ) + + # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n", + # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty, + # self.options.indel_err, self.options.ambi_thres ); + + r = bam_maqindel(m, + self.iter.pos, + self.options, + self.iter.plp, + seq, + 0, + NULL) + + if r == NULL: return None + + cdef IndelCall call + call = IndelCall() + call._r = r + call._tid = self.iter.tid + call._pos = self.iter.pos + call._coverage = self.iter.n_plp + + cdef uint64_t rms_aux = 0 + cdef int i = 0 + cdef bam_pileup1_t * p + cdef int tmp + + for i from 0 <= i < self.iter.n_plp: + p = self.iter.plp + i + if p.b.core.qual < self.cap_mapQ: + tmp = p.b.core.qual + else: + tmp = self.cap_mapQ + rms_aux += tmp * tmp -# It is slow, if called over large genomic regions. -# ''' + call._rms_mapping_quality = (sqrt(rms_aux / self.iter.n_plp) + .499) -# def __cinit__(self, -# IteratorColumn iterator_column, -# **kwargs ): + return call -# pass +cdef class IndelCaller( IndelCallerBase ): + '''*(IteratorColumn iterator_column )* -# def call(self, reference, int pos ): -# """call a snp on chromosome *reference* -# and position *pos*. + The samtools SNP caller. -# returns a :class:`SNPCall` object or None, if no indel call could be made. -# """ + This object will call SNPs in *samfile* against the reference + sequence in *fasta*. -# cdef int tid = self.iter.samfile.gettid( reference ) + This caller is fast for calling few SNPs in selected regions. -# self.iter.reset( tid, pos, pos + 1 ) + It is slow, if called over large genomic regions. + ''' -# while 1: -# self.iter.cnext() + def __cinit__(self, + IteratorColumn iterator_column, + **kwargs ): + + pass + + def call(self, reference, int pos ): + """call a snp on chromosome *reference* + and position *pos*. + + returns a :class:`SNPCall` object or None, if no indel call could be made. + """ + + cdef int tid = self.iter.samfile.gettid( reference ) + + self.iter.reset( tid, pos, pos + 1 ) + + while 1: + self.iter.cnext() -# if self.iter.n_plp < 0: -# raise ValueError("error during iteration" ) + if self.iter.n_plp < 0: + raise ValueError("error during iteration" ) -# if self.iter.plp == NULL: -# raise ValueError( "no reads in region - no call" ) + if self.iter.plp == NULL: + raise ValueError( "no reads in region - no call" ) -# if self.iter.pos == pos: break + if self.iter.pos == pos: break -# return self._call() + return self._call() -# cdef class IteratorIndelCalls( IndelCallerBase ): -# """*(IteratorColumn iterator)* +cdef class IteratorIndelCalls( IndelCallerBase ): + """*(IteratorColumn iterator)* -# call indels within a region. + call indels within a region. -# *iterator* is a pileup iterator. SNPs will be called -# on all positions returned by this iterator. + *iterator* is a pileup iterator. SNPs will be called + on all positions returned by this iterator. -# This caller is fast if SNPs are called over large continuous -# regions. It is slow, if instantiated frequently and in random -# order as the sequence will have to be reloaded. + This caller is fast if SNPs are called over large continuous + regions. It is slow, if instantiated frequently and in random + order as the sequence will have to be reloaded. -# """ + """ -# def __cinit__(self, -# IteratorColumn iterator_column, -# **kwargs ): -# pass + def __cinit__(self, + IteratorColumn iterator_column, + **kwargs ): + pass -# def __iter__(self): -# return self + def __iter__(self): + return self -# def __next__(self): -# """python version of next(). -# """ + def __next__(self): + """python version of next(). + """ -# # the following code was adapted from bam_plcmd.c:pileup_func() -# self.iter.cnext() + # the following code was adapted from bam_plcmd.c:pileup_func() + self.iter.cnext() -# if self.iter.n_plp < 0: -# raise ValueError("error during iteration" ) + if self.iter.n_plp < 0: + raise ValueError("error during iteration" ) -# if self.iter.plp == NULL: -# raise StopIteration + if self.iter.plp == NULL: + raise StopIteration -# return self._call() + return self._call() @@ -3527,10 +3441,10 @@ __all__ = ["Samfile", "PileupColumn", "PileupProxy", "PileupRead", - # "IteratorSNPCalls", - # "SNPCaller", - # "IndelCaller", - # "IteratorIndelCalls", + "IteratorSNPCalls", + "SNPCaller", + "IndelCaller", + "IteratorIndelCalls", "IndexedReads" ]