X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2Fcsamtools.pyx;h=ddd599889b4525548417a43246de4c408d8faaea;hp=471a445bed09a53bb137f9f2023ef56879d5b5fc;hb=ca46ef4ba4a883c57cea62d5bf1bc021f1185109;hpb=d02fe5283ed7a93a2f76a5d6dc6e37b40c11b9b1 diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx index 471a445..ddd5998 100644 --- a/pysam/csamtools.pyx +++ b/pysam/csamtools.pyx @@ -413,7 +413,8 @@ cdef int mate_callback( bam1_t *alignment, void *f): cdef class Samfile: - '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None)* + '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None, + add_sq_text = False )* A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened. @@ -427,11 +428,11 @@ cdef class Samfile: f = pysam.Samfile('ex1.bam','rb') - If mode is not specified, we will try to auto-detect in the order 'r', 'rb', thus both the following + If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following should work:: f1 = pysam.Samfile('ex1.bam' ) - f2 = pysam.Samfile('ex1.bam' ) + f2 = pysam.Samfile('ex1.sam' ) If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random access to reads via :meth:`fetch` and :meth:`pileup` is disabled. @@ -449,6 +450,8 @@ cdef class Samfile: 3. If *text* is given, new header text is copied from raw text. 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists. + By default, 'SQ' and 'LN' tags will be added to the header text. This option can be + changed by unsetting the flag *add_sq_text*. ''' @@ -456,6 +459,7 @@ cdef class Samfile: self.samfile = NULL self._filename = NULL self.isbam = False + self.isstream = False self._open( *args, **kwargs ) # allocate memory for iterator @@ -478,6 +482,7 @@ cdef class Samfile: text = None, header = None, port = None, + add_sq_text = True, ): '''open a sam/bam file. @@ -488,20 +493,21 @@ cdef class Samfile: # read mode autodetection if mode is None: try: - self._open(filename, 'r', template=template, + self._open(filename, 'rb', template=template, referencenames=referencenames, referencelengths=referencelengths, text=text, header=header, port=port) return except ValueError, msg: pass - self._open(filename, 'rb', template=template, + + self._open(filename, 'r', template=template, referencenames=referencenames, referencelengths=referencelengths, text=text, header=header, port=port) return - assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode + assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode assert filename != NULL # close a previously opened file @@ -513,6 +519,7 @@ cdef class Samfile: if self._filename != NULL: free(self._filename ) self._filename = strdup( filename ) + self.isstream = strcmp( filename, "-" ) == 0 self.isbam = len(mode) > 1 and mode[1] == 'b' @@ -551,6 +558,13 @@ cdef class Samfile: header_to_write.target_name[x] = calloc(len(name)+1, sizeof(char)) strncpy( header_to_write.target_name[x], name, len(name) ) + # Optionally, if there is no text, add a SAM compatible header to output + # file. + if text is None and add_sq_text: + text = '' + for x from 0 <= x < header_to_write.n_targets: + text += "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] ) + if text != None: # copy without \0 ctext = text @@ -587,8 +601,9 @@ cdef class Samfile: raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode ) #disabled for autodetection to work - if self.samfile.header.n_targets == 0: - raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode) + # needs to be disabled so that reading from sam-files without headers works + #if self.samfile.header.n_targets == 0: + # raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode) if self.samfile == NULL: raise IOError("could not open file `%s`" % filename ) @@ -608,7 +623,10 @@ cdef class Samfile: self.index = bam_index_load(filename) if self.index == NULL: raise IOError("error while opening index `%s` " % filename ) - + + if not self.isstream: + self.start_offset = bam_tell( self.samfile.x.bam ) + def gettid( self, reference ): ''' convert :term:`reference` name into numerical :term:`tid` @@ -688,6 +706,10 @@ cdef class Samfile: return 1, rtid, rstart, rend + def reset( self ): + '''reset file position to beginning of read section.''' + return self.seek( self.start_offset, 0 ) + def seek( self, uint64_t offset, int where = 0): ''' move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`. @@ -697,6 +719,9 @@ cdef class Samfile: raise ValueError( "I/O operation on closed file" ) if not self.isbam: raise NotImplementedError("seek only available in bam files") + if self.isstream: + raise OSError("seek no available in streams") + return bam_seek( self.samfile.x.bam, offset, where ) def tell( self ): @@ -722,10 +747,11 @@ cdef class Samfile: :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. - Without *reference* or *region* all reads will be fetched. The reads will be returned + Without *reference* or *region* all mapped reads will be fetched. The reads will be returned ordered by reference sequence, which will not necessarily be the order within the file. + If *until_eof* is given, all reads from the current file position will be returned - *in order as they are within the file*. + in order as they are within the file. Using this option will also fetch unmapped reads. If only *reference* is set, all reads aligned to *reference* will be fetched. @@ -744,6 +770,9 @@ cdef class Samfile: has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) + if self.isstream: reopen = False + else: reopen = True + if self.isbam: if not until_eof and not self._hasIndex() and not self.isremote: raise ValueError( "fetch called on bamfile without index" ) @@ -760,24 +789,26 @@ cdef class Samfile: fetch_callback ) else: if has_coord: - return IteratorRowRegion( self, rtid, rstart, rend ) + return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen ) else: if until_eof: - return IteratorRowAll( self ) + return IteratorRowAll( self, reopen=reopen ) else: - return IteratorRowAllRefs(self) + # AH: check - reason why no reopen for AllRefs? + return IteratorRowAllRefs(self ) # , reopen=reopen ) else: # check if header is present - otherwise sam_read1 aborts # this happens if a bamfile is opened with mode 'r' + if has_coord: + raise ValueError ("fetching by region is not available for sam files" ) + if self.samfile.header.n_targets == 0: raise ValueError( "fetch called for samfile without header") - - if region != None: - raise ValueError ("fetch for a region is not available for sam files" ) + if callback: raise NotImplementedError( "callback not implemented yet" ) else: - return IteratorRowAll( self ) + return IteratorRowAll( self, reopen=reopen ) def mate( self, AlignedRead read ): @@ -905,15 +936,14 @@ cdef class Samfile: mask Skip all reads with bits set in mask. + max_depth + Maximum read depth permitted. The default limit is *8000*. .. note:: *all* reads which overlap the region are returned. The first base returned will be the first base of the first read *not* necessarily the first base of the region used in the query. - The maximum number of reads considered for pileup is *8000*. This limit is set by - :term:`csamtools`. - ''' cdef int rtid, rstart, rend, has_coord cdef bam_plbuf_t *buf @@ -1021,6 +1051,33 @@ cdef class Samfile: t.append( self.samfile.header.target_len[x] ) return tuple(t) + property mapped: + """total number of mapped reads in file. + """ + def __get__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) + if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" ) + + cdef int tid + cdef uint32_t total = 0 + for tid from 0 <= tid < self.samfile.header.n_targets: + total += pysam_get_mapped( self.index, tid ) + return total + + property unmapped: + """total number of unmapped reads in file. + """ + def __get__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) + if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" ) + cdef int tid + cdef uint32_t total = 0 + for tid from 0 <= tid < self.samfile.header.n_targets: + total += pysam_get_unmapped( self.index, tid ) + # get unmapped reads without coordinates + total += pysam_get_unmapped( self.index, -1 ) + return total + property text: '''full contents of the :term:`sam file` header as a string.''' def __get__(self): @@ -1158,6 +1215,8 @@ cdef class Samfile: ############################################################### def __iter__(self): if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) + if not self.isbam and self.samfile.header.n_targets == 0: + raise NotImplementedError( "can not iterate over samfile without header") return self cdef bam1_t * getCurrent( self ): @@ -1299,10 +1358,10 @@ cdef class IteratorRowAll(IteratorRow): to not re-open *samfile*. """ - cdef bam1_t * b - cdef samfile_t * fp - # true if samfile belongs to this object - cdef int owns_samfile + # cdef bam1_t * b + # cdef samfile_t * fp + # # true if samfile belongs to this object + # cdef int owns_samfile def __cinit__(self, Samfile samfile, int reopen = True ): @@ -1579,8 +1638,8 @@ cdef class IteratorColumn: A :class:`FastaFile` object mask Skip all reads with bits set in mask. - - + max_depth + maximum read depth. The default is 8000. ''' # result of the last plbuf_push @@ -1595,12 +1654,14 @@ cdef class IteratorColumn: cdef Samfile samfile cdef Fastafile fastafile cdef stepper + cdef int max_depth def __cinit__( self, Samfile samfile, **kwargs ): self.samfile = samfile self.mask = kwargs.get("mask", BAM_DEF_MASK ) self.fastafile = kwargs.get( "fastafile", None ) self.stepper = kwargs.get( "stepper", None ) + self.max_depth = kwargs.get( "max_depth", 8000 ) self.iterdata.seq = NULL self.tid = 0 self.pos = 0 @@ -1608,6 +1669,7 @@ cdef class IteratorColumn: self.plp = NULL self.pileup_iter = NULL + def __iter__(self): return self @@ -1677,6 +1739,9 @@ cdef class IteratorColumn: else: raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper) + if self.max_depth: + bam_plp_set_maxcnt( self.pileup_iter, self.max_depth ) + bam_plp_set_mask( self.pileup_iter, self.mask ) cdef reset( self, tid, start, end ): @@ -2200,6 +2265,9 @@ cdef class AlignedRead: read.tags = read.tags + [("RG",0)] + + This method will happily write the same tag + multiple times. """ def __get__(self): cdef char * ctag @@ -2209,7 +2277,7 @@ cdef class AlignedRead: cdef char auxtype src = self._delegate - if src.l_aux == 0: return None + if src.l_aux == 0: return [] s = bam1_aux( src ) result = [] @@ -2381,7 +2449,7 @@ cdef class AlignedRead: '''length of the read (read only). Returns 0 if not given.''' def __get__(self): return self._delegate.core.l_qseq property aend: - '''aligned end position of the read (read only). Returns + '''aligned end position of the read on the reference genome. Returns None if not available.''' def __get__(self): cdef bam1_t * src @@ -2390,7 +2458,7 @@ cdef class AlignedRead: return None return bam_calend(&src.core, bam1_cigar(src)) property alen: - '''aligned length of the read (read only). Returns None if + '''aligned length of the read on the reference genome. Returns None if not available.''' def __get__(self): cdef bam1_t * src @@ -2406,14 +2474,30 @@ cdef class AlignedRead: def __get__(self): return self._delegate.core.qual def __set__(self, qual): self._delegate.core.qual = qual property mrnm: + """the :term:`reference` id of the mate + deprecated, use RNEXT instead. + """ + def __get__(self): return self._delegate.core.mtid + def __set__(self, mtid): self._delegate.core.mtid = mtid + property rnext: """the :term:`reference` id of the mate """ def __get__(self): return self._delegate.core.mtid def __set__(self, mtid): self._delegate.core.mtid = mtid property mpos: + """the position of the mate + deprecated, use PNEXT instead.""" + def __get__(self): return self._delegate.core.mpos + def __set__(self, mpos): self._delegate.core.mpos = mpos + property pnext: """the position of the mate""" def __get__(self): return self._delegate.core.mpos def __set__(self, mpos): self._delegate.core.mpos = mpos property isize: + """the insert size + deprecated: use tlen instead""" + def __get__(self): return self._delegate.core.isize + def __set__(self, isize): self._delegate.core.isize = isize + property tlen: """the insert size""" def __get__(self): return self._delegate.core.isize def __set__(self, isize): self._delegate.core.isize = isize @@ -2509,8 +2593,10 @@ cdef class AlignedRead: pos += l return result + def overlap( self, uint32_t start, uint32_t end ): - """return number of bases on reference overlapping *start* and *end* + """return number of aligned bases of read overlapping the interval *start* and *end* + on the reference sequence. """ cdef uint32_t k, i, pos, overlap cdef int op, o @@ -2739,7 +2825,7 @@ def _samtools_dispatch( method, '''call ``method`` in samtools providing arguments in args. .. note:: - This method redirects stdout and stderr to capture it + This method redirects stdout and (optionally) stderr to capture it from samtools. If for some reason stdout/stderr disappears the reason might be in this method. @@ -2756,8 +2842,8 @@ def _samtools_dispatch( method, ''' # note that debugging this module can be a problem - # as stdout/stderr will not appear - + # as stdout/stderr will not appear on the terminal + # some special cases if method == "index": if not os.path.exists( args[0] ): @@ -2781,7 +2867,6 @@ def _samtools_dispatch( method, if "-o" in args: raise ValueError("option -o is forbidden in samtools view") args = ( "-o", stdout_f ) + args - # do the function call to samtools cdef char ** cargs cdef int i, n, retval @@ -2792,6 +2877,7 @@ def _samtools_dispatch( method, cargs[0] = "samtools" cargs[1] = method for i from 0 <= i < n: cargs[i+2] = args[i] + retval = pysam_dispatch(n+2, cargs) free( cargs ) @@ -2870,500 +2956,500 @@ cdef class SNPCall: self.coverage ) ) ) -cdef class SNPCallerBase: - '''Base class for SNP callers. - - *min_baseQ* - minimum base quality (possibly capped by BAQ) - *capQ_threshold* - coefficient for adjusting mapQ of poor mappings - *theta* - theta in maq consensus calling model - *n_haplotypes* - number of haplotypes in the sample - *het_rate* - prior of a difference between two haplotypes - ''' - - cdef bam_maqcns_t * c - cdef IteratorColumn iter - - def __cinit__(self, - IteratorColumn iterator_column, - **kwargs ): - - self.iter = iterator_column - self.c = bam_maqcns_init() - - # set the default parameterization according to - # samtools - - # new default mode for samtools >0.1.10 - self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 ) - - self.c.min_baseQ = kwargs.get( "min_baseQ", 13 ) - # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 ) - self.c.n_hap = kwargs.get( "n_haplotypes", 2 ) - self.c.het_rate = kwargs.get( "het_rate", 0.001 ) - self.c.theta = kwargs.get( "theta", 0.83 ) - - if self.c.errmod != BAM_ERRMOD_MAQ2: - self.c.theta += 0.02 - - # call prepare AFTER setting parameters - bam_maqcns_prepare( self.c ) - - def __dealloc__(self): - bam_maqcns_destroy( self.c ) - - cdef __dump( self, glf1_t * g, uint32_t cns, int rb ): - '''debugging output.''' - - pysam_dump_glf( g, self.c ); - print "" - for x in range(self.iter.n_plp): - print "--> read %i %s %i" % (x, - bam1_qname(self.iter.plp[x].b), - self.iter.plp[x].qpos, - ) - - print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \ - % (self.iter.pos, - cns, - self.c.q_r, - self.iter.n_plp, - self.iter.n_plp, - rb, - cns >> 8 & 0xff, - cns >> 16 & 0xff, - cns & 0xff, - cns >> 28, - ) - - printf("-------------------------------------\n"); - sys.stdout.flush() - -cdef class IteratorSNPCalls( SNPCallerBase ): - """*(IteratorColumn iterator)* - - call SNPs within a region. - - *iterator* is a pileup iterator. SNPs will be called - on all positions returned by this iterator. - - This caller is fast if SNPs are called over large continuous - regions. It is slow, if instantiated frequently and in random - order as the sequence will have to be reloaded. - - """ +# cdef class SNPCallerBase: +# '''Base class for SNP callers. + +# *min_baseQ* +# minimum base quality (possibly capped by BAQ) +# *capQ_threshold* +# coefficient for adjusting mapQ of poor mappings +# *theta* +# theta in maq consensus calling model +# *n_haplotypes* +# number of haplotypes in the sample +# *het_rate* +# prior of a difference between two haplotypes +# ''' + +# cdef bam_maqcns_t * c +# cdef IteratorColumn iter + +# def __cinit__(self, +# IteratorColumn iterator_column, +# **kwargs ): + +# self.iter = iterator_column +# self.c = bam_maqcns_init() + +# # set the default parameterization according to +# # samtools + +# # new default mode for samtools >0.1.10 +# self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 ) + +# self.c.min_baseQ = kwargs.get( "min_baseQ", 13 ) +# # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 ) +# self.c.n_hap = kwargs.get( "n_haplotypes", 2 ) +# self.c.het_rate = kwargs.get( "het_rate", 0.001 ) +# self.c.theta = kwargs.get( "theta", 0.83 ) + +# if self.c.errmod != BAM_ERRMOD_MAQ2: +# self.c.theta += 0.02 + +# # call prepare AFTER setting parameters +# bam_maqcns_prepare( self.c ) + +# def __dealloc__(self): +# bam_maqcns_destroy( self.c ) + + # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ): + # '''debugging output.''' + + # pysam_dump_glf( g, self.c ); + # print "" + # for x in range(self.iter.n_plp): + # print "--> read %i %s %i" % (x, + # bam1_qname(self.iter.plp[x].b), + # self.iter.plp[x].qpos, + # ) + + # print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \ + # % (self.iter.pos, + # cns, + # self.c.q_r, + # self.iter.n_plp, + # self.iter.n_plp, + # rb, + # cns >> 8 & 0xff, + # cns >> 16 & 0xff, + # cns & 0xff, + # cns >> 28, + # ) + + # printf("-------------------------------------\n"); + # sys.stdout.flush() + +# cdef class IteratorSNPCalls( SNPCallerBase ): +# """*(IteratorColumn iterator)* + +# call SNPs within a region. + +# *iterator* is a pileup iterator. SNPs will be called +# on all positions returned by this iterator. + +# This caller is fast if SNPs are called over large continuous +# regions. It is slow, if instantiated frequently and in random +# order as the sequence will have to be reloaded. + +# """ - def __cinit__(self, - IteratorColumn iterator_column, - **kwargs ): +# def __cinit__(self, +# IteratorColumn iterator_column, +# **kwargs ): - assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence" +# assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence" - def __iter__(self): - return self +# def __iter__(self): +# return self - def __next__(self): - """python version of next(). - """ +# def __next__(self): +# """python version of next(). +# """ - # the following code was adapted from bam_plcmd.c:pileup_func() - self.iter.cnext() +# # the following code was adapted from bam_plcmd.c:pileup_func() +# self.iter.cnext() - if self.iter.n_plp < 0: - raise ValueError("error during iteration" ) +# if self.iter.n_plp < 0: +# raise ValueError("error during iteration" ) - if self.iter.plp == NULL: - raise StopIteration +# if self.iter.plp == NULL: +# raise StopIteration - cdef char * seq = self.iter.getSequence() - cdef int seq_len = self.iter.seq_len +# cdef char * seq = self.iter.getSequence() +# cdef int seq_len = self.iter.seq_len - assert seq != NULL +# assert seq != NULL - # reference base - if self.iter.pos >= seq_len: - raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) +# # reference base +# if self.iter.pos >= seq_len: +# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) - cdef int rb = seq[self.iter.pos] - cdef uint32_t cns - cdef glf1_t * g +# cdef int rb = seq[self.iter.pos] +# cdef uint32_t cns +# cdef glf1_t * g - g = bam_maqcns_glfgen( self.iter.n_plp, - self.iter.plp, - bam_nt16_table[rb], - self.c ) +# g = bam_maqcns_glfgen( self.iter.n_plp, +# self.iter.plp, +# bam_nt16_table[rb], +# self.c ) - if pysam_glf_depth( g ) == 0: - cns = 0xfu << 28 | 0xf << 24 - else: - cns = glf2cns(g, (self.c.q_r + .499)) - - free(g) +# if pysam_glf_depth( g ) == 0: +# cns = 0xfu << 28 | 0xf << 24 +# else: +# cns = glf2cns(g, (self.c.q_r + .499)) + +# free(g) - cdef SNPCall call +# cdef SNPCall call - call = SNPCall() - call._tid = self.iter.tid - call._pos = self.iter.pos - call._reference_base = rb - call._genotype = bam_nt16_rev_table[cns>>28] - call._consensus_quality = cns >> 8 & 0xff - call._snp_quality = cns & 0xff - call._rms_mapping_quality = cns >> 16&0xff - call._coverage = self.iter.n_plp +# call = SNPCall() +# call._tid = self.iter.tid +# call._pos = self.iter.pos +# call._reference_base = rb +# call._genotype = bam_nt16_rev_table[cns>>28] +# call._consensus_quality = cns >> 8 & 0xff +# call._snp_quality = cns & 0xff +# call._rms_mapping_quality = cns >> 16&0xff +# call._coverage = self.iter.n_plp - return call +# return call -cdef class SNPCaller( SNPCallerBase ): - '''*(IteratorColumn iterator_column )* +# cdef class SNPCaller( SNPCallerBase ): +# '''*(IteratorColumn iterator_column )* - The samtools SNP caller. +# The samtools SNP caller. - This object will call SNPs in *samfile* against the reference - sequence in *fasta*. +# This object will call SNPs in *samfile* against the reference +# sequence in *fasta*. - This caller is fast for calling few SNPs in selected regions. +# This caller is fast for calling few SNPs in selected regions. - It is slow, if called over large genomic regions. - ''' +# It is slow, if called over large genomic regions. +# ''' - def __cinit__(self, - IteratorColumn iterator_column, - **kwargs ): +# def __cinit__(self, +# IteratorColumn iterator_column, +# **kwargs ): - pass +# pass - def call(self, reference, int pos ): - """call a snp on chromosome *reference* - and position *pos*. +# def call(self, reference, int pos ): +# """call a snp on chromosome *reference* +# and position *pos*. - returns a :class:`SNPCall` object. - """ +# returns a :class:`SNPCall` object. +# """ - cdef int tid = self.iter.samfile.gettid( reference ) +# cdef int tid = self.iter.samfile.gettid( reference ) - self.iter.reset( tid, pos, pos + 1 ) +# self.iter.reset( tid, pos, pos + 1 ) - while 1: - self.iter.cnext() +# while 1: +# self.iter.cnext() - if self.iter.n_plp < 0: - raise ValueError("error during iteration" ) +# if self.iter.n_plp < 0: +# raise ValueError("error during iteration" ) - if self.iter.plp == NULL: - raise ValueError( "no reads in region - no call" ) +# if self.iter.plp == NULL: +# raise ValueError( "no reads in region - no call" ) - if self.iter.pos == pos: break - - cdef char * seq = self.iter.getSequence() - cdef int seq_len = self.iter.seq_len - - assert seq != NULL - - # reference base - if self.iter.pos >= seq_len: - raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) - - cdef int rb = seq[self.iter.pos] - cdef uint32_t cns - cdef glf1_t * g - - g = bam_maqcns_glfgen( self.iter.n_plp, - self.iter.plp, - bam_nt16_table[rb], - self.c ) - - - if pysam_glf_depth( g ) == 0: - cns = 0xfu << 28 | 0xf << 24 - else: - cns = glf2cns(g, (self.c.q_r + .499)) - - free(g) +# if self.iter.pos == pos: break + +# cdef char * seq = self.iter.getSequence() +# cdef int seq_len = self.iter.seq_len + +# assert seq != NULL + +# # reference base +# if self.iter.pos >= seq_len: +# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) + +# cdef int rb = seq[self.iter.pos] +# cdef uint32_t cns +# # cdef glf1_t * g +# # +# # g = bam_maqcns_glfgen( self.iter.n_plp, +# # self.iter.plp, +# # bam_nt16_table[rb], +# # self.c ) +# ## +# # +# # if pysam_glf_depth( g ) == 0: +# # cns = 0xfu << 28 | 0xf << 24 +# # else: +# # cns = glf2cns(g, (self.c.q_r + .499)) +# # +# # free(g) - cdef SNPCall call - - call = SNPCall() - call._tid = self.iter.tid - call._pos = self.iter.pos - call._reference_base = rb - call._genotype = bam_nt16_rev_table[cns>>28] - call._consensus_quality = cns >> 8 & 0xff - call._snp_quality = cns & 0xff - call._rms_mapping_quality = cns >> 16&0xff - call._coverage = self.iter.n_plp - - return call - -cdef class IndelCall: - '''the results of an indel call.''' - cdef int _tid - cdef int _pos - cdef int _coverage - cdef int _rms_mapping_quality - cdef bam_maqindel_ret_t * _r - - def __cinit__(self): - #assert r != NULL - #self._r = r - pass - - property tid: - '''the chromosome ID as is defined in the header''' - def __get__(self): - return self._tid +# cdef SNPCall call + +# call = SNPCall() +# call._tid = self.iter.tid +# call._pos = self.iter.pos +# call._reference_base = rb +# call._genotype = bam_nt16_rev_table[cns>>28] +# call._consensus_quality = cns >> 8 & 0xff +# call._snp_quality = cns & 0xff +# call._rms_mapping_quality = cns >> 16&0xff +# call._coverage = self.iter.n_plp + +# return call + +# cdef class IndelCall: +# '''the results of an indel call.''' +# cdef int _tid +# cdef int _pos +# cdef int _coverage +# cdef int _rms_mapping_quality +# cdef bam_maqindel_ret_t * _r + +# def __cinit__(self): +# #assert r != NULL +# #self._r = r +# pass + +# property tid: +# '''the chromosome ID as is defined in the header''' +# def __get__(self): +# return self._tid - property pos: - '''nucleotide position of SNP.''' - def __get__(self): return self._pos - - property genotype: - '''the genotype called.''' - def __get__(self): - if self._r.gt == 0: - s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) - return "%s/%s" % (s,s) - elif self._r.gt == 1: - s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) - return "%s/%s" % (s,s) - else: - return "%s/%s" % (self.first_allele, self.second_allele ) - - property consensus_quality: - '''the genotype quality (Phred-scaled).''' - def __get__(self): return self._r.q_cns - - property snp_quality: - '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.''' - def __get__(self): return self._r.q_ref - - property mapping_quality: - '''the root mean square (rms) of the mapping quality of all reads involved in the call.''' - def __get__(self): return self._rms_mapping_quality - - property coverage: - '''coverage or read depth - the number of reads involved in the call.''' - def __get__(self): return self._coverage - - property first_allele: - '''sequence of first allele.''' - def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) - - property second_allele: - '''sequence of second allele.''' - def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) - - property reads_first: - '''reads supporting first allele.''' - def __get__(self): return self._r.cnt1 - - property reads_second: - '''reads supporting first allele.''' - def __get__(self): return self._r.cnt2 - - property reads_diff: - '''reads supporting first allele.''' - def __get__(self): return self._r.cnt_anti - - def __str__(self): - - return "\t".join( map(str, ( - self.tid, - self.pos, - self.genotype, - self.consensus_quality, - self.snp_quality, - self.mapping_quality, - self.coverage, - self.first_allele, - self.second_allele, - self.reads_first, - self.reads_second, - self.reads_diff ) ) ) - - def __dealloc__(self ): - bam_maqindel_ret_destroy(self._r) - -cdef class IndelCallerBase: - '''Base class for SNP callers. - - *min_baseQ* - minimum base quality (possibly capped by BAQ) - *capQ_threshold* - coefficient for adjusting mapQ of poor mappings - *theta* - theta in maq consensus calling model - *n_haplotypes* - number of haplotypes in the sample - *het_rate* - prior of a difference between two haplotypes - ''' - - cdef bam_maqindel_opt_t * options - cdef IteratorColumn iter - cdef int cap_mapQ - cdef int max_depth - - def __cinit__(self, - IteratorColumn iterator_column, - **kwargs ): +# property pos: +# '''nucleotide position of SNP.''' +# def __get__(self): return self._pos + +# property genotype: +# '''the genotype called.''' +# def __get__(self): +# if self._r.gt == 0: +# s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) +# return "%s/%s" % (s,s) +# elif self._r.gt == 1: +# s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) +# return "%s/%s" % (s,s) +# else: +# return "%s/%s" % (self.first_allele, self.second_allele ) + +# property consensus_quality: +# '''the genotype quality (Phred-scaled).''' +# def __get__(self): return self._r.q_cns + +# property snp_quality: +# '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.''' +# def __get__(self): return self._r.q_ref + +# property mapping_quality: +# '''the root mean square (rms) of the mapping quality of all reads involved in the call.''' +# def __get__(self): return self._rms_mapping_quality + +# property coverage: +# '''coverage or read depth - the number of reads involved in the call.''' +# def __get__(self): return self._coverage + +# property first_allele: +# '''sequence of first allele.''' +# def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1) + +# property second_allele: +# '''sequence of second allele.''' +# def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1) + +# property reads_first: +# '''reads supporting first allele.''' +# def __get__(self): return self._r.cnt1 + +# property reads_second: +# '''reads supporting first allele.''' +# def __get__(self): return self._r.cnt2 + +# property reads_diff: +# '''reads supporting first allele.''' +# def __get__(self): return self._r.cnt_anti + +# def __str__(self): + +# return "\t".join( map(str, ( +# self.tid, +# self.pos, +# self.genotype, +# self.consensus_quality, +# self.snp_quality, +# self.mapping_quality, +# self.coverage, +# self.first_allele, +# self.second_allele, +# self.reads_first, +# self.reads_second, +# self.reads_diff ) ) ) + +# def __dealloc__(self ): +# bam_maqindel_ret_destroy(self._r) + +# cdef class IndelCallerBase: +# '''Base class for SNP callers. + +# *min_baseQ* +# minimum base quality (possibly capped by BAQ) +# *capQ_threshold* +# coefficient for adjusting mapQ of poor mappings +# *theta* +# theta in maq consensus calling model +# *n_haplotypes* +# number of haplotypes in the sample +# *het_rate* +# prior of a difference between two haplotypes +# ''' + +# cdef bam_maqindel_opt_t * options +# cdef IteratorColumn iter +# cdef int cap_mapQ +# cdef int max_depth + +# def __cinit__(self, +# IteratorColumn iterator_column, +# **kwargs ): + + +# self.iter = iterator_column + +# assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence" + +# self.options = bam_maqindel_opt_init() + +# # set the default parameterization according to +# # samtools + +# self.options.r_indel = kwargs.get( "r_indel", 0.00015 ) +# self.options.q_indel = kwargs.get( "q_indel", 40 ) +# self.cap_mapQ = kwargs.get( "cap_mapQ", 60 ) +# self.max_depth = kwargs.get( "max_depth", 1024 ) + +# def __dealloc__(self): +# free( self.options ) +# def _call( self ): - self.iter = iterator_column +# cdef char * seq = self.iter.getSequence() +# cdef int seq_len = self.iter.seq_len - assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence" - - self.options = bam_maqindel_opt_init() - - # set the default parameterization according to - # samtools - - self.options.r_indel = kwargs.get( "r_indel", 0.00015 ) - self.options.q_indel = kwargs.get( "q_indel", 40 ) - self.cap_mapQ = kwargs.get( "cap_mapQ", 60 ) - self.max_depth = kwargs.get( "max_depth", 1024 ) - - def __dealloc__(self): - free( self.options ) +# assert seq != NULL - def _call( self ): +# # reference base +# if self.iter.pos >= seq_len: +# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) - cdef char * seq = self.iter.getSequence() - cdef int seq_len = self.iter.seq_len +# cdef bam_maqindel_ret_t * r + +# cdef int m = min( self.max_depth, self.iter.n_plp ) + +# # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n", +# # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty, +# # self.options.indel_err, self.options.ambi_thres ); + +# r = bam_maqindel(m, +# self.iter.pos, +# self.options, +# self.iter.plp, +# seq, +# 0, +# NULL) + +# if r == NULL: return None - assert seq != NULL +# cdef IndelCall call +# call = IndelCall() +# call._r = r +# call._tid = self.iter.tid +# call._pos = self.iter.pos +# call._coverage = self.iter.n_plp - # reference base - if self.iter.pos >= seq_len: - raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) ) +# cdef uint64_t rms_aux = 0 +# cdef int i = 0 +# cdef bam_pileup1_t * p +# cdef int tmp - cdef bam_maqindel_ret_t * r - - cdef int m = min( self.max_depth, self.iter.n_plp ) - - # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n", - # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty, - # self.options.indel_err, self.options.ambi_thres ); - - r = bam_maqindel(m, - self.iter.pos, - self.options, - self.iter.plp, - seq, - 0, - NULL) - - if r == NULL: return None - - cdef IndelCall call - call = IndelCall() - call._r = r - call._tid = self.iter.tid - call._pos = self.iter.pos - call._coverage = self.iter.n_plp - - cdef uint64_t rms_aux = 0 - cdef int i = 0 - cdef bam_pileup1_t * p - cdef int tmp - - for i from 0 <= i < self.iter.n_plp: - p = self.iter.plp + i - if p.b.core.qual < self.cap_mapQ: - tmp = p.b.core.qual - else: - tmp = self.cap_mapQ - rms_aux += tmp * tmp +# for i from 0 <= i < self.iter.n_plp: +# p = self.iter.plp + i +# if p.b.core.qual < self.cap_mapQ: +# tmp = p.b.core.qual +# else: +# tmp = self.cap_mapQ +# rms_aux += tmp * tmp - call._rms_mapping_quality = (sqrt(rms_aux / self.iter.n_plp) + .499) +# call._rms_mapping_quality = (sqrt(rms_aux / self.iter.n_plp) + .499) - return call +# return call -cdef class IndelCaller( IndelCallerBase ): - '''*(IteratorColumn iterator_column )* +# cdef class IndelCaller( IndelCallerBase ): +# '''*(IteratorColumn iterator_column )* - The samtools SNP caller. +# The samtools SNP caller. - This object will call SNPs in *samfile* against the reference - sequence in *fasta*. +# This object will call SNPs in *samfile* against the reference +# sequence in *fasta*. - This caller is fast for calling few SNPs in selected regions. +# This caller is fast for calling few SNPs in selected regions. - It is slow, if called over large genomic regions. - ''' +# It is slow, if called over large genomic regions. +# ''' - def __cinit__(self, - IteratorColumn iterator_column, - **kwargs ): +# def __cinit__(self, +# IteratorColumn iterator_column, +# **kwargs ): - pass +# pass - def call(self, reference, int pos ): - """call a snp on chromosome *reference* - and position *pos*. +# def call(self, reference, int pos ): +# """call a snp on chromosome *reference* +# and position *pos*. - returns a :class:`SNPCall` object or None, if no indel call could be made. - """ +# returns a :class:`SNPCall` object or None, if no indel call could be made. +# """ - cdef int tid = self.iter.samfile.gettid( reference ) +# cdef int tid = self.iter.samfile.gettid( reference ) - self.iter.reset( tid, pos, pos + 1 ) +# self.iter.reset( tid, pos, pos + 1 ) - while 1: - self.iter.cnext() +# while 1: +# self.iter.cnext() - if self.iter.n_plp < 0: - raise ValueError("error during iteration" ) +# if self.iter.n_plp < 0: +# raise ValueError("error during iteration" ) - if self.iter.plp == NULL: - raise ValueError( "no reads in region - no call" ) +# if self.iter.plp == NULL: +# raise ValueError( "no reads in region - no call" ) - if self.iter.pos == pos: break +# if self.iter.pos == pos: break - return self._call() +# return self._call() -cdef class IteratorIndelCalls( IndelCallerBase ): - """*(IteratorColumn iterator)* +# cdef class IteratorIndelCalls( IndelCallerBase ): +# """*(IteratorColumn iterator)* - call indels within a region. +# call indels within a region. - *iterator* is a pileup iterator. SNPs will be called - on all positions returned by this iterator. +# *iterator* is a pileup iterator. SNPs will be called +# on all positions returned by this iterator. - This caller is fast if SNPs are called over large continuous - regions. It is slow, if instantiated frequently and in random - order as the sequence will have to be reloaded. +# This caller is fast if SNPs are called over large continuous +# regions. It is slow, if instantiated frequently and in random +# order as the sequence will have to be reloaded. - """ +# """ - def __cinit__(self, - IteratorColumn iterator_column, - **kwargs ): - pass +# def __cinit__(self, +# IteratorColumn iterator_column, +# **kwargs ): +# pass - def __iter__(self): - return self +# def __iter__(self): +# return self - def __next__(self): - """python version of next(). - """ +# def __next__(self): +# """python version of next(). +# """ - # the following code was adapted from bam_plcmd.c:pileup_func() - self.iter.cnext() +# # the following code was adapted from bam_plcmd.c:pileup_func() +# self.iter.cnext() - if self.iter.n_plp < 0: - raise ValueError("error during iteration" ) +# if self.iter.n_plp < 0: +# raise ValueError("error during iteration" ) - if self.iter.plp == NULL: - raise StopIteration +# if self.iter.plp == NULL: +# raise StopIteration - return self._call() +# return self._call() @@ -3441,10 +3527,10 @@ __all__ = ["Samfile", "PileupColumn", "PileupProxy", "PileupRead", - "IteratorSNPCalls", - "SNPCaller", - "IndelCaller", - "IteratorIndelCalls", + # "IteratorSNPCalls", + # "SNPCaller", + # "IndelCaller", + # "IteratorIndelCalls", "IndexedReads" ]