X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2Fcsamtools.pyx;fp=pysam%2Fcsamtools.pyx;h=242e68a402474774b414561186de6cdb6050144e;hp=0da8d9eba23c1270f9739ea56c59af51950c934c;hb=6a7c3f175b210cc16d09a5e8e4c1d47333dbe1c6;hpb=aa8ecff068edbb09a03bd874fce716e93e22e53c diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx index 0da8d9e..242e68a 100644 --- a/pysam/csamtools.pyx +++ b/pysam/csamtools.pyx @@ -1,8 +1,12 @@ # cython: embedsignature=True +# cython: profile=True # adds doc-strings for sphinx import tempfile, os, sys, types, itertools, struct, ctypes +from python_string cimport PyString_FromStringAndSize, PyString_AS_STRING +from python_exc cimport PyErr_SetString + # defines imported from samtools DEF SEEK_SET = 0 DEF SEEK_CUR = 1 @@ -36,6 +40,14 @@ DEF BAM_FDUP =1024 DEF BAM_CIGAR_SHIFT=4 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1) +DEF BAM_CMATCH = 0 +DEF BAM_CINS = 1 +DEF BAM_CDEL = 2 +DEF BAM_CREF_SKIP = 3 +DEF BAM_CSOFT_CLIP = 4 +DEF BAM_CHARD_CLIP = 5 +DEF BAM_CPAD = 6 + ##################################################################### ##################################################################### ##################################################################### @@ -48,15 +60,17 @@ cdef makeAlignedRead( bam1_t * src): dest = AlignedRead() # destroy dummy delegate created in constructor # to prevent memory leak. - pysam_bam_destroy1(dest._delegate) + bam_destroy1(dest._delegate) dest._delegate = bam_dup1(src) return dest cdef class PileupProxy -cdef makePileupProxy( bam_plbuf_t * buf, int n ): +cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ): cdef PileupProxy dest dest = PileupProxy() - dest.buf = buf + dest.plp = plp + dest.tid = tid + dest.pos = pos dest.n = n return dest @@ -127,6 +141,7 @@ cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, p.n = n pileups = [] + cdef int x for x from 0 <= x < n: pileups.append( makePileupRead( &(pl[x]) ) ) p.pileups = pileups @@ -185,6 +200,7 @@ VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ), "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ), "PG" : ( "ID", "VN", "CL" ), } + ###################################################################### ###################################################################### ###################################################################### @@ -229,9 +245,12 @@ cdef class Samfile: cdef bam_index_t *index # true if file is a bam file cdef int isbam - + # true if file is not on the local filesystem + cdef int isremote # current read within iteration cdef bam1_t * b + # file opening mode + cdef char * mode def __cinit__(self, *args, **kwargs ): self.samfile = NULL @@ -251,12 +270,13 @@ cdef class Samfile: def _open( self, char * filename, - mode ='r', + mode = 'r', Samfile template = None, referencenames = None, referencelengths = None, - char * text = NULL, + text = None, header = None, + port = None, ): '''open a sam/bam file. @@ -277,6 +297,12 @@ cdef class Samfile: self.isbam = len(mode) > 1 and mode[1] == 'b' + self.isremote = strncmp(filename,"http:",5) == 0 or \ + strncmp(filename,"ftp:",4) == 0 + + cdef char * ctext + ctext = NULL + if mode[0] == 'w': # open file for writing @@ -306,11 +332,12 @@ cdef class Samfile: header_to_write.target_name[x] = calloc(len(name)+1, sizeof(char)) strncpy( header_to_write.target_name[x], name, len(name) ) - if text != NULL: + if text != None: # copy without \0 - header_to_write.l_text = strlen(text) - header_to_write.text = calloc( strlen(text), sizeof(char) ) - memcpy( header_to_write.text, text, strlen(text) ) + ctext = text + header_to_write.l_text = strlen(ctext) + header_to_write.text = calloc( strlen(ctext), sizeof(char) ) + memcpy( header_to_write.text, ctext, strlen(ctext) ) header_to_write.hash = NULL header_to_write.rg2lib = NULL @@ -327,7 +354,9 @@ cdef class Samfile: elif mode[0] == "r": # open file for reading - if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ): + if strncmp( filename, "-", 1) != 0 and \ + not self.isremote and \ + not os.path.exists( filename ): raise IOError( "file `%s` not found" % filename) store = StderrStore() @@ -337,15 +366,22 @@ cdef class Samfile: if self.samfile == NULL: raise IOError("could not open file `%s`" % filename ) + # check for index and open if present if mode[0] == "r" and self.isbam: - if not os.path.exists(filename + ".bai"): - self.index = NULL + + if not self.isremote: + if not os.path.exists(filename +".bai"): + self.index = NULL + else: + # returns NULL if there is no index or index could not be opened + self.index = bam_index_load(filename) + if self.index == NULL: + raise IOError("error while opening index `%s` " % filename ) else: - # returns NULL if there is no index or index could not be opened self.index = bam_index_load(filename) if self.index == NULL: raise IOError("error while opening index `%s` " % filename ) - + def getrname( self, tid ): '''(tid ) convert numerical :term:`tid` into :ref:`reference` name.''' @@ -394,6 +430,22 @@ cdef class Samfile: if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend ) return region, rtid, rstart, rend + + def seek( self, uint64_t offset, int where = 0): + '''move to current file to position *offset*''' + + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + if not self.isbam: + raise NotImplementedError("seek only available in bam files") + return bam_seek( self.samfile.x.bam, offset, where ) + + def tell( self ): + '''return current file position''' + if not self.isbam: + raise NotImplementedError("seek only available in bam files") + + return bam_tell( self.samfile.x.bam ) def fetch( self, reference = None, @@ -428,16 +480,24 @@ cdef class Samfile: if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) - + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) if self.isbam: + if not until_eof and not self._hasIndex() and not self.isremote: + raise ValueError( "fetch called on bamfile without index" ) + if callback: if not region: raise ValueError( "callback functionality requires a region/reference" ) if not self._hasIndex(): raise ValueError( "no index available for fetch" ) return bam_fetch(self.samfile.x.bam, - self.index, rtid, rstart, rend, callback, fetch_callback ) + self.index, + rtid, + rstart, + rend, + callback, + fetch_callback ) else: if region: return IteratorRow( self, rtid, rstart, rend ) @@ -453,7 +513,12 @@ cdef class Samfile: for rtid from 0 <= rtid < self.nreferences: i.append( IteratorRow( self, rtid, rstart, rend)) return itertools.chain( *i ) - else: + else: + # check if header is present - otherwise sam_read1 aborts + # this happens if a bamfile is opened with mode 'r' + if self.samfile.header.n_targets == 0: + raise ValueError( "fetch called for samfile without header") + if region != None: raise ValueError ("fetch for a region is not available for sam files" ) if callback: @@ -528,11 +593,11 @@ cdef class Samfile: self.samfile = NULL def __dealloc__( self ): - '''clean up.''' # remember: dealloc cannot call other methods - # Note that __del__ is not called. + # note: no doc string + # note: __del__ is not called. self.close() - pysam_bam_destroy1(self.b) + bam_destroy1(self.b) def write( self, AlignedRead read ): '''(AlignedRead read ) @@ -542,6 +607,13 @@ cdef class Samfile: ''' return samwrite( self.samfile, read._delegate ) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + property nreferences: '''number of :term:`reference` sequences in the file.''' def __get__(self): @@ -567,13 +639,7 @@ cdef class Samfile: property text: '''full contents of the :term:`sam file` header as a string.''' def __get__(self): - # create a temporary 0-terminated copy - cdef char * t - t = calloc( self.samfile.header.l_text + 1, sizeof(char) ) - memcpy( t, self.samfile.header.text, self.samfile.header.l_text ) - result = t - free(t) - return result + return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text) property header: '''header information within the :term:`sam file`. The records and fields are returned as @@ -730,6 +796,10 @@ cdef class Fastafile: '''return true if samfile has been opened.''' return self.fastafile != NULL + def __len__(self): + assert self.fastafile != NULL + return faidx_fetch_nseq(self.fastafile) + def _open( self, char * filename ): '''open an indexed fasta file. @@ -758,8 +828,14 @@ cdef class Fastafile: '''*(reference = None, start = None, end = None, region = None)* - fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by - :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. + fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. + The region is specified by :term:`reference`, *start* and *end*. + + If *reference* is given and *start* is None, the sequence from the + first base is returned. Similarly, if *end* is None, the sequence + until the last base is returned. + + Alternatively, a samtools :term:`region` string can be supplied. ''' if not self._isOpen(): @@ -770,59 +846,85 @@ cdef class Fastafile: max_pos = 2 << 29 if not region: - if reference == None: raise ValueError( 'no sequence/region supplied.' ) - if start == None and end == None: - region = "%s" % str(reference) - elif start == None or end == None: - raise ValueError( 'only start or only end of region supplied' ) - else: - if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) ) - # valid ranges are from 0 to 2^29-1 - if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start ) - if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end ) - region = "%s:%i-%i" % (reference, start+1, end ) - - # samtools adds a '\0' at the end - seq = fai_fetch( self.fastafile, region, &len ) + if reference is None: raise ValueError( 'no sequence/region supplied.' ) + if start is None: start = 0 + if end is None: end = max_pos -1 + + if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) ) + if start == end: return "" + # valid ranges are from 0 to 2^29-1 + if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start ) + if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end ) + + seq = faidx_fetch_seq(self.fastafile, reference, + start, + end-1, &len) + else: + # samtools adds a '\0' at the end + seq = fai_fetch( self.fastafile, region, &len ) + # copy to python - result = seq - # clean up - free(seq) + if seq == NULL: + return "" + else: + result = seq + # clean up + free(seq) return result +########################################################################### +########################################################################### +########################################################################### ## turning callbacks elegantly into iterators is an unsolved problem, see the following threads: ## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1 ## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html ## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code ## changes, the changes have to be manually entered. - cdef class IteratorRow: """iterates over mapped reads in a region. + + The samtools iterators assume that the file + position between iterations do not change. + As a consequence, no two iterators can work + on the same file. To permit this, each iterator + creates its own file handle by re-opening the + file. + + Note that the index will be shared between + samfile and the iterator. """ - cdef bam_fetch_iterator_t* bam_iter # iterator state object + cdef bam_iter_t iter # iterator state object cdef bam1_t * b - cdef error_msg - cdef int error_state + cdef int retval cdef Samfile samfile + cdef samfile_t * fp + def __cinit__(self, Samfile samfile, int tid, int beg, int end ): - self.bam_iter = NULL assert samfile._isOpen() assert samfile._hasIndex() # makes sure that samfile stays alive as long as the - # iterator is alive. + # iterator is alive self.samfile = samfile - # parse the region - self.error_state = 0 - self.error_msg = None + if samfile.isbam: mode = "rb" + else: mode = "r" + + # reopen the file + store = StderrStore() + self.fp = samopen( samfile.filename, mode, NULL ) + store.release() - cdef bamFile fp - fp = samfile.samfile.x.bam - self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end) + self.retval = 0 + + self.iter = bam_iter_query(self.samfile.index, + tid, + beg, + end) + self.b = bam_init1() def __iter__(self): return self @@ -832,29 +934,21 @@ cdef class IteratorRow: cdef int cnext(self): '''cversion of iterator. Used by IteratorColumn''' - self.b = bam_fetch_iterate(self.bam_iter) - if self.b == NULL: return 0 - return 1 - + self.retval = bam_iter_read( self.fp.x.bam, + self.iter, + self.b) + def __next__(self): """python version of next(). - - pyrex uses this non-standard name instead of next() """ - if self.error_state: - raise ValueError( self.error_msg) - - self.b = bam_fetch_iterate(self.bam_iter) - if self.b != NULL: - return makeAlignedRead( self.b ) - else: - raise StopIteration + self.cnext() + if self.retval < 0: raise StopIteration + return makeAlignedRead( self.b ) def __dealloc__(self): - '''remember: dealloc cannot call other methods!''' - if self.bam_iter: - bam_cleanup_fetch_iterator(self.bam_iter) - + bam_destroy1(self.b) + samclose( self.fp ) + cdef class IteratorRowAll: """iterates over all mapped reads """ @@ -866,7 +960,13 @@ cdef class IteratorRowAll: assert samfile._isOpen() - self.fp = samfile.samfile + if samfile.isbam: mode = "rb" + else: mode = "r" + + # reopen the file to avoid iterator conflict + store = StderrStore() + self.fp = samopen( samfile.filename, mode, NULL ) + store.release() # allocate memory for alignment self.b = calloc(1, sizeof(bam1_t)) @@ -895,9 +995,18 @@ cdef class IteratorRowAll: raise StopIteration def __dealloc__(self): - '''remember: dealloc cannot call other methods!''' - pysam_bam_destroy1(self.b) - + bam_destroy1(self.b) + samclose( self.fp ) + +ctypedef struct __iterdata: + bamFile fp + bam_iter_t iter + +cdef int __advance( void * data, bam1_t * b ): + cdef __iterdata * d + d = <__iterdata*>data + return bam_iter_read( d.fp, d.iter, b ) + cdef class IteratorColumn: '''iterates over columns. @@ -922,79 +1031,138 @@ cdef class IteratorColumn: Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`. ''' - cdef bam_plbuf_t *buf - # check if first iteration - cdef int notfirst # result of the last plbuf_push - cdef int n_pu - cdef int eof cdef IteratorRow iter - + cdef int tid + cdef int pos + cdef int n_plp + cdef bam_pileup1_t * plp + cdef bam_plp_t pileup_iter + cdef __iterdata iterdata def __cinit__(self, Samfile samfile, int tid, int start, int end ): self.iter = IteratorRow( samfile, tid, start, end ) - self.buf = bam_plbuf_init(NULL, NULL ) - self.n_pu = 0 - self.eof = 0 + self.iterdata.fp = samfile.samfile.x.bam + self.iterdata.iter = self.iter.iter + + self.pileup_iter = bam_plp_init( &__advance, &self.iterdata ) + self.n_plp = 0 + self.tid = 0 + self.pos = 0 + self.plp = NULL def __iter__(self): return self cdef int cnext(self): '''perform next iteration. - - return 1 if there is a buffer to emit. Return 0 for end of iteration. ''' + self.plp = bam_plp_auto( self.pileup_iter, + &self.tid, + &self.pos, + &self.n_plp ) - cdef int retval1, retval2 + def __next__(self): + """python version of next(). - # pysam bam_plbuf_push returns: - # 1: if buf is full and can be emitted - # 0: if b has been added - # -1: if there was an error + pyrex uses this non-standard name instead of next() + """ + self.cnext() + if self.n_plp < 0: + raise ValueError("error during iteration" ) + + if self.plp == NULL: + raise StopIteration - # check if previous plbuf was incomplete. If so, continue within - # the loop and yield if necessary - if self.n_pu > 0: - self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1) - if self.n_pu > 0: return 1 + return makePileupProxy( self.plp, self.tid, self.pos, self.n_plp ) - if self.eof: return 0 + def __dealloc__(self): + bam_plp_destroy(self.pileup_iter) + +cdef inline int32_t query_start(bam1_t *src) except -1: + cdef uint32_t * cigar_p, op + cdef uint32_t k + cdef uint32_t start_offset = 0 + + if src.core.n_cigar: + cigar_p = bam1_cigar(src); + for k from 0 <= k < src.core.n_cigar: + op = cigar_p[k] & BAM_CIGAR_MASK + if op==BAM_CHARD_CLIP: + if start_offset!=0 and start_offset!=src.core.l_qseq: + PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string') + return -1 + elif op==BAM_CSOFT_CLIP: + start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT + else: + break + + return start_offset + + +cdef inline int32_t query_end(bam1_t *src) except -1: + cdef uint32_t * cigar_p, op + cdef uint32_t k + cdef uint32_t end_offset = src.core.l_qseq + + if src.core.n_cigar>1: + cigar_p = bam1_cigar(src); + for k from src.core.n_cigar > k >= 1: + op = cigar_p[k] & BAM_CIGAR_MASK + if op==BAM_CHARD_CLIP: + if end_offset!=0 and end_offset!=src.core.l_qseq: + PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string') + return -1 + elif op==BAM_CSOFT_CLIP: + end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT + else: + break - # get next alignments and submit until plbuf indicates that - # an new column has finished - while self.n_pu == 0: - retval1 = self.iter.cnext() - # wrap up if no more input - if retval1 == 0: - self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0) - self.eof = 1 - return self.n_pu + if end_offset==0: + end_offset = src.core.l_qseq - # submit to plbuf - self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0) - if self.n_pu < 0: raise ValueError( "error while iterating" ) + return end_offset - # plbuf has yielded - return 1 - def __next__(self): - """python version of next(). +cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end): + cdef uint8_t * p + cdef uint32_t k + cdef char * s + cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN" - pyrex uses this non-standard name instead of next() - """ - cdef int ret - ret = self.cnext() - cdef bam_pileup1_t * pl + if not src.core.l_qseq: + return None - if ret > 0 : - return makePileupProxy( self.buf, self.n_pu ) - else: - raise StopIteration + seq = PyString_FromStringAndSize(NULL, end-start) + s = PyString_AS_STRING(seq) + p = bam1_seq(src) - def __dealloc__(self): - bam_plbuf_destroy(self.buf); + for k from start <= k < end: + # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c) + # note: do not use string literal as it will be a python string + s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf] + + return seq + + +cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end): + cdef uint8_t * p + cdef uint32_t k + cdef char * q + + p = bam1_qual(src) + if p[0] == 0xff: + return None + + qual = PyString_FromStringAndSize(NULL, end-start) + q = PyString_AS_STRING(qual) + + for k from start <= k < end: + ## equivalent to t[i] + 33 (see bam.c) + q[k-start] = p[k] + 33 + + return qual cdef class AlignedRead: ''' @@ -1030,8 +1198,7 @@ cdef class AlignedRead: self._delegate.data_len = 0 def __dealloc__(self): - '''clear up memory.''' - pysam_bam_destroy1(self._delegate) + bam_destroy1(self._delegate) def __str__(self): """todo""" @@ -1046,10 +1213,12 @@ cdef class AlignedRead: self.tags))) - def __cmp__(self, AlignedRead other): - '''return true, if contents in this are binary equal to ``other``.''' + def compare(self, AlignedRead other): + '''return -1,0,1, if contents in this are binary <,=,> to *other*''' + cdef int retval, x cdef bam1_t *t, *o + t = self._delegate o = other._delegate @@ -1062,16 +1231,20 @@ cdef class AlignedRead: # oo = (o.data) # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) - retval = memcmp( &t.core, - &o.core, - sizeof( bam1_core_t )) + # Fast-path test for object identity + if t==o: + return 0 + + retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t)) if retval: return retval - retval = cmp( t.data_len, o.data_len) + retval = cmp(t.data_len, o.data_len) if retval: return retval - return memcmp( t.data, - o.data, - sizeof( t.data_len )) + return memcmp(t.data, o.data, t.data_len) + + # Disabled so long as __cmp__ is a special method + def __hash__(self): + return _Py_HashPointer(self) property qname: """the query name (None if not present)""" @@ -1079,7 +1252,7 @@ cdef class AlignedRead: cdef bam1_t * src src = self._delegate if src.core.l_qname == 0: return None - return pysam_bam1_qname( src ) + return bam1_qname( src ) def __set__(self, qname ): if qname == None or len(qname) == 0: return @@ -1088,7 +1261,7 @@ cdef class AlignedRead: cdef char * p src = self._delegate - p = pysam_bam1_qname( src ) + p = bam1_qname( src ) # the qname is \0 terminated l = len(qname) + 1 @@ -1101,7 +1274,7 @@ cdef class AlignedRead: # re-acquire pointer to location in memory # as it might have moved - p = pysam_bam1_qname(src) + p = bam1_qname(src) strncpy( p, qname, l ) @@ -1112,11 +1285,13 @@ cdef class AlignedRead: cdef uint32_t * cigar_p cdef bam1_t * src cdef op, l, cigar + cdef int k + src = self._delegate if src.core.n_cigar == 0: return None cigar = [] - cigar_p = pysam_bam1_cigar(src); + cigar_p = bam1_cigar(src); for k from 0 <= k < src.core.n_cigar: op = cigar_p[k] & BAM_CIGAR_MASK l = cigar_p[k] >> BAM_CIGAR_SHIFT @@ -1135,7 +1310,7 @@ cdef class AlignedRead: src = self._delegate # get location of cigar string - p = pysam_bam1_cigar(src) + p = bam1_cigar(src) # create space for cigar data within src.data pysam_bam_update( src, @@ -1148,7 +1323,7 @@ cdef class AlignedRead: # re-acquire pointer to location in memory # as it might have moved - p = pysam_bam1_cigar(src) + p = bam1_cigar(src) # insert cigar operations for op, l in values: @@ -1159,24 +1334,16 @@ cdef class AlignedRead: src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p)) property seq: - """the query sequence (None if not present)""" + """read sequence bases, including :term:`soft clipped` bases (None if not present)""" def __get__(self): cdef bam1_t * src - cdef uint8_t * p cdef char * s + src = self._delegate - bam_nt16_rev_table = "=ACMGRSVTWYHKDBN" - ## parse qseq (bam1_seq) + if src.core.l_qseq == 0: return None - s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char)) - p = pysam_bam1_seq( src ) - for k from 0 <= k < src.core.l_qseq: - ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c) - s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)] - retval=s - free(s) - return retval + return get_seq_range(src, 0, src.core.l_qseq) def __set__(self,seq): # samtools manages sequence and quality length memory together @@ -1186,9 +1353,10 @@ cdef class AlignedRead: cdef bam1_t * src cdef uint8_t * p cdef char * s - src = self._delegate cdef int l, k, nbytes_new, nbytes_old + src = self._delegate + l = len(seq) # as the sequence is stored in half-bytes, the total length (sequence @@ -1196,7 +1364,7 @@ cdef class AlignedRead: nbytes_new = (l+1)/2 + l nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq # acquire pointer to location in memory - p = pysam_bam1_seq( src ) + p = bam1_seq( src ) src.core.l_qseq = l pysam_bam_update( src, @@ -1205,7 +1373,7 @@ cdef class AlignedRead: p) # re-acquire pointer to location in memory # as it might have moved - p = pysam_bam1_seq( src ) + p = bam1_seq( src ) for k from 0 <= k < nbytes_new: p[k] = 0 # convert to C string s = seq @@ -1213,38 +1381,32 @@ cdef class AlignedRead: p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2) # erase qualities - p = pysam_bam1_qual( src ) + p = bam1_qual( src ) p[0] = 0xff + property qual: - """the base quality (None if not present)""" + """read sequence base qualities, including :term:`soft clipped` bases (None if not present)""" def __get__(self): - cdef bam1_t * src - cdef uint8_t * p + + cdef bam1_t * src cdef char * q + src = self._delegate - if src.core.l_qseq == 0: return None - p = pysam_bam1_qual( src ) - if p[0] == 0xff: return None + if src.core.l_qseq == 0: return None - q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char)) - for k from 0 <= k < src.core.l_qseq: - ## equivalent to t[i] + 33 (see bam.c) - q[k] = p[k] + 33 - # convert to python string - retval=q - # clean up - free(q) - return retval + return get_qual_range(src, 0, src.core.l_qseq) def __set__(self,qual): # note that space is already allocated via the sequences cdef bam1_t * src cdef uint8_t * p cdef char * q + cdef int k + src = self._delegate - p = pysam_bam1_qual( src ) + p = bam1_qual( src ) if qual == None or len(qual) == 0: # if absent - set to 0xff p[0] = 0xff @@ -1259,8 +1421,74 @@ cdef class AlignedRead: for k from 0 <= k < l: p[k] = q[k] - 33 + property query: + """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present) + + SAM/BAM files may included extra flanking bases sequences that were + not part of the alignment. These bases may be the result of the + Smith-Waterman or other algorithms, which may not require alignments + that begin at the first residue or end at the last. In addition, + extra sequencing adapters, multiplex identifiers, and low-quality bases that + were not considered for alignment may have been retained.""" + + def __get__(self): + cdef bam1_t * src + cdef uint32_t start, end + cdef char * s + + src = self._delegate + + if src.core.l_qseq == 0: return None + + start = query_start(src) + end = query_end(src) + + return get_seq_range(src, start, end) + + property qqual: + """aligned query sequence quality values (None if not present)""" + def __get__(self): + cdef bam1_t * src + cdef uint32_t start, end + cdef char * q + + src = self._delegate + + if src.core.l_qseq == 0: return None + + start = query_start(src) + end = query_end(src) + + return get_qual_range(src, start, end) + + property qstart: + """start index of the aligned query portion of the sequence (0-based, inclusive)""" + def __get__(self): + return query_start(self._delegate) + + property qend: + """end index of the aligned query portion of the sequence (0-based, exclusive)""" + def __get__(self): + return query_end(self._delegate) + + property qlen: + """Length of the aligned query sequence""" + def __get__(self): + cdef bam1_t * src + src = self._delegate + return query_end(src)-query_start(src) + property tags: - """the tags in the AUX field.""" + """the tags in the AUX field. + This property permits convenience access to + the tags. Changes it the returned list will + not update the tags automatically. Instead, + the following is required for adding a + new tag:: + + read.tags = read.tags + [("RG",0)] + + """ def __get__(self): cdef char * ctag cdef bam1_t * src @@ -1270,7 +1498,7 @@ cdef class AlignedRead: src = self._delegate if src.l_aux == 0: return None - s = pysam_bam1_aux( src ) + s = bam1_aux( src ) result = [] ctag = calloc( 3, sizeof(char) ) cdef int x @@ -1290,27 +1518,27 @@ cdef class AlignedRead: # how do I do char literal comparison in cython? # the code below works (i.e, is C comparison) tpe = toupper(s[0]) - if tpe == 'S'[0]: + if tpe == 'S': value = bam_aux2i(s) s += 2 - elif tpe == 'I'[0]: + elif tpe == 'I': value = bam_aux2i(s) s += 4 - elif tpe == 'F'[0]: + elif tpe == 'F': value = bam_aux2f(s) s += 4 - elif tpe == 'D'[0]: + elif tpe == 'D': value = bam_aux2d(s) s += 8 - elif tpe == 'C'[0]: + elif tpe == 'C': value = bam_aux2i(s) s += 1 - elif tpe == 'A'[0]: + elif tpe == 'A': # there might a more efficient way # to convert a char into a string value = "%c" % bam_aux2A(s) s += 1 - elif tpe == 'Z'[0]: + elif tpe == 'Z': value = bam_aux2Z(s) # +1 for NULL terminated string s += len(value) + 1 @@ -1377,14 +1605,14 @@ cdef class AlignedRead: pysam_bam_update( src, src.l_aux, offset, - pysam_bam1_aux( src ) ) + bam1_aux( src ) ) src.l_aux = offset if offset == 0: return # get location of new data - s = pysam_bam1_aux( src ) + s = bam1_aux( src ) # check if there is direct path from buffer.raw to tmp cdef char * temp @@ -1416,7 +1644,7 @@ cdef class AlignedRead: cdef bam1_t * src src = self._delegate if src.core.n_cigar: - src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) ) + src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) ) else: src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1) self._delegate.core.pos = pos @@ -1427,6 +1655,27 @@ cdef class AlignedRead: property rlen: '''length of the read (read only). Returns 0 if not given.''' def __get__(self): return self._delegate.core.l_qseq + property aend: + '''aligned end position of the read (read only). Returns + None if not available.''' + def __get__(self): + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0: + return None + return bam_calend(&src.core, bam1_cigar(src)) + property alen: + '''aligned length of the read (read only). Returns None if + not available.''' + def __get__(self): + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0: + return None + return bam_calend(&src.core, + bam1_cigar(src)) - \ + self._delegate.core.pos + property mapq: """mapping quality""" def __get__(self): return self._delegate.core.qual @@ -1585,9 +1834,11 @@ cdef class PileupProxy: If the underlying engine iterator advances, the results of this column will change. ''' - cdef bam_plbuf_t * buf + cdef bam_pileup1_t * plp + cdef int tid + cdef int pos cdef int n_pu - + def __cinit__(self ): pass @@ -1598,7 +1849,7 @@ cdef class PileupProxy: property tid: '''the chromosome ID as is defined in the header''' - def __get__(self): return pysam_get_tid( self.buf ) + def __get__(self): return self.tid property n: '''number of reads mapping to this column.''' @@ -1606,18 +1857,17 @@ cdef class PileupProxy: def __set__(self, n): self.n_pu = n property pos: - def __get__(self): return pysam_get_pos( self.buf ) + def __get__(self): return self.pos property pileups: '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' def __get__(self): - cdef bam_pileup1_t * pl - pl = pysam_get_pileup( self.buf ) + cdef int x pileups = [] # warning: there could be problems if self.n and self.buf are # out of sync. for x from 0 <= x < self.n_pu: - pileups.append( makePileupRead( &pl[x]) ) + pileups.append( makePileupRead( &(self.plp[x])) ) return pileups cdef class PileupRead: