# cython: embedsignature=True
+# cython: profile=True
# adds doc-strings for sphinx
import tempfile, os, sys, types, itertools, struct, ctypes
+from python_string cimport PyString_FromStringAndSize, PyString_AS_STRING
+from python_exc cimport PyErr_SetString
+
# defines imported from samtools
DEF SEEK_SET = 0
DEF SEEK_CUR = 1
DEF BAM_CIGAR_SHIFT=4
DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
+DEF BAM_CMATCH = 0
+DEF BAM_CINS = 1
+DEF BAM_CDEL = 2
+DEF BAM_CREF_SKIP = 3
+DEF BAM_CSOFT_CLIP = 4
+DEF BAM_CHARD_CLIP = 5
+DEF BAM_CPAD = 6
+
#####################################################################
#####################################################################
#####################################################################
dest = AlignedRead()
# destroy dummy delegate created in constructor
# to prevent memory leak.
- pysam_bam_destroy1(dest._delegate)
+ bam_destroy1(dest._delegate)
dest._delegate = bam_dup1(src)
return dest
cdef class PileupProxy
-cdef makePileupProxy( bam_plbuf_t * buf, int n ):
+cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
cdef PileupProxy dest
dest = PileupProxy()
- dest.buf = buf
+ dest.plp = plp
+ dest.tid = tid
+ dest.pos = pos
dest.n = n
return dest
p.n = n
pileups = []
+ cdef int x
for x from 0 <= x < n:
pileups.append( makePileupRead( &(pl[x]) ) )
p.pileups = pileups
"RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
"PG" : ( "ID", "VN", "CL" ), }
+
######################################################################
######################################################################
######################################################################
cdef bam_index_t *index
# true if file is a bam file
cdef int isbam
-
+ # true if file is not on the local filesystem
+ cdef int isremote
# current read within iteration
cdef bam1_t * b
+ # file opening mode
+ cdef char * mode
def __cinit__(self, *args, **kwargs ):
self.samfile = NULL
def _open( self,
char * filename,
- mode ='r',
+ mode = 'r',
Samfile template = None,
referencenames = None,
referencelengths = None,
- char * text = NULL,
+ text = None,
header = None,
+ port = None,
):
'''open a sam/bam file.
self.isbam = len(mode) > 1 and mode[1] == 'b'
+ self.isremote = strncmp(filename,"http:",5) == 0 or \
+ strncmp(filename,"ftp:",4) == 0
+
+ cdef char * ctext
+ ctext = NULL
+
if mode[0] == 'w':
# open file for writing
header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
strncpy( header_to_write.target_name[x], name, len(name) )
- if text != NULL:
+ if text != None:
# copy without \0
- header_to_write.l_text = strlen(text)
- header_to_write.text = <char*>calloc( strlen(text), sizeof(char) )
- memcpy( header_to_write.text, text, strlen(text) )
+ ctext = text
+ header_to_write.l_text = strlen(ctext)
+ header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
+ memcpy( header_to_write.text, ctext, strlen(ctext) )
header_to_write.hash = NULL
header_to_write.rg2lib = NULL
elif mode[0] == "r":
# open file for reading
- if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ):
+ if strncmp( filename, "-", 1) != 0 and \
+ not self.isremote and \
+ not os.path.exists( filename ):
raise IOError( "file `%s` not found" % filename)
store = StderrStore()
if self.samfile == NULL:
raise IOError("could not open file `%s`" % filename )
+ # check for index and open if present
if mode[0] == "r" and self.isbam:
- if not os.path.exists(filename + ".bai"):
- self.index = NULL
+
+ if not self.isremote:
+ if not os.path.exists(filename +".bai"):
+ self.index = NULL
+ else:
+ # returns NULL if there is no index or index could not be opened
+ self.index = bam_index_load(filename)
+ if self.index == NULL:
+ raise IOError("error while opening index `%s` " % filename )
else:
- # returns NULL if there is no index or index could not be opened
self.index = bam_index_load(filename)
if self.index == NULL:
raise IOError("error while opening index `%s` " % filename )
-
+
def getrname( self, tid ):
'''(tid )
convert numerical :term:`tid` into :ref:`reference` name.'''
if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
return region, rtid, rstart, rend
+
+ def seek( self, uint64_t offset, int where = 0):
+ '''move to current file to position *offset*'''
+
+ if not self._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+ if not self.isbam:
+ raise NotImplementedError("seek only available in bam files")
+ return bam_seek( self.samfile.x.bam, offset, where )
+
+ def tell( self ):
+ '''return current file position'''
+ if not self.isbam:
+ raise NotImplementedError("seek only available in bam files")
+
+ return bam_tell( self.samfile.x.bam )
def fetch( self,
reference = None,
if not self._isOpen():
raise ValueError( "I/O operation on closed file" )
-
+
region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
if self.isbam:
+ if not until_eof and not self._hasIndex() and not self.isremote:
+ raise ValueError( "fetch called on bamfile without index" )
+
if callback:
if not region:
raise ValueError( "callback functionality requires a region/reference" )
if not self._hasIndex(): raise ValueError( "no index available for fetch" )
return bam_fetch(self.samfile.x.bam,
- self.index, rtid, rstart, rend, <void*>callback, fetch_callback )
+ self.index,
+ rtid,
+ rstart,
+ rend,
+ <void*>callback,
+ fetch_callback )
else:
if region:
return IteratorRow( self, rtid, rstart, rend )
for rtid from 0 <= rtid < self.nreferences:
i.append( IteratorRow( self, rtid, rstart, rend))
return itertools.chain( *i )
- else:
+ else:
+ # check if header is present - otherwise sam_read1 aborts
+ # this happens if a bamfile is opened with mode 'r'
+ if self.samfile.header.n_targets == 0:
+ raise ValueError( "fetch called for samfile without header")
+
if region != None:
raise ValueError ("fetch for a region is not available for sam files" )
if callback:
self.samfile = NULL
def __dealloc__( self ):
- '''clean up.'''
# remember: dealloc cannot call other methods
- # Note that __del__ is not called.
+ # note: no doc string
+ # note: __del__ is not called.
self.close()
- pysam_bam_destroy1(self.b)
+ bam_destroy1(self.b)
def write( self, AlignedRead read ):
'''(AlignedRead read )
'''
return samwrite( self.samfile, read._delegate )
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
property nreferences:
'''number of :term:`reference` sequences in the file.'''
def __get__(self):
property text:
'''full contents of the :term:`sam file` header as a string.'''
def __get__(self):
- # create a temporary 0-terminated copy
- cdef char * t
- t = <char*>calloc( self.samfile.header.l_text + 1, sizeof(char) )
- memcpy( t, self.samfile.header.text, self.samfile.header.l_text )
- result = t
- free(t)
- return result
+ return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
property header:
'''header information within the :term:`sam file`. The records and fields are returned as
'''return true if samfile has been opened.'''
return self.fastafile != NULL
+ def __len__(self):
+ assert self.fastafile != NULL
+ return faidx_fetch_nseq(self.fastafile)
+
def _open( self,
char * filename ):
'''open an indexed fasta file.
'''*(reference = None, start = None, end = None, region = None)*
- fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
- :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
+ fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
+ The region is specified by :term:`reference`, *start* and *end*.
+
+ If *reference* is given and *start* is None, the sequence from the
+ first base is returned. Similarly, if *end* is None, the sequence
+ until the last base is returned.
+
+ Alternatively, a samtools :term:`region` string can be supplied.
'''
if not self._isOpen():
max_pos = 2 << 29
if not region:
- if reference == None: raise ValueError( 'no sequence/region supplied.' )
- if start == None and end == None:
- region = "%s" % str(reference)
- elif start == None or end == None:
- raise ValueError( 'only start or only end of region supplied' )
- else:
- if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
- # valid ranges are from 0 to 2^29-1
- if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
- if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
- region = "%s:%i-%i" % (reference, start+1, end )
-
- # samtools adds a '\0' at the end
- seq = fai_fetch( self.fastafile, region, &len )
+ if reference is None: raise ValueError( 'no sequence/region supplied.' )
+ if start is None: start = 0
+ if end is None: end = max_pos -1
+
+ if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
+ if start == end: return ""
+ # valid ranges are from 0 to 2^29-1
+ if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
+ if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
+
+ seq = faidx_fetch_seq(self.fastafile, reference,
+ start,
+ end-1, &len)
+ else:
+ # samtools adds a '\0' at the end
+ seq = fai_fetch( self.fastafile, region, &len )
+
# copy to python
- result = seq
- # clean up
- free(seq)
+ if seq == NULL:
+ return ""
+ else:
+ result = seq
+ # clean up
+ free(seq)
return result
+###########################################################################
+###########################################################################
+###########################################################################
## turning callbacks elegantly into iterators is an unsolved problem, see the following threads:
## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1
## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html
## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code
## changes, the changes have to be manually entered.
-
cdef class IteratorRow:
"""iterates over mapped reads in a region.
+
+ The samtools iterators assume that the file
+ position between iterations do not change.
+ As a consequence, no two iterators can work
+ on the same file. To permit this, each iterator
+ creates its own file handle by re-opening the
+ file.
+
+ Note that the index will be shared between
+ samfile and the iterator.
"""
- cdef bam_fetch_iterator_t* bam_iter # iterator state object
+ cdef bam_iter_t iter # iterator state object
cdef bam1_t * b
- cdef error_msg
- cdef int error_state
+ cdef int retval
cdef Samfile samfile
+ cdef samfile_t * fp
+
def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
- self.bam_iter = NULL
assert samfile._isOpen()
assert samfile._hasIndex()
# makes sure that samfile stays alive as long as the
- # iterator is alive.
+ # iterator is alive
self.samfile = samfile
- # parse the region
- self.error_state = 0
- self.error_msg = None
+ if samfile.isbam: mode = "rb"
+ else: mode = "r"
+
+ # reopen the file
+ store = StderrStore()
+ self.fp = samopen( samfile.filename, mode, NULL )
+ store.release()
- cdef bamFile fp
- fp = samfile.samfile.x.bam
- self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end)
+ self.retval = 0
+
+ self.iter = bam_iter_query(self.samfile.index,
+ tid,
+ beg,
+ end)
+ self.b = bam_init1()
def __iter__(self):
return self
cdef int cnext(self):
'''cversion of iterator. Used by IteratorColumn'''
- self.b = bam_fetch_iterate(self.bam_iter)
- if self.b == NULL: return 0
- return 1
-
+ self.retval = bam_iter_read( self.fp.x.bam,
+ self.iter,
+ self.b)
+
def __next__(self):
"""python version of next().
-
- pyrex uses this non-standard name instead of next()
"""
- if self.error_state:
- raise ValueError( self.error_msg)
-
- self.b = bam_fetch_iterate(self.bam_iter)
- if self.b != NULL:
- return makeAlignedRead( self.b )
- else:
- raise StopIteration
+ self.cnext()
+ if self.retval < 0: raise StopIteration
+ return makeAlignedRead( self.b )
def __dealloc__(self):
- '''remember: dealloc cannot call other methods!'''
- if self.bam_iter:
- bam_cleanup_fetch_iterator(self.bam_iter)
-
+ bam_destroy1(self.b)
+ samclose( self.fp )
+
cdef class IteratorRowAll:
"""iterates over all mapped reads
"""
assert samfile._isOpen()
- self.fp = samfile.samfile
+ if samfile.isbam: mode = "rb"
+ else: mode = "r"
+
+ # reopen the file to avoid iterator conflict
+ store = StderrStore()
+ self.fp = samopen( samfile.filename, mode, NULL )
+ store.release()
# allocate memory for alignment
self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
raise StopIteration
def __dealloc__(self):
- '''remember: dealloc cannot call other methods!'''
- pysam_bam_destroy1(self.b)
-
+ bam_destroy1(self.b)
+ samclose( self.fp )
+
+ctypedef struct __iterdata:
+ bamFile fp
+ bam_iter_t iter
+
+cdef int __advance( void * data, bam1_t * b ):
+ cdef __iterdata * d
+ d = <__iterdata*>data
+ return bam_iter_read( d.fp, d.iter, b )
+
cdef class IteratorColumn:
'''iterates over columns.
Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`.
'''
- cdef bam_plbuf_t *buf
- # check if first iteration
- cdef int notfirst
# result of the last plbuf_push
- cdef int n_pu
- cdef int eof
cdef IteratorRow iter
-
+ cdef int tid
+ cdef int pos
+ cdef int n_plp
+ cdef bam_pileup1_t * plp
+ cdef bam_plp_t pileup_iter
+ cdef __iterdata iterdata
def __cinit__(self, Samfile samfile, int tid, int start, int end ):
self.iter = IteratorRow( samfile, tid, start, end )
- self.buf = bam_plbuf_init(NULL, NULL )
- self.n_pu = 0
- self.eof = 0
+ self.iterdata.fp = samfile.samfile.x.bam
+ self.iterdata.iter = self.iter.iter
+
+ self.pileup_iter = bam_plp_init( &__advance, &self.iterdata )
+ self.n_plp = 0
+ self.tid = 0
+ self.pos = 0
+ self.plp = NULL
def __iter__(self):
return self
cdef int cnext(self):
'''perform next iteration.
-
- return 1 if there is a buffer to emit. Return 0 for end of iteration.
'''
+ self.plp = bam_plp_auto( self.pileup_iter,
+ &self.tid,
+ &self.pos,
+ &self.n_plp )
- cdef int retval1, retval2
+ def __next__(self):
+ """python version of next().
- # pysam bam_plbuf_push returns:
- # 1: if buf is full and can be emitted
- # 0: if b has been added
- # -1: if there was an error
+ pyrex uses this non-standard name instead of next()
+ """
+ self.cnext()
+ if self.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ if self.plp == NULL:
+ raise StopIteration
- # check if previous plbuf was incomplete. If so, continue within
- # the loop and yield if necessary
- if self.n_pu > 0:
- self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1)
- if self.n_pu > 0: return 1
+ return makePileupProxy( self.plp, self.tid, self.pos, self.n_plp )
- if self.eof: return 0
+ def __dealloc__(self):
+ bam_plp_destroy(self.pileup_iter)
+
+cdef inline int32_t query_start(bam1_t *src) except -1:
+ cdef uint32_t * cigar_p, op
+ cdef uint32_t k
+ cdef uint32_t start_offset = 0
+
+ if src.core.n_cigar:
+ cigar_p = bam1_cigar(src);
+ for k from 0 <= k < src.core.n_cigar:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op==BAM_CHARD_CLIP:
+ if start_offset!=0 and start_offset!=src.core.l_qseq:
+ PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+ return -1
+ elif op==BAM_CSOFT_CLIP:
+ start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
+
+ return start_offset
+
+
+cdef inline int32_t query_end(bam1_t *src) except -1:
+ cdef uint32_t * cigar_p, op
+ cdef uint32_t k
+ cdef uint32_t end_offset = src.core.l_qseq
+
+ if src.core.n_cigar>1:
+ cigar_p = bam1_cigar(src);
+ for k from src.core.n_cigar > k >= 1:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op==BAM_CHARD_CLIP:
+ if end_offset!=0 and end_offset!=src.core.l_qseq:
+ PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+ return -1
+ elif op==BAM_CSOFT_CLIP:
+ end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
- # get next alignments and submit until plbuf indicates that
- # an new column has finished
- while self.n_pu == 0:
- retval1 = self.iter.cnext()
- # wrap up if no more input
- if retval1 == 0:
- self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0)
- self.eof = 1
- return self.n_pu
+ if end_offset==0:
+ end_offset = src.core.l_qseq
- # submit to plbuf
- self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0)
- if self.n_pu < 0: raise ValueError( "error while iterating" )
+ return end_offset
- # plbuf has yielded
- return 1
- def __next__(self):
- """python version of next().
+cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
+ cdef uint8_t * p
+ cdef uint32_t k
+ cdef char * s
+ cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
- pyrex uses this non-standard name instead of next()
- """
- cdef int ret
- ret = self.cnext()
- cdef bam_pileup1_t * pl
+ if not src.core.l_qseq:
+ return None
- if ret > 0 :
- return makePileupProxy( self.buf, self.n_pu )
- else:
- raise StopIteration
+ seq = PyString_FromStringAndSize(NULL, end-start)
+ s = PyString_AS_STRING(seq)
+ p = bam1_seq(src)
- def __dealloc__(self):
- bam_plbuf_destroy(self.buf);
+ for k from start <= k < end:
+ # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
+ # note: do not use string literal as it will be a python string
+ s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
+
+ return seq
+
+
+cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
+ cdef uint8_t * p
+ cdef uint32_t k
+ cdef char * q
+
+ p = bam1_qual(src)
+ if p[0] == 0xff:
+ return None
+
+ qual = PyString_FromStringAndSize(NULL, end-start)
+ q = PyString_AS_STRING(qual)
+
+ for k from start <= k < end:
+ ## equivalent to t[i] + 33 (see bam.c)
+ q[k-start] = p[k] + 33
+
+ return qual
cdef class AlignedRead:
'''
self._delegate.data_len = 0
def __dealloc__(self):
- '''clear up memory.'''
- pysam_bam_destroy1(self._delegate)
+ bam_destroy1(self._delegate)
def __str__(self):
"""todo"""
self.tags)))
- def __cmp__(self, AlignedRead other):
- '''return true, if contents in this are binary equal to ``other``.'''
+ def compare(self, AlignedRead other):
+ '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
+
cdef int retval, x
cdef bam1_t *t, *o
+
t = self._delegate
o = other._delegate
# oo = <unsigned char*>(o.data)
# for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
- retval = memcmp( &t.core,
- &o.core,
- sizeof( bam1_core_t ))
+ # Fast-path test for object identity
+ if t==o:
+ return 0
+
+ retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
if retval: return retval
- retval = cmp( t.data_len, o.data_len)
+ retval = cmp(t.data_len, o.data_len)
if retval: return retval
- return memcmp( t.data,
- o.data,
- sizeof( t.data_len ))
+ return memcmp(t.data, o.data, t.data_len)
+
+ # Disabled so long as __cmp__ is a special method
+ def __hash__(self):
+ return _Py_HashPointer(<void *>self)
property qname:
"""the query name (None if not present)"""
cdef bam1_t * src
src = self._delegate
if src.core.l_qname == 0: return None
- return <char *>pysam_bam1_qname( src )
+ return <char *>bam1_qname( src )
def __set__(self, qname ):
if qname == None or len(qname) == 0: return
cdef char * p
src = self._delegate
- p = pysam_bam1_qname( src )
+ p = bam1_qname( src )
# the qname is \0 terminated
l = len(qname) + 1
# re-acquire pointer to location in memory
# as it might have moved
- p = pysam_bam1_qname(src)
+ p = bam1_qname(src)
strncpy( p, qname, l )
cdef uint32_t * cigar_p
cdef bam1_t * src
cdef op, l, cigar
+ cdef int k
+
src = self._delegate
if src.core.n_cigar == 0: return None
cigar = []
- cigar_p = pysam_bam1_cigar(src);
+ cigar_p = bam1_cigar(src);
for k from 0 <= k < src.core.n_cigar:
op = cigar_p[k] & BAM_CIGAR_MASK
l = cigar_p[k] >> BAM_CIGAR_SHIFT
src = self._delegate
# get location of cigar string
- p = pysam_bam1_cigar(src)
+ p = bam1_cigar(src)
# create space for cigar data within src.data
pysam_bam_update( src,
# re-acquire pointer to location in memory
# as it might have moved
- p = pysam_bam1_cigar(src)
+ p = bam1_cigar(src)
# insert cigar operations
for op, l in values:
src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
property seq:
- """the query sequence (None if not present)"""
+ """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
def __get__(self):
cdef bam1_t * src
- cdef uint8_t * p
cdef char * s
+
src = self._delegate
- bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
- ## parse qseq (bam1_seq)
+
if src.core.l_qseq == 0: return None
- s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char))
- p = pysam_bam1_seq( src )
- for k from 0 <= k < src.core.l_qseq:
- ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
- s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)]
- retval=s
- free(s)
- return retval
+ return get_seq_range(src, 0, src.core.l_qseq)
def __set__(self,seq):
# samtools manages sequence and quality length memory together
cdef bam1_t * src
cdef uint8_t * p
cdef char * s
- src = self._delegate
cdef int l, k, nbytes_new, nbytes_old
+ src = self._delegate
+
l = len(seq)
# as the sequence is stored in half-bytes, the total length (sequence
nbytes_new = (l+1)/2 + l
nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
# acquire pointer to location in memory
- p = pysam_bam1_seq( src )
+ p = bam1_seq( src )
src.core.l_qseq = l
pysam_bam_update( src,
p)
# re-acquire pointer to location in memory
# as it might have moved
- p = pysam_bam1_seq( src )
+ p = bam1_seq( src )
for k from 0 <= k < nbytes_new: p[k] = 0
# convert to C string
s = seq
p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
# erase qualities
- p = pysam_bam1_qual( src )
+ p = bam1_qual( src )
p[0] = 0xff
+
property qual:
- """the base quality (None if not present)"""
+ """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
def __get__(self):
- cdef bam1_t * src
- cdef uint8_t * p
+
+ cdef bam1_t * src
cdef char * q
+
src = self._delegate
- if src.core.l_qseq == 0: return None
- p = pysam_bam1_qual( src )
- if p[0] == 0xff: return None
+ if src.core.l_qseq == 0: return None
- q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char))
- for k from 0 <= k < src.core.l_qseq:
- ## equivalent to t[i] + 33 (see bam.c)
- q[k] = p[k] + 33
- # convert to python string
- retval=q
- # clean up
- free(q)
- return retval
+ return get_qual_range(src, 0, src.core.l_qseq)
def __set__(self,qual):
# note that space is already allocated via the sequences
cdef bam1_t * src
cdef uint8_t * p
cdef char * q
+ cdef int k
+
src = self._delegate
- p = pysam_bam1_qual( src )
+ p = bam1_qual( src )
if qual == None or len(qual) == 0:
# if absent - set to 0xff
p[0] = 0xff
for k from 0 <= k < l:
p[k] = <uint8_t>q[k] - 33
+ property query:
+ """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
+
+ SAM/BAM files may included extra flanking bases sequences that were
+ not part of the alignment. These bases may be the result of the
+ Smith-Waterman or other algorithms, which may not require alignments
+ that begin at the first residue or end at the last. In addition,
+ extra sequencing adapters, multiplex identifiers, and low-quality bases that
+ were not considered for alignment may have been retained."""
+
+ def __get__(self):
+ cdef bam1_t * src
+ cdef uint32_t start, end
+ cdef char * s
+
+ src = self._delegate
+
+ if src.core.l_qseq == 0: return None
+
+ start = query_start(src)
+ end = query_end(src)
+
+ return get_seq_range(src, start, end)
+
+ property qqual:
+ """aligned query sequence quality values (None if not present)"""
+ def __get__(self):
+ cdef bam1_t * src
+ cdef uint32_t start, end
+ cdef char * q
+
+ src = self._delegate
+
+ if src.core.l_qseq == 0: return None
+
+ start = query_start(src)
+ end = query_end(src)
+
+ return get_qual_range(src, start, end)
+
+ property qstart:
+ """start index of the aligned query portion of the sequence (0-based, inclusive)"""
+ def __get__(self):
+ return query_start(self._delegate)
+
+ property qend:
+ """end index of the aligned query portion of the sequence (0-based, exclusive)"""
+ def __get__(self):
+ return query_end(self._delegate)
+
+ property qlen:
+ """Length of the aligned query sequence"""
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ return query_end(src)-query_start(src)
+
property tags:
- """the tags in the AUX field."""
+ """the tags in the AUX field.
+ This property permits convenience access to
+ the tags. Changes it the returned list will
+ not update the tags automatically. Instead,
+ the following is required for adding a
+ new tag::
+
+ read.tags = read.tags + [("RG",0)]
+
+ """
def __get__(self):
cdef char * ctag
cdef bam1_t * src
src = self._delegate
if src.l_aux == 0: return None
- s = pysam_bam1_aux( src )
+ s = bam1_aux( src )
result = []
ctag = <char*>calloc( 3, sizeof(char) )
cdef int x
# how do I do char literal comparison in cython?
# the code below works (i.e, is C comparison)
tpe = toupper(s[0])
- if tpe == 'S'[0]:
+ if tpe == 'S':
value = <int>bam_aux2i(s)
s += 2
- elif tpe == 'I'[0]:
+ elif tpe == 'I':
value = <int>bam_aux2i(s)
s += 4
- elif tpe == 'F'[0]:
+ elif tpe == 'F':
value = <float>bam_aux2f(s)
s += 4
- elif tpe == 'D'[0]:
+ elif tpe == 'D':
value = <double>bam_aux2d(s)
s += 8
- elif tpe == 'C'[0]:
+ elif tpe == 'C':
value = <int>bam_aux2i(s)
s += 1
- elif tpe == 'A'[0]:
+ elif tpe == 'A':
# there might a more efficient way
# to convert a char into a string
value = "%c" % <char>bam_aux2A(s)
s += 1
- elif tpe == 'Z'[0]:
+ elif tpe == 'Z':
value = <char*>bam_aux2Z(s)
# +1 for NULL terminated string
s += len(value) + 1
pysam_bam_update( src,
src.l_aux,
offset,
- pysam_bam1_aux( src ) )
+ bam1_aux( src ) )
src.l_aux = offset
if offset == 0: return
# get location of new data
- s = pysam_bam1_aux( src )
+ s = bam1_aux( src )
# check if there is direct path from buffer.raw to tmp
cdef char * temp
cdef bam1_t * src
src = self._delegate
if src.core.n_cigar:
- src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) )
+ src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
else:
src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
self._delegate.core.pos = pos
property rlen:
'''length of the read (read only). Returns 0 if not given.'''
def __get__(self): return self._delegate.core.l_qseq
+ property aend:
+ '''aligned end position of the read (read only). Returns
+ None if not available.'''
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
+ return None
+ return bam_calend(&src.core, bam1_cigar(src))
+ property alen:
+ '''aligned length of the read (read only). Returns None if
+ not available.'''
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
+ return None
+ return bam_calend(&src.core,
+ bam1_cigar(src)) - \
+ self._delegate.core.pos
+
property mapq:
"""mapping quality"""
def __get__(self): return self._delegate.core.qual
If the underlying engine iterator advances, the results of this column
will change.
'''
- cdef bam_plbuf_t * buf
+ cdef bam_pileup1_t * plp
+ cdef int tid
+ cdef int pos
cdef int n_pu
-
+
def __cinit__(self ):
pass
property tid:
'''the chromosome ID as is defined in the header'''
- def __get__(self): return pysam_get_tid( self.buf )
+ def __get__(self): return self.tid
property n:
'''number of reads mapping to this column.'''
def __set__(self, n): self.n_pu = n
property pos:
- def __get__(self): return pysam_get_pos( self.buf )
+ def __get__(self): return self.pos
property pileups:
'''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
def __get__(self):
- cdef bam_pileup1_t * pl
- pl = pysam_get_pileup( self.buf )
+ cdef int x
pileups = []
# warning: there could be problems if self.n and self.buf are
# out of sync.
for x from 0 <= x < self.n_pu:
- pileups.append( makePileupRead( &pl[x]) )
+ pileups.append( makePileupRead( &(self.plp[x])) )
return pileups
cdef class PileupRead: