import collections
import re
import platform
-from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
-from cpython cimport PyErr_SetString
+import warnings
+from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize
+from cpython.version cimport PY_MAJOR_VERSION
+
+########################################################################
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+cdef from_string_and_size(char* s, size_t length):
+ if PY_MAJOR_VERSION < 3:
+ return s[:length]
+ else:
+ return s[:length].decode("ascii")
+
+# filename encoding (copied from lxml.etree.pyx)
+cdef str _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = 'ascii'
+
+#cdef char* _C_FILENAME_ENCODING
+#_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
+
+cdef bytes _my_encodeFilename(object filename):
+ u"""Make sure a filename is 8-bit encoded (or None).
+ """
+ if filename is None:
+ return None
+ elif PyBytes_Check(filename):
+ return filename
+ elif PyUnicode_Check(filename):
+ return filename.encode(_FILENAME_ENCODING)
+ else:
+ raise TypeError, u"Argument must be string or unicode."
-#from cpython.string cimport PyString_FromStringAndSize, PyString_AS_STRING
-#from cpython.exc cimport PyErr_SetString, PyErr_NoMemory
+cdef bytes _force_bytes(object s):
+ u"""convert string or unicode object to bytes, assuming ascii encoding.
+ """
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif s is None:
+ return None
+ elif PyBytes_Check(s):
+ return s
+ elif PyUnicode_Check(s):
+ return s.encode('ascii')
+ else:
+ raise TypeError, u"Argument must be string, bytes or unicode."
+
+cdef inline bytes _force_cmdline_bytes(object s):
+ return _force_bytes(s)
+
+cdef _charptr_to_str(char* s):
+ if PY_MAJOR_VERSION < 3:
+ return s
+ else:
+ return s.decode("ascii")
+
+cdef _force_str(object s):
+ """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
+ if s is None:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif PyBytes_Check(s):
+ return s.decode('ascii')
+ else:
+ # assume unicode
+ return s
+########################################################################
+########################################################################
+########################################################################
+## Constants and global variables
+########################################################################
# defines imported from samtools
DEF SEEK_SET = 0
DEF SEEK_CUR = 1
## @abstract optical or PCR duplicate */
DEF BAM_FDUP =1024
+#####################################################################
+# CIGAR operations
DEF BAM_CIGAR_SHIFT=4
DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
DEF BAM_CSOFT_CLIP = 4
DEF BAM_CHARD_CLIP = 5
DEF BAM_CPAD = 6
+DEF BAM_CEQUAL = 7
+DEF BAM_CDIFF = 8
+
+cdef char* CODE2CIGAR= "MIDNSHP=X"
+if IS_PYTHON3:
+ CIGAR2CODE = dict( [y,x] for x,y in enumerate( CODE2CIGAR) )
+else:
+ CIGAR2CODE = dict( [ord(y),x] for x,y in enumerate( CODE2CIGAR) )
+CIGAR_REGEX = re.compile( "([MIDNSHP=X])(\d+)" )
+
+#####################################################################
+## set pysam stderr to /dev/null
+pysam_unset_stderr()
#####################################################################
# hard-coded constants
cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
cdef int max_pos = 2 << 29
-# redirect stderr to 0
-_logfile = open(os.path.devnull, "w")
-pysam_set_stderr( PyFile_AsFile( _logfile ) )
-
#####################################################################
#####################################################################
#####################################################################
return dest
cdef class PileupProxy
-cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
+cdef makePileupProxy( bam_pileup1_t ** plp, int tid, int pos, int n ):
cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
dest.plp = plp
dest.tid = tid
dest._is_tail = src.is_tail
return dest
+cdef convertBinaryTagToList( uint8_t * s ):
+ """return bytesize, number of values list of values in s."""
+ cdef char auxtype
+ cdef uint8_t byte_size
+ cdef int32_t nvalues
+
+ # get byte size
+ auxtype = s[0]
+ byte_size = bam_aux_type2size( auxtype )
+ s += 1
+ # get number of values in array
+ nvalues = (<int32_t*>s)[0]
+ s += 4
+ # get values
+ values = []
+ if auxtype == 'c':
+ for x from 0 <= x < nvalues:
+ values.append((<int8_t*>s)[0])
+ s += 1
+ elif auxtype == 'C':
+ for x from 0 <= x < nvalues:
+ values.append((<uint8_t*>s)[0])
+ s += 1
+ elif auxtype == 's':
+ for x from 0 <= x < nvalues:
+ values.append((<int16_t*>s)[0])
+ s += 2
+ elif auxtype == 'S':
+ for x from 0 <= x < nvalues:
+ values.append((<uint16_t*>s)[0])
+ s += 2
+ elif auxtype == 'i':
+ for x from 0 <= x < nvalues:
+ values.append((<int32_t*>s)[0])
+ s += 4
+ elif auxtype == 'I':
+ for x from 0 <= x < nvalues:
+ values.append((<uint32_t*>s)[0])
+ s += 4
+ elif auxtype == 'f':
+ for x from 0 <= x < nvalues:
+ values.append((<float*>s)[0])
+ s += 4
+
+ return byte_size, nvalues, values
+
#####################################################################
#####################################################################
#####################################################################
## Generic callbacks for inserting python callbacks.
#####################################################################
cdef int fetch_callback( bam1_t *alignment, void *f):
- '''callback for bam_fetch.
-
+ '''callback for bam_fetch.
+
calls function in *f* with a new :class:`AlignedRead` object as parameter.
'''
a = makeAlignedRead( alignment )
(<object>f)(a)
-class PileupColumn(object):
- '''A pileup column. A pileup column contains
+class PileupColumn(object):
+ '''A pileup column. A pileup column contains
all the reads that map to a certain target base.
- tid
- chromosome ID as is defined in the header
- pos
- the target base coordinate (0-based)
- n
- number of reads mapping to this column
- pileups
- list of reads (:class:`pysam.PileupRead`) aligned to this column
- '''
- def __str__(self):
+ tid
+ chromosome ID as is defined in the header
+ pos
+ the target base coordinate (0-based)
+ n
+ number of reads mapping to this column
+ pileups
+ list of reads (:class:`pysam.PileupRead`) aligned to this column
+ '''
+ def __str__(self):
return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
"\n" + "\n".join( map(str, self.pileups) )
for x from 0 <= x < n:
pileups.append( makePileupRead( &(pl[x]) ) )
p.pileups = pileups
-
+
(<object>f)(p)
cdef int pileup_fetch_callback( bam1_t *b, void *data):
- '''callback for bam_fetch.
+ '''callback for bam_fetch.
Fetches reads and submits them to pileup.
'''
class StderrStore():
'''
- stderr is captured.
+ stderr is captured.
'''
def __init__(self):
return
self.stderr_h, self.stderr_f = tempfile.mkstemp()
self.stderr_save = Outs( sys.stderr.fileno() )
self.stderr_save.setfd( self.stderr_h )
-
+
def readAndRelease( self ):
return []
self.stderr_save.restore()
######################################################################
######################################################################
# valid types for sam headers
-VALID_HEADER_TYPES = { "HD" : dict,
- "SQ" : list,
- "RG" : list,
- "PG" : list,
+VALID_HEADER_TYPES = { "HD" : dict,
+ "SQ" : list,
+ "RG" : list,
+ "PG" : list,
"CO" : list }
# order of records within sam headers
# type conversions within sam header records
VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
"SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
- "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
+ "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str,
+ "CN" : str, "DT" : str, "PL" : str, "FO" : str, "KS" : str },
"PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
# output order of fields within records
VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
"SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
- "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
+ "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL", "FO", "KS" ),
"PG" : ( "PN", "ID", "VN", "CL" ), }
######################################################################
cdef class Fastafile:
'''*(filename)*
-
+
A *FASTA* file. The file is automatically opened.
The file expects an indexed fasta file.
- TODO:
+ TODO:
add automatic indexing.
add function to get sequence names.
'''
- cdef char * _filename
- # pointer to fastafile
- cdef faidx_t * fastafile
-
def __cinit__(self, *args, **kwargs ):
self.fastafile = NULL
self._filename = NULL
return faidx_fetch_nseq(self.fastafile)
- def _open( self,
- char * filename ):
+ def _open( self,
+ filename ):
'''open an indexed fasta file.
This method expects an indexed fasta file.
# close a previously opened file
if self.fastafile != NULL: self.close()
if self._filename != NULL: free(self._filename)
+ filename = _my_encodeFilename(filename)
self._filename = strdup(filename)
self.fastafile = fai_load( filename )
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
return self._filename
- def fetch( self,
- reference = None,
- start = None,
+ def fetch( self,
+ reference = None,
+ start = None,
end = None,
region = None):
-
+
'''*(reference = None, start = None, end = None, region = None)*
-
- fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
-
- The region is specified by :term:`reference`, *start* and *end*.
-
+
+ fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
+
+ The region is specified by :term:`reference`, *start* and *end*.
+
fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
- If *reference* is given and *start* is None, the sequence from the
- first base is returned. Similarly, if *end* is None, the sequence
+ If *reference* is given and *start* is None, the sequence from the
+ first base is returned. Similarly, if *end* is None, the sequence
until the last base is returned.
-
+
Alternatively, a samtools :term:`region` string can be supplied.
'''
-
+
if not self._isOpen():
raise ValueError( "I/O operation on closed file" )
if end is None: end = max_pos -1
if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
- if start == end: return ""
+ if start == end: return b""
# valid ranges are from 0 to 2^29-1
if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
# note: faidx_fetch_seq has a bug such that out-of-range access
# always returns the last residue. Hence do not use faidx_fetch_seq,
- # but use fai_fetch instead
- # seq = faidx_fetch_seq(self.fastafile,
- # reference,
+ # but use fai_fetch instead
+ # seq = faidx_fetch_seq(self.fastafile,
+ # reference,
# start,
- # end-1,
+ # end-1,
# &length)
region = "%s:%i-%i" % (reference, start+1, end)
- seq = fai_fetch( self.fastafile,
+ if PY_MAJOR_VERSION >= 3:
+ region = region.encode('ascii')
+ seq = fai_fetch( self.fastafile,
region,
&length )
else:
# copy to python
if seq == NULL:
- return ""
+ return b""
else:
try:
- py_seq = PyString_FromStringAndSize(seq, length)
+ py_seq = seq[:length]
finally:
free(seq)
cdef char * _fetch( self, char * reference, int start, int end, int * length ):
'''fetch sequence for reference, start and end'''
-
- return faidx_fetch_seq(self.fastafile,
- reference,
+
+ return faidx_fetch_seq(self.fastafile,
+ reference,
start,
- end-1,
+ end-1,
length )
#------------------------------------------------------------------------
'''callback for bam_fetch = filter mate
'''
cdef MateData * d = (<MateData*>f)
- # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
+ # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
# d.mate, d.name, bam1_qname(alignment),
# d.flag, alignment.core.flag, alignment.core.flag & d.flag)
- if d.mate == NULL:
+ if d.mate == NULL:
# could be sped up by comparing the lengths of query strings first
# using l_qname
#
- # also, make sure that we get the other read by comparing
+ # also, make sure that we get the other read by comparing
# the flags
if alignment.core.flag & d.flag != 0 and \
strcmp( bam1_qname( alignment ), d.name ) == 0:
cdef class Samfile:
- '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
-
+ '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None,
+ add_sq_text = False, check_header = True, check_sq = True )*
+
A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
-
- *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
- (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
+
+ *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
+ (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
Use ``h`` to output header information in text (:term:`TAM`) mode.
- If ``b`` is present, it must immediately follow ``r`` or ``w``.
- Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
+ If ``b`` is present, it must immediately follow ``r`` or ``w``.
+ Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
a :term:`BAM` formatted file for reading, type::
f = pysam.Samfile('ex1.bam','rb')
- If mode is not specified, we will try to auto-detect in the order 'r', 'rb', thus both the following
+ If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following
should work::
f1 = pysam.Samfile('ex1.bam' )
- f2 = pysam.Samfile('ex1.bam' )
+ f2 = pysam.Samfile('ex1.sam' )
If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
sources (see also the samtools format specification):
- 1. If *template* is given, the header is copied from a another *Samfile*
+ 1. If *template* is given, the header is copied from a another *Samfile*
(*template* must be of type *Samfile*).
- 2. If *header* is given, the header is built from a multi-level dictionary. The first level
- are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
- being a list of tag-value pairs.
+ 2. If *header* is given, the header is built from a multi-level dictionary. The first level
+ are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
+ being a list of tag-value pairs. The header is constructed first from all the defined fields,
+ followed by user tags in alphabetical order.
3. If *text* is given, new header text is copied from raw text.
4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
+ By default, 'SQ' and 'LN' tags will be added to the header text. This option can be
+ changed by unsetting the flag *add_sq_text*.
+ By default, if file a file is opened in mode 'r', it is checked for a valid header
+ (*check_header* = True) and a definition of chromosome names (*check_sq* = True).
+
'''
def __cinit__(self, *args, **kwargs ):
self.samfile = NULL
self._filename = NULL
self.isbam = False
+ self.isstream = False
self._open( *args, **kwargs )
# allocate memory for iterator
'''return true if samfile has an existing (and opened) index.'''
return self.index != NULL
- def _open( self,
- char * filename,
+ def _open( self,
+ filename,
mode = None,
Samfile template = None,
referencenames = None,
text = None,
header = None,
port = None,
+ add_sq_text = True,
+ check_header = True,
+ check_sq = True,
):
'''open a sam/bam file.
# read mode autodetection
if mode is None:
try:
- self._open(filename, 'r', template=template,
+ self._open(filename, 'rb', template=template,
referencenames=referencenames,
referencelengths=referencelengths,
- text=text, header=header, port=port)
+ text=text, header=header, port=port,
+ check_header=check_header,
+ check_sq=check_sq)
return
except ValueError, msg:
pass
- self._open(filename, 'rb', template=template,
+
+ self._open(filename, 'r', template=template,
referencenames=referencenames,
referencelengths=referencelengths,
- text=text, header=header, port=port)
+ text=text, header=header, port=port,
+ check_header=check_header,
+ check_sq=check_sq)
return
- assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
- assert filename != NULL
+ assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode
+
+ #assert filename != NULL
# close a previously opened file
if self.samfile != NULL: self.close()
cdef bam_header_t * header_to_write
header_to_write = NULL
-
+
if self._filename != NULL: free(self._filename )
- self._filename = strdup( filename )
+ filename = _my_encodeFilename(filename)
+ cdef bytes bmode = mode.encode('ascii')
+ #cdef char* cfilename
+ #cfilename = filename.encode(_FILENAME_ENCODING)
+ self._filename = strdup(filename)
+ self.isstream = strcmp( filename, "-" ) == 0
self.isbam = len(mode) > 1 and mode[1] == 'b'
self.isremote = strncmp(filename,"http:",5) == 0 or \
- strncmp(filename,"ftp:",4) == 0
+ strncmp(filename,"ftp:",4) == 0
cdef char * ctext
ctext = NULL
if mode[0] == 'w':
# open file for writing
-
+
# header structure (used for writing)
if template:
# copy header from another file
assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
# allocate and fill header
+ referencenames = [ _force_bytes(ref) for ref in referencenames ]
header_to_write = bam_header_init()
header_to_write.n_targets = len(referencenames)
n = 0
header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
strncpy( header_to_write.target_name[x], name, len(name) )
+ # Optionally, if there is no text, add a SAM compatible header to output
+ # file.
+ if text is None and add_sq_text:
+ text = []
+ for x from 0 <= x < header_to_write.n_targets:
+ text.append( "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] ) )
+ text = ''.join(text)
+
if text != None:
# copy without \0
+ text = _force_bytes(text)
ctext = text
header_to_write.l_text = strlen(ctext)
header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
header_to_write.hash = NULL
header_to_write.rg2lib = NULL
-
+
# open file. Header gets written to file at the same time for bam files
# and sam files (in the latter case, the mode needs to be wh)
store = StderrStore()
- self.samfile = samopen( filename, mode, header_to_write )
+ self.samfile = samopen( filename, bmode, header_to_write )
store.release()
# bam_header_destroy takes care of cleaning up of all the members
raise IOError( "file `%s` not found" % filename)
# try to detect errors
- self.samfile = samopen( filename, mode, NULL )
+ self.samfile = samopen( filename, bmode, NULL )
if self.samfile == NULL:
raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
- if self.samfile.header == NULL:
- raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode )
-
- #disabled for autodetection to work
- if self.samfile.header.n_targets == 0:
+ # bam files require a valid header
+ if self.isbam:
+ if self.samfile.header == NULL:
+ raise ValueError( "file does not have valid header (mode='%s') - is it BAM format?" % mode )
+ else:
+ # in sam files it is optional (samfile full of unmapped reads)
+ if check_header and self.samfile.header == NULL:
+ raise ValueError( "file does not have valid header (mode='%s') - is it SAM format?" % mode )
+
+ # disabled for autodetection to work
+ # needs to be disabled so that reading from sam-files without headers works
+ if check_sq and self.samfile.header.n_targets == 0:
raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
if self.samfile == NULL:
if mode[0] == "r" and self.isbam:
if not self.isremote:
- if not os.path.exists(filename +".bai"):
+ if not os.path.exists(filename + b".bai"):
self.index = NULL
else:
# returns NULL if there is no index or index could not be opened
self.index = bam_index_load(filename)
if self.index == NULL:
raise IOError("error while opening index `%s` " % filename )
-
+
+ if not self.isstream:
+ self.start_offset = bam_tell( self.samfile.x.bam )
+
def gettid( self, reference ):
'''
convert :term:`reference` name into numerical :term:`tid`
returns -1 if reference is not known.
'''
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ reference = _force_bytes(reference)
return pysam_reference2tid( self.samfile.header, reference )
def getrname( self, tid ):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
if not 0 <= tid < self.samfile.header.n_targets:
raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
- return self.samfile.header.target_name[tid]
+ return _charptr_to_str(self.samfile.header.target_name[tid])
- cdef char * _getrname( self, int tid ):
+ cdef char * _getrname( self, int tid ): # TODO unused
'''
convert numerical :term:`tid` into :term:`reference` name.'''
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
return self.samfile.header.target_name[tid]
- def _parseRegion( self,
- reference = None,
- start = None,
+ def _parseRegion( self,
+ reference = None,
+ start = None,
end = None,
region = None ):
'''
Note that regions are 1-based, while start,end are python coordinates.
'''
- # This method's main objective is to translate from a reference to a tid.
+ # This method's main objective is to translate from a reference to a tid.
# For now, it calls bam_parse_region, which is clumsy. Might be worth
# implementing it all in pysam (makes use of khash).
-
+
cdef int rtid
cdef long long rstart
cdef long long rend
rtid = -1
rstart = 0
rend = max_pos
- if start != None:
+ if start != None:
try:
rstart = start
except OverflowError:
raise ValueError( 'start out of range (%i)' % start )
-
- if end != None:
+
+ if end != None:
try:
rend = end
except OverflowError:
raise ValueError( 'end out of range (%i)' % end )
if region:
+ region = _force_str(region)
parts = re.split( "[:-]", region )
reference = parts[0]
if len(parts) >= 2: rstart = int(parts[1]) - 1
if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
return 1, rtid, rstart, rend
-
+
+ def reset( self ):
+ '''reset file position to beginning of read section.'''
+ return self.seek( self.start_offset, 0 )
+
def seek( self, uint64_t offset, int where = 0):
'''
move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
raise ValueError( "I/O operation on closed file" )
if not self.isbam:
raise NotImplementedError("seek only available in bam files")
+ if self.isstream:
+ raise OSError("seek no available in streams")
+
return bam_seek( self.samfile.x.bam, offset, where )
def tell( self ):
return bam_tell( self.samfile.x.bam )
- def fetch( self,
- reference = None,
- start = None,
- end = None,
- region = None,
+ def fetch( self,
+ reference = None,
+ start = None,
+ end = None,
+ region = None,
callback = None,
until_eof = False ):
'''
fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
- :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
+ :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
be supplied.
- Without *reference* or *region* all reads will be fetched. The reads will be returned
+ Without *reference* or *region* all mapped reads will be fetched. The reads will be returned
ordered by reference sequence, which will not necessarily be the order within the file.
+
If *until_eof* is given, all reads from the current file position will be returned
- *in order as they are within the file*.
-
+ in order as they are within the file. Using this option will also fetch unmapped reads.
+
If only *reference* is set, all reads aligned to *reference* will be fetched.
The method returns an iterator of type :class:`pysam.IteratorRow` unless
- a *callback is provided. If *callback* is given, the callback will be executed
+ a *callback is provided. If *callback* is given, the callback will be executed
for each position within the :term:`region`. Note that callbacks currently work
only, if *region* or *reference* is given.
raise ValueError( "I/O operation on closed file" )
has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
-
+
+ if self.isstream: reopen = False
+ else: reopen = True
+
if self.isbam:
- if not until_eof and not self._hasIndex() and not self.isremote:
+ if not until_eof and not self._hasIndex() and not self.isremote:
raise ValueError( "fetch called on bamfile without index" )
if callback:
if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
if not self._hasIndex(): raise ValueError( "no index available for fetch" )
- return bam_fetch(self.samfile.x.bam,
- self.index,
- rtid,
- rstart,
- rend,
- <void*>callback,
+ return bam_fetch(self.samfile.x.bam,
+ self.index,
+ rtid,
+ rstart,
+ rend,
+ <void*>callback,
fetch_callback )
else:
if has_coord:
- return IteratorRowRegion( self, rtid, rstart, rend )
+ return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen )
else:
if until_eof:
- return IteratorRowAll( self )
+ return IteratorRowAll( self, reopen=reopen )
else:
- return IteratorRowAllRefs(self)
- else:
- # check if header is present - otherwise sam_read1 aborts
- # this happens if a bamfile is opened with mode 'r'
- if self.samfile.header.n_targets == 0:
- raise ValueError( "fetch called for samfile without header")
-
- if region != None:
- raise ValueError ("fetch for a region is not available for sam files" )
+ # AH: check - reason why no reopen for AllRefs?
+ return IteratorRowAllRefs(self ) # , reopen=reopen )
+ else:
+ if has_coord:
+ raise ValueError ("fetching by region is not available for sam files" )
+
if callback:
raise NotImplementedError( "callback not implemented yet" )
- else:
- return IteratorRowAll( self )
- def mate( self,
+ if self.samfile.header == NULL:
+ raise ValueError( "fetch called for samfile without header")
+
+ # check if targets are defined
+ # give warning, sam_read1 segfaults
+ if self.samfile.header.n_targets == 0:
+ warnings.warn( "fetch called for samfile without header")
+
+ return IteratorRowAll( self, reopen=reopen )
+
+ def mate( self,
AlignedRead read ):
'''return the mate of :class:`AlignedRead` *read*.
raise ValueError( "read %s: is unpaired" % (read.qname))
if flag & BAM_FMUNMAP != 0:
raise ValueError( "mate %s: is unmapped" % (read.qname))
-
+
cdef MateData mate_data
mate_data.name = <char *>bam1_qname(read._delegate)
cdef int x = BAM_FREAD1 + BAM_FREAD2
mate_data.flag = ( flag ^ x) & x
- bam_fetch(self.samfile.x.bam,
- self.index,
- read._delegate.core.mtid,
+ bam_fetch(self.samfile.x.bam,
+ self.index,
+ read._delegate.core.mtid,
read._delegate.core.mpos,
read._delegate.core.mpos + 1,
- <void*>&mate_data,
+ <void*>&mate_data,
mate_callback )
if mate_data.mate == NULL:
dest._delegate = mate_data.mate
return dest
- def count( self,
- reference = None,
- start = None,
- end = None,
- region = None,
+ def count( self,
+ reference = None,
+ start = None,
+ end = None,
+ region = None,
until_eof = False ):
'''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
-
+
count reads :term:`region` using 0-based indexing. The region is specified by
:term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
if not self._isOpen():
raise ValueError( "I/O operation on closed file" )
-
+
region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
cdef int counter
counter = 0;
if self.isbam:
- if not until_eof and not self._hasIndex() and not self.isremote:
+ if not until_eof and not self._hasIndex() and not self.isremote:
raise ValueError( "fetch called on bamfile without index" )
if not region:
raise ValueError( "counting functionality requires a region/reference" )
if not self._hasIndex(): raise ValueError( "no index available for fetch" )
- bam_fetch(self.samfile.x.bam,
- self.index,
- rtid,
- rstart,
- rend,
- <void*>&counter,
+ bam_fetch(self.samfile.x.bam,
+ self.index,
+ rtid,
+ rstart,
+ rend,
+ <void*>&counter,
count_callback )
return counter
- else:
+ else:
raise ValueError ("count for a region is not available for sam files" )
- def pileup( self,
- reference = None,
- start = None,
- end = None,
- region = None,
+ def pileup( self,
+ reference = None,
+ start = None,
+ end = None,
+ region = None,
callback = None,
**kwargs ):
'''
perform a :term:`pileup` within a :term:`region`. The region is specified by
- :term:`reference`, *start* and *end* (using 0-based indexing).
+ :term:`reference`, *start* and *end* (using 0-based indexing).
Alternatively, a samtools *region* string can be supplied.
Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
The method returns an iterator of type :class:`pysam.IteratorColumn` unless
- a *callback is provided. If a *callback* is given, the callback will be executed
- for each column within the :term:`region`.
+ a *callback is provided. If a *callback* is given, the callback will be executed
+ for each column within the :term:`region`.
- Note that :term:`SAM` formatted files do not allow random access.
+ Note that :term:`SAM` formatted files do not allow random access.
In these files, if a *region* or *reference* are given an exception is raised.
-
+
Optional *kwargs* to the iterator:
stepper
- The stepper controlls how the iterator advances.
- Possible options for the stepper are
-
+ The stepper controlls how the iterator advances.
+ Possible options for the stepper are
+
``all``
use all reads for pileup.
``samtools``
A :class:`FastaFile` object
mask
- Skip all reads with bits set in mask.
+ Skip all reads with bits set in mask if mask=True.
+
+ max_depth
+ Maximum read depth permitted. The default limit is *8000*.
+ truncate
+ By default, the samtools pileup engine outputs all reads overlapping a region (see note below).
+ If truncate is True and a region is given, only output columns in the exact region
+ specificied.
.. note::
- *all* reads which overlap the region are returned. The first base returned will be the
+ *all* reads which overlap the region are returned. The first base returned will be the
first base of the first read *not* necessarily the first base of the region used in the query.
- The maximum number of reads considered for pileup is *8000*. This limit is set by
- :term:`csamtools`.
-
'''
cdef int rtid, rstart, rend, has_coord
cdef bam_plbuf_t *buf
if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
- bam_fetch(self.samfile.x.bam,
- self.index, rtid, rstart, rend,
+ bam_fetch(self.samfile.x.bam,
+ self.index, rtid, rstart, rend,
buf, pileup_fetch_callback )
-
+
# finalize pileup
bam_plbuf_push( NULL, buf)
bam_plbuf_destroy(buf)
else:
if has_coord:
- return IteratorColumnRegion( self,
- tid = rtid,
- start = rstart,
- end = rend,
+ return IteratorColumnRegion( self,
+ tid = rtid,
+ start = rstart,
+ end = rend,
**kwargs )
else:
return IteratorColumnAllRefs(self, **kwargs )
def __enter__(self):
return self
-
+
def __exit__(self, exc_type, exc_value, traceback):
self.close()
return False
def __get__(self):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
return self._filename
-
+
property nreferences:
'''number of :term:`reference` sequences in the file.'''
def __get__(self):
property references:
"""tuple with the names of :term:`reference` sequences."""
- def __get__(self):
+ def __get__(self):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
t = []
for x from 0 <= x < self.samfile.header.n_targets:
- t.append( self.samfile.header.target_name[x] )
+ t.append( _charptr_to_str(self.samfile.header.target_name[x]) )
return tuple(t)
property lengths:
- """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
+ """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
:attr:`pysam.Samfile.references`
"""
- def __get__(self):
+ def __get__(self):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
t = []
for x from 0 <= x < self.samfile.header.n_targets:
t.append( self.samfile.header.target_len[x] )
return tuple(t)
+ property mapped:
+ """total number of mapped reads in file.
+ """
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" )
+
+ cdef int tid
+ cdef uint32_t total = 0
+ for tid from 0 <= tid < self.samfile.header.n_targets:
+ total += pysam_get_mapped( self.index, tid )
+ return total
+
+ property unmapped:
+ """total number of unmapped reads in file.
+ """
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" )
+ cdef int tid
+ cdef uint32_t total = 0
+ for tid from 0 <= tid < self.samfile.header.n_targets:
+ total += pysam_get_unmapped( self.index, tid )
+ # get unmapped reads without coordinates
+ total += pysam_get_unmapped( self.index, -1 )
+ return total
+
property text:
'''full contents of the :term:`sam file` header as a string.'''
def __get__(self):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
- return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
+ return from_string_and_size(self.samfile.header.text, self.samfile.header.l_text)
property header:
- '''header information within the :term:`sam file`. The records and fields are returned as
+ '''header information within the :term:`sam file`. The records and fields are returned as
a two-level dictionary.
'''
def __get__(self):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
result = {}
-
+
if self.samfile.header.text != NULL:
# convert to python string (note: call self.text to create 0-terminated string)
t = self.text
if record not in result: result[record] = []
result[record].append( x )
+ # if there are no SQ lines in the header, add the reference names
+ # from the information in the bam file.
+ # Background: c-samtools keeps the textual part of the header separate from
+ # the list of reference names and lengths. Thus, if a header contains only
+ # SQ lines, the SQ information is not part of the textual header and thus
+ # are missing from the output. See issue 84.
+ if "SQ" not in result:
+ sq = []
+ for ref, length in zip( self.references, self.lengths ):
+ sq.append( {'LN': length, 'SN': ref } )
+ result["SQ"] = sq
+
return result
def _buildLine( self, fields, record ):
# TODO: add checking for field and sort order
line = ["@%s" % record ]
+ # comment
if record == "CO":
line.append( fields )
+ # user tags
+ elif record.islower():
+ for key in sorted(fields):
+ line.append( "%s:%s" % (key, str(fields[key])))
+ # defined tags
else:
# write fields of the specification
for key in VALID_HEADER_ORDER[record]:
if not key.isupper():
line.append( "%s:%s" % (key, str(fields[key])))
- return "\t".join( line )
+ return "\t".join( line )
cdef bam_header_t * _buildHeader( self, new_header ):
'''return a new header built from a dictionary in *new_header*.
cdef bam_header_t * dest
dest = bam_header_init()
-
+
+ # first: defined tags
for record in VALID_HEADERS:
if record in new_header:
ttype = VALID_HEADER_TYPES[record]
data = new_header[record]
if type( data ) != type( ttype() ):
raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
- if type( data ) == types.DictType:
+ if type( data ) is dict:
lines.append( self._buildLine( data, record ) )
else:
for fields in new_header[record]:
lines.append( self._buildLine( fields, record ) )
+ # then: user tags (lower case), sorted alphabetically
+ for record, data in sorted(new_header.items()):
+ if record in VALID_HEADERS: continue
+ if type( data ) is dict:
+ lines.append( self._buildLine( data, record ) )
+ else:
+ for fields in new_header[record]:
+ lines.append( self._buildLine( fields, record ) )
+
text = "\n".join(lines) + "\n"
if dest.text != NULL: free( dest.text )
dest.text = <char*>calloc( len(text), sizeof(char))
dest.l_text = len(text)
- strncpy( dest.text, text, dest.l_text )
+ cdef bytes btext = text.encode('ascii')
+ strncpy( dest.text, btext, dest.l_text )
+ cdef bytes bseqname
# collect targets
if "SQ" in new_header:
seqs = []
seqs.append( (fields["SN"], fields["LN"] ) )
except KeyError:
raise KeyError( "incomplete sequence information in '%s'" % str(fields))
-
+
dest.n_targets = len(seqs)
dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
-
+
for x from 0 <= x < dest.n_targets:
seqname, seqlen = seqs[x]
dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
- strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
+ bseqname = seqname.encode('ascii')
+ strncpy( dest.target_name[x], bseqname, len(seqname) + 1 )
dest.target_len[x] = seqlen
return dest
###############################################################
def __iter__(self):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
- return self
+ if not self.isbam and self.samfile.header.n_targets == 0:
+ raise NotImplementedError( "can not iterate over samfile without header")
+ return self
cdef bam1_t * getCurrent( self ):
return self.b
cdef int ret
return samread(self.samfile, self.b)
- def __next__(self):
+ def __next__(self):
"""
python version of next().
"""
:class:`pysam.IteratorRowAllRefs`
iterate over all reads in all reference sequences.
-
+
The method :meth:`Samfile.fetch` returns an IteratorRow.
'''
pass
+
cdef class IteratorRowRegion(IteratorRow):
"""*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
creates its own file handle by re-opening the
file.
- Note that the index will be shared between
+ Note that the index will be shared between
samfile and the iterator.
"""
-
- cdef bam_iter_t iter # iterator state object
- cdef bam1_t * b
- cdef int retval
- cdef Samfile samfile
- cdef samfile_t * fp
- # true if samfile belongs to this object
- cdef int owns_samfile
def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
if not samfile._isOpen():
raise ValueError( "I/O operation on closed file" )
-
+
if not samfile._hasIndex():
raise ValueError( "no index available for iteration" )
-
+
# makes sure that samfile stays alive as long as the
# iterator is alive
self.samfile = samfile
- if samfile.isbam: mode = "rb"
- else: mode = "r"
+ if samfile.isbam: mode = b"rb"
+ else: mode = b"r"
# reopen the file - note that this makes the iterator
# slow and causes pileup to slow down significantly.
if reopen:
- store = StderrStore()
+ store = StderrStore()
self.fp = samopen( samfile._filename, mode, NULL )
store.release()
assert self.fp != NULL
self.retval = 0
- self.iter = bam_iter_query(self.samfile.index,
- tid,
- beg,
+ self.iter = bam_iter_query(self.samfile.index,
+ tid,
+ beg,
end)
self.b = bam_init1()
def __iter__(self):
- return self
+ return self
cdef bam1_t * getCurrent( self ):
return self.b
cdef int cnext(self):
'''cversion of iterator. Used by IteratorColumn'''
- self.retval = bam_iter_read( self.fp.x.bam,
- self.iter,
+ self.retval = bam_iter_read( self.fp.x.bam,
+ self.iter,
self.b)
-
- def __next__(self):
+
+ def __next__(self):
"""python version of next().
"""
self.cnext()
def __dealloc__(self):
bam_destroy1(self.b)
+ bam_iter_destroy( self.iter )
if self.owns_samfile: samclose( self.fp )
cdef class IteratorRowAll(IteratorRow):
to not re-open *samfile*.
"""
- cdef bam1_t * b
- cdef samfile_t * fp
- # true if samfile belongs to this object
- cdef int owns_samfile
-
def __cinit__(self, Samfile samfile, int reopen = True ):
if not samfile._isOpen():
raise ValueError( "I/O operation on closed file" )
- if samfile.isbam: mode = "rb"
- else: mode = "r"
+ if samfile.isbam: mode = b"rb"
+ else: mode = b"r"
# reopen the file to avoid iterator conflict
if reopen:
self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
def __iter__(self):
- return self
+ return self
cdef bam1_t * getCurrent( self ):
return self.b
cdef int cnext(self):
'''cversion of iterator. Used by IteratorColumn'''
- cdef int ret
return samread(self.fp, self.b)
- def __next__(self):
+ def __next__(self):
"""python version of next().
pyrex uses this non-standard name instead of next()
cdef class IteratorRowAllRefs(IteratorRow):
"""iterates over all mapped reads by chaining iterators over each reference
"""
- cdef Samfile samfile
- cdef int tid
- cdef IteratorRowRegion rowiter
def __cinit__(self, Samfile samfile):
assert samfile._isOpen()
iterate over reads in *samfile* at a given list of file positions.
"""
- cdef bam1_t * b
- cdef int current_pos
- cdef samfile_t * fp
- cdef positions
- # true if samfile belongs to this object
- cdef int owns_samfile
-
def __cinit__(self, Samfile samfile, positions, int reopen = True ):
if not samfile._isOpen():
raise ValueError( "I/O operation on closed file" )
assert samfile.isbam, "can only use this iterator on bam files"
- mode = "rb"
+ mode = b"rb"
# reopen the file to avoid iterator conflict
if reopen:
self.current_pos = 0
def __iter__(self):
- return self
+ return self
cdef bam1_t * getCurrent( self ):
return self.b
# end iteration if out of positions
if self.current_pos >= len(self.positions): return -1
- bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
+ bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
self.current_pos += 1
return samread(self.fp, self.b)
- def __next__(self):
+ def __next__(self):
"""python version of next().
pyrex uses this non-standard name instead of next()
##-------------------------------------------------------------------
##-------------------------------------------------------------------
##-------------------------------------------------------------------
-ctypedef struct __iterdata:
- samfile_t * samfile
- bam_iter_t iter
- faidx_t * fastafile
- int tid
- char * seq
- int seq_len
-
cdef int __advance_all( void * data, bam1_t * b ):
'''advance without any read filtering.
'''
return bam_iter_read( d.samfile.x.bam, d.iter, b )
cdef int __advance_snpcalls( void * data, bam1_t * b ):
- '''advance using same filter and read processing as in
+ '''advance using same filter and read processing as in
the samtools pileup.
'''
cdef __iterdata * d
if d.fastafile != NULL and b.core.tid != d.tid:
if d.seq != NULL: free(d.seq)
d.tid = b.core.tid
- d.seq = faidx_fetch_seq(d.fastafile,
+ d.seq = faidx_fetch_seq(d.fastafile,
d.samfile.header.target_name[d.tid],
- 0, max_pos,
+ 0, max_pos,
&d.seq_len)
if d.seq == NULL:
raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
- (d.samfile.header.target_name[d.tid],
+ (d.samfile.header.target_name[d.tid],
d.tid))
'''abstract base class for iterators over columns.
IteratorColumn objects wrap the pileup functionality of samtools.
-
- For reasons of efficiency, the iterator points to the current
+
+ For reasons of efficiency, the iterator points to the current
pileup buffer. The pileup buffer is updated at every iteration.
This might cause some unexpected behavious. For example,
consider the conversion to a list::
-
+
f = Samfile("file.bam", "rb")
result = list( f.pileup() )
- Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
+ Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
but each object in ``result`` will contain the same information.
-
+
The desired behaviour can be achieved by list comprehension::
result = [ x.pileups() for x in f.pileup() ]
Optional kwargs to the iterator
stepper
- The stepper controlls how the iterator advances.
- Possible options for the stepper are
-
+ The stepper controls how the iterator advances.
+ Possible options for the stepper are
+
all
use all reads for pileup.
samtools
same filter and read processing as in :term:`csamtools` pileup
+
+ The default is to use "all" if no stepper is given.
+
fastafile
A :class:`FastaFile` object
mask
Skip all reads with bits set in mask.
-
-
+ max_depth
+ maximum read depth. The default is 8000.
'''
- # result of the last plbuf_push
- cdef IteratorRowRegion iter
- cdef int tid
- cdef int pos
- cdef int n_plp
- cdef int mask
- cdef const_bam_pileup1_t_ptr plp
- cdef bam_plp_t pileup_iter
- cdef __iterdata iterdata
- cdef Samfile samfile
- cdef Fastafile fastafile
- cdef stepper
-
def __cinit__( self, Samfile samfile, **kwargs ):
self.samfile = samfile
self.mask = kwargs.get("mask", BAM_DEF_MASK )
self.fastafile = kwargs.get( "fastafile", None )
self.stepper = kwargs.get( "stepper", None )
+ self.max_depth = kwargs.get( "max_depth", 8000 )
self.iterdata.seq = NULL
self.tid = 0
self.pos = 0
self.plp = NULL
self.pileup_iter = <bam_plp_t>NULL
+
def __iter__(self):
- return self
+ return self
cdef int cnext(self):
'''perform next iteration.
This method is analogous to the samtools bam_plp_auto method.
It has been re-implemented to permit for filtering.
'''
- self.plp = bam_plp_auto( self.pileup_iter,
+ self.plp = bam_plp_auto( self.pileup_iter,
&self.tid,
&self.pos,
&self.n_plp )
self.mask = mask
bam_plp_set_mask( self.pileup_iter, self.mask )
- cdef setupIteratorData( self,
- int tid,
- int start,
- int end,
+ cdef setupIteratorData( self,
+ int tid,
+ int start,
+ int end,
int reopen = 0 ):
'''setup the iterator structure'''
if self.stepper == None or self.stepper == "all":
self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
- elif self.stepper == "samtools":
+ elif self.stepper == "samtools":
self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
else:
raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
+ if self.max_depth:
+ bam_plp_set_maxcnt( self.pileup_iter, self.max_depth )
+
bam_plp_set_mask( self.pileup_iter, self.mask )
cdef reset( self, tid, start, end ):
'''
self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
self.iterdata.iter = self.iter.iter
-
+
# invalidate sequence if different tid
if self.tid != tid:
if self.iterdata.seq != NULL: free( self.iterdata.seq )
- self.iterdata.seq = NULL
+ self.iterdata.seq = NULL
self.iterdata.tid = -1
-
+
# self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
bam_plp_reset(self.pileup_iter)
bam_plp_destroy(self.pileup_iter)
self.pileup_iter = <bam_plp_t>NULL
- if self.iterdata.seq != NULL:
+ if self.iterdata.seq != NULL:
free(self.iterdata.seq)
self.iterdata.seq = NULL
cdef class IteratorColumnRegion(IteratorColumn):
'''iterates over a region only.
'''
- def __cinit__(self, Samfile samfile,
- int tid = 0,
- int start = 0,
+ def __cinit__(self, Samfile samfile,
+ int tid = 0,
+ int start = 0,
int end = max_pos,
+ int truncate = False,
**kwargs ):
# initialize iterator
self.setupIteratorData( tid, start, end, 1 )
+ self.start = start
+ self.end = end
+ self.truncate = truncate
- def __next__(self):
+ def __next__(self):
"""python version of next().
"""
self.cnext()
if self.n_plp < 0:
raise ValueError("error during iteration" )
-
+
if self.plp == NULL:
raise StopIteration
- return makePileupProxy( <bam_pileup1_t*>self.plp,
- self.tid,
- self.pos,
+ if self.truncate:
+ if self.start < self.pos: continue
+ if self.pos >= self.end: raise StopIteration
+
+ return makePileupProxy( &self.plp,
+ self.tid,
+ self.pos,
self.n_plp )
cdef class IteratorColumnAllRefs(IteratorColumn):
"""iterates over all columns by chaining iterators over each reference
"""
- def __cinit__(self,
+ def __cinit__(self,
Samfile samfile,
**kwargs ):
def __next__(self):
"""python version of next().
"""
-
+
while 1:
self.cnext()
if self.n_plp < 0:
raise ValueError("error during iteration" )
-
+
# return result, if within same reference
if self.plp != NULL:
- return makePileupProxy( <bam_pileup1_t*>self.plp,
- self.tid,
- self.pos,
+ return makePileupProxy( &self.plp,
+ self.tid,
+ self.pos,
self.n_plp )
# otherwise, proceed to next reference or stop
if not src.core.l_qseq:
return None
- seq = PyString_FromStringAndSize(NULL, end-start)
- s = PyString_AS_STRING(seq)
+ seq = PyBytes_FromStringAndSize(NULL, end - start)
+ s = <char*>seq
p = bam1_seq(src)
for k from start <= k < end:
if p[0] == 0xff:
return None
- qual = PyString_FromStringAndSize(NULL, end-start)
- q = PyString_AS_STRING(qual)
+ qual = PyBytes_FromStringAndSize(NULL, end - start)
+ q = <char*>qual
for k from start <= k < end:
## equivalent to t[i] + 33 (see bam.c)
cdef class AlignedRead:
'''
- Class representing an aligned read. see SAM format specification for
+ Class representing an aligned read. see SAM format specification for
the meaning of fields (http://samtools.sourceforge.net/).
This class stores a handle to the samtools C-structure representing
One issue to look out for is that the sequence should always
be set *before* the quality scores. Setting the sequence will
also erase any quality scores that were set previously.
+
+ In Python 3, the fields containing sequence and quality
+ (seq, query, qual and qqual) data are of type bytes. Other
+ string data, such as the qname field and strings in the
+ tags tuple, is represented as unicode strings. On assignment,
+ both bytes and unicode objects are allowed, but unicode strings
+ must contain only ASCII characters.
'''
# Now only called when instances are created from Python
def __init__(self):
# see bam_init1
self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
- # allocate some memory
+ # allocate some memory
# If size is 0, calloc does not return a pointer that can be passed to free()
# so allocate 40 bytes for a new read
self._delegate.m_data = 40
def __dealloc__(self):
bam_destroy1(self._delegate)
-
+
def __str__(self):
"""return string representation of alignment.
"""
# sam-parsing is done in sam.c/bam_format1_core which
# requires a valid header.
+ if sys.version_info[0] < 3:
+ seq = self.seq
+ qual = self.qual
+ else:
+ seq = self.seq.decode('ascii')
+ qual = self.qual.decode('ascii')
return "\t".join(map(str, (self.qname,
self.flag,
self.rname,
self.mrnm,
self.mpos,
self.rlen,
- self.seq,
- self.qual,
+ seq,
+ qual,
self.tags )))
-
+
def compare(self, AlignedRead other):
'''return -1,0,1, if contents in this are binary <,=,> to *other*'''
retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
if retval: return retval
- retval = cmp(t.data_len, o.data_len)
+ retval = (t.data_len > o.data_len) - (t.data_len < o.data_len) # cmp(t.data_len, o.data_len)
if retval: return retval
return memcmp(t.data, o.data, t.data_len)
property qname:
"""the query name (None if not present)"""
def __get__(self):
- cdef bam1_t * src
+ cdef bam1_t * src
src = self._delegate
if src.core.l_qname == 0: return None
- return <char *>bam1_qname( src )
+ return _charptr_to_str(<char *>bam1_qname( src ))
def __set__(self, qname ):
if qname == None or len(qname) == 0: return
- cdef bam1_t * src
- cdef int l
+ qname = _force_bytes(qname)
+ cdef bam1_t * src
+ cdef int l
cdef char * p
- src = self._delegate
+ src = self._delegate
p = bam1_qname( src )
# the qname is \0 terminated
l = len(qname) + 1
- pysam_bam_update( src,
- src.core.l_qname,
- l,
+ pysam_bam_update( src,
+ src.core.l_qname,
+ l,
<uint8_t*>p )
src.core.l_qname = l
p = bam1_qname(src)
strncpy( p, qname, l )
-
+
property cigar:
- """the :term:`cigar` alignment (None if not present).
+ """the :term:`cigar` alignment (None if not present). The alignment
+ is returned as a list of operations. The operations are:
+
+ +-----+--------------+-----+
+ |M |BAM_CMATCH |0 |
+ +-----+--------------+-----+
+ |I |BAM_CINS |1 |
+ +-----+--------------+-----+
+ |D |BAM_CDEL |2 |
+ +-----+--------------+-----+
+ |N |BAM_CREF_SKIP |3 |
+ +-----+--------------+-----+
+ |S |BAM_CSOFT_CLIP|4 |
+ +-----+--------------+-----+
+ |H |BAM_CHARD_CLIP|5 |
+ +-----+--------------+-----+
+ |P |BAM_CPAD |6 |
+ +-----+--------------+-----+
+ |= |BAM_CEQUAL |7 |
+ +-----+--------------+-----+
+ |X |BAM_CDIFF |8 |
+ +-----+--------------+-----+
+
"""
def __get__(self):
cdef uint32_t * cigar_p
- cdef bam1_t * src
+ cdef bam1_t * src
cdef op, l, cigar
cdef int k
src = self._delegate
if src.core.n_cigar == 0: return None
-
+
cigar = []
cigar_p = bam1_cigar(src);
for k from 0 <= k < src.core.n_cigar:
def __set__(self, values ):
if values == None or len(values) == 0: return
cdef uint32_t * p
- cdef bam1_t * src
+ cdef bam1_t * src
cdef op, l
cdef int k
p = bam1_cigar(src)
# create space for cigar data within src.data
- pysam_bam_update( src,
+ pysam_bam_update( src,
src.core.n_cigar * 4,
- len(values) * 4,
+ len(values) * 4,
<uint8_t*>p )
-
+
# length is number of cigar operations, not bytes
src.core.n_cigar = len(values)
## setting the cigar string also updates the "bin" attribute
src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
+ property cigarstring:
+ '''the :term:`cigar` alignment as a string.
+
+ Returns the empty string if not present.
+ '''
+ def __get__(self):
+ c = self.cigar
+ if c == None: return ""
+ else: return "".join([ "%c%i" % (CODE2CIGAR[x],y) for x,y in c])
+
+ def __set__(self, cigar):
+ if cigar == None or len(cigar) == 0: self.cigar = []
+ parts = CIGAR_REGEX.findall( cigar )
+ self.cigar = [ (CIGAR2CODE[ord(x)], int(y)) for x,y in parts ]
+
property seq:
- """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
+ """read sequence bases, including :term:`soft clipped` bases (None if not present).
+
+ In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
def __get__(self):
cdef bam1_t * src
cdef char * s
def __set__(self,seq):
# samtools manages sequence and quality length memory together
# if no quality information is present, the first byte says 0xff.
-
+
if seq == None or len(seq) == 0: return
+ seq = _force_bytes(seq)
cdef bam1_t * src
- cdef uint8_t * p
+ cdef uint8_t * p
cdef char * s
cdef int l, k, nbytes_new, nbytes_old
src = self._delegate
l = len(seq)
-
+
# as the sequence is stored in half-bytes, the total length (sequence
# plus quality scores) is (l+1)/2 + l
nbytes_new = (l+1)/2 + l
p = bam1_seq( src )
src.core.l_qseq = l
- pysam_bam_update( src,
+ pysam_bam_update( src,
nbytes_old,
nbytes_new,
p)
property qual:
- """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
+ """read sequence base qualities, including :term:`soft clipped` bases (None if not present).
+
+ In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
def __get__(self):
cdef bam1_t * src
# note that space is already allocated via the sequences
cdef bam1_t * src
cdef uint8_t * p
- cdef char * q
+ cdef char * q
cdef int k
src = self._delegate
# if absent - set to 0xff
p[0] = 0xff
return
+ qual = _force_bytes(qual)
cdef int l
# convert to C string
q = qual
p[k] = <uint8_t>q[k] - 33
property query:
- """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
+ """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present).
+
+ In Python 3, this property is of type bytes. Assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient.
SAM/BAM files may included extra flanking bases sequences that were
not part of the alignment. These bases may be the result of the
return get_seq_range(src, start, end)
property qqual:
- """aligned query sequence quality values (None if not present)"""
+ """aligned query sequence quality values (None if not present). This property is read-only.
+
+ In Python 3, this property is of type bytes."""
def __get__(self):
cdef bam1_t * src
cdef uint32_t start, end
- cdef char * q
src = self._delegate
property tags:
"""the tags in the AUX field.
- This property permits convenience access to
+ This property permits convenience access to
the tags. Changes it the returned list will
not update the tags automatically. Instead,
- the following is required for adding a
+ the following is required for adding a
new tag::
read.tags = read.tags + [("RG",0)]
+
+ This method will happily write the same tag
+ multiple times.
"""
def __get__(self):
cdef char * ctag
cdef uint8_t * s
cdef char auxtag[3]
cdef char auxtype
-
+ cdef uint8_t byte_size
+ cdef int32_t nvalues
+
src = self._delegate
- if src.l_aux == 0: return None
-
+ if src.l_aux == 0: return []
s = bam1_aux( src )
result = []
auxtag[2] = 0
auxtag[1] = s[1]
s += 2
auxtype = s[0]
-
if auxtype in ('c', 'C'):
- value = <int>bam_aux2i(s)
+ value = <int>bam_aux2i(s)
s += 1
elif auxtype in ('s', 'S'):
- value = <int>bam_aux2i(s)
+ value = <int>bam_aux2i(s)
s += 2
elif auxtype in ('i', 'I'):
- value = <float>bam_aux2i(s)
+ value = <int32_t>bam_aux2i(s)
s += 4
elif auxtype == 'f':
value = <float>bam_aux2f(s)
value = "%c" % <char>bam_aux2A(s)
s += 1
elif auxtype in ('Z', 'H'):
- value = <char*>bam_aux2Z(s)
+ value = _charptr_to_str(<char*>bam_aux2Z(s))
# +1 for NULL terminated string
s += len(value) + 1
- #
+ elif auxtype == 'B':
+ s += 1
+ byte_size, nvalues, value = convertBinaryTagToList( s )
+ # 5 for 1 char and 1 int
+ s += 5 + ( nvalues * byte_size) - 1
+
s += 1
-
- result.append( (auxtag, value) )
+
+ result.append( (_charptr_to_str(auxtag), value) )
return result
def __set__(self, tags):
- cdef char * ctag
cdef bam1_t * src
cdef uint8_t * s
cdef uint8_t * new_data
- cdef char * temp
- cdef int guessed_size, control_size
- cdef int max_size, size, offset
+ cdef char * temp
src = self._delegate
- max_size = 4000
- offset = 0
- if tags != None:
+ fmts, args = ["<"], []
+
+ if tags != None:
# map samtools code to python.struct code and byte size
- buffer = ctypes.create_string_buffer(max_size)
-
for pytag, value in tags:
+ if not type(pytag) is bytes:
+ pytag = pytag.encode('ascii')
t = type(value)
- if t == types.FloatType:
- fmt, pytype = "<cccf", 'f'
- elif t == types.IntType:
+
+ if t is tuple or t is list:
+ # binary tags - treat separately
+ pytype = 'B'
+ # get data type - first value determines type
+ if type(value[0]) is float:
+ datafmt, datatype = "f", "f"
+ else:
+ mi, ma = min(value), max(value)
+ absmax = max( abs(mi), abs(ma) )
+ # signed ints
+ if mi < 0:
+ if mi >= -127: datafmt, datatype = "b", 'c'
+ elif mi >= -32767: datafmt, datatype = "h", 's'
+ elif absmax < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+ else: datafmt, datatype = "i", 'i'
+
+ # unsigned ints
+ else:
+ if absmax <= 255: datafmt, datatype = "B", 'C'
+ elif absmax <= 65535: datafmt, datatype = "H", 'S'
+ elif absmax > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+ else: datafmt, datatype = "I", 'I'
+
+ datafmt = "2sccI%i%s" % (len(value), datafmt)
+ args.extend( [pytag[:2],
+ pytype.encode('ascii'),
+ datatype.encode('ascii'),
+ len(value)] + list(value) )
+ fmts.append( datafmt )
+ continue
+
+ if t is float:
+ fmt, pytype = "2scf", 'f'
+ elif t is int:
+ # negative values
if value < 0:
- if value >= -127: fmt, pytype = "<cccb", 'c'
- elif value >= -32767: fmt, pytype = "<ccch", 's'
+ if value >= -127: fmt, pytype = "2scb", 'c'
+ elif value >= -32767: fmt, pytype = "2sch", 's'
elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
- else: fmt, pytype = "<ccci", 'i'[0]
+ else: fmt, pytype = "2sci", 'i'
+ # positive values
else:
- if value <= 255: fmt, pytype = "<cccB", 'C'
- elif value <= 65535: fmt, pytype = "<cccH", 'S'
+ if value <= 255: fmt, pytype = "2scB", 'C'
+ elif value <= 65535: fmt, pytype = "2scH", 'S'
elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
- else: fmt, pytype = "<cccI", 'I'
+ else: fmt, pytype = "2scI", 'I'
else:
# Note: hex strings (H) are not supported yet
+ if t is not bytes:
+ value = value.encode('ascii')
if len(value) == 1:
- fmt, pytype = "<cccc", 'A'
+ fmt, pytype = "2scc", 'A'
else:
- fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
-
- size = struct.calcsize(fmt)
- if offset + size > max_size:
- raise NotImplementedError("tags field too large")
-
- struct.pack_into( fmt,
- buffer,
- offset,
- pytag[0],
- pytag[1],
- pytype,
- value )
- offset += size
-
- # delete the old data and allocate new
- # if offset == 0, the aux field will be
+ fmt, pytype = "2sc%is" % (len(value)+1), 'Z'
+
+ args.extend( [pytag[:2],
+ pytype.encode('ascii'),
+ value ] )
+
+ fmts.append( fmt )
+
+ fmt = "".join(fmts)
+ total_size = struct.calcsize(fmt)
+ buffer = ctypes.create_string_buffer(total_size)
+ struct.pack_into( fmt,
+ buffer,
+ 0,
+ *args )
+
+ # delete the old data and allocate new space.
+ # If total_size == 0, the aux field will be
# empty
- pysam_bam_update( src,
+ pysam_bam_update( src,
src.l_aux,
- offset,
+ total_size,
bam1_aux( src ) )
-
- src.l_aux = offset
- # copy data only if there is any
- if offset != 0:
+ src.l_aux = total_size
+ # copy data only if there is any
+ if total_size != 0:
+
# get location of new data
- s = bam1_aux( src )
-
+ s = bam1_aux( src )
+
# check if there is direct path from buffer.raw to tmp
temp = buffer.raw
- memcpy( s, temp, offset )
+ memcpy( s, temp, total_size )
- property flag:
+ property flag:
"""properties flag"""
def __get__(self): return self._delegate.core.flag
def __set__(self, flag): self._delegate.core.flag = flag
- property rname:
+ property rname:
"""
:term:`target` ID
.. note::
- This field contains the index of the reference sequence
+ This field contains the index of the reference sequence
in the sequence dictionary. To obtain the name
of the reference sequence, use :meth:`pysam.Samfile.getrname()`
-
+
"""
def __get__(self): return self._delegate.core.tid
def __set__(self, tid): self._delegate.core.tid = tid
- property tid:
+ property tid:
"""
:term:`target` ID
.. note::
- This field contains the index of the reference sequence
+ This field contains the index of the reference sequence
in the sequence dictionary. To obtain the name
of the reference sequence, use :meth:`pysam.Samfile.getrname()`
-
+
"""
def __get__(self): return self._delegate.core.tid
def __set__(self, tid): self._delegate.core.tid = tid
- property pos:
+ property pos:
"""0-based leftmost coordinate"""
def __get__(self): return self._delegate.core.pos
- def __set__(self, pos):
+ def __set__(self, pos):
## setting the cigar string also updates the "bin" attribute
cdef bam1_t * src
src = self._delegate
else:
src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
self._delegate.core.pos = pos
- property bin:
+ property bin:
"""properties bin"""
def __get__(self): return self._delegate.core.bin
def __set__(self, bin): self._delegate.core.bin = bin
'''length of the read (read only). Returns 0 if not given.'''
def __get__(self): return self._delegate.core.l_qseq
property aend:
- '''aligned end position of the read (read only). Returns
- None if not available.'''
+ '''aligned reference position of the read on the reference genome.
+
+ aend points to one past the last aligned residue.
+ Returns None if not available.'''
def __get__(self):
cdef bam1_t * src
src = self._delegate
if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
return None
return bam_calend(&src.core, bam1_cigar(src))
+
property alen:
- '''aligned length of the read (read only). Returns None if
+ '''aligned length of the read on the reference genome. Returns None if
not available.'''
def __get__(self):
cdef bam1_t * src
src = self._delegate
if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
return None
- return bam_calend(&src.core,
+ return bam_calend(&src.core,
bam1_cigar(src)) - \
self._delegate.core.pos
- property mapq:
+ property mapq:
"""mapping quality"""
def __get__(self): return self._delegate.core.qual
def __set__(self, qual): self._delegate.core.qual = qual
+
property mrnm:
- """the :term:`reference` id of the mate """
+ """the :term:`reference` id of the mate
+ deprecated, use RNEXT instead.
+ """
+ def __get__(self): return self._delegate.core.mtid
+ def __set__(self, mtid): self._delegate.core.mtid = mtid
+ property rnext:
+ """the :term:`reference` id of the mate """
def __get__(self): return self._delegate.core.mtid
def __set__(self, mtid): self._delegate.core.mtid = mtid
- property mpos:
+ property mpos:
+ """the position of the mate
+ deprecated, use PNEXT instead."""
+ def __get__(self): return self._delegate.core.mpos
+ def __set__(self, mpos): self._delegate.core.mpos = mpos
+ property pnext:
"""the position of the mate"""
def __get__(self): return self._delegate.core.mpos
def __set__(self, mpos): self._delegate.core.mpos = mpos
- property isize:
+ property isize:
+ """the insert size
+ deprecated: use tlen instead"""
+ def __get__(self): return self._delegate.core.isize
+ def __set__(self, isize): self._delegate.core.isize = isize
+ property tlen:
"""the insert size"""
def __get__(self): return self._delegate.core.isize
def __set__(self, isize): self._delegate.core.isize = isize
- property is_paired:
+ property is_paired:
"""true if read is paired in sequencing"""
def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FPAIRED
else: self._delegate.core.flag &= ~BAM_FPAIRED
property is_proper_pair:
"""true if read is mapped in a proper pair"""
def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
property is_unmapped:
"""true if read itself is unmapped"""
def __get__(self): return (self.flag & BAM_FUNMAP) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FUNMAP
else: self._delegate.core.flag &= ~BAM_FUNMAP
- property mate_is_unmapped:
- """true if the mate is unmapped"""
+ property mate_is_unmapped:
+ """true if the mate is unmapped"""
def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FMUNMAP
else: self._delegate.core.flag &= ~BAM_FMUNMAP
property is_reverse:
"""true if read is mapped to reverse strand"""
def __get__(self): return (self.flag & BAM_FREVERSE) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FREVERSE
else: self._delegate.core.flag &= ~BAM_FREVERSE
property mate_is_reverse:
"""true is read is mapped to reverse strand"""
def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FMREVERSE
else: self._delegate.core.flag &= ~BAM_FMREVERSE
- property is_read1:
+ property is_read1:
"""true if this is read1"""
def __get__(self): return (self.flag & BAM_FREAD1) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FREAD1
else: self._delegate.core.flag &= ~BAM_FREAD1
property is_read2:
"""true if this is read2"""
def __get__(self): return (self.flag & BAM_FREAD2) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FREAD2
else: self._delegate.core.flag &= ~BAM_FREAD2
property is_secondary:
"""true if not primary alignment"""
def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FSECONDARY
else: self._delegate.core.flag &= ~BAM_FSECONDARY
property is_qcfail:
"""true if QC failure"""
def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FQCFAIL
else: self._delegate.core.flag &= ~BAM_FQCFAIL
property is_duplicate:
"""true if optical or PCR duplicate"""
def __get__(self): return (self.flag & BAM_FDUP) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FDUP
else: self._delegate.core.flag &= ~BAM_FDUP
property positions:
cdef uint32_t k, i, pos
cdef int op
cdef uint32_t * cigar_p
- cdef bam1_t * src
+ cdef bam1_t * src
- result = []
src = self._delegate
if src.core.n_cigar == 0: return []
+ result = []
pos = src.core.pos
-
cigar_p = bam1_cigar(src)
+
for k from 0 <= k < src.core.n_cigar:
op = cigar_p[k] & BAM_CIGAR_MASK
l = cigar_p[k] >> BAM_CIGAR_SHIFT
pos += l
return result
+
+ property aligned_pairs:
+ """a list of aligned read and reference positions.
+
+ Unaligned position are marked by None.
+ """
+ def __get__(self):
+ cdef uint32_t k, i, pos, qpos
+ cdef int op
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+
+ src = self._delegate
+ if src.core.n_cigar == 0: return []
+
+ result = []
+ pos = src.core.pos
+ qpos = 0
+ cigar_p = bam1_cigar(src)
+
+ for k from 0 <= k < src.core.n_cigar:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ if op == BAM_CMATCH:
+ for i from pos <= i < pos + l:
+ result.append( (qpos, i) )
+ qpos += 1
+ pos += l
+
+ elif op == BAM_CINS:
+ for i from pos <= i < pos + l:
+ result.append( (qpos, None) )
+ qpos += 1
+
+ elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ for i from pos <= i < pos + l:
+ result.append( (None, i) )
+ pos += l
+
+ return result
+
+
def overlap( self, uint32_t start, uint32_t end ):
- """return number of bases on reference overlapping *start* and *end*
+ """return number of aligned bases of read overlapping the interval *start* and *end*
+ on the reference sequence.
"""
cdef uint32_t k, i, pos, overlap
cdef int op, o
cdef uint32_t * cigar_p
- cdef bam1_t * src
+ cdef bam1_t * src
overlap = 0
def opt(self, tag):
"""retrieves optional data given a two-letter *tag*"""
- #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
+ #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
cdef uint8_t * v
- v = bam_aux_get(self._delegate, tag)
+ cdef int nvalues
+ btag = _force_bytes(tag)
+ v = bam_aux_get(self._delegate, btag)
if v == NULL: raise KeyError( "tag '%s' not present" % tag )
- type = chr(v[0])
- if type == 'c' or type == 'C' or type == 's' or type == 'S':
- return <int>bam_aux2i(v)
- elif type == 'i' or type == 'I':
- return <int32_t>bam_aux2i(v)
- elif type == 'f' or type == 'F':
+ auxtype = chr(v[0])
+ if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
+ return <int>bam_aux2i(v)
+ elif auxtype == 'i' or auxtype == 'I':
+ return <int32_t>bam_aux2i(v)
+ elif auxtype == 'f' or auxtype == 'F':
return <float>bam_aux2f(v)
- elif type == 'd' or type == 'D':
+ elif auxtype == 'd' or auxtype == 'D':
return <double>bam_aux2d(v)
- elif type == 'A':
+ elif auxtype == 'A':
# there might a more efficient way
# to convert a char into a string
return '%c' % <char>bam_aux2A(v)
- elif type == 'Z':
- return <char*>bam_aux2Z(v)
-
+ elif auxtype == 'Z':
+ return _charptr_to_str(<char*>bam_aux2Z(v))
+ elif auxtype == 'B':
+ bytesize, nvalues, values = convertBinaryTagToList( v + 1 )
+ return values
+ else:
+ raise ValueError("unknown auxilliary type '%s'" % auxtype)
+
+
def fancy_str (self):
"""returns list of fieldnames/values in pretty format for debugging
"""
"m_data": "Maximum data length",
"data_len": "Current data length",
}
- fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
- "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
+ fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
+ "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
"l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
-
+
for f in fields_names_in_order:
if not f in self.__dict__:
continue
pos
the target base coordinate (0-based)
n
- number of reads mapping to this column
+ number of reads mapping to this column
pileups
list of reads (:class:`pysam.PileupRead`) aligned to this column
If the underlying engine iterator advances, the results of this column
will change.
'''
- cdef bam_pileup1_t * plp
- cdef int tid
- cdef int pos
- cdef int n_pu
-
def __init__(self):
raise TypeError("This class cannot be instantiated from Python")
def __get__(self):
cdef int x
pileups = []
+
+ if self.plp[0] == NULL:
+ raise ValueError("PileupProxy accessed after iterator finished")
+
# warning: there could be problems if self.n and self.buf are
# out of sync.
for x from 0 <= x < self.n_pu:
- pileups.append( makePileupRead( &(self.plp[x])) )
+ pileups.append( makePileupRead( &(self.plp[0][x])) )
return pileups
cdef class PileupRead:
'''A read aligned to a column.
'''
- cdef:
- AlignedRead _alignment
- int32_t _qpos
- int _indel
- int _level
- uint32_t _is_del
- uint32_t _is_head
- uint32_t _is_tail
-
def __init__(self):
raise TypeError("This class cannot be instantiated from Python")
def __str__(self):
return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
-
+
property alignment:
"""a :class:`pysam.AlignedRead` object of the aligned read"""
def __get__(self):
sys.stderr.flush() # Buffered data goes to old stream.
os.dup2(fd, self.id) # Open unit 1 on new stream.
os.close(fd) # Close other unit (look out, caller.)
-
+
def restore(self):
'''restore previous output stream'''
if self.streams:
os.close(self.streams[-1])
del self.streams[-1]
-def _samtools_dispatch( method,
+def _samtools_dispatch( method,
args = (),
- catch_stdout = True,
- catch_stderr = False,
- ):
+ catch_stdout = True ):
'''call ``method`` in samtools providing arguments in args.
.. note::
- This method redirects stdout and stderr to capture it
- from samtools. If for some reason stdout/stderr disappears
+ This method redirects stdout to capture it
+ from samtools. If for some reason stdout disappears
the reason might be in this method.
.. note::
The current implementation might only work on linux.
-
- .. note::
- This method captures stdout and stderr using temporary files,
+
+ .. note::
+ This method captures stdout and stderr using temporary files,
which are then read into memory in their entirety. This method
- is slow and might cause large memory overhead.
+ is slow and might cause large memory overhead.
See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
on the topic of redirecting stderr/stdout.
'''
# note that debugging this module can be a problem
- # as stdout/stderr will not appear
-
+ # as stdout/stderr will not appear on the terminal
+
# some special cases
if method == "index":
if not os.path.exists( args[0] ):
raise IOError( "No such file or directory: '%s'" % args[0] )
# redirect stderr and stdout to file
- if catch_stderr:
- stderr_h, stderr_f = tempfile.mkstemp()
- stderr_save = Outs( sys.stderr.fileno() )
- stderr_save.setfd( stderr_h )
-
+ stderr_h, stderr_f = tempfile.mkstemp()
+ pysam_set_stderr( stderr_h )
+
if catch_stdout:
stdout_h, stdout_f = tempfile.mkstemp()
- stdout_save = Outs( sys.stdout.fileno() )
- stdout_save.setfd( stdout_h )
+ try:
+ stdout_save = Outs( sys.stdout.fileno() )
+ stdout_save.setfd( stdout_h )
+ except AttributeError:
+ # stdout has already been redirected
+ catch_stdout = False
# patch for `samtools view`
# samtools `view` closes stdout, from which I can not
if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
args = ( "-o", stdout_f ) + args
-
# do the function call to samtools
cdef char ** cargs
cdef int i, n, retval
n = len(args)
+ method = _force_cmdline_bytes(method)
+ args = [ _force_cmdline_bytes(a) for a in args ]
+
# allocate two more for first (dummy) argument (contains command)
cargs = <char**>calloc( n+2, sizeof( char *) )
cargs[0] = "samtools"
cargs[1] = method
for i from 0 <= i < n: cargs[i+2] = args[i]
+
retval = pysam_dispatch(n+2, cargs)
free( cargs )
-
+
# restore stdout/stderr. This will also flush, so
# needs to be before reading back the file contents
if catch_stdout:
stdout_save.restore()
- out_stdout = open( stdout_f, "r").readlines()
+ try:
+ with open( stdout_f, "r") as inf:
+ out_stdout = inf.readlines()
+ except UnicodeDecodeError:
+ with open( stdout_f, "rb") as inf:
+ # read binary output
+ out_stdout = inf.read()
os.remove( stdout_f )
else:
out_stdout = []
- if catch_stderr:
- stderr_save.restore()
- out_stderr = open( stderr_f, "r").readlines()
+ # get error messages
+ pysam_unset_stderr()
+ try:
+ with open( stderr_f, "r") as inf:
+ out_stderr = inf.readlines()
+ except UnicodeDecodeError:
+ with open( stderr_f, "rb") as inf:
+ # read binary output
+ out_stderr = inf.read()
os.remove( stderr_f )
else:
out_stderr = []
-
+
return retval, out_stderr, out_stdout
cdef class SNPCall:
property tid:
'''the chromosome ID as is defined in the header'''
- def __get__(self):
+ def __get__(self):
return self._tid
-
+
property pos:
'''nucleotide position of SNP.'''
def __get__(self): return self._pos
property reference_base:
'''reference base at pos. ``N`` if no reference sequence supplied.'''
- def __get__(self): return PyString_FromStringAndSize( &self._reference_base, 1 )
+ def __get__(self): return from_string_and_size( &self._reference_base, 1 )
property genotype:
'''the genotype called.'''
- def __get__(self): return PyString_FromStringAndSize( &self._genotype, 1 )
+ def __get__(self): return from_string_and_size( &self._genotype, 1 )
property consensus_quality:
'''the genotype quality (Phred-scaled).'''
self.coverage ) ) )
-cdef class SNPCallerBase:
- '''Base class for SNP callers.
+# cdef class SNPCallerBase:
+# '''Base class for SNP callers.
- *min_baseQ*
- minimum base quality (possibly capped by BAQ)
- *capQ_threshold*
- coefficient for adjusting mapQ of poor mappings
- *theta*
- theta in maq consensus calling model
- *n_haplotypes*
- number of haplotypes in the sample
- *het_rate*
- prior of a difference between two haplotypes
- '''
+# *min_baseQ*
+# minimum base quality (possibly capped by BAQ)
+# *capQ_threshold*
+# coefficient for adjusting mapQ of poor mappings
+# *theta*
+# theta in maq consensus calling model
+# *n_haplotypes*
+# number of haplotypes in the sample
+# *het_rate*
+# prior of a difference between two haplotypes
+# '''
- cdef bam_maqcns_t * c
- cdef IteratorColumn iter
+# cdef bam_maqcns_t * c
+# cdef IteratorColumn iter
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- self.iter = iterator_column
- self.c = bam_maqcns_init()
+# self.iter = iterator_column
+# self.c = bam_maqcns_init()
- # set the default parameterization according to
- # samtools
+# # set the default parameterization according to
+# # samtools
- # new default mode for samtools >0.1.10
- self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
+# # new default mode for samtools >0.1.10
+# self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
- self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
- # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
- self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
- self.c.het_rate = kwargs.get( "het_rate", 0.001 )
- self.c.theta = kwargs.get( "theta", 0.83 )
+# self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
+# # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
+# self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
+# self.c.het_rate = kwargs.get( "het_rate", 0.001 )
+# self.c.theta = kwargs.get( "theta", 0.83 )
- if self.c.errmod != BAM_ERRMOD_MAQ2:
- self.c.theta += 0.02
+# if self.c.errmod != BAM_ERRMOD_MAQ2:
+# self.c.theta += 0.02
- # call prepare AFTER setting parameters
- bam_maqcns_prepare( self.c )
+# # call prepare AFTER setting parameters
+# bam_maqcns_prepare( self.c )
- def __dealloc__(self):
- bam_maqcns_destroy( self.c )
-
- cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
- '''debugging output.'''
-
- pysam_dump_glf( g, self.c );
- print ""
- for x in range(self.iter.n_plp):
- print "--> read %i %s %i" % (x,
- bam1_qname(self.iter.plp[x].b),
- self.iter.plp[x].qpos,
- )
-
- print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
- % (self.iter.pos,
- cns,
- self.c.q_r,
- self.iter.n_plp,
- self.iter.n_plp,
- rb,
- cns >> 8 & 0xff,
- cns >> 16 & 0xff,
- cns & 0xff,
- cns >> 28,
- )
-
- printf("-------------------------------------\n");
- sys.stdout.flush()
-
-cdef class IteratorSNPCalls( SNPCallerBase ):
- """*(IteratorColumn iterator)*
-
- call SNPs within a region.
-
- *iterator* is a pileup iterator. SNPs will be called
- on all positions returned by this iterator.
-
- This caller is fast if SNPs are called over large continuous
- regions. It is slow, if instantiated frequently and in random
- order as the sequence will have to be reloaded.
+# def __dealloc__(self):
+# bam_maqcns_destroy( self.c )
- """
-
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+ # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
+ # '''debugging output.'''
- assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
+ # pysam_dump_glf( g, self.c );
+ # print ""
+ # for x in range(self.iter.n_plp):
+ # print "--> read %i %s %i" % (x,
+ # bam1_qname(self.iter.plp[x].b),
+ # self.iter.plp[x].qpos,
+ # )
- def __iter__(self):
- return self
+ # print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
+ # % (self.iter.pos,
+ # cns,
+ # self.c.q_r,
+ # self.iter.n_plp,
+ # self.iter.n_plp,
+ # rb,
+ # cns >> 8 & 0xff,
+ # cns >> 16 & 0xff,
+ # cns & 0xff,
+ # cns >> 28,
+ # )
- def __next__(self):
- """python version of next().
- """
+ # printf("-------------------------------------\n");
+ # sys.stdout.flush()
- # the following code was adapted from bam_plcmd.c:pileup_func()
- self.iter.cnext()
+# cdef class IteratorSNPCalls( SNPCallerBase ):
+# """*(IteratorColumn iterator)*
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# call SNPs within a region.
- if self.iter.plp == NULL:
- raise StopIteration
+# *iterator* is a pileup iterator. SNPs will be called
+# on all positions returned by this iterator.
- cdef char * seq = self.iter.getSequence()
- cdef int seq_len = self.iter.seq_len
+# This caller is fast if SNPs are called over large continuous
+# regions. It is slow, if instantiated frequently and in random
+# order as the sequence will have to be reloaded.
- assert seq != NULL
+# """
- # reference base
- if self.iter.pos >= seq_len:
- raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- cdef int rb = seq[self.iter.pos]
- cdef uint32_t cns
- cdef glf1_t * g
+# assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
- g = bam_maqcns_glfgen( self.iter.n_plp,
- self.iter.plp,
- bam_nt16_table[rb],
- self.c )
+# def __iter__(self):
+# return self
- if pysam_glf_depth( g ) == 0:
- cns = 0xfu << 28 | 0xf << 24
- else:
- cns = glf2cns(g, <int>(self.c.q_r + .499))
-
- free(g)
-
- cdef SNPCall call
+# def __next__(self):
+# """python version of next().
+# """
- call = SNPCall()
- call._tid = self.iter.tid
- call._pos = self.iter.pos
- call._reference_base = rb
- call._genotype = bam_nt16_rev_table[cns>>28]
- call._consensus_quality = cns >> 8 & 0xff
- call._snp_quality = cns & 0xff
- call._rms_mapping_quality = cns >> 16&0xff
- call._coverage = self.iter.n_plp
+# # the following code was adapted from bam_plcmd.c:pileup_func()
+# self.iter.cnext()
- return call
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
-cdef class SNPCaller( SNPCallerBase ):
- '''*(IteratorColumn iterator_column )*
+# if self.iter.plp == NULL:
+# raise StopIteration
- The samtools SNP caller.
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
- This object will call SNPs in *samfile* against the reference
- sequence in *fasta*.
+# assert seq != NULL
- This caller is fast for calling few SNPs in selected regions.
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
- It is slow, if called over large genomic regions.
- '''
+# cdef int rb = seq[self.iter.pos]
+# cdef uint32_t cns
+# cdef glf1_t * g
+# g = bam_maqcns_glfgen( self.iter.n_plp,
+# self.iter.plp,
+# bam_nt16_table[rb],
+# self.c )
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# if pysam_glf_depth( g ) == 0:
+# cns = 0xfu << 28 | 0xf << 24
+# else:
+# cns = glf2cns(g, <int>(self.c.q_r + .499))
- pass
+# free(g)
- def call(self, reference, int pos ):
- """call a snp on chromosome *reference*
- and position *pos*.
+# cdef SNPCall call
- returns a :class:`SNPCall` object.
- """
+# call = SNPCall()
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._reference_base = rb
+# call._genotype = bam_nt16_rev_table[cns>>28]
+# call._consensus_quality = cns >> 8 & 0xff
+# call._snp_quality = cns & 0xff
+# call._rms_mapping_quality = cns >> 16&0xff
+# call._coverage = self.iter.n_plp
- cdef int tid = self.iter.samfile.gettid( reference )
+# return call
- self.iter.reset( tid, pos, pos + 1 )
+# cdef class SNPCaller( SNPCallerBase ):
+# '''*(IteratorColumn iterator_column )*
- while 1:
- self.iter.cnext()
-
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# The samtools SNP caller.
- if self.iter.plp == NULL:
- raise ValueError( "no reads in region - no call" )
-
- if self.iter.pos == pos: break
+# This object will call SNPs in *samfile* against the reference
+# sequence in *fasta*.
- cdef char * seq = self.iter.getSequence()
- cdef int seq_len = self.iter.seq_len
+# This caller is fast for calling few SNPs in selected regions.
- assert seq != NULL
+# It is slow, if called over large genomic regions.
+# '''
- # reference base
- if self.iter.pos >= seq_len:
- raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
- cdef int rb = seq[self.iter.pos]
- cdef uint32_t cns
- cdef glf1_t * g
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- g = bam_maqcns_glfgen( self.iter.n_plp,
- self.iter.plp,
- bam_nt16_table[rb],
- self.c )
+# pass
+# def call(self, reference, int pos ):
+# """call a snp on chromosome *reference*
+# and position *pos*.
- if pysam_glf_depth( g ) == 0:
- cns = 0xfu << 28 | 0xf << 24
- else:
- cns = glf2cns(g, <int>(self.c.q_r + .499))
+# returns a :class:`SNPCall` object.
+# """
- free(g)
-
- cdef SNPCall call
-
- call = SNPCall()
- call._tid = self.iter.tid
- call._pos = self.iter.pos
- call._reference_base = rb
- call._genotype = bam_nt16_rev_table[cns>>28]
- call._consensus_quality = cns >> 8 & 0xff
- call._snp_quality = cns & 0xff
- call._rms_mapping_quality = cns >> 16&0xff
- call._coverage = self.iter.n_plp
-
- return call
-
-cdef class IndelCall:
- '''the results of an indel call.'''
- cdef int _tid
- cdef int _pos
- cdef int _coverage
- cdef int _rms_mapping_quality
- cdef bam_maqindel_ret_t * _r
+# cdef int tid = self.iter.samfile.gettid( reference )
- def __cinit__(self):
- #assert r != NULL
- #self._r = r
- pass
+# self.iter.reset( tid, pos, pos + 1 )
- property tid:
- '''the chromosome ID as is defined in the header'''
- def __get__(self):
- return self._tid
-
- property pos:
- '''nucleotide position of SNP.'''
- def __get__(self): return self._pos
+# while 1:
+# self.iter.cnext()
- property genotype:
- '''the genotype called.'''
- def __get__(self):
- if self._r.gt == 0:
- s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
- return "%s/%s" % (s,s)
- elif self._r.gt == 1:
- s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
- return "%s/%s" % (s,s)
- else:
- return "%s/%s" % (self.first_allele, self.second_allele )
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
- property consensus_quality:
- '''the genotype quality (Phred-scaled).'''
- def __get__(self): return self._r.q_cns
+# if self.iter.plp == NULL:
+# raise ValueError( "no reads in region - no call" )
- property snp_quality:
- '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
- def __get__(self): return self._r.q_ref
+# if self.iter.pos == pos: break
- property mapping_quality:
- '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
- def __get__(self): return self._rms_mapping_quality
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
- property coverage:
- '''coverage or read depth - the number of reads involved in the call.'''
- def __get__(self): return self._coverage
+# assert seq != NULL
- property first_allele:
- '''sequence of first allele.'''
- def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
- property second_allele:
- '''sequence of second allele.'''
- def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+# cdef int rb = seq[self.iter.pos]
+# cdef uint32_t cns
+# # cdef glf1_t * g
+# #
+# # g = bam_maqcns_glfgen( self.iter.n_plp,
+# # self.iter.plp,
+# # bam_nt16_table[rb],
+# # self.c )
+# ##
+# #
+# # if pysam_glf_depth( g ) == 0:
+# # cns = 0xfu << 28 | 0xf << 24
+# # else:
+# # cns = glf2cns(g, <int>(self.c.q_r + .499))
+# #
+# # free(g)
+
+# cdef SNPCall call
+
+# call = SNPCall()
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._reference_base = rb
+# call._genotype = bam_nt16_rev_table[cns>>28]
+# call._consensus_quality = cns >> 8 & 0xff
+# call._snp_quality = cns & 0xff
+# call._rms_mapping_quality = cns >> 16&0xff
+# call._coverage = self.iter.n_plp
+
+# return call
+
+# cdef class IndelCall:
+# '''the results of an indel call.'''
+# cdef int _tid
+# cdef int _pos
+# cdef int _coverage
+# cdef int _rms_mapping_quality
+# cdef bam_maqindel_ret_t * _r
+
+# def __cinit__(self):
+# #assert r != NULL
+# #self._r = r
+# pass
+
+# property tid:
+# '''the chromosome ID as is defined in the header'''
+# def __get__(self):
+# return self._tid
+
+# property pos:
+# '''nucleotide position of SNP.'''
+# def __get__(self): return self._pos
+
+# property genotype:
+# '''the genotype called.'''
+# def __get__(self):
+# if self._r.gt == 0:
+# s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+# return "%s/%s" % (s,s)
+# elif self._r.gt == 1:
+# s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+# return "%s/%s" % (s,s)
+# else:
+# return "%s/%s" % (self.first_allele, self.second_allele )
+
+# property consensus_quality:
+# '''the genotype quality (Phred-scaled).'''
+# def __get__(self): return self._r.q_cns
+
+# property snp_quality:
+# '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+# def __get__(self): return self._r.q_ref
+
+# property mapping_quality:
+# '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+# def __get__(self): return self._rms_mapping_quality
+
+# property coverage:
+# '''coverage or read depth - the number of reads involved in the call.'''
+# def __get__(self): return self._coverage
+
+# property first_allele:
+# '''sequence of first allele.'''
+# def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+
+# property second_allele:
+# '''sequence of second allele.'''
+# def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+
+# property reads_first:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt1
+
+# property reads_second:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt2
+
+# property reads_diff:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt_anti
+
+# def __str__(self):
+
+# return "\t".join( map(str, (
+# self.tid,
+# self.pos,
+# self.genotype,
+# self.consensus_quality,
+# self.snp_quality,
+# self.mapping_quality,
+# self.coverage,
+# self.first_allele,
+# self.second_allele,
+# self.reads_first,
+# self.reads_second,
+# self.reads_diff ) ) )
+
+# def __dealloc__(self ):
+# bam_maqindel_ret_destroy(self._r)
+
+# cdef class IndelCallerBase:
+# '''Base class for SNP callers.
+
+# *min_baseQ*
+# minimum base quality (possibly capped by BAQ)
+# *capQ_threshold*
+# coefficient for adjusting mapQ of poor mappings
+# *theta*
+# theta in maq consensus calling model
+# *n_haplotypes*
+# number of haplotypes in the sample
+# *het_rate*
+# prior of a difference between two haplotypes
+# '''
+
+# cdef bam_maqindel_opt_t * options
+# cdef IteratorColumn iter
+# cdef int cap_mapQ
+# cdef int max_depth
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- property reads_first:
- '''reads supporting first allele.'''
- def __get__(self): return self._r.cnt1
- property reads_second:
- '''reads supporting first allele.'''
- def __get__(self): return self._r.cnt2
+# self.iter = iterator_column
- property reads_diff:
- '''reads supporting first allele.'''
- def __get__(self): return self._r.cnt_anti
+# assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
- def __str__(self):
+# self.options = bam_maqindel_opt_init()
- return "\t".join( map(str, (
- self.tid,
- self.pos,
- self.genotype,
- self.consensus_quality,
- self.snp_quality,
- self.mapping_quality,
- self.coverage,
- self.first_allele,
- self.second_allele,
- self.reads_first,
- self.reads_second,
- self.reads_diff ) ) )
-
- def __dealloc__(self ):
- bam_maqindel_ret_destroy(self._r)
-
-cdef class IndelCallerBase:
- '''Base class for SNP callers.
-
- *min_baseQ*
- minimum base quality (possibly capped by BAQ)
- *capQ_threshold*
- coefficient for adjusting mapQ of poor mappings
- *theta*
- theta in maq consensus calling model
- *n_haplotypes*
- number of haplotypes in the sample
- *het_rate*
- prior of a difference between two haplotypes
- '''
+# # set the default parameterization according to
+# # samtools
- cdef bam_maqindel_opt_t * options
- cdef IteratorColumn iter
- cdef int cap_mapQ
- cdef int max_depth
+# self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
+# self.options.q_indel = kwargs.get( "q_indel", 40 )
+# self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
+# self.max_depth = kwargs.get( "max_depth", 1024 )
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# def __dealloc__(self):
+# free( self.options )
+# def _call( self ):
- self.iter = iterator_column
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
- assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
+# assert seq != NULL
- self.options = bam_maqindel_opt_init()
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
- # set the default parameterization according to
- # samtools
+# cdef bam_maqindel_ret_t * r
- self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
- self.options.q_indel = kwargs.get( "q_indel", 40 )
- self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
- self.max_depth = kwargs.get( "max_depth", 1024 )
+# cdef int m = min( self.max_depth, self.iter.n_plp )
- def __dealloc__(self):
- free( self.options )
+# # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
+# # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
+# # self.options.indel_err, self.options.ambi_thres );
- def _call( self ):
+# r = bam_maqindel(m,
+# self.iter.pos,
+# self.options,
+# self.iter.plp,
+# seq,
+# 0,
+# NULL)
- cdef char * seq = self.iter.getSequence()
- cdef int seq_len = self.iter.seq_len
+# if r == NULL: return None
- assert seq != NULL
+# cdef IndelCall call
+# call = IndelCall()
+# call._r = r
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._coverage = self.iter.n_plp
- # reference base
- if self.iter.pos >= seq_len:
- raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+# cdef uint64_t rms_aux = 0
+# cdef int i = 0
+# cdef bam_pileup1_t * p
+# cdef int tmp
- cdef bam_maqindel_ret_t * r
-
- cdef int m = min( self.max_depth, self.iter.n_plp )
-
- # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
- # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
- # self.options.indel_err, self.options.ambi_thres );
-
- r = bam_maqindel(m,
- self.iter.pos,
- self.options,
- self.iter.plp,
- seq,
- 0,
- NULL)
-
- if r == NULL: return None
-
- cdef IndelCall call
- call = IndelCall()
- call._r = r
- call._tid = self.iter.tid
- call._pos = self.iter.pos
- call._coverage = self.iter.n_plp
-
- cdef uint64_t rms_aux = 0
- cdef int i = 0
- cdef bam_pileup1_t * p
- cdef int tmp
-
- for i from 0 <= i < self.iter.n_plp:
- p = self.iter.plp + i
- if p.b.core.qual < self.cap_mapQ:
- tmp = p.b.core.qual
- else:
- tmp = self.cap_mapQ
- rms_aux += tmp * tmp
+# for i from 0 <= i < self.iter.n_plp:
+# p = self.iter.plp + i
+# if p.b.core.qual < self.cap_mapQ:
+# tmp = p.b.core.qual
+# else:
+# tmp = self.cap_mapQ
+# rms_aux += tmp * tmp
- call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
+# call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
- return call
+# return call
-cdef class IndelCaller( IndelCallerBase ):
- '''*(IteratorColumn iterator_column )*
+# cdef class IndelCaller( IndelCallerBase ):
+# '''*(IteratorColumn iterator_column )*
- The samtools SNP caller.
+# The samtools SNP caller.
- This object will call SNPs in *samfile* against the reference
- sequence in *fasta*.
+# This object will call SNPs in *samfile* against the reference
+# sequence in *fasta*.
- This caller is fast for calling few SNPs in selected regions.
+# This caller is fast for calling few SNPs in selected regions.
- It is slow, if called over large genomic regions.
- '''
+# It is slow, if called over large genomic regions.
+# '''
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- pass
+# pass
- def call(self, reference, int pos ):
- """call a snp on chromosome *reference*
- and position *pos*.
+# def call(self, reference, int pos ):
+# """call a snp on chromosome *reference*
+# and position *pos*.
- returns a :class:`SNPCall` object or None, if no indel call could be made.
- """
+# returns a :class:`SNPCall` object or None, if no indel call could be made.
+# """
- cdef int tid = self.iter.samfile.gettid( reference )
+# cdef int tid = self.iter.samfile.gettid( reference )
- self.iter.reset( tid, pos, pos + 1 )
+# self.iter.reset( tid, pos, pos + 1 )
- while 1:
- self.iter.cnext()
-
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# while 1:
+# self.iter.cnext()
- if self.iter.plp == NULL:
- raise ValueError( "no reads in region - no call" )
-
- if self.iter.pos == pos: break
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
- return self._call()
+# if self.iter.plp == NULL:
+# raise ValueError( "no reads in region - no call" )
-cdef class IteratorIndelCalls( IndelCallerBase ):
- """*(IteratorColumn iterator)*
+# if self.iter.pos == pos: break
- call indels within a region.
+# return self._call()
- *iterator* is a pileup iterator. SNPs will be called
- on all positions returned by this iterator.
+# cdef class IteratorIndelCalls( IndelCallerBase ):
+# """*(IteratorColumn iterator)*
- This caller is fast if SNPs are called over large continuous
- regions. It is slow, if instantiated frequently and in random
- order as the sequence will have to be reloaded.
+# call indels within a region.
- """
-
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
- pass
+# *iterator* is a pileup iterator. SNPs will be called
+# on all positions returned by this iterator.
+# This caller is fast if SNPs are called over large continuous
+# regions. It is slow, if instantiated frequently and in random
+# order as the sequence will have to be reloaded.
- def __iter__(self):
- return self
+# """
- def __next__(self):
- """python version of next().
- """
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+# pass
- # the following code was adapted from bam_plcmd.c:pileup_func()
- self.iter.cnext()
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# def __iter__(self):
+# return self
- if self.iter.plp == NULL:
- raise StopIteration
+# def __next__(self):
+# """python version of next().
+# """
- return self._call()
+# # the following code was adapted from bam_plcmd.c:pileup_func()
+# self.iter.cnext()
+
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
+
+# if self.iter.plp == NULL:
+# raise StopIteration
+
+# return self._call()
to not re-open *samfile*.
"""
- cdef Samfile samfile
- cdef samfile_t * fp
- cdef index
- # true if samfile belongs to this object
- cdef int owns_samfile
-
def __init__(self, Samfile samfile, int reopen = True ):
self.samfile = samfile
- if samfile.isbam: mode = "rb"
- else: mode = "r"
+ if samfile.isbam: mode = b"rb"
+ else: mode = b"r"
# reopen the file - note that this makes the iterator
# slow and causes pileup to slow down significantly.
if reopen:
- store = StderrStore()
+ store = StderrStore()
self.fp = samopen( samfile._filename, mode, NULL )
store.release()
assert self.fp != NULL
def build( self ):
'''build index.'''
-
+
self.index = collections.defaultdict( list )
# this method will start indexing from the current file position
# if you decide
cdef int ret = 1
cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
-
+
cdef uint64_t pos
while ret > 0:
- pos = bam_tell( self.fp.x.bam )
+ pos = bam_tell( self.fp.x.bam )
ret = samread( self.fp, b)
if ret > 0:
- qname = bam1_qname( b )
- self.index[qname].append( pos )
-
+ qname = _charptr_to_str(bam1_qname( b ))
+ self.index[qname].append( pos )
+
bam_destroy1( b )
def find( self, qname ):
def __dealloc__(self):
if self.owns_samfile: samclose( self.fp )
-__all__ = ["Samfile",
+__all__ = ["Samfile",
"Fastafile",
- "IteratorRow",
- "IteratorColumn",
- "AlignedRead",
- "PileupColumn",
- "PileupProxy",
+ "IteratorRow",
+ "IteratorColumn",
+ "AlignedRead",
+ "PileupColumn",
+ "PileupProxy",
"PileupRead",
- "IteratorSNPCalls",
- "SNPCaller",
- "IndelCaller",
- "IteratorIndelCalls",
+ # "IteratorSNPCalls",
+ # "SNPCaller",
+ # "IndelCaller",
+ # "IteratorIndelCalls",
"IndexedReads" ]
-
+