# cython: embedsignature=True
+# cython: profile=True
# adds doc-strings for sphinx
-
-import tempfile, os, sys, types, itertools, struct, ctypes
-
+import tempfile
+import os
+import sys
+import types
+import itertools
+import struct
+import ctypes
+import collections
+import re
+import platform
+import warnings
+from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize
+from cpython.version cimport PY_MAJOR_VERSION
+
+########################################################################
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+cdef from_string_and_size(char* s, size_t length):
+ if PY_MAJOR_VERSION < 3:
+ return s[:length]
+ else:
+ return s[:length].decode("ascii")
+
+# filename encoding (copied from lxml.etree.pyx)
+cdef str _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = 'ascii'
+
+#cdef char* _C_FILENAME_ENCODING
+#_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
+
+cdef bytes _my_encodeFilename(object filename):
+ u"""Make sure a filename is 8-bit encoded (or None).
+ """
+ if filename is None:
+ return None
+ elif PyBytes_Check(filename):
+ return filename
+ elif PyUnicode_Check(filename):
+ return filename.encode(_FILENAME_ENCODING)
+ else:
+ raise TypeError, u"Argument must be string or unicode."
+
+
+cdef bytes _force_bytes(object s):
+ u"""convert string or unicode object to bytes, assuming ascii encoding.
+ """
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif s is None:
+ return None
+ elif PyBytes_Check(s):
+ return s
+ elif PyUnicode_Check(s):
+ return s.encode('ascii')
+ else:
+ raise TypeError, u"Argument must be string, bytes or unicode."
+
+cdef inline bytes _force_cmdline_bytes(object s):
+ return _force_bytes(s)
+
+cdef _charptr_to_str(char* s):
+ if PY_MAJOR_VERSION < 3:
+ return s
+ else:
+ return s.decode("ascii")
+
+cdef _force_str(object s):
+ """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
+ if s is None:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif PyBytes_Check(s):
+ return s.decode('ascii')
+ else:
+ # assume unicode
+ return s
+########################################################################
+########################################################################
+########################################################################
+## Constants and global variables
+########################################################################
# defines imported from samtools
DEF SEEK_SET = 0
DEF SEEK_CUR = 1
## @abstract optical or PCR duplicate */
DEF BAM_FDUP =1024
+#####################################################################
+# CIGAR operations
DEF BAM_CIGAR_SHIFT=4
DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
+DEF BAM_CMATCH = 0
+DEF BAM_CINS = 1
+DEF BAM_CDEL = 2
+DEF BAM_CREF_SKIP = 3
+DEF BAM_CSOFT_CLIP = 4
+DEF BAM_CHARD_CLIP = 5
+DEF BAM_CPAD = 6
+DEF BAM_CEQUAL = 7
+DEF BAM_CDIFF = 8
+
+cdef char* CODE2CIGAR= "MIDNSHP=X"
+if IS_PYTHON3:
+ CIGAR2CODE = dict( [y,x] for x,y in enumerate( CODE2CIGAR) )
+else:
+ CIGAR2CODE = dict( [ord(y),x] for x,y in enumerate( CODE2CIGAR) )
+CIGAR_REGEX = re.compile( "([MIDNSHP=X])(\d+)" )
+
+#####################################################################
+## set pysam stderr to /dev/null
+pysam_unset_stderr()
+
+#####################################################################
+# hard-coded constants
+cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
+cdef int max_pos = 2 << 29
+
#####################################################################
#####################################################################
#####################################################################
## private factory methods
#####################################################################
cdef class AlignedRead
-cdef makeAlignedRead( bam1_t * src):
+cdef makeAlignedRead(bam1_t * src):
'''enter src into AlignedRead.'''
- cdef AlignedRead dest
- dest = AlignedRead()
- # destroy dummy delegate created in constructor
- # to prevent memory leak.
- pysam_bam_destroy1(dest._delegate)
+ cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
dest._delegate = bam_dup1(src)
return dest
cdef class PileupProxy
-cdef makePileupProxy( bam_plbuf_t * buf, int n ):
- cdef PileupProxy dest
- dest = PileupProxy()
- dest.buf = buf
+cdef makePileupProxy( bam_pileup1_t ** plp, int tid, int pos, int n ):
+ cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
+ dest.plp = plp
+ dest.tid = tid
+ dest.pos = pos
dest.n = n
return dest
cdef class PileupRead
cdef makePileupRead( bam_pileup1_t * src ):
'''fill a PileupRead object from a bam_pileup1_t * object.'''
- cdef PileupRead dest
- dest = PileupRead()
+ cdef PileupRead dest = PileupRead.__new__(PileupRead)
dest._alignment = makeAlignedRead( src.b )
dest._qpos = src.qpos
dest._indel = src.indel
dest._is_tail = src.is_tail
return dest
+cdef convertBinaryTagToList( uint8_t * s ):
+ """return bytesize, number of values list of values in s."""
+ cdef char auxtype
+ cdef uint8_t byte_size
+ cdef int32_t nvalues
+
+ # get byte size
+ auxtype = s[0]
+ byte_size = bam_aux_type2size( auxtype )
+ s += 1
+ # get number of values in array
+ nvalues = (<int32_t*>s)[0]
+ s += 4
+ # get values
+ values = []
+ if auxtype == 'c':
+ for x from 0 <= x < nvalues:
+ values.append((<int8_t*>s)[0])
+ s += 1
+ elif auxtype == 'C':
+ for x from 0 <= x < nvalues:
+ values.append((<uint8_t*>s)[0])
+ s += 1
+ elif auxtype == 's':
+ for x from 0 <= x < nvalues:
+ values.append((<int16_t*>s)[0])
+ s += 2
+ elif auxtype == 'S':
+ for x from 0 <= x < nvalues:
+ values.append((<uint16_t*>s)[0])
+ s += 2
+ elif auxtype == 'i':
+ for x from 0 <= x < nvalues:
+ values.append((<int32_t*>s)[0])
+ s += 4
+ elif auxtype == 'I':
+ for x from 0 <= x < nvalues:
+ values.append((<uint32_t*>s)[0])
+ s += 4
+ elif auxtype == 'f':
+ for x from 0 <= x < nvalues:
+ values.append((<float*>s)[0])
+ s += 4
+
+ return byte_size, nvalues, values
+
#####################################################################
#####################################################################
#####################################################################
## Generic callbacks for inserting python callbacks.
#####################################################################
cdef int fetch_callback( bam1_t *alignment, void *f):
- '''callback for bam_fetch.
-
+ '''callback for bam_fetch.
+
calls function in *f* with a new :class:`AlignedRead` object as parameter.
'''
a = makeAlignedRead( alignment )
(<object>f)(a)
-class PileupColumn(object):
- '''A pileup column. A pileup column contains
+class PileupColumn(object):
+ '''A pileup column. A pileup column contains
all the reads that map to a certain target base.
- tid
- chromosome ID as is defined in the header
- pos
- the target base coordinate (0-based)
- n
- number of reads mapping to this column
- pileups
- list of reads (:class:`pysam.PileupRead`) aligned to this column
- '''
- def __str__(self):
+ tid
+ chromosome ID as is defined in the header
+ pos
+ the target base coordinate (0-based)
+ n
+ number of reads mapping to this column
+ pileups
+ list of reads (:class:`pysam.PileupRead`) aligned to this column
+ '''
+ def __str__(self):
return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
"\n" + "\n".join( map(str, self.pileups) )
p.n = n
pileups = []
+ cdef int x
for x from 0 <= x < n:
pileups.append( makePileupRead( &(pl[x]) ) )
p.pileups = pileups
-
+
(<object>f)(p)
cdef int pileup_fetch_callback( bam1_t *b, void *data):
- '''callback for bam_fetch.
+ '''callback for bam_fetch.
Fetches reads and submits them to pileup.
'''
class StderrStore():
'''
- stderr is captured.
+ stderr is captured.
'''
def __init__(self):
+ return
self.stderr_h, self.stderr_f = tempfile.mkstemp()
self.stderr_save = Outs( sys.stderr.fileno() )
self.stderr_save.setfd( self.stderr_h )
-
+
+ def readAndRelease( self ):
+ return []
+ self.stderr_save.restore()
+ lines = []
+ if os.path.exists(self.stderr_f):
+ lines = open( self.stderr_f, "r" ).readlines()
+ os.remove( self.stderr_f )
+ return lines
+
def release(self):
+ return
self.stderr_save.restore()
if os.path.exists(self.stderr_f):
os.remove( self.stderr_f )
def __del__(self):
self.release()
+class StderrStoreWindows():
+ '''does nothing. stderr can't be redirected on windows'''
+ def __init__(self): pass
+ def readAndRelease(self): return []
+ def release(self): pass
+
+if platform.system()=='Windows':
+ del StderrStore
+ StderrStore = StderrStoreWindows
+
+
######################################################################
######################################################################
######################################################################
# valid types for sam headers
-VALID_HEADER_TYPES = { "HD" : dict,
- "SQ" : list,
- "RG" : list,
- "PG" : list,
+VALID_HEADER_TYPES = { "HD" : dict,
+ "SQ" : list,
+ "RG" : list,
+ "PG" : list,
"CO" : list }
# order of records within sam headers
# type conversions within sam header records
VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
"SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
- "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
- "PG" : { "ID" : str, "VN" : str, "CL" : str }, }
+ "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str,
+ "CN" : str, "DT" : str, "PL" : str, "FO" : str, "KS" : str },
+ "PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
# output order of fields within records
VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
"SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
- "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
- "PG" : ( "ID", "VN", "CL" ), }
+ "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL", "FO", "KS" ),
+ "PG" : ( "PN", "ID", "VN", "CL" ), }
+
######################################################################
######################################################################
######################################################################
## Public methods
######################################################################
-cdef class Samfile:
- '''*(filename, mode='r', template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
-
- A *SAM* file. The file is automatically opened.
-
- *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode so for binary
- (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
- Use ``h`` to output header information in text (:term:`TAM`) mode.
+cdef class Fastafile:
+ '''*(filename)*
- If ``b`` is present, it must immediately follow ``r`` or ``w``.
- Currently valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``.
-
- so to open a :term:`BAM` file for reading::
+ A *FASTA* file. The file is automatically opened.
+
+ The file expects an indexed fasta file.
+
+ TODO:
+ add automatic indexing.
+ add function to get sequence names.
+ '''
- f=Samfile('ex1.bam','rb')
+ def __cinit__(self, *args, **kwargs ):
+ self.fastafile = NULL
+ self._filename = NULL
+ self._open( *args, **kwargs )
+
+ def _isOpen( self ):
+ '''return true if samfile has been opened.'''
+ return self.fastafile != NULL
+ def __len__(self):
+ if self.fastafile == NULL:
+ raise ValueError( "calling len() on closed file" )
- For writing, the header of a :term:`TAM` file/:term:`BAM` file can be constituted from several
- sources:
+ return faidx_fetch_nseq(self.fastafile)
- 1. If *template* is given, the header is copied from a another *Samfile* (*template* must be of type *Samfile*).
+ def _open( self,
+ filename ):
+ '''open an indexed fasta file.
- 2. If *header* is given, the header is build from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level is then a list of lines, with each line being a list of tag-value pairs.
+ This method expects an indexed fasta file.
+ '''
- 3. If *text* is given, new header text is copied from raw text.
+ # close a previously opened file
+ if self.fastafile != NULL: self.close()
+ if self._filename != NULL: free(self._filename)
+ filename = _my_encodeFilename(filename)
+ self._filename = strdup(filename)
+ self.fastafile = fai_load( filename )
- 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
+ if self.fastafile == NULL:
+ raise IOError("could not open file `%s`" % filename )
+
+ def close( self ):
+ if self.fastafile != NULL:
+ fai_destroy( self.fastafile )
+ self.fastafile = NULL
+
+ def __dealloc__(self):
+ self.close()
+ if self._filename != NULL: free(self._filename)
+
+ property filename:
+ '''number of :term:`filename` associated with this object.'''
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ return self._filename
+
+ def fetch( self,
+ reference = None,
+ start = None,
+ end = None,
+ region = None):
+
+ '''*(reference = None, start = None, end = None, region = None)*
+
+ fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
+
+ The region is specified by :term:`reference`, *start* and *end*.
+
+ fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
+
+ If *reference* is given and *start* is None, the sequence from the
+ first base is returned. Similarly, if *end* is None, the sequence
+ until the last base is returned.
+
+ Alternatively, a samtools :term:`region` string can be supplied.
+ '''
+
+ if not self._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+
+ cdef int length
+ cdef char * seq
+
+ if not region:
+ if reference is None: raise ValueError( 'no sequence/region supplied.' )
+ if start is None: start = 0
+ if end is None: end = max_pos -1
+
+ if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
+ if start == end: return b""
+ # valid ranges are from 0 to 2^29-1
+ if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
+ if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
+ # note: faidx_fetch_seq has a bug such that out-of-range access
+ # always returns the last residue. Hence do not use faidx_fetch_seq,
+ # but use fai_fetch instead
+ # seq = faidx_fetch_seq(self.fastafile,
+ # reference,
+ # start,
+ # end-1,
+ # &length)
+ region = "%s:%i-%i" % (reference, start+1, end)
+ if PY_MAJOR_VERSION >= 3:
+ region = region.encode('ascii')
+ seq = fai_fetch( self.fastafile,
+ region,
+ &length )
+ else:
+ # samtools adds a '\0' at the end
+ seq = fai_fetch( self.fastafile, region, &length )
+
+ # copy to python
+ if seq == NULL:
+ return b""
+ else:
+ try:
+ py_seq = seq[:length]
+ finally:
+ free(seq)
+
+ return py_seq
+
+ cdef char * _fetch( self, char * reference, int start, int end, int * length ):
+ '''fetch sequence for reference, start and end'''
+
+ return faidx_fetch_seq(self.fastafile,
+ reference,
+ start,
+ end-1,
+ length )
+
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+cdef int count_callback( bam1_t *alignment, void *f):
+ '''callback for bam_fetch - count number of reads.
+ '''
+ cdef int* counter = (<int*>f)
+ counter[0] += 1;
+
+ctypedef struct MateData:
+ char * name
+ bam1_t * mate
+ uint32_t flag
+
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+cdef int mate_callback( bam1_t *alignment, void *f):
+ '''callback for bam_fetch = filter mate
+ '''
+ cdef MateData * d = (<MateData*>f)
+ # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
+ # d.mate, d.name, bam1_qname(alignment),
+ # d.flag, alignment.core.flag, alignment.core.flag & d.flag)
+
+ if d.mate == NULL:
+ # could be sped up by comparing the lengths of query strings first
+ # using l_qname
+ #
+ # also, make sure that we get the other read by comparing
+ # the flags
+ if alignment.core.flag & d.flag != 0 and \
+ strcmp( bam1_qname( alignment ), d.name ) == 0:
+ d.mate = bam_dup1( alignment )
+
+
+cdef class Samfile:
+ '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None,
+ add_sq_text = False, check_header = True, check_sq = True )*
+
+ A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
+
+ *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
+ (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
+ Use ``h`` to output header information in text (:term:`TAM`) mode.
+
+ If ``b`` is present, it must immediately follow ``r`` or ``w``.
+ Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
+ a :term:`BAM` formatted file for reading, type::
+
+ f = pysam.Samfile('ex1.bam','rb')
+
+ If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following
+ should work::
+
+ f1 = pysam.Samfile('ex1.bam' )
+ f2 = pysam.Samfile('ex1.sam' )
If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
- '''
- cdef char * filename
- # pointer to samfile
- cdef samfile_t * samfile
- # pointer to index
- cdef bam_index_t *index
- # true if file is a bam file
- cdef int isbam
+ For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
+ sources (see also the samtools format specification):
+
+ 1. If *template* is given, the header is copied from a another *Samfile*
+ (*template* must be of type *Samfile*).
+
+ 2. If *header* is given, the header is built from a multi-level dictionary. The first level
+ are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
+ being a list of tag-value pairs. The header is constructed first from all the defined fields,
+ followed by user tags in alphabetical order.
+
+ 3. If *text* is given, new header text is copied from raw text.
+
+ 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
+ By default, 'SQ' and 'LN' tags will be added to the header text. This option can be
+ changed by unsetting the flag *add_sq_text*.
- # current read within iteration
- cdef bam1_t * b
+ By default, if file a file is opened in mode 'r', it is checked for a valid header
+ (*check_header* = True) and a definition of chromosome names (*check_sq* = True).
+
+ '''
def __cinit__(self, *args, **kwargs ):
self.samfile = NULL
+ self._filename = NULL
self.isbam = False
+ self.isstream = False
self._open( *args, **kwargs )
# allocate memory for iterator
'''return true if samfile has an existing (and opened) index.'''
return self.index != NULL
- def _open( self,
- char * filename,
- mode ='r',
+ def _open( self,
+ filename,
+ mode = None,
Samfile template = None,
referencenames = None,
referencelengths = None,
- char * text = NULL,
+ text = None,
header = None,
+ port = None,
+ add_sq_text = True,
+ check_header = True,
+ check_sq = True,
):
'''open a sam/bam file.
closed and a new file will be opened.
'''
- assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
+ # read mode autodetection
+ if mode is None:
+ try:
+ self._open(filename, 'rb', template=template,
+ referencenames=referencenames,
+ referencelengths=referencelengths,
+ text=text, header=header, port=port,
+ check_header=check_header,
+ check_sq=check_sq)
+ return
+ except ValueError, msg:
+ pass
+
+ self._open(filename, 'r', template=template,
+ referencenames=referencenames,
+ referencelengths=referencelengths,
+ text=text, header=header, port=port,
+ check_header=check_header,
+ check_sq=check_sq)
+ return
+
+ assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode
+
+ #assert filename != NULL
# close a previously opened file
if self.samfile != NULL: self.close()
cdef bam_header_t * header_to_write
header_to_write = NULL
- self.filename = filename
+ if self._filename != NULL: free(self._filename )
+ filename = _my_encodeFilename(filename)
+ cdef bytes bmode = mode.encode('ascii')
+ #cdef char* cfilename
+ #cfilename = filename.encode(_FILENAME_ENCODING)
+ self._filename = strdup(filename)
+ self.isstream = strcmp( filename, "-" ) == 0
self.isbam = len(mode) > 1 and mode[1] == 'b'
+ self.isremote = strncmp(filename,"http:",5) == 0 or \
+ strncmp(filename,"ftp:",4) == 0
+
+ cdef char * ctext
+ ctext = NULL
+
if mode[0] == 'w':
# open file for writing
-
+
# header structure (used for writing)
if template:
# copy header from another file
else:
# build header from a target names and lengths
- assert referencenames and referencelengths, "either supply options `template`, `header` or both `refernencenames` and `referencelengths` for writing"
+ assert referencenames and referencelengths, "either supply options `template`, `header` or both `referencenames` and `referencelengths` for writing"
assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
# allocate and fill header
+ referencenames = [ _force_bytes(ref) for ref in referencenames ]
header_to_write = bam_header_init()
header_to_write.n_targets = len(referencenames)
n = 0
header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
strncpy( header_to_write.target_name[x], name, len(name) )
- if text != NULL:
+ # Optionally, if there is no text, add a SAM compatible header to output
+ # file.
+ if text is None and add_sq_text:
+ text = []
+ for x from 0 <= x < header_to_write.n_targets:
+ text.append( "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] ) )
+ text = ''.join(text)
+
+ if text != None:
# copy without \0
- header_to_write.l_text = strlen(text)
- header_to_write.text = <char*>calloc( strlen(text), sizeof(char) )
- memcpy( header_to_write.text, text, strlen(text) )
+ text = _force_bytes(text)
+ ctext = text
+ header_to_write.l_text = strlen(ctext)
+ header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
+ memcpy( header_to_write.text, ctext, strlen(ctext) )
header_to_write.hash = NULL
header_to_write.rg2lib = NULL
-
+
# open file. Header gets written to file at the same time for bam files
# and sam files (in the latter case, the mode needs to be wh)
store = StderrStore()
- self.samfile = samopen( filename, mode, header_to_write )
+ self.samfile = samopen( filename, bmode, header_to_write )
store.release()
# bam_header_destroy takes care of cleaning up of all the members
elif mode[0] == "r":
# open file for reading
- if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ):
+ if strncmp( filename, "-", 1) != 0 and \
+ not self.isremote and \
+ not os.path.exists( filename ):
raise IOError( "file `%s` not found" % filename)
- store = StderrStore()
- self.samfile = samopen( filename, mode, NULL )
- store.release()
+ # try to detect errors
+ self.samfile = samopen( filename, bmode, NULL )
+ if self.samfile == NULL:
+ raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
+
+ # bam files require a valid header
+ if self.isbam:
+ if self.samfile.header == NULL:
+ raise ValueError( "file does not have valid header (mode='%s') - is it BAM format?" % mode )
+ else:
+ # in sam files it is optional (samfile full of unmapped reads)
+ if check_header and self.samfile.header == NULL:
+ raise ValueError( "file does not have valid header (mode='%s') - is it SAM format?" % mode )
+
+ # disabled for autodetection to work
+ # needs to be disabled so that reading from sam-files without headers works
+ if check_sq and self.samfile.header.n_targets == 0:
+ raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
if self.samfile == NULL:
raise IOError("could not open file `%s`" % filename )
+ # check for index and open if present
if mode[0] == "r" and self.isbam:
- if not os.path.exists(filename + ".bai"):
- self.index = NULL
+
+ if not self.isremote:
+ if not os.path.exists(filename + b".bai"):
+ self.index = NULL
+ else:
+ # returns NULL if there is no index or index could not be opened
+ self.index = bam_index_load(filename)
+ if self.index == NULL:
+ raise IOError("error while opening index `%s` " % filename )
else:
- # returns NULL if there is no index or index could not be opened
self.index = bam_index_load(filename)
if self.index == NULL:
raise IOError("error while opening index `%s` " % filename )
+ if not self.isstream:
+ self.start_offset = bam_tell( self.samfile.x.bam )
+
+ def gettid( self, reference ):
+ '''
+ convert :term:`reference` name into numerical :term:`tid`
+
+ returns -1 if reference is not known.
+ '''
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ reference = _force_bytes(reference)
+ return pysam_reference2tid( self.samfile.header, reference )
+
def getrname( self, tid ):
- '''(tid )
- convert numerical :term:`tid` into :ref:`reference` name.'''
+ '''
+ convert numerical :term:`tid` into :term:`reference` name.'''
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
if not 0 <= tid < self.samfile.header.n_targets:
- raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets )
+ raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
+ return _charptr_to_str(self.samfile.header.target_name[tid])
+
+ cdef char * _getrname( self, int tid ): # TODO unused
+ '''
+ convert numerical :term:`tid` into :term:`reference` name.'''
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not 0 <= tid < self.samfile.header.n_targets:
+ raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
return self.samfile.header.target_name[tid]
- def _parseRegion( self,
- reference = None,
- start = None,
- end = None,
+ def _parseRegion( self,
+ reference = None,
+ start = None,
+ end = None,
region = None ):
- '''parse region information.
+ '''
+ parse region information.
- raise Value for for invalid regions.
+ raise ValueError for for invalid regions.
- returns a tuple of region, tid, start and end. Region
- is a valid samtools :term:`region` or None if the region
- extends over the whole file.
+ returns a tuple of flag, tid, start and end. Flag indicates
+ whether some coordinates were supplied.
Note that regions are 1-based, while start,end are python coordinates.
'''
-
+ # This method's main objective is to translate from a reference to a tid.
+ # For now, it calls bam_parse_region, which is clumsy. Might be worth
+ # implementing it all in pysam (makes use of khash).
+
cdef int rtid
- cdef int rstart
- cdef int rend
- cdef int max_pos
- max_pos = 2 << 29
+ cdef long long rstart
+ cdef long long rend
+
+ rtid = -1
+ rstart = 0
+ rend = max_pos
+ if start != None:
+ try:
+ rstart = start
+ except OverflowError:
+ raise ValueError( 'start out of range (%i)' % start )
+
+ if end != None:
+ try:
+ rend = end
+ except OverflowError:
+ raise ValueError( 'end out of range (%i)' % end )
- rtid = rstart = rend = 0
+ if region:
+ region = _force_str(region)
+ parts = re.split( "[:-]", region )
+ reference = parts[0]
+ if len(parts) >= 2: rstart = int(parts[1]) - 1
+ if len(parts) >= 3: rend = int(parts[2])
- # translate to a region
- if reference:
- if start != None and end != None:
- region = "%s:%i-%i" % (reference, start+1, end)
- else:
- region = reference
+ if not reference: return 0, 0, 0, 0
- if region:
- store = StderrStore()
- bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)
- store.release()
- if rtid < 0: raise ValueError( "invalid region `%s`" % region )
- if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
- if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
- if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
-
- return region, rtid, rstart, rend
-
- def fetch( self,
- reference = None,
- start = None,
- end = None,
- region = None,
+ rtid = self.gettid( reference )
+ if rtid < 0: raise ValueError( "invalid reference `%s`" % reference )
+ if rstart > rend: raise ValueError( 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend) )
+ if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
+ if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
+
+ return 1, rtid, rstart, rend
+
+ def reset( self ):
+ '''reset file position to beginning of read section.'''
+ return self.seek( self.start_offset, 0 )
+
+ def seek( self, uint64_t offset, int where = 0):
+ '''
+ move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
+ '''
+
+ if not self._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+ if not self.isbam:
+ raise NotImplementedError("seek only available in bam files")
+ if self.isstream:
+ raise OSError("seek no available in streams")
+
+ return bam_seek( self.samfile.x.bam, offset, where )
+
+ def tell( self ):
+ '''
+ return current file position
+ '''
+ if not self._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+ if not self.isbam:
+ raise NotImplementedError("seek only available in bam files")
+
+ return bam_tell( self.samfile.x.bam )
+
+ def fetch( self,
+ reference = None,
+ start = None,
+ end = None,
+ region = None,
callback = None,
until_eof = False ):
- '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
-
- fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
- :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
+ '''
+ fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
+ :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
+ be supplied.
- Without *reference* or *region* all reads will be fetched. The reads will be returned
+ Without *reference* or *region* all mapped reads will be fetched. The reads will be returned
ordered by reference sequence, which will not necessarily be the order within the file.
+
If *until_eof* is given, all reads from the current file position will be returned
- *as they are sorted within the file*.
-
- If only *reference* is set, all reads matching on *reference* will be fetched.
+ in order as they are within the file. Using this option will also fetch unmapped reads.
+
+ If only *reference* is set, all reads aligned to *reference* will be fetched.
The method returns an iterator of type :class:`pysam.IteratorRow` unless
- a *callback is provided. If *callback* is given, the callback will be executed
+ a *callback is provided. If *callback* is given, the callback will be executed
for each position within the :term:`region`. Note that callbacks currently work
only, if *region* or *reference* is given.
- Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
+ Note that a :term:`SAM` file does not allow random access. If *region* or *reference* are given,
an exception is raised.
'''
- cdef int rtid
- cdef int rstart
- cdef int rend
+ cdef int rtid, rstart, rend, has_coord
if not self._isOpen():
raise ValueError( "I/O operation on closed file" )
- region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+ has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+
+ if self.isstream: reopen = False
+ else: reopen = True
if self.isbam:
+ if not until_eof and not self._hasIndex() and not self.isremote:
+ raise ValueError( "fetch called on bamfile without index" )
+
if callback:
- if not region:
- raise ValueError( "callback functionality requires a region/reference" )
+ if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
if not self._hasIndex(): raise ValueError( "no index available for fetch" )
- return bam_fetch(self.samfile.x.bam,
- self.index, rtid, rstart, rend, <void*>callback, fetch_callback )
+ return bam_fetch(self.samfile.x.bam,
+ self.index,
+ rtid,
+ rstart,
+ rend,
+ <void*>callback,
+ fetch_callback )
else:
- if region:
- return IteratorRow( self, rtid, rstart, rend )
+ if has_coord:
+ return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen )
else:
if until_eof:
- return IteratorRowAll( self )
+ return IteratorRowAll( self, reopen=reopen )
else:
- # return all targets by chaining the individual targets together.
- if not self._hasIndex(): raise ValueError( "no index available for fetch" )
- i = []
- rstart = 0
- rend = 1<<29
- for rtid from 0 <= rtid < self.nreferences:
- i.append( IteratorRow( self, rtid, rstart, rend))
- return itertools.chain( *i )
- else:
- if region != None:
- raise ValueError ("fetch for a region is not available for sam files" )
+ # AH: check - reason why no reopen for AllRefs?
+ return IteratorRowAllRefs(self ) # , reopen=reopen )
+ else:
+ if has_coord:
+ raise ValueError ("fetching by region is not available for sam files" )
+
if callback:
raise NotImplementedError( "callback not implemented yet" )
- else:
- return IteratorRowAll( self )
- def pileup( self, reference = None, start = None, end = None, region = None, callback = None ):
- '''run a pileup within a :term:`region` using 0-based indexing. The region is specified by
- :term:`reference`, *start* and *end*. Alternatively, a samtools *region* string can be supplied.
+ if self.samfile.header == NULL:
+ raise ValueError( "fetch called for samfile without header")
- Without *reference* or *region* all reads will be fetched. The reads will be returned
- ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
+ # check if targets are defined
+ # give warning, sam_read1 segfaults
+ if self.samfile.header.n_targets == 0:
+ warnings.warn( "fetch called for samfile without header")
+
+ return IteratorRowAll( self, reopen=reopen )
- The method returns an iterator of type :class:`pysam.IteratorColumn` unless
- a *callback is provided. If *callback* is given, the callback will be executed
- for each position within the :term:`region`.
+ def mate( self,
+ AlignedRead read ):
+ '''return the mate of :class:`AlignedRead` *read*.
- Note that samfiles do not allow random access. If *region* or *reference* are given,
- an exception is raised.
-
- .. Note::
+ Throws a ValueError if read is unpaired or the mate
+ is unmapped.
- *all* reads which overlap the region are returned. The first base returned will be the
- first base of the first read *not* necessarily the first base of the region used in the query.
+ .. note::
+ Calling this method will change the file position.
+ This might interfere with any iterators that have
+ not re-opened the file.
+
+ '''
+ cdef uint32_t flag = read._delegate.core.flag
+
+ if flag & BAM_FPAIRED == 0:
+ raise ValueError( "read %s: is unpaired" % (read.qname))
+ if flag & BAM_FMUNMAP != 0:
+ raise ValueError( "mate %s: is unmapped" % (read.qname))
+
+ cdef MateData mate_data
+
+ mate_data.name = <char *>bam1_qname(read._delegate)
+ mate_data.mate = NULL
+ # xor flags to get the other mate
+ cdef int x = BAM_FREAD1 + BAM_FREAD2
+ mate_data.flag = ( flag ^ x) & x
+
+ bam_fetch(self.samfile.x.bam,
+ self.index,
+ read._delegate.core.mtid,
+ read._delegate.core.mpos,
+ read._delegate.core.mpos + 1,
+ <void*>&mate_data,
+ mate_callback )
+
+ if mate_data.mate == NULL:
+ raise ValueError( "mate not found" )
+
+ cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
+ dest._delegate = mate_data.mate
+ return dest
+
+ def count( self,
+ reference = None,
+ start = None,
+ end = None,
+ region = None,
+ until_eof = False ):
+ '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
+
+ count reads :term:`region` using 0-based indexing. The region is specified by
+ :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
+
+ Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
+ an exception is raised.
'''
cdef int rtid
cdef int rstart
cdef int rend
- cdef bam_plbuf_t *buf
if not self._isOpen():
raise ValueError( "I/O operation on closed file" )
region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
-
+
+ cdef int counter
+ counter = 0;
+
+ if self.isbam:
+ if not until_eof and not self._hasIndex() and not self.isremote:
+ raise ValueError( "fetch called on bamfile without index" )
+
+ if not region:
+ raise ValueError( "counting functionality requires a region/reference" )
+ if not self._hasIndex(): raise ValueError( "no index available for fetch" )
+ bam_fetch(self.samfile.x.bam,
+ self.index,
+ rtid,
+ rstart,
+ rend,
+ <void*>&counter,
+ count_callback )
+ return counter
+ else:
+ raise ValueError ("count for a region is not available for sam files" )
+
+ def pileup( self,
+ reference = None,
+ start = None,
+ end = None,
+ region = None,
+ callback = None,
+ **kwargs ):
+ '''
+ perform a :term:`pileup` within a :term:`region`. The region is specified by
+ :term:`reference`, *start* and *end* (using 0-based indexing).
+ Alternatively, a samtools *region* string can be supplied.
+
+ Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
+ ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
+
+ The method returns an iterator of type :class:`pysam.IteratorColumn` unless
+ a *callback is provided. If a *callback* is given, the callback will be executed
+ for each column within the :term:`region`.
+
+ Note that :term:`SAM` formatted files do not allow random access.
+ In these files, if a *region* or *reference* are given an exception is raised.
+
+ Optional *kwargs* to the iterator:
+
+ stepper
+ The stepper controlls how the iterator advances.
+ Possible options for the stepper are
+
+ ``all``
+ use all reads for pileup.
+ ``samtools``
+ same filter and read processing as in :term:`csamtools` pileup
+
+ fastafile
+ A :class:`FastaFile` object
+
+ mask
+ Skip all reads with bits set in mask if mask=True.
+
+ max_depth
+ Maximum read depth permitted. The default limit is *8000*.
+
+ truncate
+ By default, the samtools pileup engine outputs all reads overlapping a region (see note below).
+ If truncate is True and a region is given, only output columns in the exact region
+ specificied.
+
+ .. note::
+
+ *all* reads which overlap the region are returned. The first base returned will be the
+ first base of the first read *not* necessarily the first base of the region used in the query.
+
+ '''
+ cdef int rtid, rstart, rend, has_coord
+ cdef bam_plbuf_t *buf
+
+ if not self._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+
+ has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+
if self.isbam:
if not self._hasIndex(): raise ValueError( "no index available for pileup" )
if callback:
- if not region:
- raise ValueError( "callback functionality requires a region/reference" )
+ if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
- bam_fetch(self.samfile.x.bam,
- self.index, rtid, rstart, rend,
+ bam_fetch(self.samfile.x.bam,
+ self.index, rtid, rstart, rend,
buf, pileup_fetch_callback )
-
+
# finalize pileup
bam_plbuf_push( NULL, buf)
bam_plbuf_destroy(buf)
else:
- if region:
- return IteratorColumn( self, rtid, rstart, rend )
+ if has_coord:
+ return IteratorColumnRegion( self,
+ tid = rtid,
+ start = rstart,
+ end = rend,
+ **kwargs )
else:
- # return all targets by chaining the individual targets together.
- i = []
- rstart = 0
- rend = 1<<29
- for rtid from 0 <= rtid < self.nreferences:
- i.append( IteratorColumn( self, rtid, rstart, rend))
- return itertools.chain( *i )
+ return IteratorColumnAllRefs(self, **kwargs )
else:
raise NotImplementedError( "pileup of samfiles not implemented yet" )
def close( self ):
- '''closes file.'''
+ '''
+ closes the :class:`pysam.Samfile`.'''
if self.samfile != NULL:
samclose( self.samfile )
bam_index_destroy(self.index);
self.samfile = NULL
def __dealloc__( self ):
- '''clean up.'''
# remember: dealloc cannot call other methods
- # Note that __del__ is not called.
+ # note: no doc string
+ # note: __del__ is not called.
self.close()
- pysam_bam_destroy1(self.b)
+ bam_destroy1(self.b)
+ if self._filename != NULL: free( self._filename )
- def write( self, AlignedRead read ):
- '''(AlignedRead read )
- write a single :class:`pysam.AlignedRead`..
+ cpdef int write( self, AlignedRead read ) except -1:
+ '''
+ write a single :class:`pysam.AlignedRead` to disk.
- return the number of bytes written.
+ returns the number of bytes written.
'''
+ if not self._isOpen():
+ return 0
+
return samwrite( self.samfile, read._delegate )
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
+ ###############################################################
+ ###############################################################
+ ###############################################################
+ ## properties
+ ###############################################################
+ property filename:
+ '''number of :term:`filename` associated with this object.'''
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ return self._filename
+
property nreferences:
'''number of :term:`reference` sequences in the file.'''
def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
return self.samfile.header.n_targets
property references:
"""tuple with the names of :term:`reference` sequences."""
- def __get__(self):
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
t = []
for x from 0 <= x < self.samfile.header.n_targets:
- t.append( self.samfile.header.target_name[x] )
+ t.append( _charptr_to_str(self.samfile.header.target_name[x]) )
return tuple(t)
property lengths:
- """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference`
+ """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
+ :attr:`pysam.Samfile.references`
"""
- def __get__(self):
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
t = []
for x from 0 <= x < self.samfile.header.n_targets:
t.append( self.samfile.header.target_len[x] )
return tuple(t)
+ property mapped:
+ """total number of mapped reads in file.
+ """
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" )
+
+ cdef int tid
+ cdef uint32_t total = 0
+ for tid from 0 <= tid < self.samfile.header.n_targets:
+ total += pysam_get_mapped( self.index, tid )
+ return total
+
+ property unmapped:
+ """total number of unmapped reads in file.
+ """
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" )
+ cdef int tid
+ cdef uint32_t total = 0
+ for tid from 0 <= tid < self.samfile.header.n_targets:
+ total += pysam_get_unmapped( self.index, tid )
+ # get unmapped reads without coordinates
+ total += pysam_get_unmapped( self.index, -1 )
+ return total
+
property text:
'''full contents of the :term:`sam file` header as a string.'''
def __get__(self):
- # create a temporary 0-terminated copy
- cdef char * t
- t = <char*>calloc( self.samfile.header.l_text + 1, sizeof(char) )
- memcpy( t, self.samfile.header.text, self.samfile.header.l_text )
- result = t
- free(t)
- return result
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ return from_string_and_size(self.samfile.header.text, self.samfile.header.l_text)
property header:
- '''header information within the :term:`sam file`. The records and fields are returned as
+ '''header information within the :term:`sam file`. The records and fields are returned as
a two-level dictionary.
'''
def __get__(self):
- result = {}
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ result = {}
+
if self.samfile.header.text != NULL:
# convert to python string (note: call self.text to create 0-terminated string)
t = self.text
x = {}
for field in fields[1:]:
key, value = field.split(":",1)
- if key not in VALID_HEADER_FIELDS[record]:
+ # uppercase keys must be valid
+ # lowercase are permitted for user fields
+ if key in VALID_HEADER_FIELDS[record]:
+ x[key] = VALID_HEADER_FIELDS[record][key](value)
+ elif not key.isupper():
+ x[key] = value
+ else:
raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
- x[key] = VALID_HEADER_FIELDS[record][key](value)
if VALID_HEADER_TYPES[record] == dict:
if record in result:
if record not in result: result[record] = []
result[record].append( x )
+ # if there are no SQ lines in the header, add the reference names
+ # from the information in the bam file.
+ # Background: c-samtools keeps the textual part of the header separate from
+ # the list of reference names and lengths. Thus, if a header contains only
+ # SQ lines, the SQ information is not part of the textual header and thus
+ # are missing from the output. See issue 84.
+ if "SQ" not in result:
+ sq = []
+ for ref, length in zip( self.references, self.lengths ):
+ sq.append( {'LN': length, 'SN': ref } )
+ result["SQ"] = sq
+
return result
def _buildLine( self, fields, record ):
# TODO: add checking for field and sort order
line = ["@%s" % record ]
+ # comment
if record == "CO":
line.append( fields )
+ # user tags
+ elif record.islower():
+ for key in sorted(fields):
+ line.append( "%s:%s" % (key, str(fields[key])))
+ # defined tags
else:
+ # write fields of the specification
for key in VALID_HEADER_ORDER[record]:
if key in fields:
line.append( "%s:%s" % (key, str(fields[key])))
- return "\t".join( line )
+ # write user fields
+ for key in fields:
+ if not key.isupper():
+ line.append( "%s:%s" % (key, str(fields[key])))
+
+ return "\t".join( line )
cdef bam_header_t * _buildHeader( self, new_header ):
'''return a new header built from a dictionary in *new_header*.
cdef bam_header_t * dest
dest = bam_header_init()
-
+
+ # first: defined tags
for record in VALID_HEADERS:
if record in new_header:
ttype = VALID_HEADER_TYPES[record]
data = new_header[record]
if type( data ) != type( ttype() ):
raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
- if type( data ) == types.DictType:
+ if type( data ) is dict:
lines.append( self._buildLine( data, record ) )
else:
for fields in new_header[record]:
lines.append( self._buildLine( fields, record ) )
+ # then: user tags (lower case), sorted alphabetically
+ for record, data in sorted(new_header.items()):
+ if record in VALID_HEADERS: continue
+ if type( data ) is dict:
+ lines.append( self._buildLine( data, record ) )
+ else:
+ for fields in new_header[record]:
+ lines.append( self._buildLine( fields, record ) )
+
text = "\n".join(lines) + "\n"
if dest.text != NULL: free( dest.text )
dest.text = <char*>calloc( len(text), sizeof(char))
dest.l_text = len(text)
- strncpy( dest.text, text, dest.l_text )
+ cdef bytes btext = text.encode('ascii')
+ strncpy( dest.text, btext, dest.l_text )
+ cdef bytes bseqname
# collect targets
if "SQ" in new_header:
seqs = []
seqs.append( (fields["SN"], fields["LN"] ) )
except KeyError:
raise KeyError( "incomplete sequence information in '%s'" % str(fields))
-
+
dest.n_targets = len(seqs)
dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
-
+
for x from 0 <= x < dest.n_targets:
seqname, seqlen = seqs[x]
dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
- strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
+ bseqname = seqname.encode('ascii')
+ strncpy( dest.target_name[x], bseqname, len(seqname) + 1 )
dest.target_len[x] = seqlen
return dest
+ ###############################################################
+ ###############################################################
+ ###############################################################
+ ## file-object like iterator access
+ ## note: concurrent access will cause errors (see IteratorRow
+ ## and reopen)
+ ## Possible solutions: deprecate or open new file handle
+ ###############################################################
def __iter__(self):
- return self
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam and self.samfile.header.n_targets == 0:
+ raise NotImplementedError( "can not iterate over samfile without header")
+ return self
cdef bam1_t * getCurrent( self ):
return self.b
cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
+ '''
+ cversion of iterator. Used by :class:`pysam.Samfile.IteratorColumn`.
+ '''
cdef int ret
return samread(self.samfile, self.b)
- def __next__(self):
- """python version of next().
-
- pyrex uses this non-standard name instead of next()
+ def __next__(self):
+ """
+ python version of next().
"""
cdef int ret
ret = samread(self.samfile, self.b)
else:
raise StopIteration
-cdef class Fastafile:
- '''*(filename)*
-
- A *FASTA* file. The file is automatically opened.
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef class IteratorRow:
+ '''abstract base class for iterators over mapped reads.
- The file expects an indexed fasta file.
+ Various iterators implement different behaviours for wrapping around
+ contig boundaries. Examples include:
- TODO:
- add automatic indexing.
- add function to get sequence names.
- '''
+ :class:`pysam.IteratorRowRegion`
+ iterate within a single contig and a defined region.
- cdef char * filename
- # pointer to fastafile
- cdef faidx_t * fastafile
+ :class:`pysam.IteratorRowAll`
+ iterate until EOF. This iterator will also include unmapped reads.
- def __cinit__(self, *args, **kwargs ):
- self.fastafile = NULL
- self._open( *args, **kwargs )
+ :class:`pysam.IteratorRowAllRefs`
+ iterate over all reads in all reference sequences.
- def _isOpen( self ):
- '''return true if samfile has been opened.'''
- return self.fastafile != NULL
+ The method :meth:`Samfile.fetch` returns an IteratorRow.
+ '''
+ pass
- def _open( self,
- char * filename ):
- '''open an indexed fasta file.
- This method expects an indexed fasta file.
- '''
+cdef class IteratorRowRegion(IteratorRow):
+ """*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
- # close a previously opened file
- if self.fastafile != NULL: self.close()
- self.filename = filename
- self.fastafile = fai_load( filename )
+ iterate over mapped reads in a region.
- if self.fastafile == NULL:
- raise IOError("could not open file `%s`" % filename )
+ By default, the file is re-openend to avoid conflicts between
+ multiple iterators working on the same file. Set *reopen* = False
+ to not re-open *samfile*.
- def close( self ):
- if self.fastafile != NULL:
- fai_destroy( self.fastafile )
- self.fastafile = NULL
+ The samtools iterators assume that the file
+ position between iterations do not change.
+ As a consequence, no two iterators can work
+ on the same file. To permit this, each iterator
+ creates its own file handle by re-opening the
+ file.
- def fetch( self,
- reference = None,
- start = None,
- end = None,
- region = None):
-
- '''*(reference = None, start = None, end = None, region = None)*
-
- fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
- :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
- '''
-
- if not self._isOpen():
+ Note that the index will be shared between
+ samfile and the iterator.
+ """
+
+ def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
+
+ if not samfile._isOpen():
raise ValueError( "I/O operation on closed file" )
- cdef int len, max_pos
- cdef char * seq
- max_pos = 2 << 29
+ if not samfile._hasIndex():
+ raise ValueError( "no index available for iteration" )
- if not region:
- if reference == None: raise ValueError( 'no sequence/region supplied.' )
- if start == None and end == None:
- region = "%s" % str(reference)
- elif start == None or end == None:
- raise ValueError( 'only start or only end of region supplied' )
- else:
- if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
- # valid ranges are from 0 to 2^29-1
- if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
- if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
- region = "%s:%i-%i" % (reference, start+1, end )
-
- # samtools adds a '\0' at the end
- seq = fai_fetch( self.fastafile, region, &len )
- # copy to python
- result = seq
- # clean up
- free(seq)
-
- return result
+ # makes sure that samfile stays alive as long as the
+ # iterator is alive
+ self.samfile = samfile
-## turning callbacks elegantly into iterators is an unsolved problem, see the following threads:
-## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1
-## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html
-## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code
-## changes, the changes have to be manually entered.
+ if samfile.isbam: mode = b"rb"
+ else: mode = b"r"
-cdef class IteratorRow:
- """iterates over mapped reads in a region.
+ # reopen the file - note that this makes the iterator
+ # slow and causes pileup to slow down significantly.
+ if reopen:
+ store = StderrStore()
+ self.fp = samopen( samfile._filename, mode, NULL )
+ store.release()
+ assert self.fp != NULL
+ self.owns_samfile = True
+ else:
+ self.fp = self.samfile.samfile
+ self.owns_samfile = False
+
+ self.retval = 0
+
+ self.iter = bam_iter_query(self.samfile.index,
+ tid,
+ beg,
+ end)
+ self.b = bam_init1()
+
+ def __iter__(self):
+ return self
+
+ cdef bam1_t * getCurrent( self ):
+ return self.b
+
+ cdef int cnext(self):
+ '''cversion of iterator. Used by IteratorColumn'''
+ self.retval = bam_iter_read( self.fp.x.bam,
+ self.iter,
+ self.b)
+
+ def __next__(self):
+ """python version of next().
+ """
+ self.cnext()
+ if self.retval < 0: raise StopIteration
+ return makeAlignedRead( self.b )
+
+ def __dealloc__(self):
+ bam_destroy1(self.b)
+ bam_iter_destroy( self.iter )
+ if self.owns_samfile: samclose( self.fp )
+
+cdef class IteratorRowAll(IteratorRow):
+ """*(Samfile samfile, int reopen = True)*
+
+ iterate over all reads in *samfile*
+
+ By default, the file is re-openend to avoid conflicts between
+ multiple iterators working on the same file. Set *reopen* = False
+ to not re-open *samfile*.
"""
-
- cdef bam_fetch_iterator_t* bam_iter # iterator state object
- cdef bam1_t * b
- cdef error_msg
- cdef int error_state
- cdef Samfile samfile
- def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
- self.bam_iter = NULL
- assert samfile._isOpen()
- assert samfile._hasIndex()
-
- # makes sure that samfile stays alive as long as the
- # iterator is alive.
- self.samfile = samfile
+ def __cinit__(self, Samfile samfile, int reopen = True ):
- # parse the region
- self.error_state = 0
- self.error_msg = None
+ if not samfile._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+
+ if samfile.isbam: mode = b"rb"
+ else: mode = b"r"
+
+ # reopen the file to avoid iterator conflict
+ if reopen:
+ store = StderrStore()
+ self.fp = samopen( samfile._filename, mode, NULL )
+ store.release()
+ assert self.fp != NULL
+ self.owns_samfile = True
+ else:
+ self.fp = samfile.samfile
+ self.owns_samfile = False
- cdef bamFile fp
- fp = samfile.samfile.x.bam
- self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end)
+ # allocate memory for alignment
+ self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
def __iter__(self):
- return self
+ return self
cdef bam1_t * getCurrent( self ):
return self.b
cdef int cnext(self):
'''cversion of iterator. Used by IteratorColumn'''
- self.b = bam_fetch_iterate(self.bam_iter)
- if self.b == NULL: return 0
- return 1
+ return samread(self.fp, self.b)
- def __next__(self):
+ def __next__(self):
"""python version of next().
pyrex uses this non-standard name instead of next()
"""
- if self.error_state:
- raise ValueError( self.error_msg)
-
- self.b = bam_fetch_iterate(self.bam_iter)
- if self.b != NULL:
+ cdef int ret
+ ret = samread(self.fp, self.b)
+ if (ret > 0):
return makeAlignedRead( self.b )
else:
raise StopIteration
def __dealloc__(self):
- '''remember: dealloc cannot call other methods!'''
- if self.bam_iter:
- bam_cleanup_fetch_iterator(self.bam_iter)
-
-cdef class IteratorRowAll:
- """iterates over all mapped reads
- """
+ bam_destroy1(self.b)
+ if self.owns_samfile: samclose( self.fp )
- cdef bam1_t * b
- cdef samfile_t * fp
+cdef class IteratorRowAllRefs(IteratorRow):
+ """iterates over all mapped reads by chaining iterators over each reference
+ """
def __cinit__(self, Samfile samfile):
-
assert samfile._isOpen()
+ if not samfile._hasIndex(): raise ValueError("no index available for fetch")
+ self.samfile = samfile
+ self.tid = -1
+
+ def nextiter(self):
+ self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, 1<<29)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ """python version of next().
+
+ pyrex uses this non-standard name instead of next()
+ """
+ # Create an initial iterator
+ if self.tid==-1:
+ if not self.samfile.nreferences:
+ raise StopIteration
+ self.tid = 0
+ self.nextiter()
+
+ while 1:
+ self.rowiter.cnext()
+
+ # If current iterator is not exhausted, return aligned read
+ if self.rowiter.retval>0:
+ return makeAlignedRead(self.rowiter.b)
+
+ self.tid += 1
+
+ # Otherwise, proceed to next reference or stop
+ if self.tid<self.samfile.nreferences:
+ self.nextiter()
+ else:
+ raise StopIteration
+
+cdef class IteratorRowSelection(IteratorRow):
+ """*(Samfile samfile)*
+
+ iterate over reads in *samfile* at a given list of file positions.
+ """
+
+ def __cinit__(self, Samfile samfile, positions, int reopen = True ):
+
+ if not samfile._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+
+ if not samfile._isOpen():
+ raise ValueError( "I/O operation on closed file" )
+
+ assert samfile.isbam, "can only use this iterator on bam files"
+ mode = b"rb"
- self.fp = samfile.samfile
+ # reopen the file to avoid iterator conflict
+ if reopen:
+ store = StderrStore()
+ self.fp = samopen( samfile._filename, mode, NULL )
+ store.release()
+ assert self.fp != NULL
+ self.owns_samfile = True
+ else:
+ self.fp = samfile.samfile
+ self.owns_samfile = False
# allocate memory for alignment
self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+ self.positions = positions
+ self.current_pos = 0
+
def __iter__(self):
- return self
+ return self
cdef bam1_t * getCurrent( self ):
return self.b
cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
- cdef int ret
+ '''cversion of iterator'''
+
+ # end iteration if out of positions
+ if self.current_pos >= len(self.positions): return -1
+
+ bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
+ self.current_pos += 1
return samread(self.fp, self.b)
- def __next__(self):
+ def __next__(self):
"""python version of next().
pyrex uses this non-standard name instead of next()
"""
- cdef int ret
- ret = samread(self.fp, self.b)
+
+ cdef int ret = self.cnext()
if (ret > 0):
return makeAlignedRead( self.b )
else:
raise StopIteration
def __dealloc__(self):
- '''remember: dealloc cannot call other methods!'''
- pysam_bam_destroy1(self.b)
-
+ bam_destroy1(self.b)
+ if self.owns_samfile: samclose( self.fp )
+
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef int __advance_all( void * data, bam1_t * b ):
+ '''advance without any read filtering.
+ '''
+ cdef __iterdata * d
+ d = <__iterdata*>data
+ return bam_iter_read( d.samfile.x.bam, d.iter, b )
+
+cdef int __advance_snpcalls( void * data, bam1_t * b ):
+ '''advance using same filter and read processing as in
+ the samtools pileup.
+ '''
+ cdef __iterdata * d
+ d = <__iterdata*>data
+
+ cdef int ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
+ cdef int skip = 0
+ cdef int q
+ cdef int is_cns = 1
+ cdef int is_nobaq = 0
+ cdef int capQ_thres = 0
+
+ # reload sequence
+ if d.fastafile != NULL and b.core.tid != d.tid:
+ if d.seq != NULL: free(d.seq)
+ d.tid = b.core.tid
+ d.seq = faidx_fetch_seq(d.fastafile,
+ d.samfile.header.target_name[d.tid],
+ 0, max_pos,
+ &d.seq_len)
+ if d.seq == NULL:
+ raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
+ (d.samfile.header.target_name[d.tid],
+ d.tid))
+
+
+ while ret >= 0:
+
+ skip = 0
+
+ # realign read - changes base qualities
+ if d.seq != NULL and is_cns and not is_nobaq: bam_prob_realn( b, d.seq )
+
+ if d.seq != NULL and capQ_thres > 10:
+ q = bam_cap_mapQ(b, d.seq, capQ_thres)
+ if q < 0: skip = 1
+ elif b.core.qual > q: b.core.qual = q
+ if b.core.flag & BAM_FUNMAP: skip = 1
+ elif b.core.flag & 1 and not b.core.flag & 2: skip = 1
+
+ if not skip: break
+ # additional filters
+
+ ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
+
+ return ret
+
cdef class IteratorColumn:
- '''iterates over columns.
+ '''abstract base class for iterators over columns.
+
+ IteratorColumn objects wrap the pileup functionality of samtools.
+
+ For reasons of efficiency, the iterator points to the current
+ pileup buffer. The pileup buffer is updated at every iteration.
+ This might cause some unexpected behavious. For example,
+ consider the conversion to a list::
- This iterator wraps the pileup functionality of samtools.
-
- For reasons of efficiency, the iterator returns the current
- pileup buffer. As this buffer is updated at every iteration,
- the contents of this iterator will change accordingly. Hence the conversion to
- a list will not produce the expected result::
-
f = Samfile("file.bam", "rb")
result = list( f.pileup() )
- Here, result will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
- but each object will contain the same information.
-
- If the results of several columns are required at the same time, the results
- need to be stored explicitely::
+ Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
+ but each object in ``result`` will contain the same information.
+
+ The desired behaviour can be achieved by list comprehension::
result = [ x.pileups() for x in f.pileup() ]
- Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`.
+ ``result`` will be a list of ``n`` lists of objects of type :class:`PileupRead`.
- '''
- cdef bam_plbuf_t *buf
+ If the iterator is associated with a :class:`Fastafile` using the :meth:`addReference`
+ method, then the iterator will export the current sequence via the methods :meth:`getSequence`
+ and :meth:`seq_len`.
+
+ Optional kwargs to the iterator
+
+ stepper
+ The stepper controls how the iterator advances.
+ Possible options for the stepper are
+
+ all
+ use all reads for pileup.
+ samtools
+ same filter and read processing as in :term:`csamtools` pileup
- # check if first iteration
- cdef int notfirst
- # result of the last plbuf_push
- cdef int n_pu
- cdef int eof
- cdef IteratorRow iter
+ The default is to use "all" if no stepper is given.
- def __cinit__(self, Samfile samfile, int tid, int start, int end ):
+ fastafile
+ A :class:`FastaFile` object
+ mask
+ Skip all reads with bits set in mask.
+ max_depth
+ maximum read depth. The default is 8000.
+ '''
+
+ def __cinit__( self, Samfile samfile, **kwargs ):
+ self.samfile = samfile
+ self.mask = kwargs.get("mask", BAM_DEF_MASK )
+ self.fastafile = kwargs.get( "fastafile", None )
+ self.stepper = kwargs.get( "stepper", None )
+ self.max_depth = kwargs.get( "max_depth", 8000 )
+ self.iterdata.seq = NULL
+ self.tid = 0
+ self.pos = 0
+ self.n_plp = 0
+ self.plp = NULL
+ self.pileup_iter = <bam_plp_t>NULL
- self.iter = IteratorRow( samfile, tid, start, end )
- self.buf = bam_plbuf_init(NULL, NULL )
- self.n_pu = 0
- self.eof = 0
def __iter__(self):
- return self
+ return self
cdef int cnext(self):
'''perform next iteration.
-
- return 1 if there is a buffer to emit. Return 0 for end of iteration.
+
+ This method is analogous to the samtools bam_plp_auto method.
+ It has been re-implemented to permit for filtering.
+ '''
+ self.plp = bam_plp_auto( self.pileup_iter,
+ &self.tid,
+ &self.pos,
+ &self.n_plp )
+
+ cdef char * getSequence( self ):
+ '''return current reference sequence underlying the iterator.
+ '''
+ return self.iterdata.seq
+
+ property seq_len:
+ '''current sequence length.'''
+ def __get__(self): return self.iterdata.seq_len
+
+ def addReference( self, Fastafile fastafile ):
+ '''
+ add reference sequences in *fastafile* to iterator.'''
+ self.fastafile = fastafile
+ if self.iterdata.seq != NULL: free(self.iterdata.seq)
+ self.iterdata.tid = -1
+ self.iterdata.fastafile = self.fastafile.fastafile
+
+ def hasReference( self ):
+ '''
+ return true if iterator is associated with a reference'''
+ return self.fastafile
+
+ cdef setMask( self, mask ):
+ '''set masking flag in iterator.
+
+ reads with bits set in *mask* will be skipped.
'''
+ self.mask = mask
+ bam_plp_set_mask( self.pileup_iter, self.mask )
+
+ cdef setupIteratorData( self,
+ int tid,
+ int start,
+ int end,
+ int reopen = 0 ):
+ '''setup the iterator structure'''
+
+ self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen )
+ self.iterdata.samfile = self.samfile.samfile
+ self.iterdata.iter = self.iter.iter
+ self.iterdata.seq = NULL
+ self.iterdata.tid = -1
+
+ if self.fastafile != None:
+ self.iterdata.fastafile = self.fastafile.fastafile
+ else:
+ self.iterdata.fastafile = NULL
- cdef int retval1, retval2
+ if self.stepper == None or self.stepper == "all":
+ self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
+ elif self.stepper == "samtools":
+ self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
+ else:
+ raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
- # pysam bam_plbuf_push returns:
- # 1: if buf is full and can be emitted
- # 0: if b has been added
- # -1: if there was an error
+ if self.max_depth:
+ bam_plp_set_maxcnt( self.pileup_iter, self.max_depth )
- # check if previous plbuf was incomplete. If so, continue within
- # the loop and yield if necessary
- if self.n_pu > 0:
- self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1)
- if self.n_pu > 0: return 1
+ bam_plp_set_mask( self.pileup_iter, self.mask )
- if self.eof: return 0
+ cdef reset( self, tid, start, end ):
+ '''reset iterator position.
- # get next alignments and submit until plbuf indicates that
- # an new column has finished
- while self.n_pu == 0:
- retval1 = self.iter.cnext()
- # wrap up if no more input
- if retval1 == 0:
- self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0)
- self.eof = 1
- return self.n_pu
+ This permits using the iterator multiple times without
+ having to incur the full set-up costs.
+ '''
+ self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
+ self.iterdata.iter = self.iter.iter
- # submit to plbuf
- self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0)
- if self.n_pu < 0: raise ValueError( "error while iterating" )
+ # invalidate sequence if different tid
+ if self.tid != tid:
+ if self.iterdata.seq != NULL: free( self.iterdata.seq )
+ self.iterdata.seq = NULL
+ self.iterdata.tid = -1
- # plbuf has yielded
- return 1
+ # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
+ bam_plp_reset(self.pileup_iter)
- def __next__(self):
+ def __dealloc__(self):
+ # reset in order to avoid memory leak messages for iterators that have
+ # not been fully consumed
+ if self.pileup_iter != <bam_plp_t>NULL:
+ bam_plp_reset(self.pileup_iter)
+ bam_plp_destroy(self.pileup_iter)
+ self.pileup_iter = <bam_plp_t>NULL
+
+ if self.iterdata.seq != NULL:
+ free(self.iterdata.seq)
+ self.iterdata.seq = NULL
+
+cdef class IteratorColumnRegion(IteratorColumn):
+ '''iterates over a region only.
+ '''
+ def __cinit__(self, Samfile samfile,
+ int tid = 0,
+ int start = 0,
+ int end = max_pos,
+ int truncate = False,
+ **kwargs ):
+
+ # initialize iterator
+ self.setupIteratorData( tid, start, end, 1 )
+ self.start = start
+ self.end = end
+ self.truncate = truncate
+
+ def __next__(self):
"""python version of next().
+ """
- pyrex uses this non-standard name instead of next()
+ while 1:
+ self.cnext()
+ if self.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ if self.plp == NULL:
+ raise StopIteration
+
+ if self.truncate:
+ if self.start < self.pos: continue
+ if self.pos >= self.end: raise StopIteration
+
+ return makePileupProxy( &self.plp,
+ self.tid,
+ self.pos,
+ self.n_plp )
+
+cdef class IteratorColumnAllRefs(IteratorColumn):
+ """iterates over all columns by chaining iterators over each reference
+ """
+
+ def __cinit__(self,
+ Samfile samfile,
+ **kwargs ):
+
+ # no iteration over empty files
+ if not samfile.nreferences: raise StopIteration
+
+ # initialize iterator
+ self.setupIteratorData( self.tid, 0, max_pos, 1 )
+
+ def __next__(self):
+ """python version of next().
"""
- cdef int ret
- ret = self.cnext()
- cdef bam_pileup1_t * pl
- if ret > 0 :
- return makePileupProxy( self.buf, self.n_pu )
- else:
- raise StopIteration
+ while 1:
+ self.cnext()
- def __dealloc__(self):
- bam_plbuf_destroy(self.buf);
+ if self.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ # return result, if within same reference
+ if self.plp != NULL:
+ return makePileupProxy( &self.plp,
+ self.tid,
+ self.pos,
+ self.n_plp )
+
+ # otherwise, proceed to next reference or stop
+ self.tid += 1
+ if self.tid < self.samfile.nreferences:
+ self.setupIteratorData( self.tid, 0, max_pos, 0 )
+ else:
+ raise StopIteration
+
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef inline int32_t query_start(bam1_t *src) except -1:
+ cdef uint32_t * cigar_p, op
+ cdef uint32_t k
+ cdef uint32_t start_offset = 0
+
+ if src.core.n_cigar:
+ cigar_p = bam1_cigar(src);
+ for k from 0 <= k < src.core.n_cigar:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op==BAM_CHARD_CLIP:
+ if start_offset!=0 and start_offset!=src.core.l_qseq:
+ PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+ return -1
+ elif op==BAM_CSOFT_CLIP:
+ start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
+
+ return start_offset
+
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef inline int32_t query_end(bam1_t *src) except -1:
+ cdef uint32_t * cigar_p, op
+ cdef uint32_t k
+ cdef uint32_t end_offset = src.core.l_qseq
+
+ if src.core.n_cigar>1:
+ cigar_p = bam1_cigar(src);
+ for k from src.core.n_cigar > k >= 1:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op==BAM_CHARD_CLIP:
+ if end_offset!=0 and end_offset!=src.core.l_qseq:
+ PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+ return -1
+ elif op==BAM_CSOFT_CLIP:
+ end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
+
+ if end_offset==0:
+ end_offset = src.core.l_qseq
+
+ return end_offset
+
+
+cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
+ cdef uint8_t * p
+ cdef uint32_t k
+ cdef char * s
+
+ if not src.core.l_qseq:
+ return None
+
+ seq = PyBytes_FromStringAndSize(NULL, end - start)
+ s = <char*>seq
+ p = bam1_seq(src)
+
+ for k from start <= k < end:
+ # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
+ # note: do not use string literal as it will be a python string
+ s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
+
+ return seq
+
+
+cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
+ cdef uint8_t * p
+ cdef uint32_t k
+ cdef char * q
+
+ p = bam1_qual(src)
+ if p[0] == 0xff:
+ return None
+
+ qual = PyBytes_FromStringAndSize(NULL, end - start)
+ q = <char*>qual
+
+ for k from start <= k < end:
+ ## equivalent to t[i] + 33 (see bam.c)
+ q[k-start] = p[k] + 33
+
+ return qual
cdef class AlignedRead:
'''
- Class representing an aligned read. see SAM format specification for meaning of fields (http://samtools.sourceforge.net/).
+ Class representing an aligned read. see SAM format specification for
+ the meaning of fields (http://samtools.sourceforge.net/).
This class stores a handle to the samtools C-structure representing
an aligned read. Member read access is forwarded to the C-structure
One issue to look out for is that the sequence should always
be set *before* the quality scores. Setting the sequence will
also erase any quality scores that were set previously.
+
+ In Python 3, the fields containing sequence and quality
+ (seq, query, qual and qqual) data are of type bytes. Other
+ string data, such as the qname field and strings in the
+ tags tuple, is represented as unicode strings. On assignment,
+ both bytes and unicode objects are allowed, but unicode strings
+ must contain only ASCII characters.
'''
- cdef:
- bam1_t * _delegate
- def __cinit__( self ):
+ # Now only called when instances are created from Python
+ def __init__(self):
# see bam_init1
self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
- # allocate some memory
+ # allocate some memory
# If size is 0, calloc does not return a pointer that can be passed to free()
# so allocate 40 bytes for a new read
self._delegate.m_data = 40
self._delegate.data_len = 0
def __dealloc__(self):
- '''clear up memory.'''
- pysam_bam_destroy1(self._delegate)
-
+ bam_destroy1(self._delegate)
+
def __str__(self):
- """todo"""
+ """return string representation of alignment.
+
+ The representation is an approximate :term:`sam` format.
+
+ An aligned read might not be associated with a :term:`Samfile`.
+ As a result :term:`tid` is shown instead of the reference name.
+
+ Similarly, the tags field is returned in its parsed state.
+ """
+ # sam-parsing is done in sam.c/bam_format1_core which
+ # requires a valid header.
+ if sys.version_info[0] < 3:
+ seq = self.seq
+ qual = self.qual
+ else:
+ seq = self.seq.decode('ascii')
+ qual = self.qual.decode('ascii')
return "\t".join(map(str, (self.qname,
+ self.flag,
self.rname,
self.pos,
- self.cigar,
- self.qual,
- self.flag,
- self.seq,
self.mapq,
- self.tags)))
-
-
- def __cmp__(self, AlignedRead other):
- '''return true, if contents in this are binary equal to ``other``.'''
+ self.cigar,
+ self.mrnm,
+ self.mpos,
+ self.rlen,
+ seq,
+ qual,
+ self.tags )))
+
+ def compare(self, AlignedRead other):
+ '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
+
cdef int retval, x
cdef bam1_t *t, *o
+
t = self._delegate
o = other._delegate
# oo = <unsigned char*>(o.data)
# for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
- retval = memcmp( &t.core,
- &o.core,
- sizeof( bam1_core_t ))
+ # Fast-path test for object identity
+ if t==o:
+ return 0
+
+ retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
if retval: return retval
- retval = cmp( t.data_len, o.data_len)
+ retval = (t.data_len > o.data_len) - (t.data_len < o.data_len) # cmp(t.data_len, o.data_len)
if retval: return retval
- return memcmp( t.data,
- o.data,
- sizeof( t.data_len ))
+ return memcmp(t.data, o.data, t.data_len)
+
+ # Disabled so long as __cmp__ is a special method
+ def __hash__(self):
+ return _Py_HashPointer(<void *>self)
property qname:
"""the query name (None if not present)"""
def __get__(self):
- cdef bam1_t * src
+ cdef bam1_t * src
src = self._delegate
if src.core.l_qname == 0: return None
- return <char *>pysam_bam1_qname( src )
+ return _charptr_to_str(<char *>bam1_qname( src ))
def __set__(self, qname ):
if qname == None or len(qname) == 0: return
- cdef bam1_t * src
- cdef int l
+ qname = _force_bytes(qname)
+ cdef bam1_t * src
+ cdef int l
cdef char * p
- src = self._delegate
- p = pysam_bam1_qname( src )
+ src = self._delegate
+ p = bam1_qname( src )
# the qname is \0 terminated
l = len(qname) + 1
- pysam_bam_update( src,
- src.core.l_qname,
- l,
+ pysam_bam_update( src,
+ src.core.l_qname,
+ l,
<uint8_t*>p )
src.core.l_qname = l
# re-acquire pointer to location in memory
# as it might have moved
- p = pysam_bam1_qname(src)
+ p = bam1_qname(src)
strncpy( p, qname, l )
-
+
property cigar:
- """the :term:`cigar` alignment (None if not present).
+ """the :term:`cigar` alignment (None if not present). The alignment
+ is returned as a list of operations. The operations are:
+
+ +-----+--------------+-----+
+ |M |BAM_CMATCH |0 |
+ +-----+--------------+-----+
+ |I |BAM_CINS |1 |
+ +-----+--------------+-----+
+ |D |BAM_CDEL |2 |
+ +-----+--------------+-----+
+ |N |BAM_CREF_SKIP |3 |
+ +-----+--------------+-----+
+ |S |BAM_CSOFT_CLIP|4 |
+ +-----+--------------+-----+
+ |H |BAM_CHARD_CLIP|5 |
+ +-----+--------------+-----+
+ |P |BAM_CPAD |6 |
+ +-----+--------------+-----+
+ |= |BAM_CEQUAL |7 |
+ +-----+--------------+-----+
+ |X |BAM_CDIFF |8 |
+ +-----+--------------+-----+
+
"""
def __get__(self):
cdef uint32_t * cigar_p
- cdef bam1_t * src
+ cdef bam1_t * src
cdef op, l, cigar
+ cdef int k
+
src = self._delegate
if src.core.n_cigar == 0: return None
-
+
cigar = []
- cigar_p = pysam_bam1_cigar(src);
+ cigar_p = bam1_cigar(src);
for k from 0 <= k < src.core.n_cigar:
op = cigar_p[k] & BAM_CIGAR_MASK
l = cigar_p[k] >> BAM_CIGAR_SHIFT
def __set__(self, values ):
if values == None or len(values) == 0: return
cdef uint32_t * p
- cdef bam1_t * src
+ cdef bam1_t * src
cdef op, l
cdef int k
src = self._delegate
# get location of cigar string
- p = pysam_bam1_cigar(src)
+ p = bam1_cigar(src)
# create space for cigar data within src.data
- pysam_bam_update( src,
+ pysam_bam_update( src,
src.core.n_cigar * 4,
- len(values) * 4,
- p )
-
- # length is number of cigar operations, not bytes
+ len(values) * 4,
+ <uint8_t*>p )
+
+ # length is number of cigar operations, not bytes
src.core.n_cigar = len(values)
# re-acquire pointer to location in memory
# as it might have moved
- p = pysam_bam1_cigar(src)
+ p = bam1_cigar(src)
# insert cigar operations
for op, l in values:
## setting the cigar string also updates the "bin" attribute
src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
+ property cigarstring:
+ '''the :term:`cigar` alignment as a string.
+
+ Returns the empty string if not present.
+ '''
+ def __get__(self):
+ c = self.cigar
+ if c == None: return ""
+ else: return "".join([ "%c%i" % (CODE2CIGAR[x],y) for x,y in c])
+
+ def __set__(self, cigar):
+ if cigar == None or len(cigar) == 0: self.cigar = []
+ parts = CIGAR_REGEX.findall( cigar )
+ self.cigar = [ (CIGAR2CODE[ord(x)], int(y)) for x,y in parts ]
+
property seq:
- """the query sequence (None if not present)"""
+ """read sequence bases, including :term:`soft clipped` bases (None if not present).
+
+ In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
def __get__(self):
cdef bam1_t * src
- cdef uint8_t * p
cdef char * s
src = self._delegate
- bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
- ## parse qseq (bam1_seq)
+
if src.core.l_qseq == 0: return None
- s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char))
- p = pysam_bam1_seq( src )
- for k from 0 <= k < src.core.l_qseq:
- ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
- s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)]
- retval=s
- free(s)
- return retval
+ return get_seq_range(src, 0, src.core.l_qseq)
def __set__(self,seq):
# samtools manages sequence and quality length memory together
# if no quality information is present, the first byte says 0xff.
-
+
if seq == None or len(seq) == 0: return
+ seq = _force_bytes(seq)
cdef bam1_t * src
- cdef uint8_t * p
+ cdef uint8_t * p
cdef char * s
- src = self._delegate
cdef int l, k, nbytes_new, nbytes_old
+ src = self._delegate
+
l = len(seq)
-
+
# as the sequence is stored in half-bytes, the total length (sequence
# plus quality scores) is (l+1)/2 + l
nbytes_new = (l+1)/2 + l
nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
# acquire pointer to location in memory
- p = pysam_bam1_seq( src )
+ p = bam1_seq( src )
src.core.l_qseq = l
- pysam_bam_update( src,
+ pysam_bam_update( src,
nbytes_old,
nbytes_new,
p)
# re-acquire pointer to location in memory
# as it might have moved
- p = pysam_bam1_seq( src )
+ p = bam1_seq( src )
for k from 0 <= k < nbytes_new: p[k] = 0
# convert to C string
s = seq
p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
# erase qualities
- p = pysam_bam1_qual( src )
+ p = bam1_qual( src )
p[0] = 0xff
+
property qual:
- """the base quality (None if not present)"""
+ """read sequence base qualities, including :term:`soft clipped` bases (None if not present).
+
+ In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
def __get__(self):
- cdef bam1_t * src
- cdef uint8_t * p
+
+ cdef bam1_t * src
cdef char * q
+
src = self._delegate
- if src.core.l_qseq == 0: return None
- p = pysam_bam1_qual( src )
- if p[0] == 0xff: return None
+ if src.core.l_qseq == 0: return None
- q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char))
- for k from 0 <= k < src.core.l_qseq:
- ## equivalent to t[i] + 33 (see bam.c)
- q[k] = p[k] + 33
- # convert to python string
- retval=q
- # clean up
- free(q)
- return retval
+ return get_qual_range(src, 0, src.core.l_qseq)
def __set__(self,qual):
# note that space is already allocated via the sequences
cdef bam1_t * src
cdef uint8_t * p
- cdef char * q
+ cdef char * q
+ cdef int k
+
src = self._delegate
- p = pysam_bam1_qual( src )
+ p = bam1_qual( src )
if qual == None or len(qual) == 0:
# if absent - set to 0xff
p[0] = 0xff
return
+ qual = _force_bytes(qual)
cdef int l
# convert to C string
q = qual
for k from 0 <= k < l:
p[k] = <uint8_t>q[k] - 33
+ property query:
+ """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present).
+
+ In Python 3, this property is of type bytes. Assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient.
+
+ SAM/BAM files may included extra flanking bases sequences that were
+ not part of the alignment. These bases may be the result of the
+ Smith-Waterman or other algorithms, which may not require alignments
+ that begin at the first residue or end at the last. In addition,
+ extra sequencing adapters, multiplex identifiers, and low-quality bases that
+ were not considered for alignment may have been retained."""
+
+ def __get__(self):
+ cdef bam1_t * src
+ cdef uint32_t start, end
+ cdef char * s
+
+ src = self._delegate
+
+ if src.core.l_qseq == 0: return None
+
+ start = query_start(src)
+ end = query_end(src)
+
+ return get_seq_range(src, start, end)
+
+ property qqual:
+ """aligned query sequence quality values (None if not present). This property is read-only.
+
+ In Python 3, this property is of type bytes."""
+ def __get__(self):
+ cdef bam1_t * src
+ cdef uint32_t start, end
+
+ src = self._delegate
+
+ if src.core.l_qseq == 0: return None
+
+ start = query_start(src)
+ end = query_end(src)
+
+ return get_qual_range(src, start, end)
+
+ property qstart:
+ """start index of the aligned query portion of the sequence (0-based, inclusive)"""
+ def __get__(self):
+ return query_start(self._delegate)
+
+ property qend:
+ """end index of the aligned query portion of the sequence (0-based, exclusive)"""
+ def __get__(self):
+ return query_end(self._delegate)
+
+ property qlen:
+ """Length of the aligned query sequence"""
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ return query_end(src)-query_start(src)
+
property tags:
- """the tags in the AUX field."""
+ """the tags in the AUX field.
+
+ This property permits convenience access to
+ the tags. Changes it the returned list will
+ not update the tags automatically. Instead,
+ the following is required for adding a
+ new tag::
+
+ read.tags = read.tags + [("RG",0)]
+
+
+ This method will happily write the same tag
+ multiple times.
+ """
def __get__(self):
cdef char * ctag
cdef bam1_t * src
cdef uint8_t * s
- cdef char tpe
-
+ cdef char auxtag[3]
+ cdef char auxtype
+ cdef uint8_t byte_size
+ cdef int32_t nvalues
+
src = self._delegate
- if src.l_aux == 0: return None
-
- s = pysam_bam1_aux( src )
+ if src.l_aux == 0: return []
+ s = bam1_aux( src )
result = []
- ctag = <char*>calloc( 3, sizeof(char) )
- cdef int x
+ auxtag[2] = 0
while s < (src.data + src.data_len):
# get tag
- ctag[0] = s[0]
- ctag[1] = s[1]
- pytag = ctag
-
+ auxtag[0] = s[0]
+ auxtag[1] = s[1]
s += 2
-
- # convert type - is there a better way?
- ctag[0] = s[0]
- ctag[1] = 0
- pytype = ctag
- # get type and value
- # how do I do char literal comparison in cython?
- # the code below works (i.e, is C comparison)
- tpe = toupper(s[0])
- if tpe == 'S'[0]:
- value = <int>bam_aux2i(s)
+ auxtype = s[0]
+ if auxtype in ('c', 'C'):
+ value = <int>bam_aux2i(s)
+ s += 1
+ elif auxtype in ('s', 'S'):
+ value = <int>bam_aux2i(s)
s += 2
- elif tpe == 'I'[0]:
- value = <int>bam_aux2i(s)
+ elif auxtype in ('i', 'I'):
+ value = <int32_t>bam_aux2i(s)
s += 4
- elif tpe == 'F'[0]:
+ elif auxtype == 'f':
value = <float>bam_aux2f(s)
s += 4
- elif tpe == 'D'[0]:
+ elif auxtype == 'd':
value = <double>bam_aux2d(s)
s += 8
- elif tpe == 'C'[0]:
- value = <int>bam_aux2i(s)
- s += 1
- elif tpe == 'A'[0]:
- # there might a more efficient way
- # to convert a char into a string
+ elif auxtype == 'A':
value = "%c" % <char>bam_aux2A(s)
s += 1
- elif tpe == 'Z'[0]:
- value = <char*>bam_aux2Z(s)
+ elif auxtype in ('Z', 'H'):
+ value = _charptr_to_str(<char*>bam_aux2Z(s))
# +1 for NULL terminated string
s += len(value) + 1
+ elif auxtype == 'B':
+ s += 1
+ byte_size, nvalues, value = convertBinaryTagToList( s )
+ # 5 for 1 char and 1 int
+ s += 5 + ( nvalues * byte_size) - 1
- # skip over type
s += 1
- # ignore pytype
- result.append( (pytag, value) )
+ result.append( (_charptr_to_str(auxtag), value) )
- free( ctag )
return result
def __set__(self, tags):
- cdef char * ctag
cdef bam1_t * src
cdef uint8_t * s
cdef uint8_t * new_data
- cdef int guessed_size, control_size
+ cdef char * temp
+
src = self._delegate
- cdef int max_size, size
- max_size = 4000
-
- # map samtools code to python.struct code and byte size
- buffer = ctypes.create_string_buffer(max_size)
-
- offset = 0
- for pytag, value in tags:
- t = type(value)
- if t == types.FloatType:
- fmt = "<cccf"
- elif t == types.IntType:
- if value < 0:
- if value >= -127: fmt, pytype = "<cccb", 'c'
- elif value >= -32767: fmt, pytype = "<ccch", 's'
- elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
- else: fmt, ctype = "<ccci", 'i'[0]
- else:
- if value <= 255: fmt, pytype = "<cccB", 'C'
- elif value <= 65535: fmt, pytype = "<cccH", 'S'
- elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
- else: fmt, pytype = "<cccI", 'I'
- else:
- # Note: hex strings (H) are not supported yet
- if len(value) == 1:
- fmt, pytype = "<cccc", 'A'
- else:
- fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
- size = struct.calcsize(fmt)
- if offset + size > max_size:
- raise NotImplementedError("tags field too large")
+ fmts, args = ["<"], []
+
+ if tags != None:
+
+ # map samtools code to python.struct code and byte size
+ for pytag, value in tags:
+ if not type(pytag) is bytes:
+ pytag = pytag.encode('ascii')
+ t = type(value)
+
+ if t is tuple or t is list:
+ # binary tags - treat separately
+ pytype = 'B'
+ # get data type - first value determines type
+ if type(value[0]) is float:
+ datafmt, datatype = "f", "f"
+ else:
+ mi, ma = min(value), max(value)
+ absmax = max( abs(mi), abs(ma) )
+ # signed ints
+ if mi < 0:
+ if mi >= -127: datafmt, datatype = "b", 'c'
+ elif mi >= -32767: datafmt, datatype = "h", 's'
+ elif absmax < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+ else: datafmt, datatype = "i", 'i'
+
+ # unsigned ints
+ else:
+ if absmax <= 255: datafmt, datatype = "B", 'C'
+ elif absmax <= 65535: datafmt, datatype = "H", 'S'
+ elif absmax > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+ else: datafmt, datatype = "I", 'I'
+
+ datafmt = "2sccI%i%s" % (len(value), datafmt)
+ args.extend( [pytag[:2],
+ pytype.encode('ascii'),
+ datatype.encode('ascii'),
+ len(value)] + list(value) )
+ fmts.append( datafmt )
+ continue
+
+ if t is float:
+ fmt, pytype = "2scf", 'f'
+ elif t is int:
+ # negative values
+ if value < 0:
+ if value >= -127: fmt, pytype = "2scb", 'c'
+ elif value >= -32767: fmt, pytype = "2sch", 's'
+ elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+ else: fmt, pytype = "2sci", 'i'
+ # positive values
+ else:
+ if value <= 255: fmt, pytype = "2scB", 'C'
+ elif value <= 65535: fmt, pytype = "2scH", 'S'
+ elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+ else: fmt, pytype = "2scI", 'I'
+ else:
+ # Note: hex strings (H) are not supported yet
+ if t is not bytes:
+ value = value.encode('ascii')
+ if len(value) == 1:
+ fmt, pytype = "2scc", 'A'
+ else:
+ fmt, pytype = "2sc%is" % (len(value)+1), 'Z'
+
+ args.extend( [pytag[:2],
+ pytype.encode('ascii'),
+ value ] )
+
+ fmts.append( fmt )
+ fmt = "".join(fmts)
+ total_size = struct.calcsize(fmt)
+ buffer = ctypes.create_string_buffer(total_size)
struct.pack_into( fmt,
buffer,
- offset,
- pytag[0],
- pytag[1],
- pytype,
- value )
- offset += size
-
- # delete the old data and allocate new
- pysam_bam_update( src,
+ 0,
+ *args )
+
+ # delete the old data and allocate new space.
+ # If total_size == 0, the aux field will be
+ # empty
+ pysam_bam_update( src,
src.l_aux,
- offset,
- pysam_bam1_aux( src ) )
-
- src.l_aux = offset
+ total_size,
+ bam1_aux( src ) )
- if offset == 0: return
+ src.l_aux = total_size
- # get location of new data
- s = pysam_bam1_aux( src )
-
- # check if there is direct path from buffer.raw to tmp
- cdef char * temp
- temp = buffer.raw
- memcpy( s, temp, offset )
+ # copy data only if there is any
+ if total_size != 0:
+
+ # get location of new data
+ s = bam1_aux( src )
+
+ # check if there is direct path from buffer.raw to tmp
+ temp = buffer.raw
+ memcpy( s, temp, total_size )
- property flag:
+ property flag:
"""properties flag"""
def __get__(self): return self._delegate.core.flag
def __set__(self, flag): self._delegate.core.flag = flag
- property rname:
+
+ property rname:
"""
:term:`target` ID
+ DEPRECATED from pysam-0.4 - use tid in the future.
+ The rname field caused a lot of confusion as it returns
+ the :term:`target` ID instead of the reference sequence
+ name.
+
.. note::
- This field contains the index of the reference sequence
+ This field contains the index of the reference sequence
in the sequence dictionary. To obtain the name
of the reference sequence, use :meth:`pysam.Samfile.getrname()`
"""
def __get__(self): return self._delegate.core.tid
def __set__(self, tid): self._delegate.core.tid = tid
- property pos:
+
+ property tid:
+ """
+ :term:`target` ID
+
+ .. note::
+
+ This field contains the index of the reference sequence
+ in the sequence dictionary. To obtain the name
+ of the reference sequence, use :meth:`pysam.Samfile.getrname()`
+
+ """
+ def __get__(self): return self._delegate.core.tid
+ def __set__(self, tid): self._delegate.core.tid = tid
+
+ property pos:
"""0-based leftmost coordinate"""
def __get__(self): return self._delegate.core.pos
- def __set__(self, pos):
+ def __set__(self, pos):
## setting the cigar string also updates the "bin" attribute
cdef bam1_t * src
src = self._delegate
if src.core.n_cigar:
- src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) )
+ src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
else:
src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
self._delegate.core.pos = pos
- property bin:
+ property bin:
"""properties bin"""
def __get__(self): return self._delegate.core.bin
def __set__(self, bin): self._delegate.core.bin = bin
property rlen:
'''length of the read (read only). Returns 0 if not given.'''
def __get__(self): return self._delegate.core.l_qseq
- property mapq:
+ property aend:
+ '''aligned reference position of the read on the reference genome.
+
+ aend points to one past the last aligned residue.
+ Returns None if not available.'''
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
+ return None
+ return bam_calend(&src.core, bam1_cigar(src))
+
+ property alen:
+ '''aligned length of the read on the reference genome. Returns None if
+ not available.'''
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
+ return None
+ return bam_calend(&src.core,
+ bam1_cigar(src)) - \
+ self._delegate.core.pos
+
+ property mapq:
"""mapping quality"""
def __get__(self): return self._delegate.core.qual
def __set__(self, qual): self._delegate.core.qual = qual
+
property mrnm:
- """the :term:`reference` id of the mate """
+ """the :term:`reference` id of the mate
+ deprecated, use RNEXT instead.
+ """
+ def __get__(self): return self._delegate.core.mtid
+ def __set__(self, mtid): self._delegate.core.mtid = mtid
+ property rnext:
+ """the :term:`reference` id of the mate """
def __get__(self): return self._delegate.core.mtid
def __set__(self, mtid): self._delegate.core.mtid = mtid
- property mpos:
+ property mpos:
+ """the position of the mate
+ deprecated, use PNEXT instead."""
+ def __get__(self): return self._delegate.core.mpos
+ def __set__(self, mpos): self._delegate.core.mpos = mpos
+ property pnext:
"""the position of the mate"""
def __get__(self): return self._delegate.core.mpos
def __set__(self, mpos): self._delegate.core.mpos = mpos
- property isize:
+ property isize:
+ """the insert size
+ deprecated: use tlen instead"""
+ def __get__(self): return self._delegate.core.isize
+ def __set__(self, isize): self._delegate.core.isize = isize
+ property tlen:
"""the insert size"""
def __get__(self): return self._delegate.core.isize
def __set__(self, isize): self._delegate.core.isize = isize
- property is_paired:
+ property is_paired:
"""true if read is paired in sequencing"""
def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FPAIRED
else: self._delegate.core.flag &= ~BAM_FPAIRED
property is_proper_pair:
"""true if read is mapped in a proper pair"""
def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
property is_unmapped:
"""true if read itself is unmapped"""
def __get__(self): return (self.flag & BAM_FUNMAP) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FUNMAP
else: self._delegate.core.flag &= ~BAM_FUNMAP
- property mate_is_unmapped:
- """true if the mate is unmapped"""
+ property mate_is_unmapped:
+ """true if the mate is unmapped"""
def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FMUNMAP
else: self._delegate.core.flag &= ~BAM_FMUNMAP
property is_reverse:
"""true if read is mapped to reverse strand"""
- def __get__(self):return (self.flag & BAM_FREVERSE) != 0
- def __set__(self,val):
+ def __get__(self): return (self.flag & BAM_FREVERSE) != 0
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FREVERSE
else: self._delegate.core.flag &= ~BAM_FREVERSE
property mate_is_reverse:
"""true is read is mapped to reverse strand"""
def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FMREVERSE
else: self._delegate.core.flag &= ~BAM_FMREVERSE
- property is_read1:
+ property is_read1:
"""true if this is read1"""
def __get__(self): return (self.flag & BAM_FREAD1) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FREAD1
else: self._delegate.core.flag &= ~BAM_FREAD1
property is_read2:
"""true if this is read2"""
def __get__(self): return (self.flag & BAM_FREAD2) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FREAD2
else: self._delegate.core.flag &= ~BAM_FREAD2
property is_secondary:
"""true if not primary alignment"""
def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FSECONDARY
else: self._delegate.core.flag &= ~BAM_FSECONDARY
property is_qcfail:
"""true if QC failure"""
def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FQCFAIL
else: self._delegate.core.flag &= ~BAM_FQCFAIL
property is_duplicate:
- """ true if optical or PCR duplicate"""
+ """true if optical or PCR duplicate"""
def __get__(self): return (self.flag & BAM_FDUP) != 0
- def __set__(self,val):
+ def __set__(self,val):
if val: self._delegate.core.flag |= BAM_FDUP
else: self._delegate.core.flag &= ~BAM_FDUP
-
+ property positions:
+ """a list of reference positions that this read aligns to."""
+ def __get__(self):
+ cdef uint32_t k, i, pos
+ cdef int op
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+
+ src = self._delegate
+ if src.core.n_cigar == 0: return []
+
+ result = []
+ pos = src.core.pos
+ cigar_p = bam1_cigar(src)
+
+ for k from 0 <= k < src.core.n_cigar:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ if op == BAM_CMATCH:
+ for i from pos <= i < pos + l:
+ result.append( i )
+
+ if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
+ pos += l
+
+ return result
+
+ property aligned_pairs:
+ """a list of aligned read and reference positions.
+
+ Unaligned position are marked by None.
+ """
+ def __get__(self):
+ cdef uint32_t k, i, pos, qpos
+ cdef int op
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+
+ src = self._delegate
+ if src.core.n_cigar == 0: return []
+
+ result = []
+ pos = src.core.pos
+ qpos = 0
+ cigar_p = bam1_cigar(src)
+
+ for k from 0 <= k < src.core.n_cigar:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ if op == BAM_CMATCH:
+ for i from pos <= i < pos + l:
+ result.append( (qpos, i) )
+ qpos += 1
+ pos += l
+
+ elif op == BAM_CINS:
+ for i from pos <= i < pos + l:
+ result.append( (qpos, None) )
+ qpos += 1
+
+ elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ for i from pos <= i < pos + l:
+ result.append( (None, i) )
+ pos += l
+
+ return result
+
+
+ def overlap( self, uint32_t start, uint32_t end ):
+ """return number of aligned bases of read overlapping the interval *start* and *end*
+ on the reference sequence.
+ """
+ cdef uint32_t k, i, pos, overlap
+ cdef int op, o
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+
+ overlap = 0
+
+ src = self._delegate
+ if src.core.n_cigar == 0: return 0
+ pos = src.core.pos
+ o = 0
+
+ cigar_p = bam1_cigar(src)
+ for k from 0 <= k < src.core.n_cigar:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ if op == BAM_CMATCH:
+ o = min( pos + l, end) - max( pos, start )
+ if o > 0: overlap += o
+
+ if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
+ pos += l
+
+ return overlap
+
def opt(self, tag):
"""retrieves optional data given a two-letter *tag*"""
- #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
+ #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
cdef uint8_t * v
- v = bam_aux_get(self._delegate, tag)
+ cdef int nvalues
+ btag = _force_bytes(tag)
+ v = bam_aux_get(self._delegate, btag)
if v == NULL: raise KeyError( "tag '%s' not present" % tag )
- type = chr(v[0])
- if type == 'c' or type == 'C' or type == 's' or type == 'S' or type == 'i':
- return <int>bam_aux2i(v)
- elif type == 'f':
+ auxtype = chr(v[0])
+ if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
+ return <int>bam_aux2i(v)
+ elif auxtype == 'i' or auxtype == 'I':
+ return <int32_t>bam_aux2i(v)
+ elif auxtype == 'f' or auxtype == 'F':
return <float>bam_aux2f(v)
- elif type == 'd':
+ elif auxtype == 'd' or auxtype == 'D':
return <double>bam_aux2d(v)
- elif type == 'A':
+ elif auxtype == 'A':
# there might a more efficient way
# to convert a char into a string
return '%c' % <char>bam_aux2A(v)
- elif type == 'Z':
- return <char*>bam_aux2Z(v)
-
+ elif auxtype == 'Z':
+ return _charptr_to_str(<char*>bam_aux2Z(v))
+ elif auxtype == 'B':
+ bytesize, nvalues, values = convertBinaryTagToList( v + 1 )
+ return values
+ else:
+ raise ValueError("unknown auxilliary type '%s'" % auxtype)
+
+
def fancy_str (self):
"""returns list of fieldnames/values in pretty format for debugging
"""
"m_data": "Maximum data length",
"data_len": "Current data length",
}
- fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
- "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
+ fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
+ "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
"l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
-
+
for f in fields_names_in_order:
if not f in self.__dict__:
continue
pos
the target base coordinate (0-based)
n
- number of reads mapping to this column
+ number of reads mapping to this column
pileups
list of reads (:class:`pysam.PileupRead`) aligned to this column
If the underlying engine iterator advances, the results of this column
will change.
'''
- cdef bam_plbuf_t * buf
- cdef int n_pu
-
- def __cinit__(self ):
- pass
+ def __init__(self):
+ raise TypeError("This class cannot be instantiated from Python")
def __str__(self):
return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
property tid:
'''the chromosome ID as is defined in the header'''
- def __get__(self): return pysam_get_tid( self.buf )
+ def __get__(self): return self.tid
property n:
'''number of reads mapping to this column.'''
def __set__(self, n): self.n_pu = n
property pos:
- def __get__(self): return pysam_get_pos( self.buf )
+ def __get__(self): return self.pos
property pileups:
'''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
def __get__(self):
- cdef bam_pileup1_t * pl
- pl = pysam_get_pileup( self.buf )
+ cdef int x
pileups = []
+
+ if self.plp[0] == NULL:
+ raise ValueError("PileupProxy accessed after iterator finished")
+
# warning: there could be problems if self.n and self.buf are
# out of sync.
for x from 0 <= x < self.n_pu:
- pileups.append( makePileupRead( &pl[x]) )
+ pileups.append( makePileupRead( &(self.plp[0][x])) )
return pileups
cdef class PileupRead:
'''A read aligned to a column.
'''
- cdef:
- AlignedRead _alignment
- int32_t _qpos
- int _indel
- int _level
- uint32_t _is_del
- uint32_t _is_head
- uint32_t _is_tail
-
- def __cinit__( self ):
- pass
+ def __init__(self):
+ raise TypeError("This class cannot be instantiated from Python")
def __str__(self):
return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
-
+
property alignment:
"""a :class:`pysam.AlignedRead` object of the aligned read"""
def __get__(self):
ofd = os.dup(self.id) # Save old stream on new unit.
self.streams.append(ofd)
sys.stdout.flush() # Buffered data goes to old stream.
+ sys.stderr.flush() # Buffered data goes to old stream.
os.dup2(fd, self.id) # Open unit 1 on new stream.
os.close(fd) # Close other unit (look out, caller.)
-
+
def restore(self):
'''restore previous output stream'''
if self.streams:
os.close(self.streams[-1])
del self.streams[-1]
-def _samtools_dispatch( method, args = () ):
+def _samtools_dispatch( method,
+ args = (),
+ catch_stdout = True ):
'''call ``method`` in samtools providing arguments in args.
.. note::
- This method redirects stdout and stderr to capture it
- from samtools. If for some reason stdout/stderr disappears
+ This method redirects stdout to capture it
+ from samtools. If for some reason stdout disappears
the reason might be in this method.
.. note::
The current implementation might only work on linux.
-
- .. note::
- This method captures stdout and stderr using temporary files,
+
+ .. note::
+ This method captures stdout and stderr using temporary files,
which are then read into memory in their entirety. This method
- is slow and might cause large memory overhead.
+ is slow and might cause large memory overhead.
See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
on the topic of redirecting stderr/stdout.
'''
# note that debugging this module can be a problem
- # as stdout/stderr will not appear
+ # as stdout/stderr will not appear on the terminal
+
+ # some special cases
+ if method == "index":
+ if not os.path.exists( args[0] ):
+ raise IOError( "No such file or directory: '%s'" % args[0] )
# redirect stderr and stdout to file
-
- # open files and redirect into it
stderr_h, stderr_f = tempfile.mkstemp()
- stdout_h, stdout_f = tempfile.mkstemp()
-
- # patch for `samtools view`
- # samtools `view` closes stdout, from which I can not
- # recover. Thus redirect output to file with -o option.
- if method == "view":
- if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
- args = ( "-o", stdout_f ) + args
-
- stdout_save = Outs( sys.stdout.fileno() )
- stdout_save.setfd( stdout_h )
- stderr_save = Outs( sys.stderr.fileno() )
- stderr_save.setfd( stderr_h )
+ pysam_set_stderr( stderr_h )
+
+ if catch_stdout:
+ stdout_h, stdout_f = tempfile.mkstemp()
+ try:
+ stdout_save = Outs( sys.stdout.fileno() )
+ stdout_save.setfd( stdout_h )
+ except AttributeError:
+ # stdout has already been redirected
+ catch_stdout = False
+
+ # patch for `samtools view`
+ # samtools `view` closes stdout, from which I can not
+ # recover. Thus redirect output to file with -o option.
+ if method == "view":
+ if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
+ args = ( "-o", stdout_f ) + args
# do the function call to samtools
cdef char ** cargs
cdef int i, n, retval
n = len(args)
+ method = _force_cmdline_bytes(method)
+ args = [ _force_cmdline_bytes(a) for a in args ]
+
# allocate two more for first (dummy) argument (contains command)
cargs = <char**>calloc( n+2, sizeof( char *) )
cargs[0] = "samtools"
cargs[1] = method
for i from 0 <= i < n: cargs[i+2] = args[i]
+
retval = pysam_dispatch(n+2, cargs)
free( cargs )
-
+
# restore stdout/stderr. This will also flush, so
# needs to be before reading back the file contents
- stdout_save.restore()
- stderr_save.restore()
+ if catch_stdout:
+ stdout_save.restore()
+ try:
+ with open( stdout_f, "r") as inf:
+ out_stdout = inf.readlines()
+ except UnicodeDecodeError:
+ with open( stdout_f, "rb") as inf:
+ # read binary output
+ out_stdout = inf.read()
+ os.remove( stdout_f )
+ else:
+ out_stdout = []
+
+ # get error messages
+ pysam_unset_stderr()
+ try:
+ with open( stderr_f, "r") as inf:
+ out_stderr = inf.readlines()
+ except UnicodeDecodeError:
+ with open( stderr_f, "rb") as inf:
+ # read binary output
+ out_stderr = inf.read()
+ os.remove( stderr_f )
+ else:
+ out_stderr = []
- # capture stderr/stdout.
- out_stderr = open( stderr_f, "r").readlines()
- out_stdout = open( stdout_f, "r").readlines()
+ return retval, out_stderr, out_stdout
- # clean up files
- os.remove( stderr_f )
- os.remove( stdout_f )
+cdef class SNPCall:
+ '''the results of a SNP call.'''
+ cdef int _tid
+ cdef int _pos
+ cdef char _reference_base
+ cdef char _genotype
+ cdef int _consensus_quality
+ cdef int _snp_quality
+ cdef int _rms_mapping_quality
+ cdef int _coverage
- return retval, out_stderr, out_stdout
+ property tid:
+ '''the chromosome ID as is defined in the header'''
+ def __get__(self):
+ return self._tid
+
+ property pos:
+ '''nucleotide position of SNP.'''
+ def __get__(self): return self._pos
+
+ property reference_base:
+ '''reference base at pos. ``N`` if no reference sequence supplied.'''
+ def __get__(self): return from_string_and_size( &self._reference_base, 1 )
+
+ property genotype:
+ '''the genotype called.'''
+ def __get__(self): return from_string_and_size( &self._genotype, 1 )
+
+ property consensus_quality:
+ '''the genotype quality (Phred-scaled).'''
+ def __get__(self): return self._consensus_quality
+
+ property snp_quality:
+ '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+ def __get__(self): return self._snp_quality
+
+ property mapping_quality:
+ '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+ def __get__(self): return self._rms_mapping_quality
+
+ property coverage:
+ '''coverage or read depth - the number of reads involved in the call.'''
+ def __get__(self): return self._coverage
+
+ def __str__(self):
+
+ return "\t".join( map(str, (
+ self.tid,
+ self.pos,
+ self.reference_base,
+ self.genotype,
+ self.consensus_quality,
+ self.snp_quality,
+ self.mapping_quality,
+ self.coverage ) ) )
+
+
+# cdef class SNPCallerBase:
+# '''Base class for SNP callers.
+
+# *min_baseQ*
+# minimum base quality (possibly capped by BAQ)
+# *capQ_threshold*
+# coefficient for adjusting mapQ of poor mappings
+# *theta*
+# theta in maq consensus calling model
+# *n_haplotypes*
+# number of haplotypes in the sample
+# *het_rate*
+# prior of a difference between two haplotypes
+# '''
+
+# cdef bam_maqcns_t * c
+# cdef IteratorColumn iter
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+
+# self.iter = iterator_column
+# self.c = bam_maqcns_init()
+
+# # set the default parameterization according to
+# # samtools
+
+# # new default mode for samtools >0.1.10
+# self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
+
+# self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
+# # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
+# self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
+# self.c.het_rate = kwargs.get( "het_rate", 0.001 )
+# self.c.theta = kwargs.get( "theta", 0.83 )
+
+# if self.c.errmod != BAM_ERRMOD_MAQ2:
+# self.c.theta += 0.02
+
+# # call prepare AFTER setting parameters
+# bam_maqcns_prepare( self.c )
+
+# def __dealloc__(self):
+# bam_maqcns_destroy( self.c )
+
+ # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
+ # '''debugging output.'''
+
+ # pysam_dump_glf( g, self.c );
+ # print ""
+ # for x in range(self.iter.n_plp):
+ # print "--> read %i %s %i" % (x,
+ # bam1_qname(self.iter.plp[x].b),
+ # self.iter.plp[x].qpos,
+ # )
+
+ # print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
+ # % (self.iter.pos,
+ # cns,
+ # self.c.q_r,
+ # self.iter.n_plp,
+ # self.iter.n_plp,
+ # rb,
+ # cns >> 8 & 0xff,
+ # cns >> 16 & 0xff,
+ # cns & 0xff,
+ # cns >> 28,
+ # )
+
+ # printf("-------------------------------------\n");
+ # sys.stdout.flush()
+
+# cdef class IteratorSNPCalls( SNPCallerBase ):
+# """*(IteratorColumn iterator)*
+
+# call SNPs within a region.
+
+# *iterator* is a pileup iterator. SNPs will be called
+# on all positions returned by this iterator.
+
+# This caller is fast if SNPs are called over large continuous
+# regions. It is slow, if instantiated frequently and in random
+# order as the sequence will have to be reloaded.
+
+# """
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+
+# assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
+
+# def __iter__(self):
+# return self
+
+# def __next__(self):
+# """python version of next().
+# """
+
+# # the following code was adapted from bam_plcmd.c:pileup_func()
+# self.iter.cnext()
+
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
+
+# if self.iter.plp == NULL:
+# raise StopIteration
+
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
+
+# assert seq != NULL
+
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+# cdef int rb = seq[self.iter.pos]
+# cdef uint32_t cns
+# cdef glf1_t * g
+
+# g = bam_maqcns_glfgen( self.iter.n_plp,
+# self.iter.plp,
+# bam_nt16_table[rb],
+# self.c )
+
+# if pysam_glf_depth( g ) == 0:
+# cns = 0xfu << 28 | 0xf << 24
+# else:
+# cns = glf2cns(g, <int>(self.c.q_r + .499))
+
+# free(g)
+
+# cdef SNPCall call
+
+# call = SNPCall()
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._reference_base = rb
+# call._genotype = bam_nt16_rev_table[cns>>28]
+# call._consensus_quality = cns >> 8 & 0xff
+# call._snp_quality = cns & 0xff
+# call._rms_mapping_quality = cns >> 16&0xff
+# call._coverage = self.iter.n_plp
+
+# return call
+
+# cdef class SNPCaller( SNPCallerBase ):
+# '''*(IteratorColumn iterator_column )*
+
+# The samtools SNP caller.
+
+# This object will call SNPs in *samfile* against the reference
+# sequence in *fasta*.
+
+# This caller is fast for calling few SNPs in selected regions.
+
+# It is slow, if called over large genomic regions.
+# '''
+
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+
+# pass
+
+# def call(self, reference, int pos ):
+# """call a snp on chromosome *reference*
+# and position *pos*.
+
+# returns a :class:`SNPCall` object.
+# """
+
+# cdef int tid = self.iter.samfile.gettid( reference )
+
+# self.iter.reset( tid, pos, pos + 1 )
+
+# while 1:
+# self.iter.cnext()
+
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
-__all__ = ["Samfile",
+# if self.iter.plp == NULL:
+# raise ValueError( "no reads in region - no call" )
+
+# if self.iter.pos == pos: break
+
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
+
+# assert seq != NULL
+
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+# cdef int rb = seq[self.iter.pos]
+# cdef uint32_t cns
+# # cdef glf1_t * g
+# #
+# # g = bam_maqcns_glfgen( self.iter.n_plp,
+# # self.iter.plp,
+# # bam_nt16_table[rb],
+# # self.c )
+# ##
+# #
+# # if pysam_glf_depth( g ) == 0:
+# # cns = 0xfu << 28 | 0xf << 24
+# # else:
+# # cns = glf2cns(g, <int>(self.c.q_r + .499))
+# #
+# # free(g)
+
+# cdef SNPCall call
+
+# call = SNPCall()
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._reference_base = rb
+# call._genotype = bam_nt16_rev_table[cns>>28]
+# call._consensus_quality = cns >> 8 & 0xff
+# call._snp_quality = cns & 0xff
+# call._rms_mapping_quality = cns >> 16&0xff
+# call._coverage = self.iter.n_plp
+
+# return call
+
+# cdef class IndelCall:
+# '''the results of an indel call.'''
+# cdef int _tid
+# cdef int _pos
+# cdef int _coverage
+# cdef int _rms_mapping_quality
+# cdef bam_maqindel_ret_t * _r
+
+# def __cinit__(self):
+# #assert r != NULL
+# #self._r = r
+# pass
+
+# property tid:
+# '''the chromosome ID as is defined in the header'''
+# def __get__(self):
+# return self._tid
+
+# property pos:
+# '''nucleotide position of SNP.'''
+# def __get__(self): return self._pos
+
+# property genotype:
+# '''the genotype called.'''
+# def __get__(self):
+# if self._r.gt == 0:
+# s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+# return "%s/%s" % (s,s)
+# elif self._r.gt == 1:
+# s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+# return "%s/%s" % (s,s)
+# else:
+# return "%s/%s" % (self.first_allele, self.second_allele )
+
+# property consensus_quality:
+# '''the genotype quality (Phred-scaled).'''
+# def __get__(self): return self._r.q_cns
+
+# property snp_quality:
+# '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+# def __get__(self): return self._r.q_ref
+
+# property mapping_quality:
+# '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+# def __get__(self): return self._rms_mapping_quality
+
+# property coverage:
+# '''coverage or read depth - the number of reads involved in the call.'''
+# def __get__(self): return self._coverage
+
+# property first_allele:
+# '''sequence of first allele.'''
+# def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+
+# property second_allele:
+# '''sequence of second allele.'''
+# def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+
+# property reads_first:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt1
+
+# property reads_second:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt2
+
+# property reads_diff:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt_anti
+
+# def __str__(self):
+
+# return "\t".join( map(str, (
+# self.tid,
+# self.pos,
+# self.genotype,
+# self.consensus_quality,
+# self.snp_quality,
+# self.mapping_quality,
+# self.coverage,
+# self.first_allele,
+# self.second_allele,
+# self.reads_first,
+# self.reads_second,
+# self.reads_diff ) ) )
+
+# def __dealloc__(self ):
+# bam_maqindel_ret_destroy(self._r)
+
+# cdef class IndelCallerBase:
+# '''Base class for SNP callers.
+
+# *min_baseQ*
+# minimum base quality (possibly capped by BAQ)
+# *capQ_threshold*
+# coefficient for adjusting mapQ of poor mappings
+# *theta*
+# theta in maq consensus calling model
+# *n_haplotypes*
+# number of haplotypes in the sample
+# *het_rate*
+# prior of a difference between two haplotypes
+# '''
+
+# cdef bam_maqindel_opt_t * options
+# cdef IteratorColumn iter
+# cdef int cap_mapQ
+# cdef int max_depth
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+
+
+# self.iter = iterator_column
+
+# assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
+
+# self.options = bam_maqindel_opt_init()
+
+# # set the default parameterization according to
+# # samtools
+
+# self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
+# self.options.q_indel = kwargs.get( "q_indel", 40 )
+# self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
+# self.max_depth = kwargs.get( "max_depth", 1024 )
+
+# def __dealloc__(self):
+# free( self.options )
+
+# def _call( self ):
+
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
+
+# assert seq != NULL
+
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+# cdef bam_maqindel_ret_t * r
+
+# cdef int m = min( self.max_depth, self.iter.n_plp )
+
+# # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
+# # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
+# # self.options.indel_err, self.options.ambi_thres );
+
+# r = bam_maqindel(m,
+# self.iter.pos,
+# self.options,
+# self.iter.plp,
+# seq,
+# 0,
+# NULL)
+
+# if r == NULL: return None
+
+# cdef IndelCall call
+# call = IndelCall()
+# call._r = r
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._coverage = self.iter.n_plp
+
+# cdef uint64_t rms_aux = 0
+# cdef int i = 0
+# cdef bam_pileup1_t * p
+# cdef int tmp
+
+# for i from 0 <= i < self.iter.n_plp:
+# p = self.iter.plp + i
+# if p.b.core.qual < self.cap_mapQ:
+# tmp = p.b.core.qual
+# else:
+# tmp = self.cap_mapQ
+# rms_aux += tmp * tmp
+
+# call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
+
+# return call
+
+# cdef class IndelCaller( IndelCallerBase ):
+# '''*(IteratorColumn iterator_column )*
+
+# The samtools SNP caller.
+
+# This object will call SNPs in *samfile* against the reference
+# sequence in *fasta*.
+
+# This caller is fast for calling few SNPs in selected regions.
+
+# It is slow, if called over large genomic regions.
+# '''
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+
+# pass
+
+# def call(self, reference, int pos ):
+# """call a snp on chromosome *reference*
+# and position *pos*.
+
+# returns a :class:`SNPCall` object or None, if no indel call could be made.
+# """
+
+# cdef int tid = self.iter.samfile.gettid( reference )
+
+# self.iter.reset( tid, pos, pos + 1 )
+
+# while 1:
+# self.iter.cnext()
+
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
+
+# if self.iter.plp == NULL:
+# raise ValueError( "no reads in region - no call" )
+
+# if self.iter.pos == pos: break
+
+# return self._call()
+
+# cdef class IteratorIndelCalls( IndelCallerBase ):
+# """*(IteratorColumn iterator)*
+
+# call indels within a region.
+
+# *iterator* is a pileup iterator. SNPs will be called
+# on all positions returned by this iterator.
+
+# This caller is fast if SNPs are called over large continuous
+# regions. It is slow, if instantiated frequently and in random
+# order as the sequence will have to be reloaded.
+
+# """
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+# pass
+
+
+# def __iter__(self):
+# return self
+
+# def __next__(self):
+# """python version of next().
+# """
+
+# # the following code was adapted from bam_plcmd.c:pileup_func()
+# self.iter.cnext()
+
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
+
+# if self.iter.plp == NULL:
+# raise StopIteration
+
+# return self._call()
+
+
+
+cdef class IndexedReads:
+ """index a bamfile by read.
+
+ The index is kept in memory.
+
+ By default, the file is re-openend to avoid conflicts if
+ multiple operators work on the same file. Set *reopen* = False
+ to not re-open *samfile*.
+ """
+
+ def __init__(self, Samfile samfile, int reopen = True ):
+ self.samfile = samfile
+
+ if samfile.isbam: mode = b"rb"
+ else: mode = b"r"
+
+ # reopen the file - note that this makes the iterator
+ # slow and causes pileup to slow down significantly.
+ if reopen:
+ store = StderrStore()
+ self.fp = samopen( samfile._filename, mode, NULL )
+ store.release()
+ assert self.fp != NULL
+ self.owns_samfile = True
+ else:
+ self.fp = samfile.samfile
+ self.owns_samfile = False
+
+ assert samfile.isbam, "can only IndexReads on bam files"
+
+ def build( self ):
+ '''build index.'''
+
+ self.index = collections.defaultdict( list )
+
+ # this method will start indexing from the current file position
+ # if you decide
+ cdef int ret = 1
+ cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
+
+ cdef uint64_t pos
+
+ while ret > 0:
+ pos = bam_tell( self.fp.x.bam )
+ ret = samread( self.fp, b)
+ if ret > 0:
+ qname = _charptr_to_str(bam1_qname( b ))
+ self.index[qname].append( pos )
+
+ bam_destroy1( b )
+
+ def find( self, qname ):
+ if qname in self.index:
+ return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
+ else:
+ raise KeyError( "read %s not found" % qname )
+
+ def __dealloc__(self):
+ if self.owns_samfile: samclose( self.fp )
+
+__all__ = ["Samfile",
"Fastafile",
- "IteratorRow",
- "IteratorRowAll",
- "IteratorColumn",
- "AlignedRead",
- "PileupColumn",
- "PileupProxy",
- "PileupRead" ]
-
-
+ "IteratorRow",
+ "IteratorColumn",
+ "AlignedRead",
+ "PileupColumn",
+ "PileupProxy",
+ "PileupRead",
+ # "IteratorSNPCalls",
+ # "SNPCaller",
+ # "IndelCaller",
+ # "IteratorIndelCalls",
+ "IndexedReads" ]
+
+