cdef class Samfile:
- '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
+ '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None,
+ add_sq_text = False )*
A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
f = pysam.Samfile('ex1.bam','rb')
- If mode is not specified, we will try to auto-detect in the order 'r', 'rb', thus both the following
+ If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following
should work::
f1 = pysam.Samfile('ex1.bam' )
- f2 = pysam.Samfile('ex1.bam' )
+ f2 = pysam.Samfile('ex1.sam' )
If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
3. If *text* is given, new header text is copied from raw text.
4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
+ By default, 'SQ' and 'LN' tags will be added to the header text. This option can be
+ changed by unsetting the flag *add_sq_text*.
'''
self.samfile = NULL
self._filename = NULL
self.isbam = False
+ self.isstream = False
self._open( *args, **kwargs )
# allocate memory for iterator
text = None,
header = None,
port = None,
+ add_sq_text = True,
):
'''open a sam/bam file.
# read mode autodetection
if mode is None:
try:
- self._open(filename, 'r', template=template,
+ self._open(filename, 'rb', template=template,
referencenames=referencenames,
referencelengths=referencelengths,
text=text, header=header, port=port)
return
except ValueError, msg:
pass
- self._open(filename, 'rb', template=template,
+
+ self._open(filename, 'r', template=template,
referencenames=referencenames,
referencelengths=referencelengths,
text=text, header=header, port=port)
return
- assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
+ assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode
assert filename != NULL
# close a previously opened file
if self._filename != NULL: free(self._filename )
self._filename = strdup( filename )
+ self.isstream = strcmp( filename, "-" ) == 0
self.isbam = len(mode) > 1 and mode[1] == 'b'
header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
strncpy( header_to_write.target_name[x], name, len(name) )
+ # Optionally, if there is no text, add a SAM compatible header to output
+ # file.
+ if text is None and add_sq_text:
+ text = ''
+ for x from 0 <= x < header_to_write.n_targets:
+ text += "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] )
+
if text != None:
# copy without \0
ctext = text
raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode )
#disabled for autodetection to work
- if self.samfile.header.n_targets == 0:
- raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
+ # needs to be disabled so that reading from sam-files without headers works
+ #if self.samfile.header.n_targets == 0:
+ # raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
if self.samfile == NULL:
raise IOError("could not open file `%s`" % filename )
self.index = bam_index_load(filename)
if self.index == NULL:
raise IOError("error while opening index `%s` " % filename )
-
+
+ if not self.isstream:
+ self.start_offset = bam_tell( self.samfile.x.bam )
+
def gettid( self, reference ):
'''
convert :term:`reference` name into numerical :term:`tid`
return 1, rtid, rstart, rend
+ def reset( self ):
+ '''reset file position to beginning of read section.'''
+ return self.seek( self.start_offset, 0 )
+
def seek( self, uint64_t offset, int where = 0):
'''
move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
raise ValueError( "I/O operation on closed file" )
if not self.isbam:
raise NotImplementedError("seek only available in bam files")
+ if self.isstream:
+ raise OSError("seek no available in streams")
+
return bam_seek( self.samfile.x.bam, offset, where )
def tell( self ):
:term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
be supplied.
- Without *reference* or *region* all reads will be fetched. The reads will be returned
+ Without *reference* or *region* all mapped reads will be fetched. The reads will be returned
ordered by reference sequence, which will not necessarily be the order within the file.
+
If *until_eof* is given, all reads from the current file position will be returned
- *in order as they are within the file*.
+ in order as they are within the file. Using this option will also fetch unmapped reads.
If only *reference* is set, all reads aligned to *reference* will be fetched.
has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+ if self.isstream: reopen = False
+ else: reopen = True
+
if self.isbam:
if not until_eof and not self._hasIndex() and not self.isremote:
raise ValueError( "fetch called on bamfile without index" )
fetch_callback )
else:
if has_coord:
- return IteratorRowRegion( self, rtid, rstart, rend )
+ return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen )
else:
if until_eof:
- return IteratorRowAll( self )
+ return IteratorRowAll( self, reopen=reopen )
else:
- return IteratorRowAllRefs(self)
+ # AH: check - reason why no reopen for AllRefs?
+ return IteratorRowAllRefs(self ) # , reopen=reopen )
else:
# check if header is present - otherwise sam_read1 aborts
# this happens if a bamfile is opened with mode 'r'
+ if has_coord:
+ raise ValueError ("fetching by region is not available for sam files" )
+
if self.samfile.header.n_targets == 0:
raise ValueError( "fetch called for samfile without header")
-
- if region != None:
- raise ValueError ("fetch for a region is not available for sam files" )
+
if callback:
raise NotImplementedError( "callback not implemented yet" )
else:
- return IteratorRowAll( self )
+ return IteratorRowAll( self, reopen=reopen )
def mate( self,
AlignedRead read ):
mask
Skip all reads with bits set in mask.
+ max_depth
+ Maximum read depth permitted. The default limit is *8000*.
.. note::
*all* reads which overlap the region are returned. The first base returned will be the
first base of the first read *not* necessarily the first base of the region used in the query.
- The maximum number of reads considered for pileup is *8000*. This limit is set by
- :term:`csamtools`.
-
'''
cdef int rtid, rstart, rend, has_coord
cdef bam_plbuf_t *buf
t.append( self.samfile.header.target_len[x] )
return tuple(t)
+ property mapped:
+ """total number of mapped reads in file.
+ """
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" )
+
+ cdef int tid
+ cdef uint32_t total = 0
+ for tid from 0 <= tid < self.samfile.header.n_targets:
+ total += pysam_get_mapped( self.index, tid )
+ return total
+
+ property unmapped:
+ """total number of unmapped reads in file.
+ """
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" )
+ cdef int tid
+ cdef uint32_t total = 0
+ for tid from 0 <= tid < self.samfile.header.n_targets:
+ total += pysam_get_unmapped( self.index, tid )
+ # get unmapped reads without coordinates
+ total += pysam_get_unmapped( self.index, -1 )
+ return total
+
property text:
'''full contents of the :term:`sam file` header as a string.'''
def __get__(self):
###############################################################
def __iter__(self):
if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ if not self.isbam and self.samfile.header.n_targets == 0:
+ raise NotImplementedError( "can not iterate over samfile without header")
return self
cdef bam1_t * getCurrent( self ):
to not re-open *samfile*.
"""
- cdef bam1_t * b
- cdef samfile_t * fp
- # true if samfile belongs to this object
- cdef int owns_samfile
+ # cdef bam1_t * b
+ # cdef samfile_t * fp
+ # # true if samfile belongs to this object
+ # cdef int owns_samfile
def __cinit__(self, Samfile samfile, int reopen = True ):
A :class:`FastaFile` object
mask
Skip all reads with bits set in mask.
-
-
+ max_depth
+ maximum read depth. The default is 8000.
'''
# result of the last plbuf_push
cdef Samfile samfile
cdef Fastafile fastafile
cdef stepper
+ cdef int max_depth
def __cinit__( self, Samfile samfile, **kwargs ):
self.samfile = samfile
self.mask = kwargs.get("mask", BAM_DEF_MASK )
self.fastafile = kwargs.get( "fastafile", None )
self.stepper = kwargs.get( "stepper", None )
+ self.max_depth = kwargs.get( "max_depth", 8000 )
self.iterdata.seq = NULL
self.tid = 0
self.pos = 0
self.plp = NULL
self.pileup_iter = <bam_plp_t>NULL
+
def __iter__(self):
return self
else:
raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
+ if self.max_depth:
+ bam_plp_set_maxcnt( self.pileup_iter, self.max_depth )
+
bam_plp_set_mask( self.pileup_iter, self.mask )
cdef reset( self, tid, start, end ):
read.tags = read.tags + [("RG",0)]
+
+ This method will happily write the same tag
+ multiple times.
"""
def __get__(self):
cdef char * ctag
cdef char auxtype
src = self._delegate
- if src.l_aux == 0: return None
+ if src.l_aux == 0: return []
s = bam1_aux( src )
result = []
'''length of the read (read only). Returns 0 if not given.'''
def __get__(self): return self._delegate.core.l_qseq
property aend:
- '''aligned end position of the read (read only). Returns
+ '''aligned end position of the read on the reference genome. Returns
None if not available.'''
def __get__(self):
cdef bam1_t * src
return None
return bam_calend(&src.core, bam1_cigar(src))
property alen:
- '''aligned length of the read (read only). Returns None if
+ '''aligned length of the read on the reference genome. Returns None if
not available.'''
def __get__(self):
cdef bam1_t * src
def __get__(self): return self._delegate.core.qual
def __set__(self, qual): self._delegate.core.qual = qual
property mrnm:
+ """the :term:`reference` id of the mate
+ deprecated, use RNEXT instead.
+ """
+ def __get__(self): return self._delegate.core.mtid
+ def __set__(self, mtid): self._delegate.core.mtid = mtid
+ property rnext:
"""the :term:`reference` id of the mate """
def __get__(self): return self._delegate.core.mtid
def __set__(self, mtid): self._delegate.core.mtid = mtid
property mpos:
+ """the position of the mate
+ deprecated, use PNEXT instead."""
+ def __get__(self): return self._delegate.core.mpos
+ def __set__(self, mpos): self._delegate.core.mpos = mpos
+ property pnext:
"""the position of the mate"""
def __get__(self): return self._delegate.core.mpos
def __set__(self, mpos): self._delegate.core.mpos = mpos
property isize:
+ """the insert size
+ deprecated: use tlen instead"""
+ def __get__(self): return self._delegate.core.isize
+ def __set__(self, isize): self._delegate.core.isize = isize
+ property tlen:
"""the insert size"""
def __get__(self): return self._delegate.core.isize
def __set__(self, isize): self._delegate.core.isize = isize
pos += l
return result
+
def overlap( self, uint32_t start, uint32_t end ):
- """return number of bases on reference overlapping *start* and *end*
+ """return number of aligned bases of read overlapping the interval *start* and *end*
+ on the reference sequence.
"""
cdef uint32_t k, i, pos, overlap
cdef int op, o
'''call ``method`` in samtools providing arguments in args.
.. note::
- This method redirects stdout and stderr to capture it
+ This method redirects stdout and (optionally) stderr to capture it
from samtools. If for some reason stdout/stderr disappears
the reason might be in this method.
'''
# note that debugging this module can be a problem
- # as stdout/stderr will not appear
-
+ # as stdout/stderr will not appear on the terminal
+
# some special cases
if method == "index":
if not os.path.exists( args[0] ):
if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
args = ( "-o", stdout_f ) + args
-
# do the function call to samtools
cdef char ** cargs
cdef int i, n, retval
cargs[0] = "samtools"
cargs[1] = method
for i from 0 <= i < n: cargs[i+2] = args[i]
+
retval = pysam_dispatch(n+2, cargs)
free( cargs )
self.coverage ) ) )
-cdef class SNPCallerBase:
- '''Base class for SNP callers.
-
- *min_baseQ*
- minimum base quality (possibly capped by BAQ)
- *capQ_threshold*
- coefficient for adjusting mapQ of poor mappings
- *theta*
- theta in maq consensus calling model
- *n_haplotypes*
- number of haplotypes in the sample
- *het_rate*
- prior of a difference between two haplotypes
- '''
-
- cdef bam_maqcns_t * c
- cdef IteratorColumn iter
-
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
-
- self.iter = iterator_column
- self.c = bam_maqcns_init()
-
- # set the default parameterization according to
- # samtools
-
- # new default mode for samtools >0.1.10
- self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
-
- self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
- # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
- self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
- self.c.het_rate = kwargs.get( "het_rate", 0.001 )
- self.c.theta = kwargs.get( "theta", 0.83 )
-
- if self.c.errmod != BAM_ERRMOD_MAQ2:
- self.c.theta += 0.02
-
- # call prepare AFTER setting parameters
- bam_maqcns_prepare( self.c )
-
- def __dealloc__(self):
- bam_maqcns_destroy( self.c )
-
- cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
- '''debugging output.'''
-
- pysam_dump_glf( g, self.c );
- print ""
- for x in range(self.iter.n_plp):
- print "--> read %i %s %i" % (x,
- bam1_qname(self.iter.plp[x].b),
- self.iter.plp[x].qpos,
- )
-
- print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
- % (self.iter.pos,
- cns,
- self.c.q_r,
- self.iter.n_plp,
- self.iter.n_plp,
- rb,
- cns >> 8 & 0xff,
- cns >> 16 & 0xff,
- cns & 0xff,
- cns >> 28,
- )
-
- printf("-------------------------------------\n");
- sys.stdout.flush()
-
-cdef class IteratorSNPCalls( SNPCallerBase ):
- """*(IteratorColumn iterator)*
-
- call SNPs within a region.
-
- *iterator* is a pileup iterator. SNPs will be called
- on all positions returned by this iterator.
-
- This caller is fast if SNPs are called over large continuous
- regions. It is slow, if instantiated frequently and in random
- order as the sequence will have to be reloaded.
-
- """
+# cdef class SNPCallerBase:
+# '''Base class for SNP callers.
+
+# *min_baseQ*
+# minimum base quality (possibly capped by BAQ)
+# *capQ_threshold*
+# coefficient for adjusting mapQ of poor mappings
+# *theta*
+# theta in maq consensus calling model
+# *n_haplotypes*
+# number of haplotypes in the sample
+# *het_rate*
+# prior of a difference between two haplotypes
+# '''
+
+# cdef bam_maqcns_t * c
+# cdef IteratorColumn iter
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+
+# self.iter = iterator_column
+# self.c = bam_maqcns_init()
+
+# # set the default parameterization according to
+# # samtools
+
+# # new default mode for samtools >0.1.10
+# self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
+
+# self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
+# # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
+# self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
+# self.c.het_rate = kwargs.get( "het_rate", 0.001 )
+# self.c.theta = kwargs.get( "theta", 0.83 )
+
+# if self.c.errmod != BAM_ERRMOD_MAQ2:
+# self.c.theta += 0.02
+
+# # call prepare AFTER setting parameters
+# bam_maqcns_prepare( self.c )
+
+# def __dealloc__(self):
+# bam_maqcns_destroy( self.c )
+
+ # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
+ # '''debugging output.'''
+
+ # pysam_dump_glf( g, self.c );
+ # print ""
+ # for x in range(self.iter.n_plp):
+ # print "--> read %i %s %i" % (x,
+ # bam1_qname(self.iter.plp[x].b),
+ # self.iter.plp[x].qpos,
+ # )
+
+ # print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
+ # % (self.iter.pos,
+ # cns,
+ # self.c.q_r,
+ # self.iter.n_plp,
+ # self.iter.n_plp,
+ # rb,
+ # cns >> 8 & 0xff,
+ # cns >> 16 & 0xff,
+ # cns & 0xff,
+ # cns >> 28,
+ # )
+
+ # printf("-------------------------------------\n");
+ # sys.stdout.flush()
+
+# cdef class IteratorSNPCalls( SNPCallerBase ):
+# """*(IteratorColumn iterator)*
+
+# call SNPs within a region.
+
+# *iterator* is a pileup iterator. SNPs will be called
+# on all positions returned by this iterator.
+
+# This caller is fast if SNPs are called over large continuous
+# regions. It is slow, if instantiated frequently and in random
+# order as the sequence will have to be reloaded.
+
+# """
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
+# assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
- def __iter__(self):
- return self
+# def __iter__(self):
+# return self
- def __next__(self):
- """python version of next().
- """
+# def __next__(self):
+# """python version of next().
+# """
- # the following code was adapted from bam_plcmd.c:pileup_func()
- self.iter.cnext()
+# # the following code was adapted from bam_plcmd.c:pileup_func()
+# self.iter.cnext()
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
- if self.iter.plp == NULL:
- raise StopIteration
+# if self.iter.plp == NULL:
+# raise StopIteration
- cdef char * seq = self.iter.getSequence()
- cdef int seq_len = self.iter.seq_len
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
- assert seq != NULL
+# assert seq != NULL
- # reference base
- if self.iter.pos >= seq_len:
- raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
- cdef int rb = seq[self.iter.pos]
- cdef uint32_t cns
- cdef glf1_t * g
+# cdef int rb = seq[self.iter.pos]
+# cdef uint32_t cns
+# cdef glf1_t * g
- g = bam_maqcns_glfgen( self.iter.n_plp,
- self.iter.plp,
- bam_nt16_table[rb],
- self.c )
+# g = bam_maqcns_glfgen( self.iter.n_plp,
+# self.iter.plp,
+# bam_nt16_table[rb],
+# self.c )
- if pysam_glf_depth( g ) == 0:
- cns = 0xfu << 28 | 0xf << 24
- else:
- cns = glf2cns(g, <int>(self.c.q_r + .499))
-
- free(g)
+# if pysam_glf_depth( g ) == 0:
+# cns = 0xfu << 28 | 0xf << 24
+# else:
+# cns = glf2cns(g, <int>(self.c.q_r + .499))
+
+# free(g)
- cdef SNPCall call
+# cdef SNPCall call
- call = SNPCall()
- call._tid = self.iter.tid
- call._pos = self.iter.pos
- call._reference_base = rb
- call._genotype = bam_nt16_rev_table[cns>>28]
- call._consensus_quality = cns >> 8 & 0xff
- call._snp_quality = cns & 0xff
- call._rms_mapping_quality = cns >> 16&0xff
- call._coverage = self.iter.n_plp
+# call = SNPCall()
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._reference_base = rb
+# call._genotype = bam_nt16_rev_table[cns>>28]
+# call._consensus_quality = cns >> 8 & 0xff
+# call._snp_quality = cns & 0xff
+# call._rms_mapping_quality = cns >> 16&0xff
+# call._coverage = self.iter.n_plp
- return call
+# return call
-cdef class SNPCaller( SNPCallerBase ):
- '''*(IteratorColumn iterator_column )*
+# cdef class SNPCaller( SNPCallerBase ):
+# '''*(IteratorColumn iterator_column )*
- The samtools SNP caller.
+# The samtools SNP caller.
- This object will call SNPs in *samfile* against the reference
- sequence in *fasta*.
+# This object will call SNPs in *samfile* against the reference
+# sequence in *fasta*.
- This caller is fast for calling few SNPs in selected regions.
+# This caller is fast for calling few SNPs in selected regions.
- It is slow, if called over large genomic regions.
- '''
+# It is slow, if called over large genomic regions.
+# '''
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- pass
+# pass
- def call(self, reference, int pos ):
- """call a snp on chromosome *reference*
- and position *pos*.
+# def call(self, reference, int pos ):
+# """call a snp on chromosome *reference*
+# and position *pos*.
- returns a :class:`SNPCall` object.
- """
+# returns a :class:`SNPCall` object.
+# """
- cdef int tid = self.iter.samfile.gettid( reference )
+# cdef int tid = self.iter.samfile.gettid( reference )
- self.iter.reset( tid, pos, pos + 1 )
+# self.iter.reset( tid, pos, pos + 1 )
- while 1:
- self.iter.cnext()
+# while 1:
+# self.iter.cnext()
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
- if self.iter.plp == NULL:
- raise ValueError( "no reads in region - no call" )
+# if self.iter.plp == NULL:
+# raise ValueError( "no reads in region - no call" )
- if self.iter.pos == pos: break
-
- cdef char * seq = self.iter.getSequence()
- cdef int seq_len = self.iter.seq_len
-
- assert seq != NULL
-
- # reference base
- if self.iter.pos >= seq_len:
- raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
-
- cdef int rb = seq[self.iter.pos]
- cdef uint32_t cns
- cdef glf1_t * g
-
- g = bam_maqcns_glfgen( self.iter.n_plp,
- self.iter.plp,
- bam_nt16_table[rb],
- self.c )
-
-
- if pysam_glf_depth( g ) == 0:
- cns = 0xfu << 28 | 0xf << 24
- else:
- cns = glf2cns(g, <int>(self.c.q_r + .499))
-
- free(g)
+# if self.iter.pos == pos: break
+
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
+
+# assert seq != NULL
+
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+# cdef int rb = seq[self.iter.pos]
+# cdef uint32_t cns
+# # cdef glf1_t * g
+# #
+# # g = bam_maqcns_glfgen( self.iter.n_plp,
+# # self.iter.plp,
+# # bam_nt16_table[rb],
+# # self.c )
+# ##
+# #
+# # if pysam_glf_depth( g ) == 0:
+# # cns = 0xfu << 28 | 0xf << 24
+# # else:
+# # cns = glf2cns(g, <int>(self.c.q_r + .499))
+# #
+# # free(g)
- cdef SNPCall call
-
- call = SNPCall()
- call._tid = self.iter.tid
- call._pos = self.iter.pos
- call._reference_base = rb
- call._genotype = bam_nt16_rev_table[cns>>28]
- call._consensus_quality = cns >> 8 & 0xff
- call._snp_quality = cns & 0xff
- call._rms_mapping_quality = cns >> 16&0xff
- call._coverage = self.iter.n_plp
-
- return call
-
-cdef class IndelCall:
- '''the results of an indel call.'''
- cdef int _tid
- cdef int _pos
- cdef int _coverage
- cdef int _rms_mapping_quality
- cdef bam_maqindel_ret_t * _r
-
- def __cinit__(self):
- #assert r != NULL
- #self._r = r
- pass
-
- property tid:
- '''the chromosome ID as is defined in the header'''
- def __get__(self):
- return self._tid
+# cdef SNPCall call
+
+# call = SNPCall()
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._reference_base = rb
+# call._genotype = bam_nt16_rev_table[cns>>28]
+# call._consensus_quality = cns >> 8 & 0xff
+# call._snp_quality = cns & 0xff
+# call._rms_mapping_quality = cns >> 16&0xff
+# call._coverage = self.iter.n_plp
+
+# return call
+
+# cdef class IndelCall:
+# '''the results of an indel call.'''
+# cdef int _tid
+# cdef int _pos
+# cdef int _coverage
+# cdef int _rms_mapping_quality
+# cdef bam_maqindel_ret_t * _r
+
+# def __cinit__(self):
+# #assert r != NULL
+# #self._r = r
+# pass
+
+# property tid:
+# '''the chromosome ID as is defined in the header'''
+# def __get__(self):
+# return self._tid
- property pos:
- '''nucleotide position of SNP.'''
- def __get__(self): return self._pos
-
- property genotype:
- '''the genotype called.'''
- def __get__(self):
- if self._r.gt == 0:
- s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
- return "%s/%s" % (s,s)
- elif self._r.gt == 1:
- s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
- return "%s/%s" % (s,s)
- else:
- return "%s/%s" % (self.first_allele, self.second_allele )
-
- property consensus_quality:
- '''the genotype quality (Phred-scaled).'''
- def __get__(self): return self._r.q_cns
-
- property snp_quality:
- '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
- def __get__(self): return self._r.q_ref
-
- property mapping_quality:
- '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
- def __get__(self): return self._rms_mapping_quality
-
- property coverage:
- '''coverage or read depth - the number of reads involved in the call.'''
- def __get__(self): return self._coverage
-
- property first_allele:
- '''sequence of first allele.'''
- def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
-
- property second_allele:
- '''sequence of second allele.'''
- def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
-
- property reads_first:
- '''reads supporting first allele.'''
- def __get__(self): return self._r.cnt1
-
- property reads_second:
- '''reads supporting first allele.'''
- def __get__(self): return self._r.cnt2
-
- property reads_diff:
- '''reads supporting first allele.'''
- def __get__(self): return self._r.cnt_anti
-
- def __str__(self):
-
- return "\t".join( map(str, (
- self.tid,
- self.pos,
- self.genotype,
- self.consensus_quality,
- self.snp_quality,
- self.mapping_quality,
- self.coverage,
- self.first_allele,
- self.second_allele,
- self.reads_first,
- self.reads_second,
- self.reads_diff ) ) )
-
- def __dealloc__(self ):
- bam_maqindel_ret_destroy(self._r)
-
-cdef class IndelCallerBase:
- '''Base class for SNP callers.
-
- *min_baseQ*
- minimum base quality (possibly capped by BAQ)
- *capQ_threshold*
- coefficient for adjusting mapQ of poor mappings
- *theta*
- theta in maq consensus calling model
- *n_haplotypes*
- number of haplotypes in the sample
- *het_rate*
- prior of a difference between two haplotypes
- '''
-
- cdef bam_maqindel_opt_t * options
- cdef IteratorColumn iter
- cdef int cap_mapQ
- cdef int max_depth
-
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# property pos:
+# '''nucleotide position of SNP.'''
+# def __get__(self): return self._pos
+
+# property genotype:
+# '''the genotype called.'''
+# def __get__(self):
+# if self._r.gt == 0:
+# s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+# return "%s/%s" % (s,s)
+# elif self._r.gt == 1:
+# s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+# return "%s/%s" % (s,s)
+# else:
+# return "%s/%s" % (self.first_allele, self.second_allele )
+
+# property consensus_quality:
+# '''the genotype quality (Phred-scaled).'''
+# def __get__(self): return self._r.q_cns
+
+# property snp_quality:
+# '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+# def __get__(self): return self._r.q_ref
+
+# property mapping_quality:
+# '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+# def __get__(self): return self._rms_mapping_quality
+
+# property coverage:
+# '''coverage or read depth - the number of reads involved in the call.'''
+# def __get__(self): return self._coverage
+
+# property first_allele:
+# '''sequence of first allele.'''
+# def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+
+# property second_allele:
+# '''sequence of second allele.'''
+# def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+
+# property reads_first:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt1
+
+# property reads_second:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt2
+
+# property reads_diff:
+# '''reads supporting first allele.'''
+# def __get__(self): return self._r.cnt_anti
+
+# def __str__(self):
+
+# return "\t".join( map(str, (
+# self.tid,
+# self.pos,
+# self.genotype,
+# self.consensus_quality,
+# self.snp_quality,
+# self.mapping_quality,
+# self.coverage,
+# self.first_allele,
+# self.second_allele,
+# self.reads_first,
+# self.reads_second,
+# self.reads_diff ) ) )
+
+# def __dealloc__(self ):
+# bam_maqindel_ret_destroy(self._r)
+
+# cdef class IndelCallerBase:
+# '''Base class for SNP callers.
+
+# *min_baseQ*
+# minimum base quality (possibly capped by BAQ)
+# *capQ_threshold*
+# coefficient for adjusting mapQ of poor mappings
+# *theta*
+# theta in maq consensus calling model
+# *n_haplotypes*
+# number of haplotypes in the sample
+# *het_rate*
+# prior of a difference between two haplotypes
+# '''
+
+# cdef bam_maqindel_opt_t * options
+# cdef IteratorColumn iter
+# cdef int cap_mapQ
+# cdef int max_depth
+
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+
+
+# self.iter = iterator_column
+
+# assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
+
+# self.options = bam_maqindel_opt_init()
+
+# # set the default parameterization according to
+# # samtools
+
+# self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
+# self.options.q_indel = kwargs.get( "q_indel", 40 )
+# self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
+# self.max_depth = kwargs.get( "max_depth", 1024 )
+
+# def __dealloc__(self):
+# free( self.options )
+# def _call( self ):
- self.iter = iterator_column
+# cdef char * seq = self.iter.getSequence()
+# cdef int seq_len = self.iter.seq_len
- assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
-
- self.options = bam_maqindel_opt_init()
-
- # set the default parameterization according to
- # samtools
-
- self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
- self.options.q_indel = kwargs.get( "q_indel", 40 )
- self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
- self.max_depth = kwargs.get( "max_depth", 1024 )
-
- def __dealloc__(self):
- free( self.options )
+# assert seq != NULL
- def _call( self ):
+# # reference base
+# if self.iter.pos >= seq_len:
+# raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
- cdef char * seq = self.iter.getSequence()
- cdef int seq_len = self.iter.seq_len
+# cdef bam_maqindel_ret_t * r
+
+# cdef int m = min( self.max_depth, self.iter.n_plp )
+
+# # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
+# # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
+# # self.options.indel_err, self.options.ambi_thres );
+
+# r = bam_maqindel(m,
+# self.iter.pos,
+# self.options,
+# self.iter.plp,
+# seq,
+# 0,
+# NULL)
+
+# if r == NULL: return None
- assert seq != NULL
+# cdef IndelCall call
+# call = IndelCall()
+# call._r = r
+# call._tid = self.iter.tid
+# call._pos = self.iter.pos
+# call._coverage = self.iter.n_plp
- # reference base
- if self.iter.pos >= seq_len:
- raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+# cdef uint64_t rms_aux = 0
+# cdef int i = 0
+# cdef bam_pileup1_t * p
+# cdef int tmp
- cdef bam_maqindel_ret_t * r
-
- cdef int m = min( self.max_depth, self.iter.n_plp )
-
- # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
- # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
- # self.options.indel_err, self.options.ambi_thres );
-
- r = bam_maqindel(m,
- self.iter.pos,
- self.options,
- self.iter.plp,
- seq,
- 0,
- NULL)
-
- if r == NULL: return None
-
- cdef IndelCall call
- call = IndelCall()
- call._r = r
- call._tid = self.iter.tid
- call._pos = self.iter.pos
- call._coverage = self.iter.n_plp
-
- cdef uint64_t rms_aux = 0
- cdef int i = 0
- cdef bam_pileup1_t * p
- cdef int tmp
-
- for i from 0 <= i < self.iter.n_plp:
- p = self.iter.plp + i
- if p.b.core.qual < self.cap_mapQ:
- tmp = p.b.core.qual
- else:
- tmp = self.cap_mapQ
- rms_aux += tmp * tmp
+# for i from 0 <= i < self.iter.n_plp:
+# p = self.iter.plp + i
+# if p.b.core.qual < self.cap_mapQ:
+# tmp = p.b.core.qual
+# else:
+# tmp = self.cap_mapQ
+# rms_aux += tmp * tmp
- call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
+# call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
- return call
+# return call
-cdef class IndelCaller( IndelCallerBase ):
- '''*(IteratorColumn iterator_column )*
+# cdef class IndelCaller( IndelCallerBase ):
+# '''*(IteratorColumn iterator_column )*
- The samtools SNP caller.
+# The samtools SNP caller.
- This object will call SNPs in *samfile* against the reference
- sequence in *fasta*.
+# This object will call SNPs in *samfile* against the reference
+# sequence in *fasta*.
- This caller is fast for calling few SNPs in selected regions.
+# This caller is fast for calling few SNPs in selected regions.
- It is slow, if called over large genomic regions.
- '''
+# It is slow, if called over large genomic regions.
+# '''
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
- pass
+# pass
- def call(self, reference, int pos ):
- """call a snp on chromosome *reference*
- and position *pos*.
+# def call(self, reference, int pos ):
+# """call a snp on chromosome *reference*
+# and position *pos*.
- returns a :class:`SNPCall` object or None, if no indel call could be made.
- """
+# returns a :class:`SNPCall` object or None, if no indel call could be made.
+# """
- cdef int tid = self.iter.samfile.gettid( reference )
+# cdef int tid = self.iter.samfile.gettid( reference )
- self.iter.reset( tid, pos, pos + 1 )
+# self.iter.reset( tid, pos, pos + 1 )
- while 1:
- self.iter.cnext()
+# while 1:
+# self.iter.cnext()
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
- if self.iter.plp == NULL:
- raise ValueError( "no reads in region - no call" )
+# if self.iter.plp == NULL:
+# raise ValueError( "no reads in region - no call" )
- if self.iter.pos == pos: break
+# if self.iter.pos == pos: break
- return self._call()
+# return self._call()
-cdef class IteratorIndelCalls( IndelCallerBase ):
- """*(IteratorColumn iterator)*
+# cdef class IteratorIndelCalls( IndelCallerBase ):
+# """*(IteratorColumn iterator)*
- call indels within a region.
+# call indels within a region.
- *iterator* is a pileup iterator. SNPs will be called
- on all positions returned by this iterator.
+# *iterator* is a pileup iterator. SNPs will be called
+# on all positions returned by this iterator.
- This caller is fast if SNPs are called over large continuous
- regions. It is slow, if instantiated frequently and in random
- order as the sequence will have to be reloaded.
+# This caller is fast if SNPs are called over large continuous
+# regions. It is slow, if instantiated frequently and in random
+# order as the sequence will have to be reloaded.
- """
+# """
- def __cinit__(self,
- IteratorColumn iterator_column,
- **kwargs ):
- pass
+# def __cinit__(self,
+# IteratorColumn iterator_column,
+# **kwargs ):
+# pass
- def __iter__(self):
- return self
+# def __iter__(self):
+# return self
- def __next__(self):
- """python version of next().
- """
+# def __next__(self):
+# """python version of next().
+# """
- # the following code was adapted from bam_plcmd.c:pileup_func()
- self.iter.cnext()
+# # the following code was adapted from bam_plcmd.c:pileup_func()
+# self.iter.cnext()
- if self.iter.n_plp < 0:
- raise ValueError("error during iteration" )
+# if self.iter.n_plp < 0:
+# raise ValueError("error during iteration" )
- if self.iter.plp == NULL:
- raise StopIteration
+# if self.iter.plp == NULL:
+# raise StopIteration
- return self._call()
+# return self._call()
"PileupColumn",
"PileupProxy",
"PileupRead",
- "IteratorSNPCalls",
- "SNPCaller",
- "IndelCaller",
- "IteratorIndelCalls",
+ # "IteratorSNPCalls",
+ # "SNPCaller",
+ # "IndelCaller",
+ # "IteratorIndelCalls",
"IndexedReads" ]