1 # cython: embedsignature=True
3 # adds doc-strings for sphinx
15 from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize
16 from cpython.version cimport PY_MAJOR_VERSION
18 ########################################################################
19 ########################################################################
20 ########################################################################
21 ## Python 3 compatibility functions
22 ########################################################################
23 IS_PYTHON3 = PY_MAJOR_VERSION >= 3
24 cdef from_string_and_size(char* s, size_t length):
25 if PY_MAJOR_VERSION < 3:
28 return s[:length].decode("ascii")
30 # filename encoding (copied from lxml.etree.pyx)
31 cdef str _FILENAME_ENCODING
32 _FILENAME_ENCODING = sys.getfilesystemencoding()
33 if _FILENAME_ENCODING is None:
34 _FILENAME_ENCODING = sys.getdefaultencoding()
35 if _FILENAME_ENCODING is None:
36 _FILENAME_ENCODING = 'ascii'
38 #cdef char* _C_FILENAME_ENCODING
39 #_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
41 cdef bytes _my_encodeFilename(object filename):
42 u"""Make sure a filename is 8-bit encoded (or None).
46 elif PyBytes_Check(filename):
48 elif PyUnicode_Check(filename):
49 return filename.encode(_FILENAME_ENCODING)
51 raise TypeError, u"Argument must be string or unicode."
54 cdef bytes _force_bytes(object s):
55 u"""convert string or unicode object to bytes, assuming ascii encoding.
57 if PY_MAJOR_VERSION < 3:
61 elif PyBytes_Check(s):
63 elif PyUnicode_Check(s):
64 return s.encode('ascii')
66 raise TypeError, u"Argument must be string, bytes or unicode."
68 cdef inline bytes _force_cmdline_bytes(object s):
69 return _force_bytes(s)
71 cdef _charptr_to_str(char* s):
72 if PY_MAJOR_VERSION < 3:
75 return s.decode("ascii")
77 cdef _force_str(object s):
78 """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
81 if PY_MAJOR_VERSION < 3:
83 elif PyBytes_Check(s):
84 return s.decode('ascii')
88 ########################################################################
89 ########################################################################
90 ########################################################################
91 ## Constants and global variables
92 ########################################################################
93 # defines imported from samtools
98 ## These are bits set in the flag.
99 ## have to put these definitions here, in csamtools.pxd they got ignored
100 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
102 ## @abstract the read is mapped in a proper pair */
103 DEF BAM_FPROPER_PAIR =2
104 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
106 ## @abstract the mate is unmapped */
108 ## @abstract the read is mapped to the reverse strand */
110 ## @abstract the mate is mapped to the reverse strand */
111 DEF BAM_FMREVERSE =32
112 ## @abstract this is read1 */
114 ## @abstract this is read2 */
116 ## @abstract not primary alignment */
117 DEF BAM_FSECONDARY =256
118 ## @abstract QC failure */
120 ## @abstract optical or PCR duplicate */
123 #####################################################################
125 DEF BAM_CIGAR_SHIFT=4
126 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
131 DEF BAM_CREF_SKIP = 3
132 DEF BAM_CSOFT_CLIP = 4
133 DEF BAM_CHARD_CLIP = 5
138 cdef char* CODE2CIGAR= "MIDNSHP=X"
140 CIGAR2CODE = dict( [y,x] for x,y in enumerate( CODE2CIGAR) )
142 CIGAR2CODE = dict( [ord(y),x] for x,y in enumerate( CODE2CIGAR) )
143 CIGAR_REGEX = re.compile( "([MIDNSHP=X])(\d+)" )
145 #####################################################################
146 ## set pysam stderr to /dev/null
149 #####################################################################
150 # hard-coded constants
151 cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
152 cdef int max_pos = 2 << 29
154 #####################################################################
155 #####################################################################
156 #####################################################################
157 ## private factory methods
158 #####################################################################
159 cdef class AlignedRead
160 cdef makeAlignedRead(bam1_t * src):
161 '''enter src into AlignedRead.'''
162 cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
163 dest._delegate = bam_dup1(src)
166 cdef class PileupProxy
167 cdef makePileupProxy( bam_pileup1_t ** plp, int tid, int pos, int n ):
168 cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
175 cdef class PileupRead
176 cdef makePileupRead( bam_pileup1_t * src ):
177 '''fill a PileupRead object from a bam_pileup1_t * object.'''
178 cdef PileupRead dest = PileupRead.__new__(PileupRead)
179 dest._alignment = makeAlignedRead( src.b )
180 dest._qpos = src.qpos
181 dest._indel = src.indel
182 dest._level = src.level
183 dest._is_del = src.is_del
184 dest._is_head = src.is_head
185 dest._is_tail = src.is_tail
188 cdef convertBinaryTagToList( uint8_t * s ):
189 """return bytesize, number of values list of values in s."""
191 cdef uint8_t byte_size
196 byte_size = bam_aux_type2size( auxtype )
198 # get number of values in array
199 nvalues = (<int32_t*>s)[0]
204 for x from 0 <= x < nvalues:
205 values.append((<int8_t*>s)[0])
208 for x from 0 <= x < nvalues:
209 values.append((<uint8_t*>s)[0])
212 for x from 0 <= x < nvalues:
213 values.append((<int16_t*>s)[0])
216 for x from 0 <= x < nvalues:
217 values.append((<uint16_t*>s)[0])
220 for x from 0 <= x < nvalues:
221 values.append((<int32_t*>s)[0])
224 for x from 0 <= x < nvalues:
225 values.append((<uint32_t*>s)[0])
228 for x from 0 <= x < nvalues:
229 values.append((<float*>s)[0])
232 return byte_size, nvalues, values
234 #####################################################################
235 #####################################################################
236 #####################################################################
237 ## Generic callbacks for inserting python callbacks.
238 #####################################################################
239 cdef int fetch_callback( bam1_t *alignment, void *f):
240 '''callback for bam_fetch.
242 calls function in *f* with a new :class:`AlignedRead` object as parameter.
244 a = makeAlignedRead( alignment )
247 class PileupColumn(object):
248 '''A pileup column. A pileup column contains
249 all the reads that map to a certain target base.
252 chromosome ID as is defined in the header
254 the target base coordinate (0-based)
256 number of reads mapping to this column
258 list of reads (:class:`pysam.PileupRead`) aligned to this column
261 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
262 "\n" + "\n".join( map(str, self.pileups) )
264 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
265 '''callback for pileup.
267 calls function in *f* with a new :class:`Pileup` object as parameter.
270 chromosome ID as is defined in the header
272 start coordinate of the alignment, 0-based
274 number of elements in pl array
288 for x from 0 <= x < n:
289 pileups.append( makePileupRead( &(pl[x]) ) )
294 cdef int pileup_fetch_callback( bam1_t *b, void *data):
295 '''callback for bam_fetch.
297 Fetches reads and submits them to pileup.
299 cdef bam_plbuf_t * buf
300 buf = <bam_plbuf_t*>data
301 bam_plbuf_push(b, buf)
310 self.stderr_h, self.stderr_f = tempfile.mkstemp()
311 self.stderr_save = Outs( sys.stderr.fileno() )
312 self.stderr_save.setfd( self.stderr_h )
314 def readAndRelease( self ):
316 self.stderr_save.restore()
318 if os.path.exists(self.stderr_f):
319 lines = open( self.stderr_f, "r" ).readlines()
320 os.remove( self.stderr_f )
325 self.stderr_save.restore()
326 if os.path.exists(self.stderr_f):
327 os.remove( self.stderr_f )
332 class StderrStoreWindows():
333 '''does nothing. stderr can't be redirected on windows'''
334 def __init__(self): pass
335 def readAndRelease(self): return []
336 def release(self): pass
338 if platform.system()=='Windows':
340 StderrStore = StderrStoreWindows
343 ######################################################################
344 ######################################################################
345 ######################################################################
346 # valid types for sam headers
347 VALID_HEADER_TYPES = { "HD" : dict,
353 # order of records within sam headers
354 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
356 # type conversions within sam header records
357 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
358 "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
359 "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str,
360 "CN" : str, "DT" : str, "PL" : str, "FO" : str, "KS" : str },
361 "PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
363 # output order of fields within records
364 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
365 "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
366 "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL", "FO", "KS" ),
367 "PG" : ( "PN", "ID", "VN", "CL" ), }
370 ######################################################################
371 ######################################################################
372 ######################################################################
374 ######################################################################
375 cdef class Fastafile:
378 A *FASTA* file. The file is automatically opened.
380 The file expects an indexed fasta file.
383 add automatic indexing.
384 add function to get sequence names.
387 def __cinit__(self, *args, **kwargs ):
388 self.fastafile = NULL
389 self._filename = NULL
390 self._open( *args, **kwargs )
393 '''return true if samfile has been opened.'''
394 return self.fastafile != NULL
397 if self.fastafile == NULL:
398 raise ValueError( "calling len() on closed file" )
400 return faidx_fetch_nseq(self.fastafile)
404 '''open an indexed fasta file.
406 This method expects an indexed fasta file.
409 # close a previously opened file
410 if self.fastafile != NULL: self.close()
411 if self._filename != NULL: free(self._filename)
412 filename = _my_encodeFilename(filename)
413 self._filename = strdup(filename)
414 self.fastafile = fai_load( filename )
416 if self.fastafile == NULL:
417 raise IOError("could not open file `%s`" % filename )
420 if self.fastafile != NULL:
421 fai_destroy( self.fastafile )
422 self.fastafile = NULL
424 def __dealloc__(self):
426 if self._filename != NULL: free(self._filename)
429 '''number of :term:`filename` associated with this object.'''
431 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
432 return self._filename
440 '''*(reference = None, start = None, end = None, region = None)*
442 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
444 The region is specified by :term:`reference`, *start* and *end*.
446 fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
448 If *reference* is given and *start* is None, the sequence from the
449 first base is returned. Similarly, if *end* is None, the sequence
450 until the last base is returned.
452 Alternatively, a samtools :term:`region` string can be supplied.
455 if not self._isOpen():
456 raise ValueError( "I/O operation on closed file" )
462 if reference is None: raise ValueError( 'no sequence/region supplied.' )
463 if start is None: start = 0
464 if end is None: end = max_pos -1
466 if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
467 if start == end: return b""
468 # valid ranges are from 0 to 2^29-1
469 if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
470 if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
471 # note: faidx_fetch_seq has a bug such that out-of-range access
472 # always returns the last residue. Hence do not use faidx_fetch_seq,
473 # but use fai_fetch instead
474 # seq = faidx_fetch_seq(self.fastafile,
479 region = "%s:%i-%i" % (reference, start+1, end)
480 if PY_MAJOR_VERSION >= 3:
481 region = region.encode('ascii')
482 seq = fai_fetch( self.fastafile,
486 # samtools adds a '\0' at the end
487 seq = fai_fetch( self.fastafile, region, &length )
494 py_seq = seq[:length]
500 cdef char * _fetch( self, char * reference, int start, int end, int * length ):
501 '''fetch sequence for reference, start and end'''
503 return faidx_fetch_seq(self.fastafile,
509 #------------------------------------------------------------------------
510 #------------------------------------------------------------------------
511 #------------------------------------------------------------------------
512 cdef int count_callback( bam1_t *alignment, void *f):
513 '''callback for bam_fetch - count number of reads.
515 cdef int* counter = (<int*>f)
518 ctypedef struct MateData:
523 #------------------------------------------------------------------------
524 #------------------------------------------------------------------------
525 #------------------------------------------------------------------------
526 cdef int mate_callback( bam1_t *alignment, void *f):
527 '''callback for bam_fetch = filter mate
529 cdef MateData * d = (<MateData*>f)
530 # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
531 # d.mate, d.name, bam1_qname(alignment),
532 # d.flag, alignment.core.flag, alignment.core.flag & d.flag)
535 # could be sped up by comparing the lengths of query strings first
538 # also, make sure that we get the other read by comparing
540 if alignment.core.flag & d.flag != 0 and \
541 strcmp( bam1_qname( alignment ), d.name ) == 0:
542 d.mate = bam_dup1( alignment )
546 '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None,
547 add_sq_text = False, check_header = True, check_sq = True )*
549 A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
551 *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
552 (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
553 Use ``h`` to output header information in text (:term:`TAM`) mode.
555 If ``b`` is present, it must immediately follow ``r`` or ``w``.
556 Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
557 a :term:`BAM` formatted file for reading, type::
559 f = pysam.Samfile('ex1.bam','rb')
561 If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following
564 f1 = pysam.Samfile('ex1.bam' )
565 f2 = pysam.Samfile('ex1.sam' )
567 If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
568 access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
570 For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
571 sources (see also the samtools format specification):
573 1. If *template* is given, the header is copied from a another *Samfile*
574 (*template* must be of type *Samfile*).
576 2. If *header* is given, the header is built from a multi-level dictionary. The first level
577 are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
578 being a list of tag-value pairs. The header is constructed first from all the defined fields,
579 followed by user tags in alphabetical order.
581 3. If *text* is given, new header text is copied from raw text.
583 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
584 By default, 'SQ' and 'LN' tags will be added to the header text. This option can be
585 changed by unsetting the flag *add_sq_text*.
587 By default, if file a file is opened in mode 'r', it is checked for a valid header
588 (*check_header* = True) and a definition of chromosome names (*check_sq* = True).
592 def __cinit__(self, *args, **kwargs ):
594 self._filename = NULL
596 self.isstream = False
597 self._open( *args, **kwargs )
599 # allocate memory for iterator
600 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
603 '''return true if samfile has been opened.'''
604 return self.samfile != NULL
606 def _hasIndex( self ):
607 '''return true if samfile has an existing (and opened) index.'''
608 return self.index != NULL
613 Samfile template = None,
614 referencenames = None,
615 referencelengths = None,
623 '''open a sam/bam file.
625 If _open is called on an existing bamfile, the current file will be
626 closed and a new file will be opened.
629 # read mode autodetection
632 self._open(filename, 'rb', template=template,
633 referencenames=referencenames,
634 referencelengths=referencelengths,
635 text=text, header=header, port=port,
636 check_header=check_header,
639 except ValueError, msg:
642 self._open(filename, 'r', template=template,
643 referencenames=referencenames,
644 referencelengths=referencelengths,
645 text=text, header=header, port=port,
646 check_header=check_header,
650 assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode
652 #assert filename != NULL
654 # close a previously opened file
655 if self.samfile != NULL: self.close()
658 cdef bam_header_t * header_to_write
659 header_to_write = NULL
661 if self._filename != NULL: free(self._filename )
662 filename = _my_encodeFilename(filename)
663 cdef bytes bmode = mode.encode('ascii')
664 #cdef char* cfilename
665 #cfilename = filename.encode(_FILENAME_ENCODING)
666 self._filename = strdup(filename)
667 self.isstream = strcmp( filename, "-" ) == 0
669 self.isbam = len(mode) > 1 and mode[1] == 'b'
671 self.isremote = strncmp(filename,"http:",5) == 0 or \
672 strncmp(filename,"ftp:",4) == 0
678 # open file for writing
680 # header structure (used for writing)
682 # copy header from another file
683 header_to_write = template.samfile.header
686 header_to_write = self._buildHeader( header )
689 # build header from a target names and lengths
690 assert referencenames and referencelengths, "either supply options `template`, `header` or both `referencenames` and `referencelengths` for writing"
691 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
693 # allocate and fill header
694 referencenames = [ _force_bytes(ref) for ref in referencenames ]
695 header_to_write = bam_header_init()
696 header_to_write.n_targets = len(referencenames)
698 for x in referencenames: n += len(x) + 1
699 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
700 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
701 for x from 0 <= x < header_to_write.n_targets:
702 header_to_write.target_len[x] = referencelengths[x]
703 name = referencenames[x]
704 header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
705 strncpy( header_to_write.target_name[x], name, len(name) )
707 # Optionally, if there is no text, add a SAM compatible header to output
709 if text is None and add_sq_text:
711 for x from 0 <= x < header_to_write.n_targets:
712 text.append( "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] ) )
717 text = _force_bytes(text)
719 header_to_write.l_text = strlen(ctext)
720 header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
721 memcpy( header_to_write.text, ctext, strlen(ctext) )
723 header_to_write.hash = NULL
724 header_to_write.rg2lib = NULL
726 # open file. Header gets written to file at the same time for bam files
727 # and sam files (in the latter case, the mode needs to be wh)
728 store = StderrStore()
729 self.samfile = samopen( filename, bmode, header_to_write )
732 # bam_header_destroy takes care of cleaning up of all the members
733 if not template and header_to_write != NULL:
734 bam_header_destroy( header_to_write )
737 # open file for reading
738 if strncmp( filename, "-", 1) != 0 and \
739 not self.isremote and \
740 not os.path.exists( filename ):
741 raise IOError( "file `%s` not found" % filename)
743 # try to detect errors
744 self.samfile = samopen( filename, bmode, NULL )
745 if self.samfile == NULL:
746 raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
748 # bam files require a valid header
750 if self.samfile.header == NULL:
751 raise ValueError( "file does not have valid header (mode='%s') - is it BAM format?" % mode )
753 # in sam files it is optional (samfile full of unmapped reads)
754 if check_header and self.samfile.header == NULL:
755 raise ValueError( "file does not have valid header (mode='%s') - is it SAM format?" % mode )
757 # disabled for autodetection to work
758 # needs to be disabled so that reading from sam-files without headers works
759 if check_sq and self.samfile.header.n_targets == 0:
760 raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
762 if self.samfile == NULL:
763 raise IOError("could not open file `%s`" % filename )
765 # check for index and open if present
766 if mode[0] == "r" and self.isbam:
768 if not self.isremote:
769 if not os.path.exists(filename + b".bai"):
772 # returns NULL if there is no index or index could not be opened
773 self.index = bam_index_load(filename)
774 if self.index == NULL:
775 raise IOError("error while opening index `%s` " % filename )
777 self.index = bam_index_load(filename)
778 if self.index == NULL:
779 raise IOError("error while opening index `%s` " % filename )
781 if not self.isstream:
782 self.start_offset = bam_tell( self.samfile.x.bam )
784 def gettid( self, reference ):
786 convert :term:`reference` name into numerical :term:`tid`
788 returns -1 if reference is not known.
790 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
791 reference = _force_bytes(reference)
792 return pysam_reference2tid( self.samfile.header, reference )
794 def getrname( self, tid ):
796 convert numerical :term:`tid` into :term:`reference` name.'''
797 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
798 if not 0 <= tid < self.samfile.header.n_targets:
799 raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
800 return _charptr_to_str(self.samfile.header.target_name[tid])
802 cdef char * _getrname( self, int tid ): # TODO unused
804 convert numerical :term:`tid` into :term:`reference` name.'''
805 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
806 if not 0 <= tid < self.samfile.header.n_targets:
807 raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
808 return self.samfile.header.target_name[tid]
810 def _parseRegion( self,
816 parse region information.
818 raise ValueError for for invalid regions.
820 returns a tuple of flag, tid, start and end. Flag indicates
821 whether some coordinates were supplied.
823 Note that regions are 1-based, while start,end are python coordinates.
825 # This method's main objective is to translate from a reference to a tid.
826 # For now, it calls bam_parse_region, which is clumsy. Might be worth
827 # implementing it all in pysam (makes use of khash).
830 cdef long long rstart
839 except OverflowError:
840 raise ValueError( 'start out of range (%i)' % start )
845 except OverflowError:
846 raise ValueError( 'end out of range (%i)' % end )
849 region = _force_str(region)
850 parts = re.split( "[:-]", region )
852 if len(parts) >= 2: rstart = int(parts[1]) - 1
853 if len(parts) >= 3: rend = int(parts[2])
855 if not reference: return 0, 0, 0, 0
857 rtid = self.gettid( reference )
858 if rtid < 0: raise ValueError( "invalid reference `%s`" % reference )
859 if rstart > rend: raise ValueError( 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend) )
860 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
861 if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
863 return 1, rtid, rstart, rend
866 '''reset file position to beginning of read section.'''
867 return self.seek( self.start_offset, 0 )
869 def seek( self, uint64_t offset, int where = 0):
871 move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
874 if not self._isOpen():
875 raise ValueError( "I/O operation on closed file" )
877 raise NotImplementedError("seek only available in bam files")
879 raise OSError("seek no available in streams")
881 return bam_seek( self.samfile.x.bam, offset, where )
885 return current file position
887 if not self._isOpen():
888 raise ValueError( "I/O operation on closed file" )
890 raise NotImplementedError("seek only available in bam files")
892 return bam_tell( self.samfile.x.bam )
902 fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
903 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
906 Without *reference* or *region* all mapped reads will be fetched. The reads will be returned
907 ordered by reference sequence, which will not necessarily be the order within the file.
909 If *until_eof* is given, all reads from the current file position will be returned
910 in order as they are within the file. Using this option will also fetch unmapped reads.
912 If only *reference* is set, all reads aligned to *reference* will be fetched.
914 The method returns an iterator of type :class:`pysam.IteratorRow` unless
915 a *callback is provided. If *callback* is given, the callback will be executed
916 for each position within the :term:`region`. Note that callbacks currently work
917 only, if *region* or *reference* is given.
919 Note that a :term:`SAM` file does not allow random access. If *region* or *reference* are given,
920 an exception is raised.
922 cdef int rtid, rstart, rend, has_coord
924 if not self._isOpen():
925 raise ValueError( "I/O operation on closed file" )
927 has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
929 if self.isstream: reopen = False
933 if not until_eof and not self._hasIndex() and not self.isremote:
934 raise ValueError( "fetch called on bamfile without index" )
937 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
938 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
939 return bam_fetch(self.samfile.x.bam,
948 return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen )
951 return IteratorRowAll( self, reopen=reopen )
953 # AH: check - reason why no reopen for AllRefs?
954 return IteratorRowAllRefs(self ) # , reopen=reopen )
957 raise ValueError ("fetching by region is not available for sam files" )
960 raise NotImplementedError( "callback not implemented yet" )
962 if self.samfile.header == NULL:
963 raise ValueError( "fetch called for samfile without header")
965 # check if targets are defined
966 # give warning, sam_read1 segfaults
967 if self.samfile.header.n_targets == 0:
968 warnings.warn( "fetch called for samfile without header")
970 return IteratorRowAll( self, reopen=reopen )
974 '''return the mate of :class:`AlignedRead` *read*.
976 Throws a ValueError if read is unpaired or the mate
980 Calling this method will change the file position.
981 This might interfere with any iterators that have
982 not re-opened the file.
985 cdef uint32_t flag = read._delegate.core.flag
987 if flag & BAM_FPAIRED == 0:
988 raise ValueError( "read %s: is unpaired" % (read.qname))
989 if flag & BAM_FMUNMAP != 0:
990 raise ValueError( "mate %s: is unmapped" % (read.qname))
992 cdef MateData mate_data
994 mate_data.name = <char *>bam1_qname(read._delegate)
995 mate_data.mate = NULL
996 # xor flags to get the other mate
997 cdef int x = BAM_FREAD1 + BAM_FREAD2
998 mate_data.flag = ( flag ^ x) & x
1000 bam_fetch(self.samfile.x.bam,
1002 read._delegate.core.mtid,
1003 read._delegate.core.mpos,
1004 read._delegate.core.mpos + 1,
1008 if mate_data.mate == NULL:
1009 raise ValueError( "mate not found" )
1011 cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
1012 dest._delegate = mate_data.mate
1020 until_eof = False ):
1021 '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
1023 count reads :term:`region` using 0-based indexing. The region is specified by
1024 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
1026 Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
1027 an exception is raised.
1033 if not self._isOpen():
1034 raise ValueError( "I/O operation on closed file" )
1036 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
1042 if not until_eof and not self._hasIndex() and not self.isremote:
1043 raise ValueError( "fetch called on bamfile without index" )
1046 raise ValueError( "counting functionality requires a region/reference" )
1047 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
1048 bam_fetch(self.samfile.x.bam,
1057 raise ValueError ("count for a region is not available for sam files" )
1067 perform a :term:`pileup` within a :term:`region`. The region is specified by
1068 :term:`reference`, *start* and *end* (using 0-based indexing).
1069 Alternatively, a samtools *region* string can be supplied.
1071 Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
1072 ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
1074 The method returns an iterator of type :class:`pysam.IteratorColumn` unless
1075 a *callback is provided. If a *callback* is given, the callback will be executed
1076 for each column within the :term:`region`.
1078 Note that :term:`SAM` formatted files do not allow random access.
1079 In these files, if a *region* or *reference* are given an exception is raised.
1081 Optional *kwargs* to the iterator:
1084 The stepper controlls how the iterator advances.
1085 Possible options for the stepper are
1088 use all reads for pileup.
1090 same filter and read processing as in :term:`csamtools` pileup
1093 A :class:`FastaFile` object
1096 Skip all reads with bits set in mask if mask=True.
1099 Maximum read depth permitted. The default limit is *8000*.
1102 By default, the samtools pileup engine outputs all reads overlapping a region (see note below).
1103 If truncate is True and a region is given, only output columns in the exact region
1108 *all* reads which overlap the region are returned. The first base returned will be the
1109 first base of the first read *not* necessarily the first base of the region used in the query.
1112 cdef int rtid, rstart, rend, has_coord
1113 cdef bam_plbuf_t *buf
1115 if not self._isOpen():
1116 raise ValueError( "I/O operation on closed file" )
1118 has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
1121 if not self._hasIndex(): raise ValueError( "no index available for pileup" )
1124 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
1126 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
1127 bam_fetch(self.samfile.x.bam,
1128 self.index, rtid, rstart, rend,
1129 buf, pileup_fetch_callback )
1132 bam_plbuf_push( NULL, buf)
1133 bam_plbuf_destroy(buf)
1136 return IteratorColumnRegion( self,
1142 return IteratorColumnAllRefs(self, **kwargs )
1145 raise NotImplementedError( "pileup of samfiles not implemented yet" )
1149 closes the :class:`pysam.Samfile`.'''
1150 if self.samfile != NULL:
1151 samclose( self.samfile )
1152 bam_index_destroy(self.index);
1155 def __dealloc__( self ):
1156 # remember: dealloc cannot call other methods
1157 # note: no doc string
1158 # note: __del__ is not called.
1160 bam_destroy1(self.b)
1161 if self._filename != NULL: free( self._filename )
1163 cpdef int write( self, AlignedRead read ) except -1:
1165 write a single :class:`pysam.AlignedRead` to disk.
1167 returns the number of bytes written.
1169 if not self._isOpen():
1172 return samwrite( self.samfile, read._delegate )
1174 def __enter__(self):
1177 def __exit__(self, exc_type, exc_value, traceback):
1181 ###############################################################
1182 ###############################################################
1183 ###############################################################
1185 ###############################################################
1187 '''number of :term:`filename` associated with this object.'''
1189 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1190 return self._filename
1192 property nreferences:
1193 '''number of :term:`reference` sequences in the file.'''
1195 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1196 return self.samfile.header.n_targets
1198 property references:
1199 """tuple with the names of :term:`reference` sequences."""
1201 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1203 for x from 0 <= x < self.samfile.header.n_targets:
1204 t.append( _charptr_to_str(self.samfile.header.target_name[x]) )
1208 """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
1209 :attr:`pysam.Samfile.references`
1212 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1214 for x from 0 <= x < self.samfile.header.n_targets:
1215 t.append( self.samfile.header.target_len[x] )
1219 """total number of mapped reads in file.
1222 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1223 if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" )
1226 cdef uint32_t total = 0
1227 for tid from 0 <= tid < self.samfile.header.n_targets:
1228 total += pysam_get_mapped( self.index, tid )
1232 """total number of unmapped reads in file.
1235 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1236 if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" )
1238 cdef uint32_t total = 0
1239 for tid from 0 <= tid < self.samfile.header.n_targets:
1240 total += pysam_get_unmapped( self.index, tid )
1241 # get unmapped reads without coordinates
1242 total += pysam_get_unmapped( self.index, -1 )
1246 '''full contents of the :term:`sam file` header as a string.'''
1248 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1249 return from_string_and_size(self.samfile.header.text, self.samfile.header.l_text)
1252 '''header information within the :term:`sam file`. The records and fields are returned as
1253 a two-level dictionary.
1256 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1260 if self.samfile.header.text != NULL:
1261 # convert to python string (note: call self.text to create 0-terminated string)
1263 for line in t.split("\n"):
1264 if not line.strip(): continue
1265 assert line.startswith("@"), "header line without '@': '%s'" % line
1266 fields = line[1:].split("\t")
1268 assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
1272 if record not in result: result[record] = []
1273 result[record].append( "\t".join( fields[1:] ) )
1276 # the following is clumsy as generators do not work?
1278 for field in fields[1:]:
1279 key, value = field.split(":",1)
1280 # uppercase keys must be valid
1281 # lowercase are permitted for user fields
1282 if key in VALID_HEADER_FIELDS[record]:
1283 x[key] = VALID_HEADER_FIELDS[record][key](value)
1284 elif not key.isupper():
1287 raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
1289 if VALID_HEADER_TYPES[record] == dict:
1290 if record in result:
1291 raise ValueError( "multiple '%s' lines are not permitted" % record )
1293 elif VALID_HEADER_TYPES[record] == list:
1294 if record not in result: result[record] = []
1295 result[record].append( x )
1297 # if there are no SQ lines in the header, add the reference names
1298 # from the information in the bam file.
1299 # Background: c-samtools keeps the textual part of the header separate from
1300 # the list of reference names and lengths. Thus, if a header contains only
1301 # SQ lines, the SQ information is not part of the textual header and thus
1302 # are missing from the output. See issue 84.
1303 if "SQ" not in result:
1305 for ref, length in zip( self.references, self.lengths ):
1306 sq.append( {'LN': length, 'SN': ref } )
1311 def _buildLine( self, fields, record ):
1312 '''build a header line from *fields* dictionary for *record*'''
1314 # TODO: add checking for field and sort order
1315 line = ["@%s" % record ]
1318 line.append( fields )
1320 elif record.islower():
1321 for key in sorted(fields):
1322 line.append( "%s:%s" % (key, str(fields[key])))
1325 # write fields of the specification
1326 for key in VALID_HEADER_ORDER[record]:
1328 line.append( "%s:%s" % (key, str(fields[key])))
1331 if not key.isupper():
1332 line.append( "%s:%s" % (key, str(fields[key])))
1334 return "\t".join( line )
1336 cdef bam_header_t * _buildHeader( self, new_header ):
1337 '''return a new header built from a dictionary in *new_header*.
1339 This method inserts the text field, target_name and target_len.
1344 # check if hash exists
1346 # create new header and copy old data
1347 cdef bam_header_t * dest
1349 dest = bam_header_init()
1351 # first: defined tags
1352 for record in VALID_HEADERS:
1353 if record in new_header:
1354 ttype = VALID_HEADER_TYPES[record]
1355 data = new_header[record]
1356 if type( data ) != type( ttype() ):
1357 raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
1358 if type( data ) is dict:
1359 lines.append( self._buildLine( data, record ) )
1361 for fields in new_header[record]:
1362 lines.append( self._buildLine( fields, record ) )
1364 # then: user tags (lower case), sorted alphabetically
1365 for record, data in sorted(new_header.items()):
1366 if record in VALID_HEADERS: continue
1367 if type( data ) is dict:
1368 lines.append( self._buildLine( data, record ) )
1370 for fields in new_header[record]:
1371 lines.append( self._buildLine( fields, record ) )
1373 text = "\n".join(lines) + "\n"
1374 if dest.text != NULL: free( dest.text )
1375 dest.text = <char*>calloc( len(text), sizeof(char))
1376 dest.l_text = len(text)
1377 cdef bytes btext = text.encode('ascii')
1378 strncpy( dest.text, btext, dest.l_text )
1382 if "SQ" in new_header:
1384 for fields in new_header["SQ"]:
1386 seqs.append( (fields["SN"], fields["LN"] ) )
1388 raise KeyError( "incomplete sequence information in '%s'" % str(fields))
1390 dest.n_targets = len(seqs)
1391 dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
1392 dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
1394 for x from 0 <= x < dest.n_targets:
1395 seqname, seqlen = seqs[x]
1396 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
1397 bseqname = seqname.encode('ascii')
1398 strncpy( dest.target_name[x], bseqname, len(seqname) + 1 )
1399 dest.target_len[x] = seqlen
1403 ###############################################################
1404 ###############################################################
1405 ###############################################################
1406 ## file-object like iterator access
1407 ## note: concurrent access will cause errors (see IteratorRow
1409 ## Possible solutions: deprecate or open new file handle
1410 ###############################################################
1412 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1413 if not self.isbam and self.samfile.header.n_targets == 0:
1414 raise NotImplementedError( "can not iterate over samfile without header")
1417 cdef bam1_t * getCurrent( self ):
1420 cdef int cnext(self):
1422 cversion of iterator. Used by :class:`pysam.Samfile.IteratorColumn`.
1425 return samread(self.samfile, self.b)
1429 python version of next().
1432 ret = samread(self.samfile, self.b)
1434 return makeAlignedRead( self.b )
1438 ##-------------------------------------------------------------------
1439 ##-------------------------------------------------------------------
1440 ##-------------------------------------------------------------------
1441 cdef class IteratorRow:
1442 '''abstract base class for iterators over mapped reads.
1444 Various iterators implement different behaviours for wrapping around
1445 contig boundaries. Examples include:
1447 :class:`pysam.IteratorRowRegion`
1448 iterate within a single contig and a defined region.
1450 :class:`pysam.IteratorRowAll`
1451 iterate until EOF. This iterator will also include unmapped reads.
1453 :class:`pysam.IteratorRowAllRefs`
1454 iterate over all reads in all reference sequences.
1456 The method :meth:`Samfile.fetch` returns an IteratorRow.
1461 cdef class IteratorRowRegion(IteratorRow):
1462 """*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
1464 iterate over mapped reads in a region.
1466 By default, the file is re-openend to avoid conflicts between
1467 multiple iterators working on the same file. Set *reopen* = False
1468 to not re-open *samfile*.
1470 The samtools iterators assume that the file
1471 position between iterations do not change.
1472 As a consequence, no two iterators can work
1473 on the same file. To permit this, each iterator
1474 creates its own file handle by re-opening the
1477 Note that the index will be shared between
1478 samfile and the iterator.
1481 def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
1483 if not samfile._isOpen():
1484 raise ValueError( "I/O operation on closed file" )
1486 if not samfile._hasIndex():
1487 raise ValueError( "no index available for iteration" )
1489 # makes sure that samfile stays alive as long as the
1491 self.samfile = samfile
1493 if samfile.isbam: mode = b"rb"
1496 # reopen the file - note that this makes the iterator
1497 # slow and causes pileup to slow down significantly.
1499 store = StderrStore()
1500 self.fp = samopen( samfile._filename, mode, NULL )
1502 assert self.fp != NULL
1503 self.owns_samfile = True
1505 self.fp = self.samfile.samfile
1506 self.owns_samfile = False
1510 self.iter = bam_iter_query(self.samfile.index,
1514 self.b = bam_init1()
1519 cdef bam1_t * getCurrent( self ):
1522 cdef int cnext(self):
1523 '''cversion of iterator. Used by IteratorColumn'''
1524 self.retval = bam_iter_read( self.fp.x.bam,
1529 """python version of next().
1532 if self.retval < 0: raise StopIteration
1533 return makeAlignedRead( self.b )
1535 def __dealloc__(self):
1536 bam_destroy1(self.b)
1537 bam_iter_destroy( self.iter )
1538 if self.owns_samfile: samclose( self.fp )
1540 cdef class IteratorRowAll(IteratorRow):
1541 """*(Samfile samfile, int reopen = True)*
1543 iterate over all reads in *samfile*
1545 By default, the file is re-openend to avoid conflicts between
1546 multiple iterators working on the same file. Set *reopen* = False
1547 to not re-open *samfile*.
1550 def __cinit__(self, Samfile samfile, int reopen = True ):
1552 if not samfile._isOpen():
1553 raise ValueError( "I/O operation on closed file" )
1555 if samfile.isbam: mode = b"rb"
1558 # reopen the file to avoid iterator conflict
1560 store = StderrStore()
1561 self.fp = samopen( samfile._filename, mode, NULL )
1563 assert self.fp != NULL
1564 self.owns_samfile = True
1566 self.fp = samfile.samfile
1567 self.owns_samfile = False
1569 # allocate memory for alignment
1570 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1575 cdef bam1_t * getCurrent( self ):
1578 cdef int cnext(self):
1579 '''cversion of iterator. Used by IteratorColumn'''
1580 return samread(self.fp, self.b)
1583 """python version of next().
1585 pyrex uses this non-standard name instead of next()
1588 ret = samread(self.fp, self.b)
1590 return makeAlignedRead( self.b )
1594 def __dealloc__(self):
1595 bam_destroy1(self.b)
1596 if self.owns_samfile: samclose( self.fp )
1598 cdef class IteratorRowAllRefs(IteratorRow):
1599 """iterates over all mapped reads by chaining iterators over each reference
1602 def __cinit__(self, Samfile samfile):
1603 assert samfile._isOpen()
1604 if not samfile._hasIndex(): raise ValueError("no index available for fetch")
1605 self.samfile = samfile
1609 self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, 1<<29)
1615 """python version of next().
1617 pyrex uses this non-standard name instead of next()
1619 # Create an initial iterator
1621 if not self.samfile.nreferences:
1627 self.rowiter.cnext()
1629 # If current iterator is not exhausted, return aligned read
1630 if self.rowiter.retval>0:
1631 return makeAlignedRead(self.rowiter.b)
1635 # Otherwise, proceed to next reference or stop
1636 if self.tid<self.samfile.nreferences:
1641 cdef class IteratorRowSelection(IteratorRow):
1642 """*(Samfile samfile)*
1644 iterate over reads in *samfile* at a given list of file positions.
1647 def __cinit__(self, Samfile samfile, positions, int reopen = True ):
1649 if not samfile._isOpen():
1650 raise ValueError( "I/O operation on closed file" )
1652 if not samfile._isOpen():
1653 raise ValueError( "I/O operation on closed file" )
1655 assert samfile.isbam, "can only use this iterator on bam files"
1658 # reopen the file to avoid iterator conflict
1660 store = StderrStore()
1661 self.fp = samopen( samfile._filename, mode, NULL )
1663 assert self.fp != NULL
1664 self.owns_samfile = True
1666 self.fp = samfile.samfile
1667 self.owns_samfile = False
1669 # allocate memory for alignment
1670 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1672 self.positions = positions
1673 self.current_pos = 0
1678 cdef bam1_t * getCurrent( self ):
1681 cdef int cnext(self):
1682 '''cversion of iterator'''
1684 # end iteration if out of positions
1685 if self.current_pos >= len(self.positions): return -1
1687 bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
1688 self.current_pos += 1
1689 return samread(self.fp, self.b)
1692 """python version of next().
1694 pyrex uses this non-standard name instead of next()
1697 cdef int ret = self.cnext()
1699 return makeAlignedRead( self.b )
1703 def __dealloc__(self):
1704 bam_destroy1(self.b)
1705 if self.owns_samfile: samclose( self.fp )
1707 ##-------------------------------------------------------------------
1708 ##-------------------------------------------------------------------
1709 ##-------------------------------------------------------------------
1710 cdef int __advance_all( void * data, bam1_t * b ):
1711 '''advance without any read filtering.
1714 d = <__iterdata*>data
1715 return bam_iter_read( d.samfile.x.bam, d.iter, b )
1717 cdef int __advance_snpcalls( void * data, bam1_t * b ):
1718 '''advance using same filter and read processing as in
1719 the samtools pileup.
1722 d = <__iterdata*>data
1724 cdef int ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1728 cdef int is_nobaq = 0
1729 cdef int capQ_thres = 0
1732 if d.fastafile != NULL and b.core.tid != d.tid:
1733 if d.seq != NULL: free(d.seq)
1735 d.seq = faidx_fetch_seq(d.fastafile,
1736 d.samfile.header.target_name[d.tid],
1740 raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
1741 (d.samfile.header.target_name[d.tid],
1749 # realign read - changes base qualities
1750 if d.seq != NULL and is_cns and not is_nobaq: bam_prob_realn( b, d.seq )
1752 if d.seq != NULL and capQ_thres > 10:
1753 q = bam_cap_mapQ(b, d.seq, capQ_thres)
1755 elif b.core.qual > q: b.core.qual = q
1756 if b.core.flag & BAM_FUNMAP: skip = 1
1757 elif b.core.flag & 1 and not b.core.flag & 2: skip = 1
1760 # additional filters
1762 ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1766 cdef class IteratorColumn:
1767 '''abstract base class for iterators over columns.
1769 IteratorColumn objects wrap the pileup functionality of samtools.
1771 For reasons of efficiency, the iterator points to the current
1772 pileup buffer. The pileup buffer is updated at every iteration.
1773 This might cause some unexpected behavious. For example,
1774 consider the conversion to a list::
1776 f = Samfile("file.bam", "rb")
1777 result = list( f.pileup() )
1779 Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
1780 but each object in ``result`` will contain the same information.
1782 The desired behaviour can be achieved by list comprehension::
1784 result = [ x.pileups() for x in f.pileup() ]
1786 ``result`` will be a list of ``n`` lists of objects of type :class:`PileupRead`.
1788 If the iterator is associated with a :class:`Fastafile` using the :meth:`addReference`
1789 method, then the iterator will export the current sequence via the methods :meth:`getSequence`
1790 and :meth:`seq_len`.
1792 Optional kwargs to the iterator
1795 The stepper controls how the iterator advances.
1796 Possible options for the stepper are
1799 use all reads for pileup.
1801 same filter and read processing as in :term:`csamtools` pileup
1803 The default is to use "all" if no stepper is given.
1806 A :class:`FastaFile` object
1808 Skip all reads with bits set in mask.
1810 maximum read depth. The default is 8000.
1813 def __cinit__( self, Samfile samfile, **kwargs ):
1814 self.samfile = samfile
1815 self.mask = kwargs.get("mask", BAM_DEF_MASK )
1816 self.fastafile = kwargs.get( "fastafile", None )
1817 self.stepper = kwargs.get( "stepper", None )
1818 self.max_depth = kwargs.get( "max_depth", 8000 )
1819 self.iterdata.seq = NULL
1824 self.pileup_iter = <bam_plp_t>NULL
1830 cdef int cnext(self):
1831 '''perform next iteration.
1833 This method is analogous to the samtools bam_plp_auto method.
1834 It has been re-implemented to permit for filtering.
1836 self.plp = bam_plp_auto( self.pileup_iter,
1841 cdef char * getSequence( self ):
1842 '''return current reference sequence underlying the iterator.
1844 return self.iterdata.seq
1847 '''current sequence length.'''
1848 def __get__(self): return self.iterdata.seq_len
1850 def addReference( self, Fastafile fastafile ):
1852 add reference sequences in *fastafile* to iterator.'''
1853 self.fastafile = fastafile
1854 if self.iterdata.seq != NULL: free(self.iterdata.seq)
1855 self.iterdata.tid = -1
1856 self.iterdata.fastafile = self.fastafile.fastafile
1858 def hasReference( self ):
1860 return true if iterator is associated with a reference'''
1861 return self.fastafile
1863 cdef setMask( self, mask ):
1864 '''set masking flag in iterator.
1866 reads with bits set in *mask* will be skipped.
1869 bam_plp_set_mask( self.pileup_iter, self.mask )
1871 cdef setupIteratorData( self,
1876 '''setup the iterator structure'''
1878 self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen )
1879 self.iterdata.samfile = self.samfile.samfile
1880 self.iterdata.iter = self.iter.iter
1881 self.iterdata.seq = NULL
1882 self.iterdata.tid = -1
1884 if self.fastafile != None:
1885 self.iterdata.fastafile = self.fastafile.fastafile
1887 self.iterdata.fastafile = NULL
1889 if self.stepper == None or self.stepper == "all":
1890 self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
1891 elif self.stepper == "samtools":
1892 self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
1894 raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
1897 bam_plp_set_maxcnt( self.pileup_iter, self.max_depth )
1899 bam_plp_set_mask( self.pileup_iter, self.mask )
1901 cdef reset( self, tid, start, end ):
1902 '''reset iterator position.
1904 This permits using the iterator multiple times without
1905 having to incur the full set-up costs.
1907 self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
1908 self.iterdata.iter = self.iter.iter
1910 # invalidate sequence if different tid
1912 if self.iterdata.seq != NULL: free( self.iterdata.seq )
1913 self.iterdata.seq = NULL
1914 self.iterdata.tid = -1
1916 # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
1917 bam_plp_reset(self.pileup_iter)
1919 def __dealloc__(self):
1920 # reset in order to avoid memory leak messages for iterators that have
1921 # not been fully consumed
1922 if self.pileup_iter != <bam_plp_t>NULL:
1923 bam_plp_reset(self.pileup_iter)
1924 bam_plp_destroy(self.pileup_iter)
1925 self.pileup_iter = <bam_plp_t>NULL
1927 if self.iterdata.seq != NULL:
1928 free(self.iterdata.seq)
1929 self.iterdata.seq = NULL
1931 cdef class IteratorColumnRegion(IteratorColumn):
1932 '''iterates over a region only.
1934 def __cinit__(self, Samfile samfile,
1938 int truncate = False,
1941 # initialize iterator
1942 self.setupIteratorData( tid, start, end, 1 )
1945 self.truncate = truncate
1948 """python version of next().
1954 raise ValueError("error during iteration" )
1956 if self.plp == NULL:
1960 if self.start < self.pos: continue
1961 if self.pos >= self.end: raise StopIteration
1963 return makePileupProxy( &self.plp,
1968 cdef class IteratorColumnAllRefs(IteratorColumn):
1969 """iterates over all columns by chaining iterators over each reference
1976 # no iteration over empty files
1977 if not samfile.nreferences: raise StopIteration
1979 # initialize iterator
1980 self.setupIteratorData( self.tid, 0, max_pos, 1 )
1983 """python version of next().
1990 raise ValueError("error during iteration" )
1992 # return result, if within same reference
1993 if self.plp != NULL:
1994 return makePileupProxy( &self.plp,
1999 # otherwise, proceed to next reference or stop
2001 if self.tid < self.samfile.nreferences:
2002 self.setupIteratorData( self.tid, 0, max_pos, 0 )
2006 ##-------------------------------------------------------------------
2007 ##-------------------------------------------------------------------
2008 ##-------------------------------------------------------------------
2009 cdef inline int32_t query_start(bam1_t *src) except -1:
2010 cdef uint32_t * cigar_p, op
2012 cdef uint32_t start_offset = 0
2014 if src.core.n_cigar:
2015 cigar_p = bam1_cigar(src);
2016 for k from 0 <= k < src.core.n_cigar:
2017 op = cigar_p[k] & BAM_CIGAR_MASK
2018 if op==BAM_CHARD_CLIP:
2019 if start_offset!=0 and start_offset!=src.core.l_qseq:
2020 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
2022 elif op==BAM_CSOFT_CLIP:
2023 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
2029 ##-------------------------------------------------------------------
2030 ##-------------------------------------------------------------------
2031 ##-------------------------------------------------------------------
2032 cdef inline int32_t query_end(bam1_t *src) except -1:
2033 cdef uint32_t * cigar_p, op
2035 cdef uint32_t end_offset = src.core.l_qseq
2037 if src.core.n_cigar>1:
2038 cigar_p = bam1_cigar(src);
2039 for k from src.core.n_cigar > k >= 1:
2040 op = cigar_p[k] & BAM_CIGAR_MASK
2041 if op==BAM_CHARD_CLIP:
2042 if end_offset!=0 and end_offset!=src.core.l_qseq:
2043 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
2045 elif op==BAM_CSOFT_CLIP:
2046 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
2051 end_offset = src.core.l_qseq
2056 cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
2061 if not src.core.l_qseq:
2064 seq = PyBytes_FromStringAndSize(NULL, end - start)
2068 for k from start <= k < end:
2069 # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
2070 # note: do not use string literal as it will be a python string
2071 s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
2076 cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
2085 qual = PyBytes_FromStringAndSize(NULL, end - start)
2088 for k from start <= k < end:
2089 ## equivalent to t[i] + 33 (see bam.c)
2090 q[k-start] = p[k] + 33
2094 cdef class AlignedRead:
2096 Class representing an aligned read. see SAM format specification for
2097 the meaning of fields (http://samtools.sourceforge.net/).
2099 This class stores a handle to the samtools C-structure representing
2100 an aligned read. Member read access is forwarded to the C-structure
2101 and converted into python objects. This implementation should be fast,
2102 as only the data needed is converted.
2104 For write access, the C-structure is updated in-place. This is
2105 not the most efficient way to build BAM entries, as the variable
2106 length data is concatenated and thus needs to resized if
2107 a field is updated. Furthermore, the BAM entry might be
2108 in an inconsistent state. The :meth:`~validate` method can
2109 be used to check if an entry is consistent.
2111 One issue to look out for is that the sequence should always
2112 be set *before* the quality scores. Setting the sequence will
2113 also erase any quality scores that were set previously.
2115 In Python 3, the fields containing sequence and quality
2116 (seq, query, qual and qqual) data are of type bytes. Other
2117 string data, such as the qname field and strings in the
2118 tags tuple, is represented as unicode strings. On assignment,
2119 both bytes and unicode objects are allowed, but unicode strings
2120 must contain only ASCII characters.
2123 # Now only called when instances are created from Python
2126 self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
2127 # allocate some memory
2128 # If size is 0, calloc does not return a pointer that can be passed to free()
2129 # so allocate 40 bytes for a new read
2130 self._delegate.m_data = 40
2131 self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
2132 self._delegate.data_len = 0
2134 def __dealloc__(self):
2135 bam_destroy1(self._delegate)
2138 """return string representation of alignment.
2140 The representation is an approximate :term:`sam` format.
2142 An aligned read might not be associated with a :term:`Samfile`.
2143 As a result :term:`tid` is shown instead of the reference name.
2145 Similarly, the tags field is returned in its parsed state.
2147 # sam-parsing is done in sam.c/bam_format1_core which
2148 # requires a valid header.
2149 if sys.version_info[0] < 3:
2153 seq = self.seq.decode('ascii')
2154 qual = self.qual.decode('ascii')
2155 return "\t".join(map(str, (self.qname,
2168 def compare(self, AlignedRead other):
2169 '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
2177 # uncomment for debugging purposes
2178 # cdef unsigned char * oo, * tt
2179 # tt = <unsigned char*>(&t.core)
2180 # oo = <unsigned char*>(&o.core)
2181 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
2182 # tt = <unsigned char*>(t.data)
2183 # oo = <unsigned char*>(o.data)
2184 # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
2186 # Fast-path test for object identity
2190 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
2192 if retval: return retval
2193 retval = (t.data_len > o.data_len) - (t.data_len < o.data_len) # cmp(t.data_len, o.data_len)
2194 if retval: return retval
2195 return memcmp(t.data, o.data, t.data_len)
2197 # Disabled so long as __cmp__ is a special method
2199 return _Py_HashPointer(<void *>self)
2202 """the query name (None if not present)"""
2205 src = self._delegate
2206 if src.core.l_qname == 0: return None
2207 return _charptr_to_str(<char *>bam1_qname( src ))
2209 def __set__(self, qname ):
2210 if qname == None or len(qname) == 0: return
2211 qname = _force_bytes(qname)
2216 src = self._delegate
2217 p = bam1_qname( src )
2219 # the qname is \0 terminated
2221 pysam_bam_update( src,
2226 src.core.l_qname = l
2228 # re-acquire pointer to location in memory
2229 # as it might have moved
2232 strncpy( p, qname, l )
2235 """the :term:`cigar` alignment (None if not present). The alignment
2236 is returned as a list of operations. The operations are:
2238 +-----+--------------+-----+
2240 +-----+--------------+-----+
2242 +-----+--------------+-----+
2244 +-----+--------------+-----+
2245 |N |BAM_CREF_SKIP |3 |
2246 +-----+--------------+-----+
2247 |S |BAM_CSOFT_CLIP|4 |
2248 +-----+--------------+-----+
2249 |H |BAM_CHARD_CLIP|5 |
2250 +-----+--------------+-----+
2252 +-----+--------------+-----+
2254 +-----+--------------+-----+
2256 +-----+--------------+-----+
2260 cdef uint32_t * cigar_p
2265 src = self._delegate
2266 if src.core.n_cigar == 0: return None
2269 cigar_p = bam1_cigar(src);
2270 for k from 0 <= k < src.core.n_cigar:
2271 op = cigar_p[k] & BAM_CIGAR_MASK
2272 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2273 cigar.append((op, l))
2276 def __set__(self, values ):
2277 if values == None or len(values) == 0: return
2285 src = self._delegate
2287 # get location of cigar string
2290 # create space for cigar data within src.data
2291 pysam_bam_update( src,
2292 src.core.n_cigar * 4,
2296 # length is number of cigar operations, not bytes
2297 src.core.n_cigar = len(values)
2299 # re-acquire pointer to location in memory
2300 # as it might have moved
2303 # insert cigar operations
2304 for op, l in values:
2305 p[k] = l << BAM_CIGAR_SHIFT | op
2308 ## setting the cigar string also updates the "bin" attribute
2309 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
2311 property cigarstring:
2312 '''the :term:`cigar` alignment as a string.
2314 Returns the empty string if not present.
2318 if c == None: return ""
2319 else: return "".join([ "%c%i" % (CODE2CIGAR[x],y) for x,y in c])
2321 def __set__(self, cigar):
2322 if cigar == None or len(cigar) == 0: self.cigar = []
2323 parts = CIGAR_REGEX.findall( cigar )
2324 self.cigar = [ (CIGAR2CODE[ord(x)], int(y)) for x,y in parts ]
2327 """read sequence bases, including :term:`soft clipped` bases (None if not present).
2329 In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
2333 src = self._delegate
2335 if src.core.l_qseq == 0: return None
2337 return get_seq_range(src, 0, src.core.l_qseq)
2339 def __set__(self,seq):
2340 # samtools manages sequence and quality length memory together
2341 # if no quality information is present, the first byte says 0xff.
2343 if seq == None or len(seq) == 0: return
2344 seq = _force_bytes(seq)
2348 cdef int l, k, nbytes_new, nbytes_old
2350 src = self._delegate
2354 # as the sequence is stored in half-bytes, the total length (sequence
2355 # plus quality scores) is (l+1)/2 + l
2356 nbytes_new = (l+1)/2 + l
2357 nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
2358 # acquire pointer to location in memory
2362 pysam_bam_update( src,
2366 # re-acquire pointer to location in memory
2367 # as it might have moved
2369 for k from 0 <= k < nbytes_new: p[k] = 0
2370 # convert to C string
2372 for k from 0 <= k < l:
2373 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
2376 p = bam1_qual( src )
2381 """read sequence base qualities, including :term:`soft clipped` bases (None if not present).
2383 In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
2389 src = self._delegate
2391 if src.core.l_qseq == 0: return None
2393 return get_qual_range(src, 0, src.core.l_qseq)
2395 def __set__(self,qual):
2396 # note that space is already allocated via the sequences
2402 src = self._delegate
2403 p = bam1_qual( src )
2404 if qual == None or len(qual) == 0:
2405 # if absent - set to 0xff
2408 qual = _force_bytes(qual)
2410 # convert to C string
2413 if src.core.l_qseq != l:
2414 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
2415 assert src.core.l_qseq == l
2416 for k from 0 <= k < l:
2417 p[k] = <uint8_t>q[k] - 33
2420 """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present).
2422 In Python 3, this property is of type bytes. Assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient.
2424 SAM/BAM files may included extra flanking bases sequences that were
2425 not part of the alignment. These bases may be the result of the
2426 Smith-Waterman or other algorithms, which may not require alignments
2427 that begin at the first residue or end at the last. In addition,
2428 extra sequencing adapters, multiplex identifiers, and low-quality bases that
2429 were not considered for alignment may have been retained."""
2433 cdef uint32_t start, end
2436 src = self._delegate
2438 if src.core.l_qseq == 0: return None
2440 start = query_start(src)
2441 end = query_end(src)
2443 return get_seq_range(src, start, end)
2446 """aligned query sequence quality values (None if not present). This property is read-only.
2448 In Python 3, this property is of type bytes."""
2451 cdef uint32_t start, end
2453 src = self._delegate
2455 if src.core.l_qseq == 0: return None
2457 start = query_start(src)
2458 end = query_end(src)
2460 return get_qual_range(src, start, end)
2463 """start index of the aligned query portion of the sequence (0-based, inclusive)"""
2465 return query_start(self._delegate)
2468 """end index of the aligned query portion of the sequence (0-based, exclusive)"""
2470 return query_end(self._delegate)
2473 """Length of the aligned query sequence"""
2476 src = self._delegate
2477 return query_end(src)-query_start(src)
2480 """the tags in the AUX field.
2482 This property permits convenience access to
2483 the tags. Changes it the returned list will
2484 not update the tags automatically. Instead,
2485 the following is required for adding a
2488 read.tags = read.tags + [("RG",0)]
2491 This method will happily write the same tag
2500 cdef uint8_t byte_size
2501 cdef int32_t nvalues
2503 src = self._delegate
2504 if src.l_aux == 0: return []
2508 while s < (src.data + src.data_len):
2514 if auxtype in ('c', 'C'):
2515 value = <int>bam_aux2i(s)
2517 elif auxtype in ('s', 'S'):
2518 value = <int>bam_aux2i(s)
2520 elif auxtype in ('i', 'I'):
2521 value = <int32_t>bam_aux2i(s)
2523 elif auxtype == 'f':
2524 value = <float>bam_aux2f(s)
2526 elif auxtype == 'd':
2527 value = <double>bam_aux2d(s)
2529 elif auxtype == 'A':
2530 value = "%c" % <char>bam_aux2A(s)
2532 elif auxtype in ('Z', 'H'):
2533 value = _charptr_to_str(<char*>bam_aux2Z(s))
2534 # +1 for NULL terminated string
2536 elif auxtype == 'B':
2538 byte_size, nvalues, value = convertBinaryTagToList( s )
2539 # 5 for 1 char and 1 int
2540 s += 5 + ( nvalues * byte_size) - 1
2544 result.append( (_charptr_to_str(auxtag), value) )
2548 def __set__(self, tags):
2551 cdef uint8_t * new_data
2554 src = self._delegate
2556 fmts, args = ["<"], []
2560 # map samtools code to python.struct code and byte size
2561 for pytag, value in tags:
2562 if not type(pytag) is bytes:
2563 pytag = pytag.encode('ascii')
2566 if t is tuple or t is list:
2567 # binary tags - treat separately
2569 # get data type - first value determines type
2570 if type(value[0]) is float:
2571 datafmt, datatype = "f", "f"
2573 mi, ma = min(value), max(value)
2574 absmax = max( abs(mi), abs(ma) )
2577 if mi >= -127: datafmt, datatype = "b", 'c'
2578 elif mi >= -32767: datafmt, datatype = "h", 's'
2579 elif absmax < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2580 else: datafmt, datatype = "i", 'i'
2584 if absmax <= 255: datafmt, datatype = "B", 'C'
2585 elif absmax <= 65535: datafmt, datatype = "H", 'S'
2586 elif absmax > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2587 else: datafmt, datatype = "I", 'I'
2589 datafmt = "2sccI%i%s" % (len(value), datafmt)
2590 args.extend( [pytag[:2],
2591 pytype.encode('ascii'),
2592 datatype.encode('ascii'),
2593 len(value)] + list(value) )
2594 fmts.append( datafmt )
2598 fmt, pytype = "2scf", 'f'
2602 if value >= -127: fmt, pytype = "2scb", 'c'
2603 elif value >= -32767: fmt, pytype = "2sch", 's'
2604 elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2605 else: fmt, pytype = "2sci", 'i'
2608 if value <= 255: fmt, pytype = "2scB", 'C'
2609 elif value <= 65535: fmt, pytype = "2scH", 'S'
2610 elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2611 else: fmt, pytype = "2scI", 'I'
2613 # Note: hex strings (H) are not supported yet
2615 value = value.encode('ascii')
2617 fmt, pytype = "2scc", 'A'
2619 fmt, pytype = "2sc%is" % (len(value)+1), 'Z'
2621 args.extend( [pytag[:2],
2622 pytype.encode('ascii'),
2628 total_size = struct.calcsize(fmt)
2629 buffer = ctypes.create_string_buffer(total_size)
2630 struct.pack_into( fmt,
2635 # delete the old data and allocate new space.
2636 # If total_size == 0, the aux field will be
2638 pysam_bam_update( src,
2643 src.l_aux = total_size
2645 # copy data only if there is any
2648 # get location of new data
2651 # check if there is direct path from buffer.raw to tmp
2653 memcpy( s, temp, total_size )
2656 """properties flag"""
2657 def __get__(self): return self._delegate.core.flag
2658 def __set__(self, flag): self._delegate.core.flag = flag
2664 DEPRECATED from pysam-0.4 - use tid in the future.
2665 The rname field caused a lot of confusion as it returns
2666 the :term:`target` ID instead of the reference sequence
2671 This field contains the index of the reference sequence
2672 in the sequence dictionary. To obtain the name
2673 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2676 def __get__(self): return self._delegate.core.tid
2677 def __set__(self, tid): self._delegate.core.tid = tid
2685 This field contains the index of the reference sequence
2686 in the sequence dictionary. To obtain the name
2687 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2690 def __get__(self): return self._delegate.core.tid
2691 def __set__(self, tid): self._delegate.core.tid = tid
2694 """0-based leftmost coordinate"""
2695 def __get__(self): return self._delegate.core.pos
2696 def __set__(self, pos):
2697 ## setting the cigar string also updates the "bin" attribute
2699 src = self._delegate
2700 if src.core.n_cigar:
2701 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
2703 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
2704 self._delegate.core.pos = pos
2706 """properties bin"""
2707 def __get__(self): return self._delegate.core.bin
2708 def __set__(self, bin): self._delegate.core.bin = bin
2710 '''length of the read (read only). Returns 0 if not given.'''
2711 def __get__(self): return self._delegate.core.l_qseq
2713 '''aligned reference position of the read on the reference genome.
2715 aend points to one past the last aligned residue.
2716 Returns None if not available.'''
2719 src = self._delegate
2720 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2722 return bam_calend(&src.core, bam1_cigar(src))
2725 '''aligned length of the read on the reference genome. Returns None if
2729 src = self._delegate
2730 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2732 return bam_calend(&src.core,
2733 bam1_cigar(src)) - \
2734 self._delegate.core.pos
2737 """mapping quality"""
2738 def __get__(self): return self._delegate.core.qual
2739 def __set__(self, qual): self._delegate.core.qual = qual
2742 """the :term:`reference` id of the mate
2743 deprecated, use RNEXT instead.
2745 def __get__(self): return self._delegate.core.mtid
2746 def __set__(self, mtid): self._delegate.core.mtid = mtid
2748 """the :term:`reference` id of the mate """
2749 def __get__(self): return self._delegate.core.mtid
2750 def __set__(self, mtid): self._delegate.core.mtid = mtid
2752 """the position of the mate
2753 deprecated, use PNEXT instead."""
2754 def __get__(self): return self._delegate.core.mpos
2755 def __set__(self, mpos): self._delegate.core.mpos = mpos
2757 """the position of the mate"""
2758 def __get__(self): return self._delegate.core.mpos
2759 def __set__(self, mpos): self._delegate.core.mpos = mpos
2762 deprecated: use tlen instead"""
2763 def __get__(self): return self._delegate.core.isize
2764 def __set__(self, isize): self._delegate.core.isize = isize
2766 """the insert size"""
2767 def __get__(self): return self._delegate.core.isize
2768 def __set__(self, isize): self._delegate.core.isize = isize
2770 """true if read is paired in sequencing"""
2771 def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
2772 def __set__(self,val):
2773 if val: self._delegate.core.flag |= BAM_FPAIRED
2774 else: self._delegate.core.flag &= ~BAM_FPAIRED
2775 property is_proper_pair:
2776 """true if read is mapped in a proper pair"""
2777 def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
2778 def __set__(self,val):
2779 if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
2780 else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
2781 property is_unmapped:
2782 """true if read itself is unmapped"""
2783 def __get__(self): return (self.flag & BAM_FUNMAP) != 0
2784 def __set__(self,val):
2785 if val: self._delegate.core.flag |= BAM_FUNMAP
2786 else: self._delegate.core.flag &= ~BAM_FUNMAP
2787 property mate_is_unmapped:
2788 """true if the mate is unmapped"""
2789 def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
2790 def __set__(self,val):
2791 if val: self._delegate.core.flag |= BAM_FMUNMAP
2792 else: self._delegate.core.flag &= ~BAM_FMUNMAP
2793 property is_reverse:
2794 """true if read is mapped to reverse strand"""
2795 def __get__(self): return (self.flag & BAM_FREVERSE) != 0
2796 def __set__(self,val):
2797 if val: self._delegate.core.flag |= BAM_FREVERSE
2798 else: self._delegate.core.flag &= ~BAM_FREVERSE
2799 property mate_is_reverse:
2800 """true is read is mapped to reverse strand"""
2801 def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
2802 def __set__(self,val):
2803 if val: self._delegate.core.flag |= BAM_FMREVERSE
2804 else: self._delegate.core.flag &= ~BAM_FMREVERSE
2806 """true if this is read1"""
2807 def __get__(self): return (self.flag & BAM_FREAD1) != 0
2808 def __set__(self,val):
2809 if val: self._delegate.core.flag |= BAM_FREAD1
2810 else: self._delegate.core.flag &= ~BAM_FREAD1
2812 """true if this is read2"""
2813 def __get__(self): return (self.flag & BAM_FREAD2) != 0
2814 def __set__(self,val):
2815 if val: self._delegate.core.flag |= BAM_FREAD2
2816 else: self._delegate.core.flag &= ~BAM_FREAD2
2817 property is_secondary:
2818 """true if not primary alignment"""
2819 def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
2820 def __set__(self,val):
2821 if val: self._delegate.core.flag |= BAM_FSECONDARY
2822 else: self._delegate.core.flag &= ~BAM_FSECONDARY
2824 """true if QC failure"""
2825 def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
2826 def __set__(self,val):
2827 if val: self._delegate.core.flag |= BAM_FQCFAIL
2828 else: self._delegate.core.flag &= ~BAM_FQCFAIL
2829 property is_duplicate:
2830 """true if optical or PCR duplicate"""
2831 def __get__(self): return (self.flag & BAM_FDUP) != 0
2832 def __set__(self,val):
2833 if val: self._delegate.core.flag |= BAM_FDUP
2834 else: self._delegate.core.flag &= ~BAM_FDUP
2836 """a list of reference positions that this read aligns to."""
2838 cdef uint32_t k, i, pos
2840 cdef uint32_t * cigar_p
2843 src = self._delegate
2844 if src.core.n_cigar == 0: return []
2848 cigar_p = bam1_cigar(src)
2850 for k from 0 <= k < src.core.n_cigar:
2851 op = cigar_p[k] & BAM_CIGAR_MASK
2852 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2853 if op == BAM_CMATCH:
2854 for i from pos <= i < pos + l:
2857 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2862 property aligned_pairs:
2863 """a list of aligned read and reference positions.
2865 Unaligned position are marked by None.
2868 cdef uint32_t k, i, pos, qpos
2870 cdef uint32_t * cigar_p
2873 src = self._delegate
2874 if src.core.n_cigar == 0: return []
2879 cigar_p = bam1_cigar(src)
2881 for k from 0 <= k < src.core.n_cigar:
2882 op = cigar_p[k] & BAM_CIGAR_MASK
2883 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2885 if op == BAM_CMATCH:
2886 for i from pos <= i < pos + l:
2887 result.append( (qpos, i) )
2891 elif op == BAM_CINS:
2892 for i from pos <= i < pos + l:
2893 result.append( (qpos, None) )
2896 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
2897 for i from pos <= i < pos + l:
2898 result.append( (None, i) )
2904 def overlap( self, uint32_t start, uint32_t end ):
2905 """return number of aligned bases of read overlapping the interval *start* and *end*
2906 on the reference sequence.
2908 cdef uint32_t k, i, pos, overlap
2910 cdef uint32_t * cigar_p
2915 src = self._delegate
2916 if src.core.n_cigar == 0: return 0
2920 cigar_p = bam1_cigar(src)
2921 for k from 0 <= k < src.core.n_cigar:
2922 op = cigar_p[k] & BAM_CIGAR_MASK
2923 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2925 if op == BAM_CMATCH:
2926 o = min( pos + l, end) - max( pos, start )
2927 if o > 0: overlap += o
2929 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2935 """retrieves optional data given a two-letter *tag*"""
2936 #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
2939 btag = _force_bytes(tag)
2940 v = bam_aux_get(self._delegate, btag)
2941 if v == NULL: raise KeyError( "tag '%s' not present" % tag )
2943 if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
2944 return <int>bam_aux2i(v)
2945 elif auxtype == 'i' or auxtype == 'I':
2946 return <int32_t>bam_aux2i(v)
2947 elif auxtype == 'f' or auxtype == 'F':
2948 return <float>bam_aux2f(v)
2949 elif auxtype == 'd' or auxtype == 'D':
2950 return <double>bam_aux2d(v)
2951 elif auxtype == 'A':
2952 # there might a more efficient way
2953 # to convert a char into a string
2954 return '%c' % <char>bam_aux2A(v)
2955 elif auxtype == 'Z':
2956 return _charptr_to_str(<char*>bam_aux2Z(v))
2957 elif auxtype == 'B':
2958 bytesize, nvalues, values = convertBinaryTagToList( v + 1 )
2961 raise ValueError("unknown auxilliary type '%s'" % auxtype)
2964 def fancy_str (self):
2965 """returns list of fieldnames/values in pretty format for debugging
2969 "tid": "Contig index",
2970 "pos": "Mapped position on contig",
2971 "mtid": "Contig index for mate pair",
2972 "mpos": "Position of mate pair",
2973 "isize": "Insert size",
2974 "flag": "Binary flag",
2975 "n_cigar": "Count of cigar entries",
2976 "cigar": "Cigar entries",
2977 "qual": "Mapping quality",
2978 "bin": "Bam index bin number",
2979 "l_qname": "Length of query name",
2980 "qname": "Query name",
2981 "l_qseq": "Length of query sequence",
2982 "qseq": "Query sequence",
2983 "bqual": "Quality scores",
2984 "l_aux": "Length of auxilary data",
2985 "m_data": "Maximum data length",
2986 "data_len": "Current data length",
2988 fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
2989 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
2990 "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
2992 for f in fields_names_in_order:
2993 if not f in self.__dict__:
2995 ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
2997 for f in self.__dict__:
2998 if not f in field_names:
2999 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
3002 cdef class PileupProxy:
3003 '''A pileup column. A pileup column contains
3004 all the reads that map to a certain target base.
3007 chromosome ID as is defined in the header
3009 the target base coordinate (0-based)
3011 number of reads mapping to this column
3013 list of reads (:class:`pysam.PileupRead`) aligned to this column
3015 This class is a proxy for results returned by the samtools pileup engine.
3016 If the underlying engine iterator advances, the results of this column
3020 raise TypeError("This class cannot be instantiated from Python")
3023 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
3025 "\n".join( map(str, self.pileups) )
3028 '''the chromosome ID as is defined in the header'''
3029 def __get__(self): return self.tid
3032 '''number of reads mapping to this column.'''
3033 def __get__(self): return self.n_pu
3034 def __set__(self, n): self.n_pu = n
3037 def __get__(self): return self.pos
3040 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
3045 if self.plp[0] == NULL:
3046 raise ValueError("PileupProxy accessed after iterator finished")
3048 # warning: there could be problems if self.n and self.buf are
3050 for x from 0 <= x < self.n_pu:
3051 pileups.append( makePileupRead( &(self.plp[0][x])) )
3054 cdef class PileupRead:
3055 '''A read aligned to a column.
3059 raise TypeError("This class cannot be instantiated from Python")
3062 return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
3065 """a :class:`pysam.AlignedRead` object of the aligned read"""
3067 return self._alignment
3069 """position of the read base at the pileup site, 0-based"""
3073 """indel length; 0 for no indel, positive for ins and negative for del"""
3077 """1 iff the base on the padded read is a deletion"""
3082 return self._is_head
3085 return self._is_tail
3091 '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
3092 def __init__(self, id = 1):
3096 def setdevice(self, filename):
3097 '''open an existing file, like "/dev/null"'''
3098 fd = os.open(filename, os.O_WRONLY)
3101 def setfile(self, filename):
3102 '''open a new file.'''
3103 fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
3106 def setfd(self, fd):
3107 ofd = os.dup(self.id) # Save old stream on new unit.
3108 self.streams.append(ofd)
3109 sys.stdout.flush() # Buffered data goes to old stream.
3110 sys.stderr.flush() # Buffered data goes to old stream.
3111 os.dup2(fd, self.id) # Open unit 1 on new stream.
3112 os.close(fd) # Close other unit (look out, caller.)
3115 '''restore previous output stream'''
3117 # the following was not sufficient, hence flush both stderr and stdout
3118 # os.fsync( self.id )
3121 os.dup2(self.streams[-1], self.id)
3122 os.close(self.streams[-1])
3123 del self.streams[-1]
3125 def _samtools_dispatch( method,
3127 catch_stdout = True ):
3128 '''call ``method`` in samtools providing arguments in args.
3131 This method redirects stdout to capture it
3132 from samtools. If for some reason stdout disappears
3133 the reason might be in this method.
3136 The current implementation might only work on linux.
3139 This method captures stdout and stderr using temporary files,
3140 which are then read into memory in their entirety. This method
3141 is slow and might cause large memory overhead.
3143 See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
3144 on the topic of redirecting stderr/stdout.
3147 # note that debugging this module can be a problem
3148 # as stdout/stderr will not appear on the terminal
3150 # some special cases
3151 if method == "index":
3152 if not os.path.exists( args[0] ):
3153 raise IOError( "No such file or directory: '%s'" % args[0] )
3155 # redirect stderr and stdout to file
3156 stderr_h, stderr_f = tempfile.mkstemp()
3157 pysam_set_stderr( stderr_h )
3160 stdout_h, stdout_f = tempfile.mkstemp()
3162 stdout_save = Outs( sys.stdout.fileno() )
3163 stdout_save.setfd( stdout_h )
3164 except AttributeError:
3165 # stdout has already been redirected
3166 catch_stdout = False
3168 # patch for `samtools view`
3169 # samtools `view` closes stdout, from which I can not
3170 # recover. Thus redirect output to file with -o option.
3171 if method == "view":
3172 if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
3173 args = ( "-o", stdout_f ) + args
3175 # do the function call to samtools
3177 cdef int i, n, retval
3180 method = _force_cmdline_bytes(method)
3181 args = [ _force_cmdline_bytes(a) for a in args ]
3183 # allocate two more for first (dummy) argument (contains command)
3184 cargs = <char**>calloc( n+2, sizeof( char *) )
3185 cargs[0] = "samtools"
3187 for i from 0 <= i < n: cargs[i+2] = args[i]
3189 retval = pysam_dispatch(n+2, cargs)
3192 # restore stdout/stderr. This will also flush, so
3193 # needs to be before reading back the file contents
3195 stdout_save.restore()
3197 with open( stdout_f, "r") as inf:
3198 out_stdout = inf.readlines()
3199 except UnicodeDecodeError:
3200 with open( stdout_f, "rb") as inf:
3201 # read binary output
3202 out_stdout = inf.read()
3203 os.remove( stdout_f )
3207 # get error messages
3208 pysam_unset_stderr()
3210 with open( stderr_f, "r") as inf:
3211 out_stderr = inf.readlines()
3212 except UnicodeDecodeError:
3213 with open( stderr_f, "rb") as inf:
3214 # read binary output
3215 out_stderr = inf.read()
3216 os.remove( stderr_f )
3220 return retval, out_stderr, out_stdout
3223 '''the results of a SNP call.'''
3226 cdef char _reference_base
3228 cdef int _consensus_quality
3229 cdef int _snp_quality
3230 cdef int _rms_mapping_quality
3234 '''the chromosome ID as is defined in the header'''
3239 '''nucleotide position of SNP.'''
3240 def __get__(self): return self._pos
3242 property reference_base:
3243 '''reference base at pos. ``N`` if no reference sequence supplied.'''
3244 def __get__(self): return from_string_and_size( &self._reference_base, 1 )
3247 '''the genotype called.'''
3248 def __get__(self): return from_string_and_size( &self._genotype, 1 )
3250 property consensus_quality:
3251 '''the genotype quality (Phred-scaled).'''
3252 def __get__(self): return self._consensus_quality
3254 property snp_quality:
3255 '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
3256 def __get__(self): return self._snp_quality
3258 property mapping_quality:
3259 '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
3260 def __get__(self): return self._rms_mapping_quality
3263 '''coverage or read depth - the number of reads involved in the call.'''
3264 def __get__(self): return self._coverage
3268 return "\t".join( map(str, (
3271 self.reference_base,
3273 self.consensus_quality,
3275 self.mapping_quality,
3279 # cdef class SNPCallerBase:
3280 # '''Base class for SNP callers.
3283 # minimum base quality (possibly capped by BAQ)
3285 # coefficient for adjusting mapQ of poor mappings
3287 # theta in maq consensus calling model
3289 # number of haplotypes in the sample
3291 # prior of a difference between two haplotypes
3294 # cdef bam_maqcns_t * c
3295 # cdef IteratorColumn iter
3297 # def __cinit__(self,
3298 # IteratorColumn iterator_column,
3301 # self.iter = iterator_column
3302 # self.c = bam_maqcns_init()
3304 # # set the default parameterization according to
3307 # # new default mode for samtools >0.1.10
3308 # self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
3310 # self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
3311 # # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
3312 # self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
3313 # self.c.het_rate = kwargs.get( "het_rate", 0.001 )
3314 # self.c.theta = kwargs.get( "theta", 0.83 )
3316 # if self.c.errmod != BAM_ERRMOD_MAQ2:
3317 # self.c.theta += 0.02
3319 # # call prepare AFTER setting parameters
3320 # bam_maqcns_prepare( self.c )
3322 # def __dealloc__(self):
3323 # bam_maqcns_destroy( self.c )
3325 # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
3326 # '''debugging output.'''
3328 # pysam_dump_glf( g, self.c );
3330 # for x in range(self.iter.n_plp):
3331 # print "--> read %i %s %i" % (x,
3332 # bam1_qname(self.iter.plp[x].b),
3333 # self.iter.plp[x].qpos,
3336 # print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
3349 # printf("-------------------------------------\n");
3350 # sys.stdout.flush()
3352 # cdef class IteratorSNPCalls( SNPCallerBase ):
3353 # """*(IteratorColumn iterator)*
3355 # call SNPs within a region.
3357 # *iterator* is a pileup iterator. SNPs will be called
3358 # on all positions returned by this iterator.
3360 # This caller is fast if SNPs are called over large continuous
3361 # regions. It is slow, if instantiated frequently and in random
3362 # order as the sequence will have to be reloaded.
3366 # def __cinit__(self,
3367 # IteratorColumn iterator_column,
3370 # assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
3372 # def __iter__(self):
3375 # def __next__(self):
3376 # """python version of next().
3379 # # the following code was adapted from bam_plcmd.c:pileup_func()
3382 # if self.iter.n_plp < 0:
3383 # raise ValueError("error during iteration" )
3385 # if self.iter.plp == NULL:
3386 # raise StopIteration
3388 # cdef char * seq = self.iter.getSequence()
3389 # cdef int seq_len = self.iter.seq_len
3391 # assert seq != NULL
3394 # if self.iter.pos >= seq_len:
3395 # raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3397 # cdef int rb = seq[self.iter.pos]
3401 # g = bam_maqcns_glfgen( self.iter.n_plp,
3403 # bam_nt16_table[rb],
3406 # if pysam_glf_depth( g ) == 0:
3407 # cns = 0xfu << 28 | 0xf << 24
3409 # cns = glf2cns(g, <int>(self.c.q_r + .499))
3416 # call._tid = self.iter.tid
3417 # call._pos = self.iter.pos
3418 # call._reference_base = rb
3419 # call._genotype = bam_nt16_rev_table[cns>>28]
3420 # call._consensus_quality = cns >> 8 & 0xff
3421 # call._snp_quality = cns & 0xff
3422 # call._rms_mapping_quality = cns >> 16&0xff
3423 # call._coverage = self.iter.n_plp
3427 # cdef class SNPCaller( SNPCallerBase ):
3428 # '''*(IteratorColumn iterator_column )*
3430 # The samtools SNP caller.
3432 # This object will call SNPs in *samfile* against the reference
3433 # sequence in *fasta*.
3435 # This caller is fast for calling few SNPs in selected regions.
3437 # It is slow, if called over large genomic regions.
3441 # def __cinit__(self,
3442 # IteratorColumn iterator_column,
3447 # def call(self, reference, int pos ):
3448 # """call a snp on chromosome *reference*
3449 # and position *pos*.
3451 # returns a :class:`SNPCall` object.
3454 # cdef int tid = self.iter.samfile.gettid( reference )
3456 # self.iter.reset( tid, pos, pos + 1 )
3461 # if self.iter.n_plp < 0:
3462 # raise ValueError("error during iteration" )
3464 # if self.iter.plp == NULL:
3465 # raise ValueError( "no reads in region - no call" )
3467 # if self.iter.pos == pos: break
3469 # cdef char * seq = self.iter.getSequence()
3470 # cdef int seq_len = self.iter.seq_len
3472 # assert seq != NULL
3475 # if self.iter.pos >= seq_len:
3476 # raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3478 # cdef int rb = seq[self.iter.pos]
3482 # # g = bam_maqcns_glfgen( self.iter.n_plp,
3484 # # bam_nt16_table[rb],
3488 # # if pysam_glf_depth( g ) == 0:
3489 # # cns = 0xfu << 28 | 0xf << 24
3491 # # cns = glf2cns(g, <int>(self.c.q_r + .499))
3498 # call._tid = self.iter.tid
3499 # call._pos = self.iter.pos
3500 # call._reference_base = rb
3501 # call._genotype = bam_nt16_rev_table[cns>>28]
3502 # call._consensus_quality = cns >> 8 & 0xff
3503 # call._snp_quality = cns & 0xff
3504 # call._rms_mapping_quality = cns >> 16&0xff
3505 # call._coverage = self.iter.n_plp
3509 # cdef class IndelCall:
3510 # '''the results of an indel call.'''
3513 # cdef int _coverage
3514 # cdef int _rms_mapping_quality
3515 # cdef bam_maqindel_ret_t * _r
3517 # def __cinit__(self):
3523 # '''the chromosome ID as is defined in the header'''
3524 # def __get__(self):
3528 # '''nucleotide position of SNP.'''
3529 # def __get__(self): return self._pos
3531 # property genotype:
3532 # '''the genotype called.'''
3533 # def __get__(self):
3534 # if self._r.gt == 0:
3535 # s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3536 # return "%s/%s" % (s,s)
3537 # elif self._r.gt == 1:
3538 # s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3539 # return "%s/%s" % (s,s)
3541 # return "%s/%s" % (self.first_allele, self.second_allele )
3543 # property consensus_quality:
3544 # '''the genotype quality (Phred-scaled).'''
3545 # def __get__(self): return self._r.q_cns
3547 # property snp_quality:
3548 # '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
3549 # def __get__(self): return self._r.q_ref
3551 # property mapping_quality:
3552 # '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
3553 # def __get__(self): return self._rms_mapping_quality
3555 # property coverage:
3556 # '''coverage or read depth - the number of reads involved in the call.'''
3557 # def __get__(self): return self._coverage
3559 # property first_allele:
3560 # '''sequence of first allele.'''
3561 # def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3563 # property second_allele:
3564 # '''sequence of second allele.'''
3565 # def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3567 # property reads_first:
3568 # '''reads supporting first allele.'''
3569 # def __get__(self): return self._r.cnt1
3571 # property reads_second:
3572 # '''reads supporting first allele.'''
3573 # def __get__(self): return self._r.cnt2
3575 # property reads_diff:
3576 # '''reads supporting first allele.'''
3577 # def __get__(self): return self._r.cnt_anti
3579 # def __str__(self):
3581 # return "\t".join( map(str, (
3585 # self.consensus_quality,
3587 # self.mapping_quality,
3589 # self.first_allele,
3590 # self.second_allele,
3592 # self.reads_second,
3593 # self.reads_diff ) ) )
3595 # def __dealloc__(self ):
3596 # bam_maqindel_ret_destroy(self._r)
3598 # cdef class IndelCallerBase:
3599 # '''Base class for SNP callers.
3602 # minimum base quality (possibly capped by BAQ)
3604 # coefficient for adjusting mapQ of poor mappings
3606 # theta in maq consensus calling model
3608 # number of haplotypes in the sample
3610 # prior of a difference between two haplotypes
3613 # cdef bam_maqindel_opt_t * options
3614 # cdef IteratorColumn iter
3616 # cdef int max_depth
3618 # def __cinit__(self,
3619 # IteratorColumn iterator_column,
3623 # self.iter = iterator_column
3625 # assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
3627 # self.options = bam_maqindel_opt_init()
3629 # # set the default parameterization according to
3632 # self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
3633 # self.options.q_indel = kwargs.get( "q_indel", 40 )
3634 # self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
3635 # self.max_depth = kwargs.get( "max_depth", 1024 )
3637 # def __dealloc__(self):
3638 # free( self.options )
3640 # def _call( self ):
3642 # cdef char * seq = self.iter.getSequence()
3643 # cdef int seq_len = self.iter.seq_len
3645 # assert seq != NULL
3648 # if self.iter.pos >= seq_len:
3649 # raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3651 # cdef bam_maqindel_ret_t * r
3653 # cdef int m = min( self.max_depth, self.iter.n_plp )
3655 # # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
3656 # # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
3657 # # self.options.indel_err, self.options.ambi_thres );
3659 # r = bam_maqindel(m,
3667 # if r == NULL: return None
3669 # cdef IndelCall call
3670 # call = IndelCall()
3672 # call._tid = self.iter.tid
3673 # call._pos = self.iter.pos
3674 # call._coverage = self.iter.n_plp
3676 # cdef uint64_t rms_aux = 0
3678 # cdef bam_pileup1_t * p
3681 # for i from 0 <= i < self.iter.n_plp:
3682 # p = self.iter.plp + i
3683 # if p.b.core.qual < self.cap_mapQ:
3684 # tmp = p.b.core.qual
3686 # tmp = self.cap_mapQ
3687 # rms_aux += tmp * tmp
3689 # call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
3693 # cdef class IndelCaller( IndelCallerBase ):
3694 # '''*(IteratorColumn iterator_column )*
3696 # The samtools SNP caller.
3698 # This object will call SNPs in *samfile* against the reference
3699 # sequence in *fasta*.
3701 # This caller is fast for calling few SNPs in selected regions.
3703 # It is slow, if called over large genomic regions.
3706 # def __cinit__(self,
3707 # IteratorColumn iterator_column,
3712 # def call(self, reference, int pos ):
3713 # """call a snp on chromosome *reference*
3714 # and position *pos*.
3716 # returns a :class:`SNPCall` object or None, if no indel call could be made.
3719 # cdef int tid = self.iter.samfile.gettid( reference )
3721 # self.iter.reset( tid, pos, pos + 1 )
3726 # if self.iter.n_plp < 0:
3727 # raise ValueError("error during iteration" )
3729 # if self.iter.plp == NULL:
3730 # raise ValueError( "no reads in region - no call" )
3732 # if self.iter.pos == pos: break
3734 # return self._call()
3736 # cdef class IteratorIndelCalls( IndelCallerBase ):
3737 # """*(IteratorColumn iterator)*
3739 # call indels within a region.
3741 # *iterator* is a pileup iterator. SNPs will be called
3742 # on all positions returned by this iterator.
3744 # This caller is fast if SNPs are called over large continuous
3745 # regions. It is slow, if instantiated frequently and in random
3746 # order as the sequence will have to be reloaded.
3750 # def __cinit__(self,
3751 # IteratorColumn iterator_column,
3756 # def __iter__(self):
3759 # def __next__(self):
3760 # """python version of next().
3763 # # the following code was adapted from bam_plcmd.c:pileup_func()
3766 # if self.iter.n_plp < 0:
3767 # raise ValueError("error during iteration" )
3769 # if self.iter.plp == NULL:
3770 # raise StopIteration
3772 # return self._call()
3776 cdef class IndexedReads:
3777 """index a bamfile by read.
3779 The index is kept in memory.
3781 By default, the file is re-openend to avoid conflicts if
3782 multiple operators work on the same file. Set *reopen* = False
3783 to not re-open *samfile*.
3786 def __init__(self, Samfile samfile, int reopen = True ):
3787 self.samfile = samfile
3789 if samfile.isbam: mode = b"rb"
3792 # reopen the file - note that this makes the iterator
3793 # slow and causes pileup to slow down significantly.
3795 store = StderrStore()
3796 self.fp = samopen( samfile._filename, mode, NULL )
3798 assert self.fp != NULL
3799 self.owns_samfile = True
3801 self.fp = samfile.samfile
3802 self.owns_samfile = False
3804 assert samfile.isbam, "can only IndexReads on bam files"
3809 self.index = collections.defaultdict( list )
3811 # this method will start indexing from the current file position
3814 cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
3819 pos = bam_tell( self.fp.x.bam )
3820 ret = samread( self.fp, b)
3822 qname = _charptr_to_str(bam1_qname( b ))
3823 self.index[qname].append( pos )
3827 def find( self, qname ):
3828 if qname in self.index:
3829 return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
3831 raise KeyError( "read %s not found" % qname )
3833 def __dealloc__(self):
3834 if self.owns_samfile: samclose( self.fp )
3836 __all__ = ["Samfile",
3844 # "IteratorSNPCalls",
3847 # "IteratorIndelCalls",