1 # cython: embedsignature=True
3 # adds doc-strings for sphinx
14 from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
15 from cpython cimport PyErr_SetString
17 #from cpython.string cimport PyString_FromStringAndSize, PyString_AS_STRING
18 #from cpython.exc cimport PyErr_SetString, PyErr_NoMemory
20 # defines imported from samtools
25 ## These are bits set in the flag.
26 ## have to put these definitions here, in csamtools.pxd they got ignored
27 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
29 ## @abstract the read is mapped in a proper pair */
30 DEF BAM_FPROPER_PAIR =2
31 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
33 ## @abstract the mate is unmapped */
35 ## @abstract the read is mapped to the reverse strand */
37 ## @abstract the mate is mapped to the reverse strand */
39 ## @abstract this is read1 */
41 ## @abstract this is read2 */
43 ## @abstract not primary alignment */
44 DEF BAM_FSECONDARY =256
45 ## @abstract QC failure */
47 ## @abstract optical or PCR duplicate */
51 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
57 DEF BAM_CSOFT_CLIP = 4
58 DEF BAM_CHARD_CLIP = 5
61 #####################################################################
62 # hard-coded constants
63 cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
64 cdef int max_pos = 2 << 29
66 # redirect stderr to 0
67 _logfile = open(os.path.devnull, "w")
68 pysam_set_stderr( PyFile_AsFile( _logfile ) )
70 #####################################################################
71 #####################################################################
72 #####################################################################
73 ## private factory methods
74 #####################################################################
75 cdef class AlignedRead
76 cdef makeAlignedRead(bam1_t * src):
77 '''enter src into AlignedRead.'''
78 cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
79 dest._delegate = bam_dup1(src)
82 cdef class PileupProxy
83 cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
84 cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
92 cdef makePileupRead( bam_pileup1_t * src ):
93 '''fill a PileupRead object from a bam_pileup1_t * object.'''
94 cdef PileupRead dest = PileupRead.__new__(PileupRead)
95 dest._alignment = makeAlignedRead( src.b )
97 dest._indel = src.indel
98 dest._level = src.level
99 dest._is_del = src.is_del
100 dest._is_head = src.is_head
101 dest._is_tail = src.is_tail
104 #####################################################################
105 #####################################################################
106 #####################################################################
107 ## Generic callbacks for inserting python callbacks.
108 #####################################################################
109 cdef int fetch_callback( bam1_t *alignment, void *f):
110 '''callback for bam_fetch.
112 calls function in *f* with a new :class:`AlignedRead` object as parameter.
114 a = makeAlignedRead( alignment )
117 class PileupColumn(object):
118 '''A pileup column. A pileup column contains
119 all the reads that map to a certain target base.
122 chromosome ID as is defined in the header
124 the target base coordinate (0-based)
126 number of reads mapping to this column
128 list of reads (:class:`pysam.PileupRead`) aligned to this column
131 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
132 "\n" + "\n".join( map(str, self.pileups) )
134 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
135 '''callback for pileup.
137 calls function in *f* with a new :class:`Pileup` object as parameter.
140 chromosome ID as is defined in the header
142 start coordinate of the alignment, 0-based
144 number of elements in pl array
158 for x from 0 <= x < n:
159 pileups.append( makePileupRead( &(pl[x]) ) )
164 cdef int pileup_fetch_callback( bam1_t *b, void *data):
165 '''callback for bam_fetch.
167 Fetches reads and submits them to pileup.
169 cdef bam_plbuf_t * buf
170 buf = <bam_plbuf_t*>data
171 bam_plbuf_push(b, buf)
180 self.stderr_h, self.stderr_f = tempfile.mkstemp()
181 self.stderr_save = Outs( sys.stderr.fileno() )
182 self.stderr_save.setfd( self.stderr_h )
184 def readAndRelease( self ):
186 self.stderr_save.restore()
188 if os.path.exists(self.stderr_f):
189 lines = open( self.stderr_f, "r" ).readlines()
190 os.remove( self.stderr_f )
195 self.stderr_save.restore()
196 if os.path.exists(self.stderr_f):
197 os.remove( self.stderr_f )
202 class StderrStoreWindows():
203 '''does nothing. stderr can't be redirected on windows'''
204 def __init__(self): pass
205 def readAndRelease(self): return []
206 def release(self): pass
208 if platform.system()=='Windows':
210 StderrStore = StderrStoreWindows
213 ######################################################################
214 ######################################################################
215 ######################################################################
216 # valid types for sam headers
217 VALID_HEADER_TYPES = { "HD" : dict,
223 # order of records within sam headers
224 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
226 # type conversions within sam header records
227 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
228 "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
229 "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
230 "PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
232 # output order of fields within records
233 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
234 "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
235 "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
236 "PG" : ( "PN", "ID", "VN", "CL" ), }
239 ######################################################################
240 ######################################################################
241 ######################################################################
243 ######################################################################
244 cdef class Fastafile:
247 A *FASTA* file. The file is automatically opened.
249 The file expects an indexed fasta file.
252 add automatic indexing.
253 add function to get sequence names.
256 cdef char * _filename
257 # pointer to fastafile
258 cdef faidx_t * fastafile
260 def __cinit__(self, *args, **kwargs ):
261 self.fastafile = NULL
262 self._filename = NULL
263 self._open( *args, **kwargs )
266 '''return true if samfile has been opened.'''
267 return self.fastafile != NULL
270 if self.fastafile == NULL:
271 raise ValueError( "calling len() on closed file" )
273 return faidx_fetch_nseq(self.fastafile)
277 '''open an indexed fasta file.
279 This method expects an indexed fasta file.
282 # close a previously opened file
283 if self.fastafile != NULL: self.close()
284 if self._filename != NULL: free(self._filename)
285 self._filename = strdup(filename)
286 self.fastafile = fai_load( filename )
288 if self.fastafile == NULL:
289 raise IOError("could not open file `%s`" % filename )
292 if self.fastafile != NULL:
293 fai_destroy( self.fastafile )
294 self.fastafile = NULL
296 def __dealloc__(self):
298 if self._filename != NULL: free(self._filename)
301 '''number of :term:`filename` associated with this object.'''
303 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
304 return self._filename
312 '''*(reference = None, start = None, end = None, region = None)*
314 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
316 The region is specified by :term:`reference`, *start* and *end*.
318 fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
320 If *reference* is given and *start* is None, the sequence from the
321 first base is returned. Similarly, if *end* is None, the sequence
322 until the last base is returned.
324 Alternatively, a samtools :term:`region` string can be supplied.
327 if not self._isOpen():
328 raise ValueError( "I/O operation on closed file" )
334 if reference is None: raise ValueError( 'no sequence/region supplied.' )
335 if start is None: start = 0
336 if end is None: end = max_pos -1
338 if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
339 if start == end: return ""
340 # valid ranges are from 0 to 2^29-1
341 if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
342 if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
343 # note: faidx_fetch_seq has a bug such that out-of-range access
344 # always returns the last residue. Hence do not use faidx_fetch_seq,
345 # but use fai_fetch instead
346 # seq = faidx_fetch_seq(self.fastafile,
351 region = "%s:%i-%i" % (reference, start+1, end)
352 seq = fai_fetch( self.fastafile,
356 # samtools adds a '\0' at the end
357 seq = fai_fetch( self.fastafile, region, &length )
364 py_seq = PyString_FromStringAndSize(seq, length)
370 cdef char * _fetch( self, char * reference, int start, int end, int * length ):
371 '''fetch sequence for reference, start and end'''
373 return faidx_fetch_seq(self.fastafile,
379 #------------------------------------------------------------------------
380 #------------------------------------------------------------------------
381 #------------------------------------------------------------------------
382 cdef int count_callback( bam1_t *alignment, void *f):
383 '''callback for bam_fetch - count number of reads.
385 cdef int* counter = (<int*>f)
388 ctypedef struct MateData:
393 #------------------------------------------------------------------------
394 #------------------------------------------------------------------------
395 #------------------------------------------------------------------------
396 cdef int mate_callback( bam1_t *alignment, void *f):
397 '''callback for bam_fetch = filter mate
399 cdef MateData * d = (<MateData*>f)
400 # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
401 # d.mate, d.name, bam1_qname(alignment),
402 # d.flag, alignment.core.flag, alignment.core.flag & d.flag)
405 # could be sped up by comparing the lengths of query strings first
408 # also, make sure that we get the other read by comparing
410 if alignment.core.flag & d.flag != 0 and \
411 strcmp( bam1_qname( alignment ), d.name ) == 0:
412 d.mate = bam_dup1( alignment )
416 '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None,
417 add_sq_text = False )*
419 A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
421 *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
422 (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
423 Use ``h`` to output header information in text (:term:`TAM`) mode.
425 If ``b`` is present, it must immediately follow ``r`` or ``w``.
426 Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
427 a :term:`BAM` formatted file for reading, type::
429 f = pysam.Samfile('ex1.bam','rb')
431 If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following
434 f1 = pysam.Samfile('ex1.bam' )
435 f2 = pysam.Samfile('ex1.sam' )
437 If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
438 access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
440 For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
441 sources (see also the samtools format specification):
443 1. If *template* is given, the header is copied from a another *Samfile*
444 (*template* must be of type *Samfile*).
446 2. If *header* is given, the header is built from a multi-level dictionary. The first level
447 are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
448 being a list of tag-value pairs.
450 3. If *text* is given, new header text is copied from raw text.
452 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
453 By default, 'SQ' and 'LN' tags will be added to the header text. This option can be
454 changed by unsetting the flag *add_sq_text*.
458 def __cinit__(self, *args, **kwargs ):
460 self._filename = NULL
462 self.isstream = False
463 self._open( *args, **kwargs )
465 # allocate memory for iterator
466 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
469 '''return true if samfile has been opened.'''
470 return self.samfile != NULL
472 def _hasIndex( self ):
473 '''return true if samfile has an existing (and opened) index.'''
474 return self.index != NULL
479 Samfile template = None,
480 referencenames = None,
481 referencelengths = None,
487 '''open a sam/bam file.
489 If _open is called on an existing bamfile, the current file will be
490 closed and a new file will be opened.
493 # read mode autodetection
496 self._open(filename, 'rb', template=template,
497 referencenames=referencenames,
498 referencelengths=referencelengths,
499 text=text, header=header, port=port)
501 except ValueError, msg:
504 self._open(filename, 'r', template=template,
505 referencenames=referencenames,
506 referencelengths=referencelengths,
507 text=text, header=header, port=port)
510 assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode
511 assert filename != NULL
513 # close a previously opened file
514 if self.samfile != NULL: self.close()
517 cdef bam_header_t * header_to_write
518 header_to_write = NULL
520 if self._filename != NULL: free(self._filename )
521 self._filename = strdup( filename )
522 self.isstream = strcmp( filename, "-" ) == 0
524 self.isbam = len(mode) > 1 and mode[1] == 'b'
526 self.isremote = strncmp(filename,"http:",5) == 0 or \
527 strncmp(filename,"ftp:",4) == 0
533 # open file for writing
535 # header structure (used for writing)
537 # copy header from another file
538 header_to_write = template.samfile.header
541 header_to_write = self._buildHeader( header )
544 # build header from a target names and lengths
545 assert referencenames and referencelengths, "either supply options `template`, `header` or both `referencenames` and `referencelengths` for writing"
546 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
548 # allocate and fill header
549 header_to_write = bam_header_init()
550 header_to_write.n_targets = len(referencenames)
552 for x in referencenames: n += len(x) + 1
553 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
554 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
555 for x from 0 <= x < header_to_write.n_targets:
556 header_to_write.target_len[x] = referencelengths[x]
557 name = referencenames[x]
558 header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
559 strncpy( header_to_write.target_name[x], name, len(name) )
561 # Optionally, if there is no text, add a SAM compatible header to output
563 if text is None and add_sq_text:
565 for x from 0 <= x < header_to_write.n_targets:
566 text += "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] )
571 header_to_write.l_text = strlen(ctext)
572 header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
573 memcpy( header_to_write.text, ctext, strlen(ctext) )
575 header_to_write.hash = NULL
576 header_to_write.rg2lib = NULL
578 # open file. Header gets written to file at the same time for bam files
579 # and sam files (in the latter case, the mode needs to be wh)
580 store = StderrStore()
581 self.samfile = samopen( filename, mode, header_to_write )
584 # bam_header_destroy takes care of cleaning up of all the members
585 if not template and header_to_write != NULL:
586 bam_header_destroy( header_to_write )
589 # open file for reading
590 if strncmp( filename, "-", 1) != 0 and \
591 not self.isremote and \
592 not os.path.exists( filename ):
593 raise IOError( "file `%s` not found" % filename)
595 # try to detect errors
596 self.samfile = samopen( filename, mode, NULL )
597 if self.samfile == NULL:
598 raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
600 if self.samfile.header == NULL:
601 raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode )
603 #disabled for autodetection to work
604 # needs to be disabled so that reading from sam-files without headers works
605 #if self.samfile.header.n_targets == 0:
606 # raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
608 if self.samfile == NULL:
609 raise IOError("could not open file `%s`" % filename )
611 # check for index and open if present
612 if mode[0] == "r" and self.isbam:
614 if not self.isremote:
615 if not os.path.exists(filename +".bai"):
618 # returns NULL if there is no index or index could not be opened
619 self.index = bam_index_load(filename)
620 if self.index == NULL:
621 raise IOError("error while opening index `%s` " % filename )
623 self.index = bam_index_load(filename)
624 if self.index == NULL:
625 raise IOError("error while opening index `%s` " % filename )
627 if not self.isstream:
628 self.start_offset = bam_tell( self.samfile.x.bam )
630 def gettid( self, reference ):
632 convert :term:`reference` name into numerical :term:`tid`
634 returns -1 if reference is not known.
636 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
637 return pysam_reference2tid( self.samfile.header, reference )
639 def getrname( self, tid ):
641 convert numerical :term:`tid` into :term:`reference` name.'''
642 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
643 if not 0 <= tid < self.samfile.header.n_targets:
644 raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
645 return self.samfile.header.target_name[tid]
647 cdef char * _getrname( self, int tid ):
649 convert numerical :term:`tid` into :term:`reference` name.'''
650 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
651 if not 0 <= tid < self.samfile.header.n_targets:
652 raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
653 return self.samfile.header.target_name[tid]
655 def _parseRegion( self,
661 parse region information.
663 raise ValueError for for invalid regions.
665 returns a tuple of flag, tid, start and end. Flag indicates
666 whether some coordinates were supplied.
668 Note that regions are 1-based, while start,end are python coordinates.
670 # This method's main objective is to translate from a reference to a tid.
671 # For now, it calls bam_parse_region, which is clumsy. Might be worth
672 # implementing it all in pysam (makes use of khash).
675 cdef long long rstart
684 except OverflowError:
685 raise ValueError( 'start out of range (%i)' % start )
690 except OverflowError:
691 raise ValueError( 'end out of range (%i)' % end )
694 parts = re.split( "[:-]", region )
696 if len(parts) >= 2: rstart = int(parts[1]) - 1
697 if len(parts) >= 3: rend = int(parts[2])
699 if not reference: return 0, 0, 0, 0
701 rtid = self.gettid( reference )
702 if rtid < 0: raise ValueError( "invalid reference `%s`" % reference )
703 if rstart > rend: raise ValueError( 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend) )
704 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
705 if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
707 return 1, rtid, rstart, rend
710 '''reset file position to beginning of read section.'''
711 return self.seek( self.start_offset, 0 )
713 def seek( self, uint64_t offset, int where = 0):
715 move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
718 if not self._isOpen():
719 raise ValueError( "I/O operation on closed file" )
721 raise NotImplementedError("seek only available in bam files")
723 raise OSError("seek no available in streams")
725 return bam_seek( self.samfile.x.bam, offset, where )
729 return current file position
731 if not self._isOpen():
732 raise ValueError( "I/O operation on closed file" )
734 raise NotImplementedError("seek only available in bam files")
736 return bam_tell( self.samfile.x.bam )
746 fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
747 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
750 Without *reference* or *region* all mapped reads will be fetched. The reads will be returned
751 ordered by reference sequence, which will not necessarily be the order within the file.
753 If *until_eof* is given, all reads from the current file position will be returned
754 in order as they are within the file. Using this option will also fetch unmapped reads.
756 If only *reference* is set, all reads aligned to *reference* will be fetched.
758 The method returns an iterator of type :class:`pysam.IteratorRow` unless
759 a *callback is provided. If *callback* is given, the callback will be executed
760 for each position within the :term:`region`. Note that callbacks currently work
761 only, if *region* or *reference* is given.
763 Note that a :term:`SAM` file does not allow random access. If *region* or *reference* are given,
764 an exception is raised.
766 cdef int rtid, rstart, rend, has_coord
768 if not self._isOpen():
769 raise ValueError( "I/O operation on closed file" )
771 has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
773 if self.isstream: reopen = False
777 if not until_eof and not self._hasIndex() and not self.isremote:
778 raise ValueError( "fetch called on bamfile without index" )
781 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
782 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
783 return bam_fetch(self.samfile.x.bam,
792 return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen )
795 return IteratorRowAll( self, reopen=reopen )
797 # AH: check - reason why no reopen for AllRefs?
798 return IteratorRowAllRefs(self ) # , reopen=reopen )
800 # check if header is present - otherwise sam_read1 aborts
801 # this happens if a bamfile is opened with mode 'r'
803 raise ValueError ("fetching by region is not available for sam files" )
805 if self.samfile.header.n_targets == 0:
806 raise ValueError( "fetch called for samfile without header")
809 raise NotImplementedError( "callback not implemented yet" )
811 return IteratorRowAll( self, reopen=reopen )
815 '''return the mate of :class:`AlignedRead` *read*.
817 Throws a ValueError if read is unpaired or the mate
821 Calling this method will change the file position.
822 This might interfere with any iterators that have
823 not re-opened the file.
826 cdef uint32_t flag = read._delegate.core.flag
828 if flag & BAM_FPAIRED == 0:
829 raise ValueError( "read %s: is unpaired" % (read.qname))
830 if flag & BAM_FMUNMAP != 0:
831 raise ValueError( "mate %s: is unmapped" % (read.qname))
833 cdef MateData mate_data
835 mate_data.name = <char *>bam1_qname(read._delegate)
836 mate_data.mate = NULL
837 # xor flags to get the other mate
838 cdef int x = BAM_FREAD1 + BAM_FREAD2
839 mate_data.flag = ( flag ^ x) & x
841 bam_fetch(self.samfile.x.bam,
843 read._delegate.core.mtid,
844 read._delegate.core.mpos,
845 read._delegate.core.mpos + 1,
849 if mate_data.mate == NULL:
850 raise ValueError( "mate not found" )
852 cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
853 dest._delegate = mate_data.mate
862 '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
864 count reads :term:`region` using 0-based indexing. The region is specified by
865 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
867 Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
868 an exception is raised.
874 if not self._isOpen():
875 raise ValueError( "I/O operation on closed file" )
877 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
883 if not until_eof and not self._hasIndex() and not self.isremote:
884 raise ValueError( "fetch called on bamfile without index" )
887 raise ValueError( "counting functionality requires a region/reference" )
888 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
889 bam_fetch(self.samfile.x.bam,
898 raise ValueError ("count for a region is not available for sam files" )
908 perform a :term:`pileup` within a :term:`region`. The region is specified by
909 :term:`reference`, *start* and *end* (using 0-based indexing).
910 Alternatively, a samtools *region* string can be supplied.
912 Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
913 ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
915 The method returns an iterator of type :class:`pysam.IteratorColumn` unless
916 a *callback is provided. If a *callback* is given, the callback will be executed
917 for each column within the :term:`region`.
919 Note that :term:`SAM` formatted files do not allow random access.
920 In these files, if a *region* or *reference* are given an exception is raised.
922 Optional *kwargs* to the iterator:
925 The stepper controlls how the iterator advances.
926 Possible options for the stepper are
929 use all reads for pileup.
931 same filter and read processing as in :term:`csamtools` pileup
934 A :class:`FastaFile` object
937 Skip all reads with bits set in mask.
940 Maximum read depth permitted. The default limit is *8000*.
944 *all* reads which overlap the region are returned. The first base returned will be the
945 first base of the first read *not* necessarily the first base of the region used in the query.
948 cdef int rtid, rstart, rend, has_coord
949 cdef bam_plbuf_t *buf
951 if not self._isOpen():
952 raise ValueError( "I/O operation on closed file" )
954 has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
957 if not self._hasIndex(): raise ValueError( "no index available for pileup" )
960 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
962 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
963 bam_fetch(self.samfile.x.bam,
964 self.index, rtid, rstart, rend,
965 buf, pileup_fetch_callback )
968 bam_plbuf_push( NULL, buf)
969 bam_plbuf_destroy(buf)
972 return IteratorColumnRegion( self,
978 return IteratorColumnAllRefs(self, **kwargs )
981 raise NotImplementedError( "pileup of samfiles not implemented yet" )
985 closes the :class:`pysam.Samfile`.'''
986 if self.samfile != NULL:
987 samclose( self.samfile )
988 bam_index_destroy(self.index);
991 def __dealloc__( self ):
992 # remember: dealloc cannot call other methods
993 # note: no doc string
994 # note: __del__ is not called.
997 if self._filename != NULL: free( self._filename )
999 cpdef int write( self, AlignedRead read ) except -1:
1001 write a single :class:`pysam.AlignedRead` to disk.
1003 returns the number of bytes written.
1005 if not self._isOpen():
1008 return samwrite( self.samfile, read._delegate )
1010 def __enter__(self):
1013 def __exit__(self, exc_type, exc_value, traceback):
1017 ###############################################################
1018 ###############################################################
1019 ###############################################################
1021 ###############################################################
1023 '''number of :term:`filename` associated with this object.'''
1025 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1026 return self._filename
1028 property nreferences:
1029 '''number of :term:`reference` sequences in the file.'''
1031 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1032 return self.samfile.header.n_targets
1034 property references:
1035 """tuple with the names of :term:`reference` sequences."""
1037 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1039 for x from 0 <= x < self.samfile.header.n_targets:
1040 t.append( self.samfile.header.target_name[x] )
1044 """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
1045 :attr:`pysam.Samfile.references`
1048 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1050 for x from 0 <= x < self.samfile.header.n_targets:
1051 t.append( self.samfile.header.target_len[x] )
1055 """total number of mapped reads in file.
1058 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1059 if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" )
1062 cdef uint32_t total = 0
1063 for tid from 0 <= tid < self.samfile.header.n_targets:
1064 total += pysam_get_mapped( self.index, tid )
1068 """total number of unmapped reads in file.
1071 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1072 if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" )
1074 cdef uint32_t total = 0
1075 for tid from 0 <= tid < self.samfile.header.n_targets:
1076 total += pysam_get_unmapped( self.index, tid )
1077 # get unmapped reads without coordinates
1078 total += pysam_get_unmapped( self.index, -1 )
1082 '''full contents of the :term:`sam file` header as a string.'''
1084 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1085 return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
1088 '''header information within the :term:`sam file`. The records and fields are returned as
1089 a two-level dictionary.
1092 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1096 if self.samfile.header.text != NULL:
1097 # convert to python string (note: call self.text to create 0-terminated string)
1099 for line in t.split("\n"):
1100 if not line.strip(): continue
1101 assert line.startswith("@"), "header line without '@': '%s'" % line
1102 fields = line[1:].split("\t")
1104 assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
1108 if record not in result: result[record] = []
1109 result[record].append( "\t".join( fields[1:] ) )
1112 # the following is clumsy as generators do not work?
1114 for field in fields[1:]:
1115 key, value = field.split(":",1)
1116 # uppercase keys must be valid
1117 # lowercase are permitted for user fields
1118 if key in VALID_HEADER_FIELDS[record]:
1119 x[key] = VALID_HEADER_FIELDS[record][key](value)
1120 elif not key.isupper():
1123 raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
1125 if VALID_HEADER_TYPES[record] == dict:
1126 if record in result:
1127 raise ValueError( "multiple '%s' lines are not permitted" % record )
1129 elif VALID_HEADER_TYPES[record] == list:
1130 if record not in result: result[record] = []
1131 result[record].append( x )
1135 def _buildLine( self, fields, record ):
1136 '''build a header line from *fields* dictionary for *record*'''
1138 # TODO: add checking for field and sort order
1139 line = ["@%s" % record ]
1141 line.append( fields )
1143 # write fields of the specification
1144 for key in VALID_HEADER_ORDER[record]:
1146 line.append( "%s:%s" % (key, str(fields[key])))
1149 if not key.isupper():
1150 line.append( "%s:%s" % (key, str(fields[key])))
1152 return "\t".join( line )
1154 cdef bam_header_t * _buildHeader( self, new_header ):
1155 '''return a new header built from a dictionary in *new_header*.
1157 This method inserts the text field, target_name and target_len.
1162 # check if hash exists
1164 # create new header and copy old data
1165 cdef bam_header_t * dest
1167 dest = bam_header_init()
1169 for record in VALID_HEADERS:
1170 if record in new_header:
1171 ttype = VALID_HEADER_TYPES[record]
1172 data = new_header[record]
1173 if type( data ) != type( ttype() ):
1174 raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
1175 if type( data ) == types.DictType:
1176 lines.append( self._buildLine( data, record ) )
1178 for fields in new_header[record]:
1179 lines.append( self._buildLine( fields, record ) )
1181 text = "\n".join(lines) + "\n"
1182 if dest.text != NULL: free( dest.text )
1183 dest.text = <char*>calloc( len(text), sizeof(char))
1184 dest.l_text = len(text)
1185 strncpy( dest.text, text, dest.l_text )
1188 if "SQ" in new_header:
1190 for fields in new_header["SQ"]:
1192 seqs.append( (fields["SN"], fields["LN"] ) )
1194 raise KeyError( "incomplete sequence information in '%s'" % str(fields))
1196 dest.n_targets = len(seqs)
1197 dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
1198 dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
1200 for x from 0 <= x < dest.n_targets:
1201 seqname, seqlen = seqs[x]
1202 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
1203 strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
1204 dest.target_len[x] = seqlen
1208 ###############################################################
1209 ###############################################################
1210 ###############################################################
1211 ## file-object like iterator access
1212 ## note: concurrent access will cause errors (see IteratorRow
1214 ## Possible solutions: deprecate or open new file handle
1215 ###############################################################
1217 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1218 if not self.isbam and self.samfile.header.n_targets == 0:
1219 raise NotImplementedError( "can not iterate over samfile without header")
1222 cdef bam1_t * getCurrent( self ):
1225 cdef int cnext(self):
1227 cversion of iterator. Used by :class:`pysam.Samfile.IteratorColumn`.
1230 return samread(self.samfile, self.b)
1234 python version of next().
1237 ret = samread(self.samfile, self.b)
1239 return makeAlignedRead( self.b )
1243 ##-------------------------------------------------------------------
1244 ##-------------------------------------------------------------------
1245 ##-------------------------------------------------------------------
1246 cdef class IteratorRow:
1247 '''abstract base class for iterators over mapped reads.
1249 Various iterators implement different behaviours for wrapping around
1250 contig boundaries. Examples include:
1252 :class:`pysam.IteratorRowRegion`
1253 iterate within a single contig and a defined region.
1255 :class:`pysam.IteratorRowAll`
1256 iterate until EOF. This iterator will also include unmapped reads.
1258 :class:`pysam.IteratorRowAllRefs`
1259 iterate over all reads in all reference sequences.
1261 The method :meth:`Samfile.fetch` returns an IteratorRow.
1265 cdef class IteratorRowRegion(IteratorRow):
1266 """*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
1268 iterate over mapped reads in a region.
1270 By default, the file is re-openend to avoid conflicts between
1271 multiple iterators working on the same file. Set *reopen* = False
1272 to not re-open *samfile*.
1274 The samtools iterators assume that the file
1275 position between iterations do not change.
1276 As a consequence, no two iterators can work
1277 on the same file. To permit this, each iterator
1278 creates its own file handle by re-opening the
1281 Note that the index will be shared between
1282 samfile and the iterator.
1285 cdef bam_iter_t iter # iterator state object
1288 cdef Samfile samfile
1290 # true if samfile belongs to this object
1291 cdef int owns_samfile
1293 def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
1295 if not samfile._isOpen():
1296 raise ValueError( "I/O operation on closed file" )
1298 if not samfile._hasIndex():
1299 raise ValueError( "no index available for iteration" )
1301 # makes sure that samfile stays alive as long as the
1303 self.samfile = samfile
1305 if samfile.isbam: mode = "rb"
1308 # reopen the file - note that this makes the iterator
1309 # slow and causes pileup to slow down significantly.
1311 store = StderrStore()
1312 self.fp = samopen( samfile._filename, mode, NULL )
1314 assert self.fp != NULL
1315 self.owns_samfile = True
1317 self.fp = self.samfile.samfile
1318 self.owns_samfile = False
1322 self.iter = bam_iter_query(self.samfile.index,
1326 self.b = bam_init1()
1331 cdef bam1_t * getCurrent( self ):
1334 cdef int cnext(self):
1335 '''cversion of iterator. Used by IteratorColumn'''
1336 self.retval = bam_iter_read( self.fp.x.bam,
1341 """python version of next().
1344 if self.retval < 0: raise StopIteration
1345 return makeAlignedRead( self.b )
1347 def __dealloc__(self):
1348 bam_destroy1(self.b)
1349 if self.owns_samfile: samclose( self.fp )
1351 cdef class IteratorRowAll(IteratorRow):
1352 """*(Samfile samfile, int reopen = True)*
1354 iterate over all reads in *samfile*
1356 By default, the file is re-openend to avoid conflicts between
1357 multiple iterators working on the same file. Set *reopen* = False
1358 to not re-open *samfile*.
1362 # cdef samfile_t * fp
1363 # # true if samfile belongs to this object
1364 # cdef int owns_samfile
1366 def __cinit__(self, Samfile samfile, int reopen = True ):
1368 if not samfile._isOpen():
1369 raise ValueError( "I/O operation on closed file" )
1371 if samfile.isbam: mode = "rb"
1374 # reopen the file to avoid iterator conflict
1376 store = StderrStore()
1377 self.fp = samopen( samfile._filename, mode, NULL )
1379 assert self.fp != NULL
1380 self.owns_samfile = True
1382 self.fp = samfile.samfile
1383 self.owns_samfile = False
1385 # allocate memory for alignment
1386 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1391 cdef bam1_t * getCurrent( self ):
1394 cdef int cnext(self):
1395 '''cversion of iterator. Used by IteratorColumn'''
1397 return samread(self.fp, self.b)
1400 """python version of next().
1402 pyrex uses this non-standard name instead of next()
1405 ret = samread(self.fp, self.b)
1407 return makeAlignedRead( self.b )
1411 def __dealloc__(self):
1412 bam_destroy1(self.b)
1413 if self.owns_samfile: samclose( self.fp )
1415 cdef class IteratorRowAllRefs(IteratorRow):
1416 """iterates over all mapped reads by chaining iterators over each reference
1418 cdef Samfile samfile
1420 cdef IteratorRowRegion rowiter
1422 def __cinit__(self, Samfile samfile):
1423 assert samfile._isOpen()
1424 if not samfile._hasIndex(): raise ValueError("no index available for fetch")
1425 self.samfile = samfile
1429 self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, 1<<29)
1435 """python version of next().
1437 pyrex uses this non-standard name instead of next()
1439 # Create an initial iterator
1441 if not self.samfile.nreferences:
1447 self.rowiter.cnext()
1449 # If current iterator is not exhausted, return aligned read
1450 if self.rowiter.retval>0:
1451 return makeAlignedRead(self.rowiter.b)
1455 # Otherwise, proceed to next reference or stop
1456 if self.tid<self.samfile.nreferences:
1461 cdef class IteratorRowSelection(IteratorRow):
1462 """*(Samfile samfile)*
1464 iterate over reads in *samfile* at a given list of file positions.
1468 cdef int current_pos
1471 # true if samfile belongs to this object
1472 cdef int owns_samfile
1474 def __cinit__(self, Samfile samfile, positions, int reopen = True ):
1476 if not samfile._isOpen():
1477 raise ValueError( "I/O operation on closed file" )
1479 if not samfile._isOpen():
1480 raise ValueError( "I/O operation on closed file" )
1482 assert samfile.isbam, "can only use this iterator on bam files"
1485 # reopen the file to avoid iterator conflict
1487 store = StderrStore()
1488 self.fp = samopen( samfile._filename, mode, NULL )
1490 assert self.fp != NULL
1491 self.owns_samfile = True
1493 self.fp = samfile.samfile
1494 self.owns_samfile = False
1496 # allocate memory for alignment
1497 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1499 self.positions = positions
1500 self.current_pos = 0
1505 cdef bam1_t * getCurrent( self ):
1508 cdef int cnext(self):
1509 '''cversion of iterator'''
1511 # end iteration if out of positions
1512 if self.current_pos >= len(self.positions): return -1
1514 bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
1515 self.current_pos += 1
1516 return samread(self.fp, self.b)
1519 """python version of next().
1521 pyrex uses this non-standard name instead of next()
1524 cdef int ret = self.cnext()
1526 return makeAlignedRead( self.b )
1530 def __dealloc__(self):
1531 bam_destroy1(self.b)
1532 if self.owns_samfile: samclose( self.fp )
1534 ##-------------------------------------------------------------------
1535 ##-------------------------------------------------------------------
1536 ##-------------------------------------------------------------------
1537 ctypedef struct __iterdata:
1545 cdef int __advance_all( void * data, bam1_t * b ):
1546 '''advance without any read filtering.
1549 d = <__iterdata*>data
1550 return bam_iter_read( d.samfile.x.bam, d.iter, b )
1552 cdef int __advance_snpcalls( void * data, bam1_t * b ):
1553 '''advance using same filter and read processing as in
1554 the samtools pileup.
1557 d = <__iterdata*>data
1559 cdef int ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1563 cdef int is_nobaq = 0
1564 cdef int capQ_thres = 0
1567 if d.fastafile != NULL and b.core.tid != d.tid:
1568 if d.seq != NULL: free(d.seq)
1570 d.seq = faidx_fetch_seq(d.fastafile,
1571 d.samfile.header.target_name[d.tid],
1575 raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
1576 (d.samfile.header.target_name[d.tid],
1584 # realign read - changes base qualities
1585 if d.seq != NULL and is_cns and not is_nobaq: bam_prob_realn( b, d.seq )
1587 if d.seq != NULL and capQ_thres > 10:
1588 q = bam_cap_mapQ(b, d.seq, capQ_thres)
1590 elif b.core.qual > q: b.core.qual = q
1591 if b.core.flag & BAM_FUNMAP: skip = 1
1592 elif b.core.flag & 1 and not b.core.flag & 2: skip = 1
1595 # additional filters
1597 ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1601 cdef class IteratorColumn:
1602 '''abstract base class for iterators over columns.
1604 IteratorColumn objects wrap the pileup functionality of samtools.
1606 For reasons of efficiency, the iterator points to the current
1607 pileup buffer. The pileup buffer is updated at every iteration.
1608 This might cause some unexpected behavious. For example,
1609 consider the conversion to a list::
1611 f = Samfile("file.bam", "rb")
1612 result = list( f.pileup() )
1614 Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
1615 but each object in ``result`` will contain the same information.
1617 The desired behaviour can be achieved by list comprehension::
1619 result = [ x.pileups() for x in f.pileup() ]
1621 ``result`` will be a list of ``n`` lists of objects of type :class:`PileupRead`.
1623 If the iterator is associated with a :class:`Fastafile` using the :meth:`addReference`
1624 method, then the iterator will export the current sequence via the methods :meth:`getSequence`
1625 and :meth:`seq_len`.
1627 Optional kwargs to the iterator
1630 The stepper controlls how the iterator advances.
1631 Possible options for the stepper are
1634 use all reads for pileup.
1636 same filter and read processing as in :term:`csamtools` pileup
1638 A :class:`FastaFile` object
1640 Skip all reads with bits set in mask.
1642 maximum read depth. The default is 8000.
1645 # result of the last plbuf_push
1646 cdef IteratorRowRegion iter
1651 cdef const_bam_pileup1_t_ptr plp
1652 cdef bam_plp_t pileup_iter
1653 cdef __iterdata iterdata
1654 cdef Samfile samfile
1655 cdef Fastafile fastafile
1659 def __cinit__( self, Samfile samfile, **kwargs ):
1660 self.samfile = samfile
1661 self.mask = kwargs.get("mask", BAM_DEF_MASK )
1662 self.fastafile = kwargs.get( "fastafile", None )
1663 self.stepper = kwargs.get( "stepper", None )
1664 self.max_depth = kwargs.get( "max_depth", 8000 )
1665 self.iterdata.seq = NULL
1670 self.pileup_iter = <bam_plp_t>NULL
1676 cdef int cnext(self):
1677 '''perform next iteration.
1679 This method is analogous to the samtools bam_plp_auto method.
1680 It has been re-implemented to permit for filtering.
1682 self.plp = bam_plp_auto( self.pileup_iter,
1687 cdef char * getSequence( self ):
1688 '''return current reference sequence underlying the iterator.
1690 return self.iterdata.seq
1693 '''current sequence length.'''
1694 def __get__(self): return self.iterdata.seq_len
1696 def addReference( self, Fastafile fastafile ):
1698 add reference sequences in *fastafile* to iterator.'''
1699 self.fastafile = fastafile
1700 if self.iterdata.seq != NULL: free(self.iterdata.seq)
1701 self.iterdata.tid = -1
1702 self.iterdata.fastafile = self.fastafile.fastafile
1704 def hasReference( self ):
1706 return true if iterator is associated with a reference'''
1707 return self.fastafile
1709 cdef setMask( self, mask ):
1710 '''set masking flag in iterator.
1712 reads with bits set in *mask* will be skipped.
1715 bam_plp_set_mask( self.pileup_iter, self.mask )
1717 cdef setupIteratorData( self,
1722 '''setup the iterator structure'''
1724 self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen )
1725 self.iterdata.samfile = self.samfile.samfile
1726 self.iterdata.iter = self.iter.iter
1727 self.iterdata.seq = NULL
1728 self.iterdata.tid = -1
1730 if self.fastafile != None:
1731 self.iterdata.fastafile = self.fastafile.fastafile
1733 self.iterdata.fastafile = NULL
1735 if self.stepper == None or self.stepper == "all":
1736 self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
1737 elif self.stepper == "samtools":
1738 self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
1740 raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
1743 bam_plp_set_maxcnt( self.pileup_iter, self.max_depth )
1745 bam_plp_set_mask( self.pileup_iter, self.mask )
1747 cdef reset( self, tid, start, end ):
1748 '''reset iterator position.
1750 This permits using the iterator multiple times without
1751 having to incur the full set-up costs.
1753 self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
1754 self.iterdata.iter = self.iter.iter
1756 # invalidate sequence if different tid
1758 if self.iterdata.seq != NULL: free( self.iterdata.seq )
1759 self.iterdata.seq = NULL
1760 self.iterdata.tid = -1
1762 # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
1763 bam_plp_reset(self.pileup_iter)
1765 def __dealloc__(self):
1766 # reset in order to avoid memory leak messages for iterators that have
1767 # not been fully consumed
1768 if self.pileup_iter != <bam_plp_t>NULL:
1769 bam_plp_reset(self.pileup_iter)
1770 bam_plp_destroy(self.pileup_iter)
1771 self.pileup_iter = <bam_plp_t>NULL
1773 if self.iterdata.seq != NULL:
1774 free(self.iterdata.seq)
1775 self.iterdata.seq = NULL
1777 cdef class IteratorColumnRegion(IteratorColumn):
1778 '''iterates over a region only.
1780 def __cinit__(self, Samfile samfile,
1786 # initialize iterator
1787 self.setupIteratorData( tid, start, end, 1 )
1790 """python version of next().
1796 raise ValueError("error during iteration" )
1798 if self.plp == NULL:
1801 return makePileupProxy( <bam_pileup1_t*>self.plp,
1806 cdef class IteratorColumnAllRefs(IteratorColumn):
1807 """iterates over all columns by chaining iterators over each reference
1814 # no iteration over empty files
1815 if not samfile.nreferences: raise StopIteration
1817 # initialize iterator
1818 self.setupIteratorData( self.tid, 0, max_pos, 1 )
1821 """python version of next().
1828 raise ValueError("error during iteration" )
1830 # return result, if within same reference
1831 if self.plp != NULL:
1832 return makePileupProxy( <bam_pileup1_t*>self.plp,
1837 # otherwise, proceed to next reference or stop
1839 if self.tid < self.samfile.nreferences:
1840 self.setupIteratorData( self.tid, 0, max_pos, 0 )
1844 ##-------------------------------------------------------------------
1845 ##-------------------------------------------------------------------
1846 ##-------------------------------------------------------------------
1847 cdef inline int32_t query_start(bam1_t *src) except -1:
1848 cdef uint32_t * cigar_p, op
1850 cdef uint32_t start_offset = 0
1852 if src.core.n_cigar:
1853 cigar_p = bam1_cigar(src);
1854 for k from 0 <= k < src.core.n_cigar:
1855 op = cigar_p[k] & BAM_CIGAR_MASK
1856 if op==BAM_CHARD_CLIP:
1857 if start_offset!=0 and start_offset!=src.core.l_qseq:
1858 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1860 elif op==BAM_CSOFT_CLIP:
1861 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
1867 ##-------------------------------------------------------------------
1868 ##-------------------------------------------------------------------
1869 ##-------------------------------------------------------------------
1870 cdef inline int32_t query_end(bam1_t *src) except -1:
1871 cdef uint32_t * cigar_p, op
1873 cdef uint32_t end_offset = src.core.l_qseq
1875 if src.core.n_cigar>1:
1876 cigar_p = bam1_cigar(src);
1877 for k from src.core.n_cigar > k >= 1:
1878 op = cigar_p[k] & BAM_CIGAR_MASK
1879 if op==BAM_CHARD_CLIP:
1880 if end_offset!=0 and end_offset!=src.core.l_qseq:
1881 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1883 elif op==BAM_CSOFT_CLIP:
1884 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
1889 end_offset = src.core.l_qseq
1894 cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
1899 if not src.core.l_qseq:
1902 seq = PyString_FromStringAndSize(NULL, end-start)
1903 s = PyString_AS_STRING(seq)
1906 for k from start <= k < end:
1907 # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
1908 # note: do not use string literal as it will be a python string
1909 s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
1914 cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
1923 qual = PyString_FromStringAndSize(NULL, end-start)
1924 q = PyString_AS_STRING(qual)
1926 for k from start <= k < end:
1927 ## equivalent to t[i] + 33 (see bam.c)
1928 q[k-start] = p[k] + 33
1932 cdef class AlignedRead:
1934 Class representing an aligned read. see SAM format specification for
1935 the meaning of fields (http://samtools.sourceforge.net/).
1937 This class stores a handle to the samtools C-structure representing
1938 an aligned read. Member read access is forwarded to the C-structure
1939 and converted into python objects. This implementation should be fast,
1940 as only the data needed is converted.
1942 For write access, the C-structure is updated in-place. This is
1943 not the most efficient way to build BAM entries, as the variable
1944 length data is concatenated and thus needs to resized if
1945 a field is updated. Furthermore, the BAM entry might be
1946 in an inconsistent state. The :meth:`~validate` method can
1947 be used to check if an entry is consistent.
1949 One issue to look out for is that the sequence should always
1950 be set *before* the quality scores. Setting the sequence will
1951 also erase any quality scores that were set previously.
1954 # Now only called when instances are created from Python
1957 self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
1958 # allocate some memory
1959 # If size is 0, calloc does not return a pointer that can be passed to free()
1960 # so allocate 40 bytes for a new read
1961 self._delegate.m_data = 40
1962 self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
1963 self._delegate.data_len = 0
1965 def __dealloc__(self):
1966 bam_destroy1(self._delegate)
1969 """return string representation of alignment.
1971 The representation is an approximate :term:`sam` format.
1973 An aligned read might not be associated with a :term:`Samfile`.
1974 As a result :term:`tid` is shown instead of the reference name.
1976 Similarly, the tags field is returned in its parsed state.
1978 # sam-parsing is done in sam.c/bam_format1_core which
1979 # requires a valid header.
1980 return "\t".join(map(str, (self.qname,
1993 def compare(self, AlignedRead other):
1994 '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
2002 # uncomment for debugging purposes
2003 # cdef unsigned char * oo, * tt
2004 # tt = <unsigned char*>(&t.core)
2005 # oo = <unsigned char*>(&o.core)
2006 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
2007 # tt = <unsigned char*>(t.data)
2008 # oo = <unsigned char*>(o.data)
2009 # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
2011 # Fast-path test for object identity
2015 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
2017 if retval: return retval
2018 retval = cmp(t.data_len, o.data_len)
2019 if retval: return retval
2020 return memcmp(t.data, o.data, t.data_len)
2022 # Disabled so long as __cmp__ is a special method
2024 return _Py_HashPointer(<void *>self)
2027 """the query name (None if not present)"""
2030 src = self._delegate
2031 if src.core.l_qname == 0: return None
2032 return <char *>bam1_qname( src )
2034 def __set__(self, qname ):
2035 if qname == None or len(qname) == 0: return
2040 src = self._delegate
2041 p = bam1_qname( src )
2043 # the qname is \0 terminated
2045 pysam_bam_update( src,
2050 src.core.l_qname = l
2052 # re-acquire pointer to location in memory
2053 # as it might have moved
2056 strncpy( p, qname, l )
2059 """the :term:`cigar` alignment (None if not present).
2062 cdef uint32_t * cigar_p
2067 src = self._delegate
2068 if src.core.n_cigar == 0: return None
2071 cigar_p = bam1_cigar(src);
2072 for k from 0 <= k < src.core.n_cigar:
2073 op = cigar_p[k] & BAM_CIGAR_MASK
2074 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2075 cigar.append((op, l))
2078 def __set__(self, values ):
2079 if values == None or len(values) == 0: return
2087 src = self._delegate
2089 # get location of cigar string
2092 # create space for cigar data within src.data
2093 pysam_bam_update( src,
2094 src.core.n_cigar * 4,
2098 # length is number of cigar operations, not bytes
2099 src.core.n_cigar = len(values)
2101 # re-acquire pointer to location in memory
2102 # as it might have moved
2105 # insert cigar operations
2106 for op, l in values:
2107 p[k] = l << BAM_CIGAR_SHIFT | op
2110 ## setting the cigar string also updates the "bin" attribute
2111 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
2114 """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
2118 src = self._delegate
2120 if src.core.l_qseq == 0: return None
2122 return get_seq_range(src, 0, src.core.l_qseq)
2124 def __set__(self,seq):
2125 # samtools manages sequence and quality length memory together
2126 # if no quality information is present, the first byte says 0xff.
2128 if seq == None or len(seq) == 0: return
2132 cdef int l, k, nbytes_new, nbytes_old
2134 src = self._delegate
2138 # as the sequence is stored in half-bytes, the total length (sequence
2139 # plus quality scores) is (l+1)/2 + l
2140 nbytes_new = (l+1)/2 + l
2141 nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
2142 # acquire pointer to location in memory
2146 pysam_bam_update( src,
2150 # re-acquire pointer to location in memory
2151 # as it might have moved
2153 for k from 0 <= k < nbytes_new: p[k] = 0
2154 # convert to C string
2156 for k from 0 <= k < l:
2157 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
2160 p = bam1_qual( src )
2165 """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
2171 src = self._delegate
2173 if src.core.l_qseq == 0: return None
2175 return get_qual_range(src, 0, src.core.l_qseq)
2177 def __set__(self,qual):
2178 # note that space is already allocated via the sequences
2184 src = self._delegate
2185 p = bam1_qual( src )
2186 if qual == None or len(qual) == 0:
2187 # if absent - set to 0xff
2191 # convert to C string
2194 if src.core.l_qseq != l:
2195 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
2196 assert src.core.l_qseq == l
2197 for k from 0 <= k < l:
2198 p[k] = <uint8_t>q[k] - 33
2201 """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
2203 SAM/BAM files may included extra flanking bases sequences that were
2204 not part of the alignment. These bases may be the result of the
2205 Smith-Waterman or other algorithms, which may not require alignments
2206 that begin at the first residue or end at the last. In addition,
2207 extra sequencing adapters, multiplex identifiers, and low-quality bases that
2208 were not considered for alignment may have been retained."""
2212 cdef uint32_t start, end
2215 src = self._delegate
2217 if src.core.l_qseq == 0: return None
2219 start = query_start(src)
2220 end = query_end(src)
2222 return get_seq_range(src, start, end)
2225 """aligned query sequence quality values (None if not present)"""
2228 cdef uint32_t start, end
2231 src = self._delegate
2233 if src.core.l_qseq == 0: return None
2235 start = query_start(src)
2236 end = query_end(src)
2238 return get_qual_range(src, start, end)
2241 """start index of the aligned query portion of the sequence (0-based, inclusive)"""
2243 return query_start(self._delegate)
2246 """end index of the aligned query portion of the sequence (0-based, exclusive)"""
2248 return query_end(self._delegate)
2251 """Length of the aligned query sequence"""
2254 src = self._delegate
2255 return query_end(src)-query_start(src)
2258 """the tags in the AUX field.
2260 This property permits convenience access to
2261 the tags. Changes it the returned list will
2262 not update the tags automatically. Instead,
2263 the following is required for adding a
2266 read.tags = read.tags + [("RG",0)]
2269 This method will happily write the same tag
2279 src = self._delegate
2280 if src.l_aux == 0: return []
2285 while s < (src.data + src.data_len):
2292 if auxtype in ('c', 'C'):
2293 value = <int>bam_aux2i(s)
2295 elif auxtype in ('s', 'S'):
2296 value = <int>bam_aux2i(s)
2298 elif auxtype in ('i', 'I'):
2299 value = <float>bam_aux2i(s)
2301 elif auxtype == 'f':
2302 value = <float>bam_aux2f(s)
2304 elif auxtype == 'd':
2305 value = <double>bam_aux2d(s)
2307 elif auxtype == 'A':
2308 value = "%c" % <char>bam_aux2A(s)
2310 elif auxtype in ('Z', 'H'):
2311 value = <char*>bam_aux2Z(s)
2312 # +1 for NULL terminated string
2317 result.append( (auxtag, value) )
2321 def __set__(self, tags):
2325 cdef uint8_t * new_data
2327 cdef int guessed_size, control_size
2328 cdef int max_size, size, offset
2330 src = self._delegate
2336 # map samtools code to python.struct code and byte size
2337 buffer = ctypes.create_string_buffer(max_size)
2339 for pytag, value in tags:
2341 if t == types.FloatType:
2342 fmt, pytype = "<cccf", 'f'
2343 elif t == types.IntType:
2345 if value >= -127: fmt, pytype = "<cccb", 'c'
2346 elif value >= -32767: fmt, pytype = "<ccch", 's'
2347 elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2348 else: fmt, pytype = "<ccci", 'i'[0]
2350 if value <= 255: fmt, pytype = "<cccB", 'C'
2351 elif value <= 65535: fmt, pytype = "<cccH", 'S'
2352 elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2353 else: fmt, pytype = "<cccI", 'I'
2355 # Note: hex strings (H) are not supported yet
2357 fmt, pytype = "<cccc", 'A'
2359 fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
2361 size = struct.calcsize(fmt)
2362 if offset + size > max_size:
2363 raise NotImplementedError("tags field too large")
2365 struct.pack_into( fmt,
2374 # delete the old data and allocate new
2375 # if offset == 0, the aux field will be
2377 pysam_bam_update( src,
2384 # copy data only if there is any
2387 # get location of new data
2390 # check if there is direct path from buffer.raw to tmp
2392 memcpy( s, temp, offset )
2395 """properties flag"""
2396 def __get__(self): return self._delegate.core.flag
2397 def __set__(self, flag): self._delegate.core.flag = flag
2403 DEPRECATED from pysam-0.4 - use tid in the future.
2404 The rname field caused a lot of confusion as it returns
2405 the :term:`target` ID instead of the reference sequence
2410 This field contains the index of the reference sequence
2411 in the sequence dictionary. To obtain the name
2412 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2415 def __get__(self): return self._delegate.core.tid
2416 def __set__(self, tid): self._delegate.core.tid = tid
2424 This field contains the index of the reference sequence
2425 in the sequence dictionary. To obtain the name
2426 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2429 def __get__(self): return self._delegate.core.tid
2430 def __set__(self, tid): self._delegate.core.tid = tid
2433 """0-based leftmost coordinate"""
2434 def __get__(self): return self._delegate.core.pos
2435 def __set__(self, pos):
2436 ## setting the cigar string also updates the "bin" attribute
2438 src = self._delegate
2439 if src.core.n_cigar:
2440 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
2442 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
2443 self._delegate.core.pos = pos
2445 """properties bin"""
2446 def __get__(self): return self._delegate.core.bin
2447 def __set__(self, bin): self._delegate.core.bin = bin
2449 '''length of the read (read only). Returns 0 if not given.'''
2450 def __get__(self): return self._delegate.core.l_qseq
2452 '''aligned end position of the read on the reference genome. Returns
2453 None if not available.'''
2456 src = self._delegate
2457 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2459 return bam_calend(&src.core, bam1_cigar(src))
2461 '''aligned length of the read on the reference genome. Returns None if
2465 src = self._delegate
2466 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2468 return bam_calend(&src.core,
2469 bam1_cigar(src)) - \
2470 self._delegate.core.pos
2473 """mapping quality"""
2474 def __get__(self): return self._delegate.core.qual
2475 def __set__(self, qual): self._delegate.core.qual = qual
2477 """the :term:`reference` id of the mate
2478 deprecated, use RNEXT instead.
2480 def __get__(self): return self._delegate.core.mtid
2481 def __set__(self, mtid): self._delegate.core.mtid = mtid
2483 """the :term:`reference` id of the mate """
2484 def __get__(self): return self._delegate.core.mtid
2485 def __set__(self, mtid): self._delegate.core.mtid = mtid
2487 """the position of the mate
2488 deprecated, use PNEXT instead."""
2489 def __get__(self): return self._delegate.core.mpos
2490 def __set__(self, mpos): self._delegate.core.mpos = mpos
2492 """the position of the mate"""
2493 def __get__(self): return self._delegate.core.mpos
2494 def __set__(self, mpos): self._delegate.core.mpos = mpos
2497 deprecated: use tlen instead"""
2498 def __get__(self): return self._delegate.core.isize
2499 def __set__(self, isize): self._delegate.core.isize = isize
2501 """the insert size"""
2502 def __get__(self): return self._delegate.core.isize
2503 def __set__(self, isize): self._delegate.core.isize = isize
2505 """true if read is paired in sequencing"""
2506 def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
2507 def __set__(self,val):
2508 if val: self._delegate.core.flag |= BAM_FPAIRED
2509 else: self._delegate.core.flag &= ~BAM_FPAIRED
2510 property is_proper_pair:
2511 """true if read is mapped in a proper pair"""
2512 def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
2513 def __set__(self,val):
2514 if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
2515 else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
2516 property is_unmapped:
2517 """true if read itself is unmapped"""
2518 def __get__(self): return (self.flag & BAM_FUNMAP) != 0
2519 def __set__(self,val):
2520 if val: self._delegate.core.flag |= BAM_FUNMAP
2521 else: self._delegate.core.flag &= ~BAM_FUNMAP
2522 property mate_is_unmapped:
2523 """true if the mate is unmapped"""
2524 def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
2525 def __set__(self,val):
2526 if val: self._delegate.core.flag |= BAM_FMUNMAP
2527 else: self._delegate.core.flag &= ~BAM_FMUNMAP
2528 property is_reverse:
2529 """true if read is mapped to reverse strand"""
2530 def __get__(self): return (self.flag & BAM_FREVERSE) != 0
2531 def __set__(self,val):
2532 if val: self._delegate.core.flag |= BAM_FREVERSE
2533 else: self._delegate.core.flag &= ~BAM_FREVERSE
2534 property mate_is_reverse:
2535 """true is read is mapped to reverse strand"""
2536 def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
2537 def __set__(self,val):
2538 if val: self._delegate.core.flag |= BAM_FMREVERSE
2539 else: self._delegate.core.flag &= ~BAM_FMREVERSE
2541 """true if this is read1"""
2542 def __get__(self): return (self.flag & BAM_FREAD1) != 0
2543 def __set__(self,val):
2544 if val: self._delegate.core.flag |= BAM_FREAD1
2545 else: self._delegate.core.flag &= ~BAM_FREAD1
2547 """true if this is read2"""
2548 def __get__(self): return (self.flag & BAM_FREAD2) != 0
2549 def __set__(self,val):
2550 if val: self._delegate.core.flag |= BAM_FREAD2
2551 else: self._delegate.core.flag &= ~BAM_FREAD2
2552 property is_secondary:
2553 """true if not primary alignment"""
2554 def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
2555 def __set__(self,val):
2556 if val: self._delegate.core.flag |= BAM_FSECONDARY
2557 else: self._delegate.core.flag &= ~BAM_FSECONDARY
2559 """true if QC failure"""
2560 def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
2561 def __set__(self,val):
2562 if val: self._delegate.core.flag |= BAM_FQCFAIL
2563 else: self._delegate.core.flag &= ~BAM_FQCFAIL
2564 property is_duplicate:
2565 """true if optical or PCR duplicate"""
2566 def __get__(self): return (self.flag & BAM_FDUP) != 0
2567 def __set__(self,val):
2568 if val: self._delegate.core.flag |= BAM_FDUP
2569 else: self._delegate.core.flag &= ~BAM_FDUP
2571 """a list of reference positions that this read aligns to."""
2573 cdef uint32_t k, i, pos
2575 cdef uint32_t * cigar_p
2579 src = self._delegate
2580 if src.core.n_cigar == 0: return []
2584 cigar_p = bam1_cigar(src)
2585 for k from 0 <= k < src.core.n_cigar:
2586 op = cigar_p[k] & BAM_CIGAR_MASK
2587 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2588 if op == BAM_CMATCH:
2589 for i from pos <= i < pos + l:
2592 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2597 def overlap( self, uint32_t start, uint32_t end ):
2598 """return number of aligned bases of read overlapping the interval *start* and *end*
2599 on the reference sequence.
2601 cdef uint32_t k, i, pos, overlap
2603 cdef uint32_t * cigar_p
2608 src = self._delegate
2609 if src.core.n_cigar == 0: return 0
2613 cigar_p = bam1_cigar(src)
2614 for k from 0 <= k < src.core.n_cigar:
2615 op = cigar_p[k] & BAM_CIGAR_MASK
2616 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2618 if op == BAM_CMATCH:
2619 o = min( pos + l, end) - max( pos, start )
2620 if o > 0: overlap += o
2622 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2628 """retrieves optional data given a two-letter *tag*"""
2629 #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
2631 v = bam_aux_get(self._delegate, tag)
2632 if v == NULL: raise KeyError( "tag '%s' not present" % tag )
2634 if type == 'c' or type == 'C' or type == 's' or type == 'S':
2635 return <int>bam_aux2i(v)
2636 elif type == 'i' or type == 'I':
2637 return <int32_t>bam_aux2i(v)
2638 elif type == 'f' or type == 'F':
2639 return <float>bam_aux2f(v)
2640 elif type == 'd' or type == 'D':
2641 return <double>bam_aux2d(v)
2643 # there might a more efficient way
2644 # to convert a char into a string
2645 return '%c' % <char>bam_aux2A(v)
2647 return <char*>bam_aux2Z(v)
2649 def fancy_str (self):
2650 """returns list of fieldnames/values in pretty format for debugging
2654 "tid": "Contig index",
2655 "pos": "Mapped position on contig",
2656 "mtid": "Contig index for mate pair",
2657 "mpos": "Position of mate pair",
2658 "isize": "Insert size",
2659 "flag": "Binary flag",
2660 "n_cigar": "Count of cigar entries",
2661 "cigar": "Cigar entries",
2662 "qual": "Mapping quality",
2663 "bin": "Bam index bin number",
2664 "l_qname": "Length of query name",
2665 "qname": "Query name",
2666 "l_qseq": "Length of query sequence",
2667 "qseq": "Query sequence",
2668 "bqual": "Quality scores",
2669 "l_aux": "Length of auxilary data",
2670 "m_data": "Maximum data length",
2671 "data_len": "Current data length",
2673 fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
2674 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
2675 "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
2677 for f in fields_names_in_order:
2678 if not f in self.__dict__:
2680 ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
2682 for f in self.__dict__:
2683 if not f in field_names:
2684 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
2687 cdef class PileupProxy:
2688 '''A pileup column. A pileup column contains
2689 all the reads that map to a certain target base.
2692 chromosome ID as is defined in the header
2694 the target base coordinate (0-based)
2696 number of reads mapping to this column
2698 list of reads (:class:`pysam.PileupRead`) aligned to this column
2700 This class is a proxy for results returned by the samtools pileup engine.
2701 If the underlying engine iterator advances, the results of this column
2704 cdef bam_pileup1_t * plp
2710 raise TypeError("This class cannot be instantiated from Python")
2713 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
2715 "\n".join( map(str, self.pileups) )
2718 '''the chromosome ID as is defined in the header'''
2719 def __get__(self): return self.tid
2722 '''number of reads mapping to this column.'''
2723 def __get__(self): return self.n_pu
2724 def __set__(self, n): self.n_pu = n
2727 def __get__(self): return self.pos
2730 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
2734 # warning: there could be problems if self.n and self.buf are
2736 for x from 0 <= x < self.n_pu:
2737 pileups.append( makePileupRead( &(self.plp[x])) )
2740 cdef class PileupRead:
2741 '''A read aligned to a column.
2745 AlignedRead _alignment
2754 raise TypeError("This class cannot be instantiated from Python")
2757 return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
2760 """a :class:`pysam.AlignedRead` object of the aligned read"""
2762 return self._alignment
2764 """position of the read base at the pileup site, 0-based"""
2768 """indel length; 0 for no indel, positive for ins and negative for del"""
2772 """1 iff the base on the padded read is a deletion"""
2777 return self._is_head
2780 return self._is_tail
2786 '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
2787 def __init__(self, id = 1):
2791 def setdevice(self, filename):
2792 '''open an existing file, like "/dev/null"'''
2793 fd = os.open(filename, os.O_WRONLY)
2796 def setfile(self, filename):
2797 '''open a new file.'''
2798 fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
2801 def setfd(self, fd):
2802 ofd = os.dup(self.id) # Save old stream on new unit.
2803 self.streams.append(ofd)
2804 sys.stdout.flush() # Buffered data goes to old stream.
2805 sys.stderr.flush() # Buffered data goes to old stream.
2806 os.dup2(fd, self.id) # Open unit 1 on new stream.
2807 os.close(fd) # Close other unit (look out, caller.)
2810 '''restore previous output stream'''
2812 # the following was not sufficient, hence flush both stderr and stdout
2813 # os.fsync( self.id )
2816 os.dup2(self.streams[-1], self.id)
2817 os.close(self.streams[-1])
2818 del self.streams[-1]
2820 def _samtools_dispatch( method,
2822 catch_stdout = True,
2823 catch_stderr = False,
2825 '''call ``method`` in samtools providing arguments in args.
2828 This method redirects stdout and (optionally) stderr to capture it
2829 from samtools. If for some reason stdout/stderr disappears
2830 the reason might be in this method.
2833 The current implementation might only work on linux.
2836 This method captures stdout and stderr using temporary files,
2837 which are then read into memory in their entirety. This method
2838 is slow and might cause large memory overhead.
2840 See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
2841 on the topic of redirecting stderr/stdout.
2844 # note that debugging this module can be a problem
2845 # as stdout/stderr will not appear on the terminal
2847 # some special cases
2848 if method == "index":
2849 if not os.path.exists( args[0] ):
2850 raise IOError( "No such file or directory: '%s'" % args[0] )
2852 # redirect stderr and stdout to file
2854 stderr_h, stderr_f = tempfile.mkstemp()
2855 stderr_save = Outs( sys.stderr.fileno() )
2856 stderr_save.setfd( stderr_h )
2859 stdout_h, stdout_f = tempfile.mkstemp()
2860 stdout_save = Outs( sys.stdout.fileno() )
2861 stdout_save.setfd( stdout_h )
2863 # patch for `samtools view`
2864 # samtools `view` closes stdout, from which I can not
2865 # recover. Thus redirect output to file with -o option.
2866 if method == "view":
2867 if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
2868 args = ( "-o", stdout_f ) + args
2870 # do the function call to samtools
2872 cdef int i, n, retval
2875 # allocate two more for first (dummy) argument (contains command)
2876 cargs = <char**>calloc( n+2, sizeof( char *) )
2877 cargs[0] = "samtools"
2879 for i from 0 <= i < n: cargs[i+2] = args[i]
2881 retval = pysam_dispatch(n+2, cargs)
2884 # restore stdout/stderr. This will also flush, so
2885 # needs to be before reading back the file contents
2887 stdout_save.restore()
2888 out_stdout = open( stdout_f, "r").readlines()
2889 os.remove( stdout_f )
2894 stderr_save.restore()
2895 out_stderr = open( stderr_f, "r").readlines()
2896 os.remove( stderr_f )
2900 return retval, out_stderr, out_stdout
2903 '''the results of a SNP call.'''
2906 cdef char _reference_base
2908 cdef int _consensus_quality
2909 cdef int _snp_quality
2910 cdef int _rms_mapping_quality
2914 '''the chromosome ID as is defined in the header'''
2919 '''nucleotide position of SNP.'''
2920 def __get__(self): return self._pos
2922 property reference_base:
2923 '''reference base at pos. ``N`` if no reference sequence supplied.'''
2924 def __get__(self): return PyString_FromStringAndSize( &self._reference_base, 1 )
2927 '''the genotype called.'''
2928 def __get__(self): return PyString_FromStringAndSize( &self._genotype, 1 )
2930 property consensus_quality:
2931 '''the genotype quality (Phred-scaled).'''
2932 def __get__(self): return self._consensus_quality
2934 property snp_quality:
2935 '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
2936 def __get__(self): return self._snp_quality
2938 property mapping_quality:
2939 '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
2940 def __get__(self): return self._rms_mapping_quality
2943 '''coverage or read depth - the number of reads involved in the call.'''
2944 def __get__(self): return self._coverage
2948 return "\t".join( map(str, (
2951 self.reference_base,
2953 self.consensus_quality,
2955 self.mapping_quality,
2959 # cdef class SNPCallerBase:
2960 # '''Base class for SNP callers.
2963 # minimum base quality (possibly capped by BAQ)
2965 # coefficient for adjusting mapQ of poor mappings
2967 # theta in maq consensus calling model
2969 # number of haplotypes in the sample
2971 # prior of a difference between two haplotypes
2974 # cdef bam_maqcns_t * c
2975 # cdef IteratorColumn iter
2977 # def __cinit__(self,
2978 # IteratorColumn iterator_column,
2981 # self.iter = iterator_column
2982 # self.c = bam_maqcns_init()
2984 # # set the default parameterization according to
2987 # # new default mode for samtools >0.1.10
2988 # self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
2990 # self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
2991 # # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
2992 # self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
2993 # self.c.het_rate = kwargs.get( "het_rate", 0.001 )
2994 # self.c.theta = kwargs.get( "theta", 0.83 )
2996 # if self.c.errmod != BAM_ERRMOD_MAQ2:
2997 # self.c.theta += 0.02
2999 # # call prepare AFTER setting parameters
3000 # bam_maqcns_prepare( self.c )
3002 # def __dealloc__(self):
3003 # bam_maqcns_destroy( self.c )
3005 # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
3006 # '''debugging output.'''
3008 # pysam_dump_glf( g, self.c );
3010 # for x in range(self.iter.n_plp):
3011 # print "--> read %i %s %i" % (x,
3012 # bam1_qname(self.iter.plp[x].b),
3013 # self.iter.plp[x].qpos,
3016 # print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
3029 # printf("-------------------------------------\n");
3030 # sys.stdout.flush()
3032 # cdef class IteratorSNPCalls( SNPCallerBase ):
3033 # """*(IteratorColumn iterator)*
3035 # call SNPs within a region.
3037 # *iterator* is a pileup iterator. SNPs will be called
3038 # on all positions returned by this iterator.
3040 # This caller is fast if SNPs are called over large continuous
3041 # regions. It is slow, if instantiated frequently and in random
3042 # order as the sequence will have to be reloaded.
3046 # def __cinit__(self,
3047 # IteratorColumn iterator_column,
3050 # assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
3052 # def __iter__(self):
3055 # def __next__(self):
3056 # """python version of next().
3059 # # the following code was adapted from bam_plcmd.c:pileup_func()
3062 # if self.iter.n_plp < 0:
3063 # raise ValueError("error during iteration" )
3065 # if self.iter.plp == NULL:
3066 # raise StopIteration
3068 # cdef char * seq = self.iter.getSequence()
3069 # cdef int seq_len = self.iter.seq_len
3071 # assert seq != NULL
3074 # if self.iter.pos >= seq_len:
3075 # raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3077 # cdef int rb = seq[self.iter.pos]
3081 # g = bam_maqcns_glfgen( self.iter.n_plp,
3083 # bam_nt16_table[rb],
3086 # if pysam_glf_depth( g ) == 0:
3087 # cns = 0xfu << 28 | 0xf << 24
3089 # cns = glf2cns(g, <int>(self.c.q_r + .499))
3096 # call._tid = self.iter.tid
3097 # call._pos = self.iter.pos
3098 # call._reference_base = rb
3099 # call._genotype = bam_nt16_rev_table[cns>>28]
3100 # call._consensus_quality = cns >> 8 & 0xff
3101 # call._snp_quality = cns & 0xff
3102 # call._rms_mapping_quality = cns >> 16&0xff
3103 # call._coverage = self.iter.n_plp
3107 # cdef class SNPCaller( SNPCallerBase ):
3108 # '''*(IteratorColumn iterator_column )*
3110 # The samtools SNP caller.
3112 # This object will call SNPs in *samfile* against the reference
3113 # sequence in *fasta*.
3115 # This caller is fast for calling few SNPs in selected regions.
3117 # It is slow, if called over large genomic regions.
3121 # def __cinit__(self,
3122 # IteratorColumn iterator_column,
3127 # def call(self, reference, int pos ):
3128 # """call a snp on chromosome *reference*
3129 # and position *pos*.
3131 # returns a :class:`SNPCall` object.
3134 # cdef int tid = self.iter.samfile.gettid( reference )
3136 # self.iter.reset( tid, pos, pos + 1 )
3141 # if self.iter.n_plp < 0:
3142 # raise ValueError("error during iteration" )
3144 # if self.iter.plp == NULL:
3145 # raise ValueError( "no reads in region - no call" )
3147 # if self.iter.pos == pos: break
3149 # cdef char * seq = self.iter.getSequence()
3150 # cdef int seq_len = self.iter.seq_len
3152 # assert seq != NULL
3155 # if self.iter.pos >= seq_len:
3156 # raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3158 # cdef int rb = seq[self.iter.pos]
3162 # # g = bam_maqcns_glfgen( self.iter.n_plp,
3164 # # bam_nt16_table[rb],
3168 # # if pysam_glf_depth( g ) == 0:
3169 # # cns = 0xfu << 28 | 0xf << 24
3171 # # cns = glf2cns(g, <int>(self.c.q_r + .499))
3178 # call._tid = self.iter.tid
3179 # call._pos = self.iter.pos
3180 # call._reference_base = rb
3181 # call._genotype = bam_nt16_rev_table[cns>>28]
3182 # call._consensus_quality = cns >> 8 & 0xff
3183 # call._snp_quality = cns & 0xff
3184 # call._rms_mapping_quality = cns >> 16&0xff
3185 # call._coverage = self.iter.n_plp
3189 # cdef class IndelCall:
3190 # '''the results of an indel call.'''
3193 # cdef int _coverage
3194 # cdef int _rms_mapping_quality
3195 # cdef bam_maqindel_ret_t * _r
3197 # def __cinit__(self):
3203 # '''the chromosome ID as is defined in the header'''
3204 # def __get__(self):
3208 # '''nucleotide position of SNP.'''
3209 # def __get__(self): return self._pos
3211 # property genotype:
3212 # '''the genotype called.'''
3213 # def __get__(self):
3214 # if self._r.gt == 0:
3215 # s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3216 # return "%s/%s" % (s,s)
3217 # elif self._r.gt == 1:
3218 # s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3219 # return "%s/%s" % (s,s)
3221 # return "%s/%s" % (self.first_allele, self.second_allele )
3223 # property consensus_quality:
3224 # '''the genotype quality (Phred-scaled).'''
3225 # def __get__(self): return self._r.q_cns
3227 # property snp_quality:
3228 # '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
3229 # def __get__(self): return self._r.q_ref
3231 # property mapping_quality:
3232 # '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
3233 # def __get__(self): return self._rms_mapping_quality
3235 # property coverage:
3236 # '''coverage or read depth - the number of reads involved in the call.'''
3237 # def __get__(self): return self._coverage
3239 # property first_allele:
3240 # '''sequence of first allele.'''
3241 # def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3243 # property second_allele:
3244 # '''sequence of second allele.'''
3245 # def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3247 # property reads_first:
3248 # '''reads supporting first allele.'''
3249 # def __get__(self): return self._r.cnt1
3251 # property reads_second:
3252 # '''reads supporting first allele.'''
3253 # def __get__(self): return self._r.cnt2
3255 # property reads_diff:
3256 # '''reads supporting first allele.'''
3257 # def __get__(self): return self._r.cnt_anti
3259 # def __str__(self):
3261 # return "\t".join( map(str, (
3265 # self.consensus_quality,
3267 # self.mapping_quality,
3269 # self.first_allele,
3270 # self.second_allele,
3272 # self.reads_second,
3273 # self.reads_diff ) ) )
3275 # def __dealloc__(self ):
3276 # bam_maqindel_ret_destroy(self._r)
3278 # cdef class IndelCallerBase:
3279 # '''Base class for SNP callers.
3282 # minimum base quality (possibly capped by BAQ)
3284 # coefficient for adjusting mapQ of poor mappings
3286 # theta in maq consensus calling model
3288 # number of haplotypes in the sample
3290 # prior of a difference between two haplotypes
3293 # cdef bam_maqindel_opt_t * options
3294 # cdef IteratorColumn iter
3296 # cdef int max_depth
3298 # def __cinit__(self,
3299 # IteratorColumn iterator_column,
3303 # self.iter = iterator_column
3305 # assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
3307 # self.options = bam_maqindel_opt_init()
3309 # # set the default parameterization according to
3312 # self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
3313 # self.options.q_indel = kwargs.get( "q_indel", 40 )
3314 # self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
3315 # self.max_depth = kwargs.get( "max_depth", 1024 )
3317 # def __dealloc__(self):
3318 # free( self.options )
3320 # def _call( self ):
3322 # cdef char * seq = self.iter.getSequence()
3323 # cdef int seq_len = self.iter.seq_len
3325 # assert seq != NULL
3328 # if self.iter.pos >= seq_len:
3329 # raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3331 # cdef bam_maqindel_ret_t * r
3333 # cdef int m = min( self.max_depth, self.iter.n_plp )
3335 # # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
3336 # # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
3337 # # self.options.indel_err, self.options.ambi_thres );
3339 # r = bam_maqindel(m,
3347 # if r == NULL: return None
3349 # cdef IndelCall call
3350 # call = IndelCall()
3352 # call._tid = self.iter.tid
3353 # call._pos = self.iter.pos
3354 # call._coverage = self.iter.n_plp
3356 # cdef uint64_t rms_aux = 0
3358 # cdef bam_pileup1_t * p
3361 # for i from 0 <= i < self.iter.n_plp:
3362 # p = self.iter.plp + i
3363 # if p.b.core.qual < self.cap_mapQ:
3364 # tmp = p.b.core.qual
3366 # tmp = self.cap_mapQ
3367 # rms_aux += tmp * tmp
3369 # call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
3373 # cdef class IndelCaller( IndelCallerBase ):
3374 # '''*(IteratorColumn iterator_column )*
3376 # The samtools SNP caller.
3378 # This object will call SNPs in *samfile* against the reference
3379 # sequence in *fasta*.
3381 # This caller is fast for calling few SNPs in selected regions.
3383 # It is slow, if called over large genomic regions.
3386 # def __cinit__(self,
3387 # IteratorColumn iterator_column,
3392 # def call(self, reference, int pos ):
3393 # """call a snp on chromosome *reference*
3394 # and position *pos*.
3396 # returns a :class:`SNPCall` object or None, if no indel call could be made.
3399 # cdef int tid = self.iter.samfile.gettid( reference )
3401 # self.iter.reset( tid, pos, pos + 1 )
3406 # if self.iter.n_plp < 0:
3407 # raise ValueError("error during iteration" )
3409 # if self.iter.plp == NULL:
3410 # raise ValueError( "no reads in region - no call" )
3412 # if self.iter.pos == pos: break
3414 # return self._call()
3416 # cdef class IteratorIndelCalls( IndelCallerBase ):
3417 # """*(IteratorColumn iterator)*
3419 # call indels within a region.
3421 # *iterator* is a pileup iterator. SNPs will be called
3422 # on all positions returned by this iterator.
3424 # This caller is fast if SNPs are called over large continuous
3425 # regions. It is slow, if instantiated frequently and in random
3426 # order as the sequence will have to be reloaded.
3430 # def __cinit__(self,
3431 # IteratorColumn iterator_column,
3436 # def __iter__(self):
3439 # def __next__(self):
3440 # """python version of next().
3443 # # the following code was adapted from bam_plcmd.c:pileup_func()
3446 # if self.iter.n_plp < 0:
3447 # raise ValueError("error during iteration" )
3449 # if self.iter.plp == NULL:
3450 # raise StopIteration
3452 # return self._call()
3456 cdef class IndexedReads:
3457 """index a bamfile by read.
3459 The index is kept in memory.
3461 By default, the file is re-openend to avoid conflicts if
3462 multiple operators work on the same file. Set *reopen* = False
3463 to not re-open *samfile*.
3466 cdef Samfile samfile
3469 # true if samfile belongs to this object
3470 cdef int owns_samfile
3472 def __init__(self, Samfile samfile, int reopen = True ):
3473 self.samfile = samfile
3475 if samfile.isbam: mode = "rb"
3478 # reopen the file - note that this makes the iterator
3479 # slow and causes pileup to slow down significantly.
3481 store = StderrStore()
3482 self.fp = samopen( samfile._filename, mode, NULL )
3484 assert self.fp != NULL
3485 self.owns_samfile = True
3487 self.fp = samfile.samfile
3488 self.owns_samfile = False
3490 assert samfile.isbam, "can only IndexReads on bam files"
3495 self.index = collections.defaultdict( list )
3497 # this method will start indexing from the current file position
3500 cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
3505 pos = bam_tell( self.fp.x.bam )
3506 ret = samread( self.fp, b)
3508 qname = bam1_qname( b )
3509 self.index[qname].append( pos )
3513 def find( self, qname ):
3514 if qname in self.index:
3515 return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
3517 raise KeyError( "read %s not found" % qname )
3519 def __dealloc__(self):
3520 if self.owns_samfile: samclose( self.fp )
3522 __all__ = ["Samfile",
3530 # "IteratorSNPCalls",
3533 # "IteratorIndelCalls",