1 # cython: embedsignature=True
3 # adds doc-strings for sphinx
14 from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
15 from cpython cimport PyErr_SetString
17 #from cpython.string cimport PyString_FromStringAndSize, PyString_AS_STRING
18 #from cpython.exc cimport PyErr_SetString, PyErr_NoMemory
20 # defines imported from samtools
25 ## These are bits set in the flag.
26 ## have to put these definitions here, in csamtools.pxd they got ignored
27 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
29 ## @abstract the read is mapped in a proper pair */
30 DEF BAM_FPROPER_PAIR =2
31 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
33 ## @abstract the mate is unmapped */
35 ## @abstract the read is mapped to the reverse strand */
37 ## @abstract the mate is mapped to the reverse strand */
39 ## @abstract this is read1 */
41 ## @abstract this is read2 */
43 ## @abstract not primary alignment */
44 DEF BAM_FSECONDARY =256
45 ## @abstract QC failure */
47 ## @abstract optical or PCR duplicate */
51 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
57 DEF BAM_CSOFT_CLIP = 4
58 DEF BAM_CHARD_CLIP = 5
61 #####################################################################
62 # hard-coded constants
63 cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
64 cdef int max_pos = 2 << 29
66 # redirect stderr to 0
67 _logfile = open(os.path.devnull, "w")
68 pysam_set_stderr( PyFile_AsFile( _logfile ) )
70 #####################################################################
71 #####################################################################
72 #####################################################################
73 ## private factory methods
74 #####################################################################
75 cdef class AlignedRead
76 cdef makeAlignedRead(bam1_t * src):
77 '''enter src into AlignedRead.'''
78 cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
79 dest._delegate = bam_dup1(src)
82 cdef class PileupProxy
83 cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
84 cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
92 cdef makePileupRead( bam_pileup1_t * src ):
93 '''fill a PileupRead object from a bam_pileup1_t * object.'''
94 cdef PileupRead dest = PileupRead.__new__(PileupRead)
95 dest._alignment = makeAlignedRead( src.b )
97 dest._indel = src.indel
98 dest._level = src.level
99 dest._is_del = src.is_del
100 dest._is_head = src.is_head
101 dest._is_tail = src.is_tail
104 #####################################################################
105 #####################################################################
106 #####################################################################
107 ## Generic callbacks for inserting python callbacks.
108 #####################################################################
109 cdef int fetch_callback( bam1_t *alignment, void *f):
110 '''callback for bam_fetch.
112 calls function in *f* with a new :class:`AlignedRead` object as parameter.
114 a = makeAlignedRead( alignment )
117 class PileupColumn(object):
118 '''A pileup column. A pileup column contains
119 all the reads that map to a certain target base.
122 chromosome ID as is defined in the header
124 the target base coordinate (0-based)
126 number of reads mapping to this column
128 list of reads (:class:`pysam.PileupRead`) aligned to this column
131 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
132 "\n" + "\n".join( map(str, self.pileups) )
134 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
135 '''callback for pileup.
137 calls function in *f* with a new :class:`Pileup` object as parameter.
140 chromosome ID as is defined in the header
142 start coordinate of the alignment, 0-based
144 number of elements in pl array
158 for x from 0 <= x < n:
159 pileups.append( makePileupRead( &(pl[x]) ) )
164 cdef int pileup_fetch_callback( bam1_t *b, void *data):
165 '''callback for bam_fetch.
167 Fetches reads and submits them to pileup.
169 cdef bam_plbuf_t * buf
170 buf = <bam_plbuf_t*>data
171 bam_plbuf_push(b, buf)
180 self.stderr_h, self.stderr_f = tempfile.mkstemp()
181 self.stderr_save = Outs( sys.stderr.fileno() )
182 self.stderr_save.setfd( self.stderr_h )
184 def readAndRelease( self ):
186 self.stderr_save.restore()
188 if os.path.exists(self.stderr_f):
189 lines = open( self.stderr_f, "r" ).readlines()
190 os.remove( self.stderr_f )
195 self.stderr_save.restore()
196 if os.path.exists(self.stderr_f):
197 os.remove( self.stderr_f )
202 class StderrStoreWindows():
203 '''does nothing. stderr can't be redirected on windows'''
204 def __init__(self): pass
205 def readAndRelease(self): return []
206 def release(self): pass
208 if platform.system()=='Windows':
210 StderrStore = StderrStoreWindows
213 ######################################################################
214 ######################################################################
215 ######################################################################
216 # valid types for sam headers
217 VALID_HEADER_TYPES = { "HD" : dict,
223 # order of records within sam headers
224 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
226 # type conversions within sam header records
227 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
228 "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
229 "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
230 "PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
232 # output order of fields within records
233 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
234 "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
235 "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
236 "PG" : ( "PN", "ID", "VN", "CL" ), }
239 ######################################################################
240 ######################################################################
241 ######################################################################
243 ######################################################################
244 cdef class Fastafile:
247 A *FASTA* file. The file is automatically opened.
249 The file expects an indexed fasta file.
252 add automatic indexing.
253 add function to get sequence names.
256 cdef char * _filename
257 # pointer to fastafile
258 cdef faidx_t * fastafile
260 def __cinit__(self, *args, **kwargs ):
261 self.fastafile = NULL
262 self._filename = NULL
263 self._open( *args, **kwargs )
266 '''return true if samfile has been opened.'''
267 return self.fastafile != NULL
270 if self.fastafile == NULL:
271 raise ValueError( "calling len() on closed file" )
273 return faidx_fetch_nseq(self.fastafile)
277 '''open an indexed fasta file.
279 This method expects an indexed fasta file.
282 # close a previously opened file
283 if self.fastafile != NULL: self.close()
284 if self._filename != NULL: free(self._filename)
285 self._filename = strdup(filename)
286 self.fastafile = fai_load( filename )
288 if self.fastafile == NULL:
289 raise IOError("could not open file `%s`" % filename )
292 if self.fastafile != NULL:
293 fai_destroy( self.fastafile )
294 self.fastafile = NULL
296 def __dealloc__(self):
298 if self._filename != NULL: free(self._filename)
301 '''number of :term:`filename` associated with this object.'''
303 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
304 return self._filename
312 '''*(reference = None, start = None, end = None, region = None)*
314 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
316 The region is specified by :term:`reference`, *start* and *end*.
318 fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
320 If *reference* is given and *start* is None, the sequence from the
321 first base is returned. Similarly, if *end* is None, the sequence
322 until the last base is returned.
324 Alternatively, a samtools :term:`region` string can be supplied.
327 if not self._isOpen():
328 raise ValueError( "I/O operation on closed file" )
334 if reference is None: raise ValueError( 'no sequence/region supplied.' )
335 if start is None: start = 0
336 if end is None: end = max_pos -1
338 if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
339 if start == end: return ""
340 # valid ranges are from 0 to 2^29-1
341 if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
342 if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
343 # note: faidx_fetch_seq has a bug such that out-of-range access
344 # always returns the last residue. Hence do not use faidx_fetch_seq,
345 # but use fai_fetch instead
346 # seq = faidx_fetch_seq(self.fastafile,
351 region = "%s:%i-%i" % (reference, start+1, end)
352 seq = fai_fetch( self.fastafile,
356 # samtools adds a '\0' at the end
357 seq = fai_fetch( self.fastafile, region, &length )
364 py_seq = PyString_FromStringAndSize(seq, length)
370 cdef char * _fetch( self, char * reference, int start, int end, int * length ):
371 '''fetch sequence for reference, start and end'''
373 return faidx_fetch_seq(self.fastafile,
379 #------------------------------------------------------------------------
380 #------------------------------------------------------------------------
381 #------------------------------------------------------------------------
382 cdef int count_callback( bam1_t *alignment, void *f):
383 '''callback for bam_fetch - count number of reads.
385 cdef int* counter = (<int*>f)
388 ctypedef struct MateData:
393 #------------------------------------------------------------------------
394 #------------------------------------------------------------------------
395 #------------------------------------------------------------------------
396 cdef int mate_callback( bam1_t *alignment, void *f):
397 '''callback for bam_fetch = filter mate
399 cdef MateData * d = (<MateData*>f)
400 # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
401 # d.mate, d.name, bam1_qname(alignment),
402 # d.flag, alignment.core.flag, alignment.core.flag & d.flag)
405 # could be sped up by comparing the lengths of query strings first
408 # also, make sure that we get the other read by comparing
410 if alignment.core.flag & d.flag != 0 and \
411 strcmp( bam1_qname( alignment ), d.name ) == 0:
412 d.mate = bam_dup1( alignment )
416 '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
418 A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
420 *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
421 (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
422 Use ``h`` to output header information in text (:term:`TAM`) mode.
424 If ``b`` is present, it must immediately follow ``r`` or ``w``.
425 Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
426 a :term:`BAM` formatted file for reading, type::
428 f = pysam.Samfile('ex1.bam','rb')
430 If mode is not specified, we will try to auto-detect in the order 'r', 'rb', thus both the following
433 f1 = pysam.Samfile('ex1.bam' )
434 f2 = pysam.Samfile('ex1.bam' )
436 If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
437 access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
439 For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
440 sources (see also the samtools format specification):
442 1. If *template* is given, the header is copied from a another *Samfile*
443 (*template* must be of type *Samfile*).
445 2. If *header* is given, the header is built from a multi-level dictionary. The first level
446 are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
447 being a list of tag-value pairs.
449 3. If *text* is given, new header text is copied from raw text.
451 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
455 def __cinit__(self, *args, **kwargs ):
457 self._filename = NULL
459 self._open( *args, **kwargs )
461 # allocate memory for iterator
462 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
465 '''return true if samfile has been opened.'''
466 return self.samfile != NULL
468 def _hasIndex( self ):
469 '''return true if samfile has an existing (and opened) index.'''
470 return self.index != NULL
475 Samfile template = None,
476 referencenames = None,
477 referencelengths = None,
482 '''open a sam/bam file.
484 If _open is called on an existing bamfile, the current file will be
485 closed and a new file will be opened.
488 # read mode autodetection
491 self._open(filename, 'r', template=template,
492 referencenames=referencenames,
493 referencelengths=referencelengths,
494 text=text, header=header, port=port)
496 except ValueError, msg:
498 self._open(filename, 'rb', template=template,
499 referencenames=referencenames,
500 referencelengths=referencelengths,
501 text=text, header=header, port=port)
504 assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
505 assert filename != NULL
507 # close a previously opened file
508 if self.samfile != NULL: self.close()
511 cdef bam_header_t * header_to_write
512 header_to_write = NULL
514 if self._filename != NULL: free(self._filename )
515 self._filename = strdup( filename )
517 self.isbam = len(mode) > 1 and mode[1] == 'b'
519 self.isremote = strncmp(filename,"http:",5) == 0 or \
520 strncmp(filename,"ftp:",4) == 0
526 # open file for writing
528 # header structure (used for writing)
530 # copy header from another file
531 header_to_write = template.samfile.header
534 header_to_write = self._buildHeader( header )
537 # build header from a target names and lengths
538 assert referencenames and referencelengths, "either supply options `template`, `header` or both `referencenames` and `referencelengths` for writing"
539 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
541 # allocate and fill header
542 header_to_write = bam_header_init()
543 header_to_write.n_targets = len(referencenames)
545 for x in referencenames: n += len(x) + 1
546 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
547 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
548 for x from 0 <= x < header_to_write.n_targets:
549 header_to_write.target_len[x] = referencelengths[x]
550 name = referencenames[x]
551 header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
552 strncpy( header_to_write.target_name[x], name, len(name) )
557 header_to_write.l_text = strlen(ctext)
558 header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
559 memcpy( header_to_write.text, ctext, strlen(ctext) )
561 header_to_write.hash = NULL
562 header_to_write.rg2lib = NULL
564 # open file. Header gets written to file at the same time for bam files
565 # and sam files (in the latter case, the mode needs to be wh)
566 store = StderrStore()
567 self.samfile = samopen( filename, mode, header_to_write )
570 # bam_header_destroy takes care of cleaning up of all the members
571 if not template and header_to_write != NULL:
572 bam_header_destroy( header_to_write )
575 # open file for reading
576 if strncmp( filename, "-", 1) != 0 and \
577 not self.isremote and \
578 not os.path.exists( filename ):
579 raise IOError( "file `%s` not found" % filename)
581 # try to detect errors
582 self.samfile = samopen( filename, mode, NULL )
583 if self.samfile == NULL:
584 raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
586 if self.samfile.header == NULL:
587 raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode )
589 #disabled for autodetection to work
590 if self.samfile.header.n_targets == 0:
591 raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
593 if self.samfile == NULL:
594 raise IOError("could not open file `%s`" % filename )
596 # check for index and open if present
597 if mode[0] == "r" and self.isbam:
599 if not self.isremote:
600 if not os.path.exists(filename +".bai"):
603 # returns NULL if there is no index or index could not be opened
604 self.index = bam_index_load(filename)
605 if self.index == NULL:
606 raise IOError("error while opening index `%s` " % filename )
608 self.index = bam_index_load(filename)
609 if self.index == NULL:
610 raise IOError("error while opening index `%s` " % filename )
612 def gettid( self, reference ):
614 convert :term:`reference` name into numerical :term:`tid`
616 returns -1 if reference is not known.
618 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
619 return pysam_reference2tid( self.samfile.header, reference )
621 def getrname( self, tid ):
623 convert numerical :term:`tid` into :term:`reference` name.'''
624 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
625 if not 0 <= tid < self.samfile.header.n_targets:
626 raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
627 return self.samfile.header.target_name[tid]
629 cdef char * _getrname( self, int tid ):
631 convert numerical :term:`tid` into :term:`reference` name.'''
632 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
633 if not 0 <= tid < self.samfile.header.n_targets:
634 raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
635 return self.samfile.header.target_name[tid]
637 def _parseRegion( self,
643 parse region information.
645 raise ValueError for for invalid regions.
647 returns a tuple of flag, tid, start and end. Flag indicates
648 whether some coordinates were supplied.
650 Note that regions are 1-based, while start,end are python coordinates.
652 # This method's main objective is to translate from a reference to a tid.
653 # For now, it calls bam_parse_region, which is clumsy. Might be worth
654 # implementing it all in pysam (makes use of khash).
657 cdef long long rstart
666 except OverflowError:
667 raise ValueError( 'start out of range (%i)' % start )
672 except OverflowError:
673 raise ValueError( 'end out of range (%i)' % end )
676 parts = re.split( "[:-]", region )
678 if len(parts) >= 2: rstart = int(parts[1]) - 1
679 if len(parts) >= 3: rend = int(parts[2])
681 if not reference: return 0, 0, 0, 0
683 rtid = self.gettid( reference )
684 if rtid < 0: raise ValueError( "invalid reference `%s`" % reference )
685 if rstart > rend: raise ValueError( 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend) )
686 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
687 if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
689 return 1, rtid, rstart, rend
691 def seek( self, uint64_t offset, int where = 0):
693 move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
696 if not self._isOpen():
697 raise ValueError( "I/O operation on closed file" )
699 raise NotImplementedError("seek only available in bam files")
700 return bam_seek( self.samfile.x.bam, offset, where )
704 return current file position
706 if not self._isOpen():
707 raise ValueError( "I/O operation on closed file" )
709 raise NotImplementedError("seek only available in bam files")
711 return bam_tell( self.samfile.x.bam )
721 fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
722 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
725 Without *reference* or *region* all reads will be fetched. The reads will be returned
726 ordered by reference sequence, which will not necessarily be the order within the file.
727 If *until_eof* is given, all reads from the current file position will be returned
728 *in order as they are within the file*.
730 If only *reference* is set, all reads aligned to *reference* will be fetched.
732 The method returns an iterator of type :class:`pysam.IteratorRow` unless
733 a *callback is provided. If *callback* is given, the callback will be executed
734 for each position within the :term:`region`. Note that callbacks currently work
735 only, if *region* or *reference* is given.
737 Note that a :term:`SAM` file does not allow random access. If *region* or *reference* are given,
738 an exception is raised.
740 cdef int rtid, rstart, rend, has_coord
742 if not self._isOpen():
743 raise ValueError( "I/O operation on closed file" )
745 has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
748 if not until_eof and not self._hasIndex() and not self.isremote:
749 raise ValueError( "fetch called on bamfile without index" )
752 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
753 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
754 return bam_fetch(self.samfile.x.bam,
763 return IteratorRowRegion( self, rtid, rstart, rend )
766 return IteratorRowAll( self )
768 return IteratorRowAllRefs(self)
770 # check if header is present - otherwise sam_read1 aborts
771 # this happens if a bamfile is opened with mode 'r'
772 if self.samfile.header.n_targets == 0:
773 raise ValueError( "fetch called for samfile without header")
776 raise ValueError ("fetch for a region is not available for sam files" )
778 raise NotImplementedError( "callback not implemented yet" )
780 return IteratorRowAll( self )
784 '''return the mate of :class:`AlignedRead` *read*.
786 Throws a ValueError if read is unpaired or the mate
790 Calling this method will change the file position.
791 This might interfere with any iterators that have
792 not re-opened the file.
795 cdef uint32_t flag = read._delegate.core.flag
797 if flag & BAM_FPAIRED == 0:
798 raise ValueError( "read %s: is unpaired" % (read.qname))
799 if flag & BAM_FMUNMAP != 0:
800 raise ValueError( "mate %s: is unmapped" % (read.qname))
802 cdef MateData mate_data
804 mate_data.name = <char *>bam1_qname(read._delegate)
805 mate_data.mate = NULL
806 # xor flags to get the other mate
807 cdef int x = BAM_FREAD1 + BAM_FREAD2
808 mate_data.flag = ( flag ^ x) & x
810 bam_fetch(self.samfile.x.bam,
812 read._delegate.core.mtid,
813 read._delegate.core.mpos,
814 read._delegate.core.mpos + 1,
818 if mate_data.mate == NULL:
819 raise ValueError( "mate not found" )
821 cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
822 dest._delegate = mate_data.mate
831 '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
833 count reads :term:`region` using 0-based indexing. The region is specified by
834 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
836 Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
837 an exception is raised.
843 if not self._isOpen():
844 raise ValueError( "I/O operation on closed file" )
846 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
852 if not until_eof and not self._hasIndex() and not self.isremote:
853 raise ValueError( "fetch called on bamfile without index" )
856 raise ValueError( "counting functionality requires a region/reference" )
857 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
858 bam_fetch(self.samfile.x.bam,
867 raise ValueError ("count for a region is not available for sam files" )
877 perform a :term:`pileup` within a :term:`region`. The region is specified by
878 :term:`reference`, *start* and *end* (using 0-based indexing).
879 Alternatively, a samtools *region* string can be supplied.
881 Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
882 ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
884 The method returns an iterator of type :class:`pysam.IteratorColumn` unless
885 a *callback is provided. If a *callback* is given, the callback will be executed
886 for each column within the :term:`region`.
888 Note that :term:`SAM` formatted files do not allow random access.
889 In these files, if a *region* or *reference* are given an exception is raised.
891 Optional *kwargs* to the iterator:
894 The stepper controlls how the iterator advances.
895 Possible options for the stepper are
898 use all reads for pileup.
900 same filter and read processing as in :term:`csamtools` pileup
903 A :class:`FastaFile` object
906 Skip all reads with bits set in mask.
911 *all* reads which overlap the region are returned. The first base returned will be the
912 first base of the first read *not* necessarily the first base of the region used in the query.
914 The maximum number of reads considered for pileup is *8000*. This limit is set by
918 cdef int rtid, rstart, rend, has_coord
919 cdef bam_plbuf_t *buf
921 if not self._isOpen():
922 raise ValueError( "I/O operation on closed file" )
924 has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
927 if not self._hasIndex(): raise ValueError( "no index available for pileup" )
930 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
932 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
933 bam_fetch(self.samfile.x.bam,
934 self.index, rtid, rstart, rend,
935 buf, pileup_fetch_callback )
938 bam_plbuf_push( NULL, buf)
939 bam_plbuf_destroy(buf)
942 return IteratorColumnRegion( self,
948 return IteratorColumnAllRefs(self, **kwargs )
951 raise NotImplementedError( "pileup of samfiles not implemented yet" )
955 closes the :class:`pysam.Samfile`.'''
956 if self.samfile != NULL:
957 samclose( self.samfile )
958 bam_index_destroy(self.index);
961 def __dealloc__( self ):
962 # remember: dealloc cannot call other methods
963 # note: no doc string
964 # note: __del__ is not called.
967 if self._filename != NULL: free( self._filename )
969 cpdef int write( self, AlignedRead read ) except -1:
971 write a single :class:`pysam.AlignedRead` to disk.
973 returns the number of bytes written.
975 if not self._isOpen():
978 return samwrite( self.samfile, read._delegate )
983 def __exit__(self, exc_type, exc_value, traceback):
987 ###############################################################
988 ###############################################################
989 ###############################################################
991 ###############################################################
993 '''number of :term:`filename` associated with this object.'''
995 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
996 return self._filename
998 property nreferences:
999 '''number of :term:`reference` sequences in the file.'''
1001 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1002 return self.samfile.header.n_targets
1004 property references:
1005 """tuple with the names of :term:`reference` sequences."""
1007 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1009 for x from 0 <= x < self.samfile.header.n_targets:
1010 t.append( self.samfile.header.target_name[x] )
1014 """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
1015 :attr:`pysam.Samfile.references`
1018 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1020 for x from 0 <= x < self.samfile.header.n_targets:
1021 t.append( self.samfile.header.target_len[x] )
1025 '''full contents of the :term:`sam file` header as a string.'''
1027 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1028 return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
1031 '''header information within the :term:`sam file`. The records and fields are returned as
1032 a two-level dictionary.
1035 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1039 if self.samfile.header.text != NULL:
1040 # convert to python string (note: call self.text to create 0-terminated string)
1042 for line in t.split("\n"):
1043 if not line.strip(): continue
1044 assert line.startswith("@"), "header line without '@': '%s'" % line
1045 fields = line[1:].split("\t")
1047 assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
1051 if record not in result: result[record] = []
1052 result[record].append( "\t".join( fields[1:] ) )
1055 # the following is clumsy as generators do not work?
1057 for field in fields[1:]:
1058 key, value = field.split(":",1)
1059 # uppercase keys must be valid
1060 # lowercase are permitted for user fields
1061 if key in VALID_HEADER_FIELDS[record]:
1062 x[key] = VALID_HEADER_FIELDS[record][key](value)
1063 elif not key.isupper():
1066 raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
1068 if VALID_HEADER_TYPES[record] == dict:
1069 if record in result:
1070 raise ValueError( "multiple '%s' lines are not permitted" % record )
1072 elif VALID_HEADER_TYPES[record] == list:
1073 if record not in result: result[record] = []
1074 result[record].append( x )
1078 def _buildLine( self, fields, record ):
1079 '''build a header line from *fields* dictionary for *record*'''
1081 # TODO: add checking for field and sort order
1082 line = ["@%s" % record ]
1084 line.append( fields )
1086 # write fields of the specification
1087 for key in VALID_HEADER_ORDER[record]:
1089 line.append( "%s:%s" % (key, str(fields[key])))
1092 if not key.isupper():
1093 line.append( "%s:%s" % (key, str(fields[key])))
1095 return "\t".join( line )
1097 cdef bam_header_t * _buildHeader( self, new_header ):
1098 '''return a new header built from a dictionary in *new_header*.
1100 This method inserts the text field, target_name and target_len.
1105 # check if hash exists
1107 # create new header and copy old data
1108 cdef bam_header_t * dest
1110 dest = bam_header_init()
1112 for record in VALID_HEADERS:
1113 if record in new_header:
1114 ttype = VALID_HEADER_TYPES[record]
1115 data = new_header[record]
1116 if type( data ) != type( ttype() ):
1117 raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
1118 if type( data ) == types.DictType:
1119 lines.append( self._buildLine( data, record ) )
1121 for fields in new_header[record]:
1122 lines.append( self._buildLine( fields, record ) )
1124 text = "\n".join(lines) + "\n"
1125 if dest.text != NULL: free( dest.text )
1126 dest.text = <char*>calloc( len(text), sizeof(char))
1127 dest.l_text = len(text)
1128 strncpy( dest.text, text, dest.l_text )
1131 if "SQ" in new_header:
1133 for fields in new_header["SQ"]:
1135 seqs.append( (fields["SN"], fields["LN"] ) )
1137 raise KeyError( "incomplete sequence information in '%s'" % str(fields))
1139 dest.n_targets = len(seqs)
1140 dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
1141 dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
1143 for x from 0 <= x < dest.n_targets:
1144 seqname, seqlen = seqs[x]
1145 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
1146 strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
1147 dest.target_len[x] = seqlen
1151 ###############################################################
1152 ###############################################################
1153 ###############################################################
1154 ## file-object like iterator access
1155 ## note: concurrent access will cause errors (see IteratorRow
1157 ## Possible solutions: deprecate or open new file handle
1158 ###############################################################
1160 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1163 cdef bam1_t * getCurrent( self ):
1166 cdef int cnext(self):
1168 cversion of iterator. Used by :class:`pysam.Samfile.IteratorColumn`.
1171 return samread(self.samfile, self.b)
1175 python version of next().
1178 ret = samread(self.samfile, self.b)
1180 return makeAlignedRead( self.b )
1184 ##-------------------------------------------------------------------
1185 ##-------------------------------------------------------------------
1186 ##-------------------------------------------------------------------
1187 cdef class IteratorRow:
1188 '''abstract base class for iterators over mapped reads.
1190 Various iterators implement different behaviours for wrapping around
1191 contig boundaries. Examples include:
1193 :class:`pysam.IteratorRowRegion`
1194 iterate within a single contig and a defined region.
1196 :class:`pysam.IteratorRowAll`
1197 iterate until EOF. This iterator will also include unmapped reads.
1199 :class:`pysam.IteratorRowAllRefs`
1200 iterate over all reads in all reference sequences.
1202 The method :meth:`Samfile.fetch` returns an IteratorRow.
1206 cdef class IteratorRowRegion(IteratorRow):
1207 """*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
1209 iterate over mapped reads in a region.
1211 By default, the file is re-openend to avoid conflicts between
1212 multiple iterators working on the same file. Set *reopen* = False
1213 to not re-open *samfile*.
1215 The samtools iterators assume that the file
1216 position between iterations do not change.
1217 As a consequence, no two iterators can work
1218 on the same file. To permit this, each iterator
1219 creates its own file handle by re-opening the
1222 Note that the index will be shared between
1223 samfile and the iterator.
1226 cdef bam_iter_t iter # iterator state object
1229 cdef Samfile samfile
1231 # true if samfile belongs to this object
1232 cdef int owns_samfile
1234 def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
1236 if not samfile._isOpen():
1237 raise ValueError( "I/O operation on closed file" )
1239 if not samfile._hasIndex():
1240 raise ValueError( "no index available for iteration" )
1242 # makes sure that samfile stays alive as long as the
1244 self.samfile = samfile
1246 if samfile.isbam: mode = "rb"
1249 # reopen the file - note that this makes the iterator
1250 # slow and causes pileup to slow down significantly.
1252 store = StderrStore()
1253 self.fp = samopen( samfile._filename, mode, NULL )
1255 assert self.fp != NULL
1256 self.owns_samfile = True
1258 self.fp = self.samfile.samfile
1259 self.owns_samfile = False
1263 self.iter = bam_iter_query(self.samfile.index,
1267 self.b = bam_init1()
1272 cdef bam1_t * getCurrent( self ):
1275 cdef int cnext(self):
1276 '''cversion of iterator. Used by IteratorColumn'''
1277 self.retval = bam_iter_read( self.fp.x.bam,
1282 """python version of next().
1285 if self.retval < 0: raise StopIteration
1286 return makeAlignedRead( self.b )
1288 def __dealloc__(self):
1289 bam_destroy1(self.b)
1290 if self.owns_samfile: samclose( self.fp )
1292 cdef class IteratorRowAll(IteratorRow):
1293 """*(Samfile samfile, int reopen = True)*
1295 iterate over all reads in *samfile*
1297 By default, the file is re-openend to avoid conflicts between
1298 multiple iterators working on the same file. Set *reopen* = False
1299 to not re-open *samfile*.
1304 # true if samfile belongs to this object
1305 cdef int owns_samfile
1307 def __cinit__(self, Samfile samfile, int reopen = True ):
1309 if not samfile._isOpen():
1310 raise ValueError( "I/O operation on closed file" )
1312 if samfile.isbam: mode = "rb"
1315 # reopen the file to avoid iterator conflict
1317 store = StderrStore()
1318 self.fp = samopen( samfile._filename, mode, NULL )
1320 assert self.fp != NULL
1321 self.owns_samfile = True
1323 self.fp = samfile.samfile
1324 self.owns_samfile = False
1326 # allocate memory for alignment
1327 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1332 cdef bam1_t * getCurrent( self ):
1335 cdef int cnext(self):
1336 '''cversion of iterator. Used by IteratorColumn'''
1338 return samread(self.fp, self.b)
1341 """python version of next().
1343 pyrex uses this non-standard name instead of next()
1346 ret = samread(self.fp, self.b)
1348 return makeAlignedRead( self.b )
1352 def __dealloc__(self):
1353 bam_destroy1(self.b)
1354 if self.owns_samfile: samclose( self.fp )
1356 cdef class IteratorRowAllRefs(IteratorRow):
1357 """iterates over all mapped reads by chaining iterators over each reference
1359 cdef Samfile samfile
1361 cdef IteratorRowRegion rowiter
1363 def __cinit__(self, Samfile samfile):
1364 assert samfile._isOpen()
1365 if not samfile._hasIndex(): raise ValueError("no index available for fetch")
1366 self.samfile = samfile
1370 self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, 1<<29)
1376 """python version of next().
1378 pyrex uses this non-standard name instead of next()
1380 # Create an initial iterator
1382 if not self.samfile.nreferences:
1388 self.rowiter.cnext()
1390 # If current iterator is not exhausted, return aligned read
1391 if self.rowiter.retval>0:
1392 return makeAlignedRead(self.rowiter.b)
1396 # Otherwise, proceed to next reference or stop
1397 if self.tid<self.samfile.nreferences:
1402 cdef class IteratorRowSelection(IteratorRow):
1403 """*(Samfile samfile)*
1405 iterate over reads in *samfile* at a given list of file positions.
1409 cdef int current_pos
1412 # true if samfile belongs to this object
1413 cdef int owns_samfile
1415 def __cinit__(self, Samfile samfile, positions, int reopen = True ):
1417 if not samfile._isOpen():
1418 raise ValueError( "I/O operation on closed file" )
1420 if not samfile._isOpen():
1421 raise ValueError( "I/O operation on closed file" )
1423 assert samfile.isbam, "can only use this iterator on bam files"
1426 # reopen the file to avoid iterator conflict
1428 store = StderrStore()
1429 self.fp = samopen( samfile._filename, mode, NULL )
1431 assert self.fp != NULL
1432 self.owns_samfile = True
1434 self.fp = samfile.samfile
1435 self.owns_samfile = False
1437 # allocate memory for alignment
1438 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1440 self.positions = positions
1441 self.current_pos = 0
1446 cdef bam1_t * getCurrent( self ):
1449 cdef int cnext(self):
1450 '''cversion of iterator'''
1452 # end iteration if out of positions
1453 if self.current_pos >= len(self.positions): return -1
1455 bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
1456 self.current_pos += 1
1457 return samread(self.fp, self.b)
1460 """python version of next().
1462 pyrex uses this non-standard name instead of next()
1465 cdef int ret = self.cnext()
1467 return makeAlignedRead( self.b )
1471 def __dealloc__(self):
1472 bam_destroy1(self.b)
1473 if self.owns_samfile: samclose( self.fp )
1475 ##-------------------------------------------------------------------
1476 ##-------------------------------------------------------------------
1477 ##-------------------------------------------------------------------
1478 ctypedef struct __iterdata:
1486 cdef int __advance_all( void * data, bam1_t * b ):
1487 '''advance without any read filtering.
1490 d = <__iterdata*>data
1491 return bam_iter_read( d.samfile.x.bam, d.iter, b )
1493 cdef int __advance_snpcalls( void * data, bam1_t * b ):
1494 '''advance using same filter and read processing as in
1495 the samtools pileup.
1498 d = <__iterdata*>data
1500 cdef int ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1504 cdef int is_nobaq = 0
1505 cdef int capQ_thres = 0
1508 if d.fastafile != NULL and b.core.tid != d.tid:
1509 if d.seq != NULL: free(d.seq)
1511 d.seq = faidx_fetch_seq(d.fastafile,
1512 d.samfile.header.target_name[d.tid],
1516 raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
1517 (d.samfile.header.target_name[d.tid],
1525 # realign read - changes base qualities
1526 if d.seq != NULL and is_cns and not is_nobaq: bam_prob_realn( b, d.seq )
1528 if d.seq != NULL and capQ_thres > 10:
1529 q = bam_cap_mapQ(b, d.seq, capQ_thres)
1531 elif b.core.qual > q: b.core.qual = q
1532 if b.core.flag & BAM_FUNMAP: skip = 1
1533 elif b.core.flag & 1 and not b.core.flag & 2: skip = 1
1536 # additional filters
1538 ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1542 cdef class IteratorColumn:
1543 '''abstract base class for iterators over columns.
1545 IteratorColumn objects wrap the pileup functionality of samtools.
1547 For reasons of efficiency, the iterator points to the current
1548 pileup buffer. The pileup buffer is updated at every iteration.
1549 This might cause some unexpected behavious. For example,
1550 consider the conversion to a list::
1552 f = Samfile("file.bam", "rb")
1553 result = list( f.pileup() )
1555 Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
1556 but each object in ``result`` will contain the same information.
1558 The desired behaviour can be achieved by list comprehension::
1560 result = [ x.pileups() for x in f.pileup() ]
1562 ``result`` will be a list of ``n`` lists of objects of type :class:`PileupRead`.
1564 If the iterator is associated with a :class:`Fastafile` using the :meth:`addReference`
1565 method, then the iterator will export the current sequence via the methods :meth:`getSequence`
1566 and :meth:`seq_len`.
1568 Optional kwargs to the iterator
1571 The stepper controlls how the iterator advances.
1572 Possible options for the stepper are
1575 use all reads for pileup.
1577 same filter and read processing as in :term:`csamtools` pileup
1579 A :class:`FastaFile` object
1581 Skip all reads with bits set in mask.
1586 # result of the last plbuf_push
1587 cdef IteratorRowRegion iter
1592 cdef const_bam_pileup1_t_ptr plp
1593 cdef bam_plp_t pileup_iter
1594 cdef __iterdata iterdata
1595 cdef Samfile samfile
1596 cdef Fastafile fastafile
1599 def __cinit__( self, Samfile samfile, **kwargs ):
1600 self.samfile = samfile
1601 self.mask = kwargs.get("mask", BAM_DEF_MASK )
1602 self.fastafile = kwargs.get( "fastafile", None )
1603 self.stepper = kwargs.get( "stepper", None )
1604 self.iterdata.seq = NULL
1609 self.pileup_iter = <bam_plp_t>NULL
1614 cdef int cnext(self):
1615 '''perform next iteration.
1617 This method is analogous to the samtools bam_plp_auto method.
1618 It has been re-implemented to permit for filtering.
1620 self.plp = bam_plp_auto( self.pileup_iter,
1625 cdef char * getSequence( self ):
1626 '''return current reference sequence underlying the iterator.
1628 return self.iterdata.seq
1631 '''current sequence length.'''
1632 def __get__(self): return self.iterdata.seq_len
1634 def addReference( self, Fastafile fastafile ):
1636 add reference sequences in *fastafile* to iterator.'''
1637 self.fastafile = fastafile
1638 if self.iterdata.seq != NULL: free(self.iterdata.seq)
1639 self.iterdata.tid = -1
1640 self.iterdata.fastafile = self.fastafile.fastafile
1642 def hasReference( self ):
1644 return true if iterator is associated with a reference'''
1645 return self.fastafile
1647 cdef setMask( self, mask ):
1648 '''set masking flag in iterator.
1650 reads with bits set in *mask* will be skipped.
1653 bam_plp_set_mask( self.pileup_iter, self.mask )
1655 cdef setupIteratorData( self,
1660 '''setup the iterator structure'''
1662 self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen )
1663 self.iterdata.samfile = self.samfile.samfile
1664 self.iterdata.iter = self.iter.iter
1665 self.iterdata.seq = NULL
1666 self.iterdata.tid = -1
1668 if self.fastafile != None:
1669 self.iterdata.fastafile = self.fastafile.fastafile
1671 self.iterdata.fastafile = NULL
1673 if self.stepper == None or self.stepper == "all":
1674 self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
1675 elif self.stepper == "samtools":
1676 self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
1678 raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
1680 bam_plp_set_mask( self.pileup_iter, self.mask )
1682 cdef reset( self, tid, start, end ):
1683 '''reset iterator position.
1685 This permits using the iterator multiple times without
1686 having to incur the full set-up costs.
1688 self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
1689 self.iterdata.iter = self.iter.iter
1691 # invalidate sequence if different tid
1693 if self.iterdata.seq != NULL: free( self.iterdata.seq )
1694 self.iterdata.seq = NULL
1695 self.iterdata.tid = -1
1697 # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
1698 bam_plp_reset(self.pileup_iter)
1700 def __dealloc__(self):
1701 # reset in order to avoid memory leak messages for iterators that have
1702 # not been fully consumed
1703 if self.pileup_iter != <bam_plp_t>NULL:
1704 bam_plp_reset(self.pileup_iter)
1705 bam_plp_destroy(self.pileup_iter)
1706 self.pileup_iter = <bam_plp_t>NULL
1708 if self.iterdata.seq != NULL:
1709 free(self.iterdata.seq)
1710 self.iterdata.seq = NULL
1712 cdef class IteratorColumnRegion(IteratorColumn):
1713 '''iterates over a region only.
1715 def __cinit__(self, Samfile samfile,
1721 # initialize iterator
1722 self.setupIteratorData( tid, start, end, 1 )
1725 """python version of next().
1731 raise ValueError("error during iteration" )
1733 if self.plp == NULL:
1736 return makePileupProxy( <bam_pileup1_t*>self.plp,
1741 cdef class IteratorColumnAllRefs(IteratorColumn):
1742 """iterates over all columns by chaining iterators over each reference
1749 # no iteration over empty files
1750 if not samfile.nreferences: raise StopIteration
1752 # initialize iterator
1753 self.setupIteratorData( self.tid, 0, max_pos, 1 )
1756 """python version of next().
1763 raise ValueError("error during iteration" )
1765 # return result, if within same reference
1766 if self.plp != NULL:
1767 return makePileupProxy( <bam_pileup1_t*>self.plp,
1772 # otherwise, proceed to next reference or stop
1774 if self.tid < self.samfile.nreferences:
1775 self.setupIteratorData( self.tid, 0, max_pos, 0 )
1779 ##-------------------------------------------------------------------
1780 ##-------------------------------------------------------------------
1781 ##-------------------------------------------------------------------
1782 cdef inline int32_t query_start(bam1_t *src) except -1:
1783 cdef uint32_t * cigar_p, op
1785 cdef uint32_t start_offset = 0
1787 if src.core.n_cigar:
1788 cigar_p = bam1_cigar(src);
1789 for k from 0 <= k < src.core.n_cigar:
1790 op = cigar_p[k] & BAM_CIGAR_MASK
1791 if op==BAM_CHARD_CLIP:
1792 if start_offset!=0 and start_offset!=src.core.l_qseq:
1793 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1795 elif op==BAM_CSOFT_CLIP:
1796 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
1802 ##-------------------------------------------------------------------
1803 ##-------------------------------------------------------------------
1804 ##-------------------------------------------------------------------
1805 cdef inline int32_t query_end(bam1_t *src) except -1:
1806 cdef uint32_t * cigar_p, op
1808 cdef uint32_t end_offset = src.core.l_qseq
1810 if src.core.n_cigar>1:
1811 cigar_p = bam1_cigar(src);
1812 for k from src.core.n_cigar > k >= 1:
1813 op = cigar_p[k] & BAM_CIGAR_MASK
1814 if op==BAM_CHARD_CLIP:
1815 if end_offset!=0 and end_offset!=src.core.l_qseq:
1816 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1818 elif op==BAM_CSOFT_CLIP:
1819 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
1824 end_offset = src.core.l_qseq
1829 cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
1834 if not src.core.l_qseq:
1837 seq = PyString_FromStringAndSize(NULL, end-start)
1838 s = PyString_AS_STRING(seq)
1841 for k from start <= k < end:
1842 # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
1843 # note: do not use string literal as it will be a python string
1844 s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
1849 cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
1858 qual = PyString_FromStringAndSize(NULL, end-start)
1859 q = PyString_AS_STRING(qual)
1861 for k from start <= k < end:
1862 ## equivalent to t[i] + 33 (see bam.c)
1863 q[k-start] = p[k] + 33
1867 cdef class AlignedRead:
1869 Class representing an aligned read. see SAM format specification for
1870 the meaning of fields (http://samtools.sourceforge.net/).
1872 This class stores a handle to the samtools C-structure representing
1873 an aligned read. Member read access is forwarded to the C-structure
1874 and converted into python objects. This implementation should be fast,
1875 as only the data needed is converted.
1877 For write access, the C-structure is updated in-place. This is
1878 not the most efficient way to build BAM entries, as the variable
1879 length data is concatenated and thus needs to resized if
1880 a field is updated. Furthermore, the BAM entry might be
1881 in an inconsistent state. The :meth:`~validate` method can
1882 be used to check if an entry is consistent.
1884 One issue to look out for is that the sequence should always
1885 be set *before* the quality scores. Setting the sequence will
1886 also erase any quality scores that were set previously.
1889 # Now only called when instances are created from Python
1892 self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
1893 # allocate some memory
1894 # If size is 0, calloc does not return a pointer that can be passed to free()
1895 # so allocate 40 bytes for a new read
1896 self._delegate.m_data = 40
1897 self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
1898 self._delegate.data_len = 0
1900 def __dealloc__(self):
1901 bam_destroy1(self._delegate)
1904 """return string representation of alignment.
1906 The representation is an approximate :term:`sam` format.
1908 An aligned read might not be associated with a :term:`Samfile`.
1909 As a result :term:`tid` is shown instead of the reference name.
1911 Similarly, the tags field is returned in its parsed state.
1913 # sam-parsing is done in sam.c/bam_format1_core which
1914 # requires a valid header.
1915 return "\t".join(map(str, (self.qname,
1928 def compare(self, AlignedRead other):
1929 '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
1937 # uncomment for debugging purposes
1938 # cdef unsigned char * oo, * tt
1939 # tt = <unsigned char*>(&t.core)
1940 # oo = <unsigned char*>(&o.core)
1941 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
1942 # tt = <unsigned char*>(t.data)
1943 # oo = <unsigned char*>(o.data)
1944 # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
1946 # Fast-path test for object identity
1950 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
1952 if retval: return retval
1953 retval = cmp(t.data_len, o.data_len)
1954 if retval: return retval
1955 return memcmp(t.data, o.data, t.data_len)
1957 # Disabled so long as __cmp__ is a special method
1959 return _Py_HashPointer(<void *>self)
1962 """the query name (None if not present)"""
1965 src = self._delegate
1966 if src.core.l_qname == 0: return None
1967 return <char *>bam1_qname( src )
1969 def __set__(self, qname ):
1970 if qname == None or len(qname) == 0: return
1975 src = self._delegate
1976 p = bam1_qname( src )
1978 # the qname is \0 terminated
1980 pysam_bam_update( src,
1985 src.core.l_qname = l
1987 # re-acquire pointer to location in memory
1988 # as it might have moved
1991 strncpy( p, qname, l )
1994 """the :term:`cigar` alignment (None if not present).
1997 cdef uint32_t * cigar_p
2002 src = self._delegate
2003 if src.core.n_cigar == 0: return None
2006 cigar_p = bam1_cigar(src);
2007 for k from 0 <= k < src.core.n_cigar:
2008 op = cigar_p[k] & BAM_CIGAR_MASK
2009 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2010 cigar.append((op, l))
2013 def __set__(self, values ):
2014 if values == None or len(values) == 0: return
2022 src = self._delegate
2024 # get location of cigar string
2027 # create space for cigar data within src.data
2028 pysam_bam_update( src,
2029 src.core.n_cigar * 4,
2033 # length is number of cigar operations, not bytes
2034 src.core.n_cigar = len(values)
2036 # re-acquire pointer to location in memory
2037 # as it might have moved
2040 # insert cigar operations
2041 for op, l in values:
2042 p[k] = l << BAM_CIGAR_SHIFT | op
2045 ## setting the cigar string also updates the "bin" attribute
2046 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
2049 """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
2053 src = self._delegate
2055 if src.core.l_qseq == 0: return None
2057 return get_seq_range(src, 0, src.core.l_qseq)
2059 def __set__(self,seq):
2060 # samtools manages sequence and quality length memory together
2061 # if no quality information is present, the first byte says 0xff.
2063 if seq == None or len(seq) == 0: return
2067 cdef int l, k, nbytes_new, nbytes_old
2069 src = self._delegate
2073 # as the sequence is stored in half-bytes, the total length (sequence
2074 # plus quality scores) is (l+1)/2 + l
2075 nbytes_new = (l+1)/2 + l
2076 nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
2077 # acquire pointer to location in memory
2081 pysam_bam_update( src,
2085 # re-acquire pointer to location in memory
2086 # as it might have moved
2088 for k from 0 <= k < nbytes_new: p[k] = 0
2089 # convert to C string
2091 for k from 0 <= k < l:
2092 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
2095 p = bam1_qual( src )
2100 """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
2106 src = self._delegate
2108 if src.core.l_qseq == 0: return None
2110 return get_qual_range(src, 0, src.core.l_qseq)
2112 def __set__(self,qual):
2113 # note that space is already allocated via the sequences
2119 src = self._delegate
2120 p = bam1_qual( src )
2121 if qual == None or len(qual) == 0:
2122 # if absent - set to 0xff
2126 # convert to C string
2129 if src.core.l_qseq != l:
2130 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
2131 assert src.core.l_qseq == l
2132 for k from 0 <= k < l:
2133 p[k] = <uint8_t>q[k] - 33
2136 """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
2138 SAM/BAM files may included extra flanking bases sequences that were
2139 not part of the alignment. These bases may be the result of the
2140 Smith-Waterman or other algorithms, which may not require alignments
2141 that begin at the first residue or end at the last. In addition,
2142 extra sequencing adapters, multiplex identifiers, and low-quality bases that
2143 were not considered for alignment may have been retained."""
2147 cdef uint32_t start, end
2150 src = self._delegate
2152 if src.core.l_qseq == 0: return None
2154 start = query_start(src)
2155 end = query_end(src)
2157 return get_seq_range(src, start, end)
2160 """aligned query sequence quality values (None if not present)"""
2163 cdef uint32_t start, end
2166 src = self._delegate
2168 if src.core.l_qseq == 0: return None
2170 start = query_start(src)
2171 end = query_end(src)
2173 return get_qual_range(src, start, end)
2176 """start index of the aligned query portion of the sequence (0-based, inclusive)"""
2178 return query_start(self._delegate)
2181 """end index of the aligned query portion of the sequence (0-based, exclusive)"""
2183 return query_end(self._delegate)
2186 """Length of the aligned query sequence"""
2189 src = self._delegate
2190 return query_end(src)-query_start(src)
2193 """the tags in the AUX field.
2195 This property permits convenience access to
2196 the tags. Changes it the returned list will
2197 not update the tags automatically. Instead,
2198 the following is required for adding a
2201 read.tags = read.tags + [("RG",0)]
2211 src = self._delegate
2212 if src.l_aux == 0: return None
2217 while s < (src.data + src.data_len):
2224 if auxtype in ('c', 'C'):
2225 value = <int>bam_aux2i(s)
2227 elif auxtype in ('s', 'S'):
2228 value = <int>bam_aux2i(s)
2230 elif auxtype in ('i', 'I'):
2231 value = <float>bam_aux2i(s)
2233 elif auxtype == 'f':
2234 value = <float>bam_aux2f(s)
2236 elif auxtype == 'd':
2237 value = <double>bam_aux2d(s)
2239 elif auxtype == 'A':
2240 value = "%c" % <char>bam_aux2A(s)
2242 elif auxtype in ('Z', 'H'):
2243 value = <char*>bam_aux2Z(s)
2244 # +1 for NULL terminated string
2249 result.append( (auxtag, value) )
2253 def __set__(self, tags):
2257 cdef uint8_t * new_data
2259 cdef int guessed_size, control_size
2260 cdef int max_size, size, offset
2262 src = self._delegate
2268 # map samtools code to python.struct code and byte size
2269 buffer = ctypes.create_string_buffer(max_size)
2271 for pytag, value in tags:
2273 if t == types.FloatType:
2274 fmt, pytype = "<cccf", 'f'
2275 elif t == types.IntType:
2277 if value >= -127: fmt, pytype = "<cccb", 'c'
2278 elif value >= -32767: fmt, pytype = "<ccch", 's'
2279 elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2280 else: fmt, pytype = "<ccci", 'i'[0]
2282 if value <= 255: fmt, pytype = "<cccB", 'C'
2283 elif value <= 65535: fmt, pytype = "<cccH", 'S'
2284 elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2285 else: fmt, pytype = "<cccI", 'I'
2287 # Note: hex strings (H) are not supported yet
2289 fmt, pytype = "<cccc", 'A'
2291 fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
2293 size = struct.calcsize(fmt)
2294 if offset + size > max_size:
2295 raise NotImplementedError("tags field too large")
2297 struct.pack_into( fmt,
2306 # delete the old data and allocate new
2307 # if offset == 0, the aux field will be
2309 pysam_bam_update( src,
2316 # copy data only if there is any
2319 # get location of new data
2322 # check if there is direct path from buffer.raw to tmp
2324 memcpy( s, temp, offset )
2327 """properties flag"""
2328 def __get__(self): return self._delegate.core.flag
2329 def __set__(self, flag): self._delegate.core.flag = flag
2335 DEPRECATED from pysam-0.4 - use tid in the future.
2336 The rname field caused a lot of confusion as it returns
2337 the :term:`target` ID instead of the reference sequence
2342 This field contains the index of the reference sequence
2343 in the sequence dictionary. To obtain the name
2344 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2347 def __get__(self): return self._delegate.core.tid
2348 def __set__(self, tid): self._delegate.core.tid = tid
2356 This field contains the index of the reference sequence
2357 in the sequence dictionary. To obtain the name
2358 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2361 def __get__(self): return self._delegate.core.tid
2362 def __set__(self, tid): self._delegate.core.tid = tid
2365 """0-based leftmost coordinate"""
2366 def __get__(self): return self._delegate.core.pos
2367 def __set__(self, pos):
2368 ## setting the cigar string also updates the "bin" attribute
2370 src = self._delegate
2371 if src.core.n_cigar:
2372 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
2374 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
2375 self._delegate.core.pos = pos
2377 """properties bin"""
2378 def __get__(self): return self._delegate.core.bin
2379 def __set__(self, bin): self._delegate.core.bin = bin
2381 '''length of the read (read only). Returns 0 if not given.'''
2382 def __get__(self): return self._delegate.core.l_qseq
2384 '''aligned end position of the read (read only). Returns
2385 None if not available.'''
2388 src = self._delegate
2389 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2391 return bam_calend(&src.core, bam1_cigar(src))
2393 '''aligned length of the read (read only). Returns None if
2397 src = self._delegate
2398 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2400 return bam_calend(&src.core,
2401 bam1_cigar(src)) - \
2402 self._delegate.core.pos
2405 """mapping quality"""
2406 def __get__(self): return self._delegate.core.qual
2407 def __set__(self, qual): self._delegate.core.qual = qual
2409 """the :term:`reference` id of the mate """
2410 def __get__(self): return self._delegate.core.mtid
2411 def __set__(self, mtid): self._delegate.core.mtid = mtid
2413 """the position of the mate"""
2414 def __get__(self): return self._delegate.core.mpos
2415 def __set__(self, mpos): self._delegate.core.mpos = mpos
2417 """the insert size"""
2418 def __get__(self): return self._delegate.core.isize
2419 def __set__(self, isize): self._delegate.core.isize = isize
2421 """true if read is paired in sequencing"""
2422 def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
2423 def __set__(self,val):
2424 if val: self._delegate.core.flag |= BAM_FPAIRED
2425 else: self._delegate.core.flag &= ~BAM_FPAIRED
2426 property is_proper_pair:
2427 """true if read is mapped in a proper pair"""
2428 def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
2429 def __set__(self,val):
2430 if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
2431 else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
2432 property is_unmapped:
2433 """true if read itself is unmapped"""
2434 def __get__(self): return (self.flag & BAM_FUNMAP) != 0
2435 def __set__(self,val):
2436 if val: self._delegate.core.flag |= BAM_FUNMAP
2437 else: self._delegate.core.flag &= ~BAM_FUNMAP
2438 property mate_is_unmapped:
2439 """true if the mate is unmapped"""
2440 def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
2441 def __set__(self,val):
2442 if val: self._delegate.core.flag |= BAM_FMUNMAP
2443 else: self._delegate.core.flag &= ~BAM_FMUNMAP
2444 property is_reverse:
2445 """true if read is mapped to reverse strand"""
2446 def __get__(self): return (self.flag & BAM_FREVERSE) != 0
2447 def __set__(self,val):
2448 if val: self._delegate.core.flag |= BAM_FREVERSE
2449 else: self._delegate.core.flag &= ~BAM_FREVERSE
2450 property mate_is_reverse:
2451 """true is read is mapped to reverse strand"""
2452 def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
2453 def __set__(self,val):
2454 if val: self._delegate.core.flag |= BAM_FMREVERSE
2455 else: self._delegate.core.flag &= ~BAM_FMREVERSE
2457 """true if this is read1"""
2458 def __get__(self): return (self.flag & BAM_FREAD1) != 0
2459 def __set__(self,val):
2460 if val: self._delegate.core.flag |= BAM_FREAD1
2461 else: self._delegate.core.flag &= ~BAM_FREAD1
2463 """true if this is read2"""
2464 def __get__(self): return (self.flag & BAM_FREAD2) != 0
2465 def __set__(self,val):
2466 if val: self._delegate.core.flag |= BAM_FREAD2
2467 else: self._delegate.core.flag &= ~BAM_FREAD2
2468 property is_secondary:
2469 """true if not primary alignment"""
2470 def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
2471 def __set__(self,val):
2472 if val: self._delegate.core.flag |= BAM_FSECONDARY
2473 else: self._delegate.core.flag &= ~BAM_FSECONDARY
2475 """true if QC failure"""
2476 def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
2477 def __set__(self,val):
2478 if val: self._delegate.core.flag |= BAM_FQCFAIL
2479 else: self._delegate.core.flag &= ~BAM_FQCFAIL
2480 property is_duplicate:
2481 """true if optical or PCR duplicate"""
2482 def __get__(self): return (self.flag & BAM_FDUP) != 0
2483 def __set__(self,val):
2484 if val: self._delegate.core.flag |= BAM_FDUP
2485 else: self._delegate.core.flag &= ~BAM_FDUP
2487 """a list of reference positions that this read aligns to."""
2489 cdef uint32_t k, i, pos
2491 cdef uint32_t * cigar_p
2495 src = self._delegate
2496 if src.core.n_cigar == 0: return []
2500 cigar_p = bam1_cigar(src)
2501 for k from 0 <= k < src.core.n_cigar:
2502 op = cigar_p[k] & BAM_CIGAR_MASK
2503 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2504 if op == BAM_CMATCH:
2505 for i from pos <= i < pos + l:
2508 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2512 def overlap( self, uint32_t start, uint32_t end ):
2513 """return number of bases on reference overlapping *start* and *end*
2515 cdef uint32_t k, i, pos, overlap
2517 cdef uint32_t * cigar_p
2522 src = self._delegate
2523 if src.core.n_cigar == 0: return 0
2527 cigar_p = bam1_cigar(src)
2528 for k from 0 <= k < src.core.n_cigar:
2529 op = cigar_p[k] & BAM_CIGAR_MASK
2530 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2532 if op == BAM_CMATCH:
2533 o = min( pos + l, end) - max( pos, start )
2534 if o > 0: overlap += o
2536 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2542 """retrieves optional data given a two-letter *tag*"""
2543 #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
2545 v = bam_aux_get(self._delegate, tag)
2546 if v == NULL: raise KeyError( "tag '%s' not present" % tag )
2548 if type == 'c' or type == 'C' or type == 's' or type == 'S':
2549 return <int>bam_aux2i(v)
2550 elif type == 'i' or type == 'I':
2551 return <int32_t>bam_aux2i(v)
2552 elif type == 'f' or type == 'F':
2553 return <float>bam_aux2f(v)
2554 elif type == 'd' or type == 'D':
2555 return <double>bam_aux2d(v)
2557 # there might a more efficient way
2558 # to convert a char into a string
2559 return '%c' % <char>bam_aux2A(v)
2561 return <char*>bam_aux2Z(v)
2563 def fancy_str (self):
2564 """returns list of fieldnames/values in pretty format for debugging
2568 "tid": "Contig index",
2569 "pos": "Mapped position on contig",
2570 "mtid": "Contig index for mate pair",
2571 "mpos": "Position of mate pair",
2572 "isize": "Insert size",
2573 "flag": "Binary flag",
2574 "n_cigar": "Count of cigar entries",
2575 "cigar": "Cigar entries",
2576 "qual": "Mapping quality",
2577 "bin": "Bam index bin number",
2578 "l_qname": "Length of query name",
2579 "qname": "Query name",
2580 "l_qseq": "Length of query sequence",
2581 "qseq": "Query sequence",
2582 "bqual": "Quality scores",
2583 "l_aux": "Length of auxilary data",
2584 "m_data": "Maximum data length",
2585 "data_len": "Current data length",
2587 fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
2588 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
2589 "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
2591 for f in fields_names_in_order:
2592 if not f in self.__dict__:
2594 ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
2596 for f in self.__dict__:
2597 if not f in field_names:
2598 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
2601 cdef class PileupProxy:
2602 '''A pileup column. A pileup column contains
2603 all the reads that map to a certain target base.
2606 chromosome ID as is defined in the header
2608 the target base coordinate (0-based)
2610 number of reads mapping to this column
2612 list of reads (:class:`pysam.PileupRead`) aligned to this column
2614 This class is a proxy for results returned by the samtools pileup engine.
2615 If the underlying engine iterator advances, the results of this column
2618 cdef bam_pileup1_t * plp
2624 raise TypeError("This class cannot be instantiated from Python")
2627 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
2629 "\n".join( map(str, self.pileups) )
2632 '''the chromosome ID as is defined in the header'''
2633 def __get__(self): return self.tid
2636 '''number of reads mapping to this column.'''
2637 def __get__(self): return self.n_pu
2638 def __set__(self, n): self.n_pu = n
2641 def __get__(self): return self.pos
2644 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
2648 # warning: there could be problems if self.n and self.buf are
2650 for x from 0 <= x < self.n_pu:
2651 pileups.append( makePileupRead( &(self.plp[x])) )
2654 cdef class PileupRead:
2655 '''A read aligned to a column.
2659 AlignedRead _alignment
2668 raise TypeError("This class cannot be instantiated from Python")
2671 return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
2674 """a :class:`pysam.AlignedRead` object of the aligned read"""
2676 return self._alignment
2678 """position of the read base at the pileup site, 0-based"""
2682 """indel length; 0 for no indel, positive for ins and negative for del"""
2686 """1 iff the base on the padded read is a deletion"""
2691 return self._is_head
2694 return self._is_tail
2700 '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
2701 def __init__(self, id = 1):
2705 def setdevice(self, filename):
2706 '''open an existing file, like "/dev/null"'''
2707 fd = os.open(filename, os.O_WRONLY)
2710 def setfile(self, filename):
2711 '''open a new file.'''
2712 fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
2715 def setfd(self, fd):
2716 ofd = os.dup(self.id) # Save old stream on new unit.
2717 self.streams.append(ofd)
2718 sys.stdout.flush() # Buffered data goes to old stream.
2719 sys.stderr.flush() # Buffered data goes to old stream.
2720 os.dup2(fd, self.id) # Open unit 1 on new stream.
2721 os.close(fd) # Close other unit (look out, caller.)
2724 '''restore previous output stream'''
2726 # the following was not sufficient, hence flush both stderr and stdout
2727 # os.fsync( self.id )
2730 os.dup2(self.streams[-1], self.id)
2731 os.close(self.streams[-1])
2732 del self.streams[-1]
2734 def _samtools_dispatch( method,
2736 catch_stdout = True,
2737 catch_stderr = False,
2739 '''call ``method`` in samtools providing arguments in args.
2742 This method redirects stdout and stderr to capture it
2743 from samtools. If for some reason stdout/stderr disappears
2744 the reason might be in this method.
2747 The current implementation might only work on linux.
2750 This method captures stdout and stderr using temporary files,
2751 which are then read into memory in their entirety. This method
2752 is slow and might cause large memory overhead.
2754 See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
2755 on the topic of redirecting stderr/stdout.
2758 # note that debugging this module can be a problem
2759 # as stdout/stderr will not appear
2761 # some special cases
2762 if method == "index":
2763 if not os.path.exists( args[0] ):
2764 raise IOError( "No such file or directory: '%s'" % args[0] )
2766 # redirect stderr and stdout to file
2768 stderr_h, stderr_f = tempfile.mkstemp()
2769 stderr_save = Outs( sys.stderr.fileno() )
2770 stderr_save.setfd( stderr_h )
2773 stdout_h, stdout_f = tempfile.mkstemp()
2774 stdout_save = Outs( sys.stdout.fileno() )
2775 stdout_save.setfd( stdout_h )
2777 # patch for `samtools view`
2778 # samtools `view` closes stdout, from which I can not
2779 # recover. Thus redirect output to file with -o option.
2780 if method == "view":
2781 if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
2782 args = ( "-o", stdout_f ) + args
2785 # do the function call to samtools
2787 cdef int i, n, retval
2790 # allocate two more for first (dummy) argument (contains command)
2791 cargs = <char**>calloc( n+2, sizeof( char *) )
2792 cargs[0] = "samtools"
2794 for i from 0 <= i < n: cargs[i+2] = args[i]
2795 retval = pysam_dispatch(n+2, cargs)
2798 # restore stdout/stderr. This will also flush, so
2799 # needs to be before reading back the file contents
2801 stdout_save.restore()
2802 out_stdout = open( stdout_f, "r").readlines()
2803 os.remove( stdout_f )
2808 stderr_save.restore()
2809 out_stderr = open( stderr_f, "r").readlines()
2810 os.remove( stderr_f )
2814 return retval, out_stderr, out_stdout
2817 '''the results of a SNP call.'''
2820 cdef char _reference_base
2822 cdef int _consensus_quality
2823 cdef int _snp_quality
2824 cdef int _rms_mapping_quality
2828 '''the chromosome ID as is defined in the header'''
2833 '''nucleotide position of SNP.'''
2834 def __get__(self): return self._pos
2836 property reference_base:
2837 '''reference base at pos. ``N`` if no reference sequence supplied.'''
2838 def __get__(self): return PyString_FromStringAndSize( &self._reference_base, 1 )
2841 '''the genotype called.'''
2842 def __get__(self): return PyString_FromStringAndSize( &self._genotype, 1 )
2844 property consensus_quality:
2845 '''the genotype quality (Phred-scaled).'''
2846 def __get__(self): return self._consensus_quality
2848 property snp_quality:
2849 '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
2850 def __get__(self): return self._snp_quality
2852 property mapping_quality:
2853 '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
2854 def __get__(self): return self._rms_mapping_quality
2857 '''coverage or read depth - the number of reads involved in the call.'''
2858 def __get__(self): return self._coverage
2862 return "\t".join( map(str, (
2865 self.reference_base,
2867 self.consensus_quality,
2869 self.mapping_quality,
2873 cdef class SNPCallerBase:
2874 '''Base class for SNP callers.
2877 minimum base quality (possibly capped by BAQ)
2879 coefficient for adjusting mapQ of poor mappings
2881 theta in maq consensus calling model
2883 number of haplotypes in the sample
2885 prior of a difference between two haplotypes
2888 cdef bam_maqcns_t * c
2889 cdef IteratorColumn iter
2892 IteratorColumn iterator_column,
2895 self.iter = iterator_column
2896 self.c = bam_maqcns_init()
2898 # set the default parameterization according to
2901 # new default mode for samtools >0.1.10
2902 self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
2904 self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
2905 # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
2906 self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
2907 self.c.het_rate = kwargs.get( "het_rate", 0.001 )
2908 self.c.theta = kwargs.get( "theta", 0.83 )
2910 if self.c.errmod != BAM_ERRMOD_MAQ2:
2911 self.c.theta += 0.02
2913 # call prepare AFTER setting parameters
2914 bam_maqcns_prepare( self.c )
2916 def __dealloc__(self):
2917 bam_maqcns_destroy( self.c )
2919 cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
2920 '''debugging output.'''
2922 pysam_dump_glf( g, self.c );
2924 for x in range(self.iter.n_plp):
2925 print "--> read %i %s %i" % (x,
2926 bam1_qname(self.iter.plp[x].b),
2927 self.iter.plp[x].qpos,
2930 print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
2943 printf("-------------------------------------\n");
2946 cdef class IteratorSNPCalls( SNPCallerBase ):
2947 """*(IteratorColumn iterator)*
2949 call SNPs within a region.
2951 *iterator* is a pileup iterator. SNPs will be called
2952 on all positions returned by this iterator.
2954 This caller is fast if SNPs are called over large continuous
2955 regions. It is slow, if instantiated frequently and in random
2956 order as the sequence will have to be reloaded.
2961 IteratorColumn iterator_column,
2964 assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
2970 """python version of next().
2973 # the following code was adapted from bam_plcmd.c:pileup_func()
2976 if self.iter.n_plp < 0:
2977 raise ValueError("error during iteration" )
2979 if self.iter.plp == NULL:
2982 cdef char * seq = self.iter.getSequence()
2983 cdef int seq_len = self.iter.seq_len
2988 if self.iter.pos >= seq_len:
2989 raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
2991 cdef int rb = seq[self.iter.pos]
2995 g = bam_maqcns_glfgen( self.iter.n_plp,
3000 if pysam_glf_depth( g ) == 0:
3001 cns = 0xfu << 28 | 0xf << 24
3003 cns = glf2cns(g, <int>(self.c.q_r + .499))
3010 call._tid = self.iter.tid
3011 call._pos = self.iter.pos
3012 call._reference_base = rb
3013 call._genotype = bam_nt16_rev_table[cns>>28]
3014 call._consensus_quality = cns >> 8 & 0xff
3015 call._snp_quality = cns & 0xff
3016 call._rms_mapping_quality = cns >> 16&0xff
3017 call._coverage = self.iter.n_plp
3021 cdef class SNPCaller( SNPCallerBase ):
3022 '''*(IteratorColumn iterator_column )*
3024 The samtools SNP caller.
3026 This object will call SNPs in *samfile* against the reference
3027 sequence in *fasta*.
3029 This caller is fast for calling few SNPs in selected regions.
3031 It is slow, if called over large genomic regions.
3036 IteratorColumn iterator_column,
3041 def call(self, reference, int pos ):
3042 """call a snp on chromosome *reference*
3045 returns a :class:`SNPCall` object.
3048 cdef int tid = self.iter.samfile.gettid( reference )
3050 self.iter.reset( tid, pos, pos + 1 )
3055 if self.iter.n_plp < 0:
3056 raise ValueError("error during iteration" )
3058 if self.iter.plp == NULL:
3059 raise ValueError( "no reads in region - no call" )
3061 if self.iter.pos == pos: break
3063 cdef char * seq = self.iter.getSequence()
3064 cdef int seq_len = self.iter.seq_len
3069 if self.iter.pos >= seq_len:
3070 raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3072 cdef int rb = seq[self.iter.pos]
3076 g = bam_maqcns_glfgen( self.iter.n_plp,
3082 if pysam_glf_depth( g ) == 0:
3083 cns = 0xfu << 28 | 0xf << 24
3085 cns = glf2cns(g, <int>(self.c.q_r + .499))
3092 call._tid = self.iter.tid
3093 call._pos = self.iter.pos
3094 call._reference_base = rb
3095 call._genotype = bam_nt16_rev_table[cns>>28]
3096 call._consensus_quality = cns >> 8 & 0xff
3097 call._snp_quality = cns & 0xff
3098 call._rms_mapping_quality = cns >> 16&0xff
3099 call._coverage = self.iter.n_plp
3103 cdef class IndelCall:
3104 '''the results of an indel call.'''
3108 cdef int _rms_mapping_quality
3109 cdef bam_maqindel_ret_t * _r
3111 def __cinit__(self):
3117 '''the chromosome ID as is defined in the header'''
3122 '''nucleotide position of SNP.'''
3123 def __get__(self): return self._pos
3126 '''the genotype called.'''
3129 s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3130 return "%s/%s" % (s,s)
3131 elif self._r.gt == 1:
3132 s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3133 return "%s/%s" % (s,s)
3135 return "%s/%s" % (self.first_allele, self.second_allele )
3137 property consensus_quality:
3138 '''the genotype quality (Phred-scaled).'''
3139 def __get__(self): return self._r.q_cns
3141 property snp_quality:
3142 '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
3143 def __get__(self): return self._r.q_ref
3145 property mapping_quality:
3146 '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
3147 def __get__(self): return self._rms_mapping_quality
3150 '''coverage or read depth - the number of reads involved in the call.'''
3151 def __get__(self): return self._coverage
3153 property first_allele:
3154 '''sequence of first allele.'''
3155 def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3157 property second_allele:
3158 '''sequence of second allele.'''
3159 def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3161 property reads_first:
3162 '''reads supporting first allele.'''
3163 def __get__(self): return self._r.cnt1
3165 property reads_second:
3166 '''reads supporting first allele.'''
3167 def __get__(self): return self._r.cnt2
3169 property reads_diff:
3170 '''reads supporting first allele.'''
3171 def __get__(self): return self._r.cnt_anti
3175 return "\t".join( map(str, (
3179 self.consensus_quality,
3181 self.mapping_quality,
3187 self.reads_diff ) ) )
3189 def __dealloc__(self ):
3190 bam_maqindel_ret_destroy(self._r)
3192 cdef class IndelCallerBase:
3193 '''Base class for SNP callers.
3196 minimum base quality (possibly capped by BAQ)
3198 coefficient for adjusting mapQ of poor mappings
3200 theta in maq consensus calling model
3202 number of haplotypes in the sample
3204 prior of a difference between two haplotypes
3207 cdef bam_maqindel_opt_t * options
3208 cdef IteratorColumn iter
3213 IteratorColumn iterator_column,
3217 self.iter = iterator_column
3219 assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
3221 self.options = bam_maqindel_opt_init()
3223 # set the default parameterization according to
3226 self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
3227 self.options.q_indel = kwargs.get( "q_indel", 40 )
3228 self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
3229 self.max_depth = kwargs.get( "max_depth", 1024 )
3231 def __dealloc__(self):
3232 free( self.options )
3236 cdef char * seq = self.iter.getSequence()
3237 cdef int seq_len = self.iter.seq_len
3242 if self.iter.pos >= seq_len:
3243 raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3245 cdef bam_maqindel_ret_t * r
3247 cdef int m = min( self.max_depth, self.iter.n_plp )
3249 # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
3250 # m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
3251 # self.options.indel_err, self.options.ambi_thres );
3261 if r == NULL: return None
3266 call._tid = self.iter.tid
3267 call._pos = self.iter.pos
3268 call._coverage = self.iter.n_plp
3270 cdef uint64_t rms_aux = 0
3272 cdef bam_pileup1_t * p
3275 for i from 0 <= i < self.iter.n_plp:
3276 p = self.iter.plp + i
3277 if p.b.core.qual < self.cap_mapQ:
3281 rms_aux += tmp * tmp
3283 call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
3287 cdef class IndelCaller( IndelCallerBase ):
3288 '''*(IteratorColumn iterator_column )*
3290 The samtools SNP caller.
3292 This object will call SNPs in *samfile* against the reference
3293 sequence in *fasta*.
3295 This caller is fast for calling few SNPs in selected regions.
3297 It is slow, if called over large genomic regions.
3301 IteratorColumn iterator_column,
3306 def call(self, reference, int pos ):
3307 """call a snp on chromosome *reference*
3310 returns a :class:`SNPCall` object or None, if no indel call could be made.
3313 cdef int tid = self.iter.samfile.gettid( reference )
3315 self.iter.reset( tid, pos, pos + 1 )
3320 if self.iter.n_plp < 0:
3321 raise ValueError("error during iteration" )
3323 if self.iter.plp == NULL:
3324 raise ValueError( "no reads in region - no call" )
3326 if self.iter.pos == pos: break
3330 cdef class IteratorIndelCalls( IndelCallerBase ):
3331 """*(IteratorColumn iterator)*
3333 call indels within a region.
3335 *iterator* is a pileup iterator. SNPs will be called
3336 on all positions returned by this iterator.
3338 This caller is fast if SNPs are called over large continuous
3339 regions. It is slow, if instantiated frequently and in random
3340 order as the sequence will have to be reloaded.
3345 IteratorColumn iterator_column,
3354 """python version of next().
3357 # the following code was adapted from bam_plcmd.c:pileup_func()
3360 if self.iter.n_plp < 0:
3361 raise ValueError("error during iteration" )
3363 if self.iter.plp == NULL:
3370 cdef class IndexedReads:
3371 """index a bamfile by read.
3373 The index is kept in memory.
3375 By default, the file is re-openend to avoid conflicts if
3376 multiple operators work on the same file. Set *reopen* = False
3377 to not re-open *samfile*.
3380 cdef Samfile samfile
3383 # true if samfile belongs to this object
3384 cdef int owns_samfile
3386 def __init__(self, Samfile samfile, int reopen = True ):
3387 self.samfile = samfile
3389 if samfile.isbam: mode = "rb"
3392 # reopen the file - note that this makes the iterator
3393 # slow and causes pileup to slow down significantly.
3395 store = StderrStore()
3396 self.fp = samopen( samfile._filename, mode, NULL )
3398 assert self.fp != NULL
3399 self.owns_samfile = True
3401 self.fp = samfile.samfile
3402 self.owns_samfile = False
3404 assert samfile.isbam, "can only IndexReads on bam files"
3409 self.index = collections.defaultdict( list )
3411 # this method will start indexing from the current file position
3414 cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
3419 pos = bam_tell( self.fp.x.bam )
3420 ret = samread( self.fp, b)
3422 qname = bam1_qname( b )
3423 self.index[qname].append( pos )
3427 def find( self, qname ):
3428 if qname in self.index:
3429 return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
3431 raise KeyError( "read %s not found" % qname )
3433 def __dealloc__(self):
3434 if self.owns_samfile: samclose( self.fp )
3436 __all__ = ["Samfile",
3447 "IteratorIndelCalls",