1 # cython: embedsignature=True
2 # adds doc-strings for sphinx
4 import tempfile, os, sys, types, itertools, struct, ctypes
6 # defines imported from samtools
11 ## These are bits set in the flag.
12 ## have to put these definitions here, in csamtools.pxd they got ignored
13 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
15 ## @abstract the read is mapped in a proper pair */
16 DEF BAM_FPROPER_PAIR =2
17 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
19 ## @abstract the mate is unmapped */
21 ## @abstract the read is mapped to the reverse strand */
23 ## @abstract the mate is mapped to the reverse strand */
25 ## @abstract this is read1 */
27 ## @abstract this is read2 */
29 ## @abstract not primary alignment */
30 DEF BAM_FSECONDARY =256
31 ## @abstract QC failure */
33 ## @abstract optical or PCR duplicate */
37 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
39 #####################################################################
40 #####################################################################
41 #####################################################################
42 ## private factory methods
43 #####################################################################
44 cdef class AlignedRead
45 cdef makeAlignedRead( bam1_t * src):
46 '''enter src into AlignedRead.'''
49 # destroy dummy delegate created in constructor
50 # to prevent memory leak.
51 pysam_bam_destroy1(dest._delegate)
52 dest._delegate = bam_dup1(src)
55 cdef class PileupProxy
56 cdef makePileupProxy( bam_plbuf_t * buf, int n ):
64 cdef makePileupRead( bam_pileup1_t * src ):
65 '''fill a PileupRead object from a bam_pileup1_t * object.'''
68 dest._alignment = makeAlignedRead( src.b )
70 dest._indel = src.indel
71 dest._level = src.level
72 dest._is_del = src.is_del
73 dest._is_head = src.is_head
74 dest._is_tail = src.is_tail
77 #####################################################################
78 #####################################################################
79 #####################################################################
80 ## Generic callbacks for inserting python callbacks.
81 #####################################################################
82 cdef int fetch_callback( bam1_t *alignment, void *f):
83 '''callback for bam_fetch.
85 calls function in *f* with a new :class:`AlignedRead` object as parameter.
87 a = makeAlignedRead( alignment )
90 class PileupColumn(object):
91 '''A pileup column. A pileup column contains
92 all the reads that map to a certain target base.
95 chromosome ID as is defined in the header
97 the target base coordinate (0-based)
99 number of reads mapping to this column
101 list of reads (:class:`pysam.PileupRead`) aligned to this column
104 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
105 "\n" + "\n".join( map(str, self.pileups) )
107 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
108 '''callback for pileup.
110 calls function in *f* with a new :class:`Pileup` object as parameter.
113 chromosome ID as is defined in the header
115 start coordinate of the alignment, 0-based
117 number of elements in pl array
130 for x from 0 <= x < n:
131 pileups.append( makePileupRead( &(pl[x]) ) )
136 cdef int pileup_fetch_callback( bam1_t *b, void *data):
137 '''callback for bam_fetch.
139 Fetches reads and submits them to pileup.
141 cdef bam_plbuf_t * buf
142 buf = <bam_plbuf_t*>data
143 bam_plbuf_push(b, buf)
151 self.stderr_h, self.stderr_f = tempfile.mkstemp()
152 self.stderr_save = Outs( sys.stderr.fileno() )
153 self.stderr_save.setfd( self.stderr_h )
156 self.stderr_save.restore()
157 if os.path.exists(self.stderr_f):
158 os.remove( self.stderr_f )
163 ######################################################################
164 ######################################################################
165 ######################################################################
166 # valid types for sam headers
167 VALID_HEADER_TYPES = { "HD" : dict,
173 # order of records within sam headers
174 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
176 # type conversions within sam header records
177 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
178 "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
179 "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
180 "PG" : { "ID" : str, "VN" : str, "CL" : str }, }
182 # output order of fields within records
183 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
184 "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
185 "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
186 "PG" : ( "ID", "VN", "CL" ), }
188 ######################################################################
189 ######################################################################
190 ######################################################################
192 ######################################################################
194 '''*(filename, mode='r', template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
196 A *SAM* file. The file is automatically opened.
198 *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode so for binary
199 (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
200 Use ``h`` to output header information in text (:term:`TAM`) mode.
202 If ``b`` is present, it must immediately follow ``r`` or ``w``.
203 Currently valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``.
205 so to open a :term:`BAM` file for reading::
207 f=Samfile('ex1.bam','rb')
210 For writing, the header of a :term:`TAM` file/:term:`BAM` file can be constituted from several
213 1. If *template* is given, the header is copied from a another *Samfile* (*template* must be of type *Samfile*).
215 2. If *header* is given, the header is build from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level is then a list of lines, with each line being a list of tag-value pairs.
217 3. If *text* is given, new header text is copied from raw text.
219 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
221 If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
222 access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
227 cdef samfile_t * samfile
229 cdef bam_index_t *index
230 # true if file is a bam file
233 # current read within iteration
236 def __cinit__(self, *args, **kwargs ):
239 self._open( *args, **kwargs )
241 # allocate memory for iterator
242 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
245 '''return true if samfile has been opened.'''
246 return self.samfile != NULL
248 def _hasIndex( self ):
249 '''return true if samfile has an existing (and opened) index.'''
250 return self.index != NULL
255 Samfile template = None,
256 referencenames = None,
257 referencelengths = None,
261 '''open a sam/bam file.
263 If _open is called on an existing bamfile, the current file will be
264 closed and a new file will be opened.
267 assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
269 # close a previously opened file
270 if self.samfile != NULL: self.close()
273 cdef bam_header_t * header_to_write
274 header_to_write = NULL
276 self.filename = filename
278 self.isbam = len(mode) > 1 and mode[1] == 'b'
281 # open file for writing
283 # header structure (used for writing)
285 # copy header from another file
286 header_to_write = template.samfile.header
289 header_to_write = self._buildHeader( header )
292 # build header from a target names and lengths
293 assert referencenames and referencelengths, "either supply options `template`, `header` or both `refernencenames` and `referencelengths` for writing"
294 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
296 # allocate and fill header
297 header_to_write = bam_header_init()
298 header_to_write.n_targets = len(referencenames)
300 for x in referencenames: n += len(x) + 1
301 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
302 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
303 for x from 0 <= x < header_to_write.n_targets:
304 header_to_write.target_len[x] = referencelengths[x]
305 name = referencenames[x]
306 header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
307 strncpy( header_to_write.target_name[x], name, len(name) )
311 header_to_write.l_text = strlen(text)
312 header_to_write.text = <char*>calloc( strlen(text), sizeof(char) )
313 memcpy( header_to_write.text, text, strlen(text) )
315 header_to_write.hash = NULL
316 header_to_write.rg2lib = NULL
318 # open file. Header gets written to file at the same time for bam files
319 # and sam files (in the latter case, the mode needs to be wh)
320 store = StderrStore()
321 self.samfile = samopen( filename, mode, header_to_write )
324 # bam_header_destroy takes care of cleaning up of all the members
325 if not template and header_to_write != NULL:
326 bam_header_destroy( header_to_write )
329 # open file for reading
330 if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ):
331 raise IOError( "file `%s` not found" % filename)
333 store = StderrStore()
334 self.samfile = samopen( filename, mode, NULL )
337 if self.samfile == NULL:
338 raise IOError("could not open file `%s`" % filename )
340 if mode[0] == "r" and self.isbam:
341 if not os.path.exists(filename + ".bai"):
344 # returns NULL if there is no index or index could not be opened
345 self.index = bam_index_load(filename)
346 if self.index == NULL:
347 raise IOError("error while opening index `%s` " % filename )
349 def getrname( self, tid ):
351 convert numerical :term:`tid` into :ref:`reference` name.'''
352 if not 0 <= tid < self.samfile.header.n_targets:
353 raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets )
354 return self.samfile.header.target_name[tid]
356 def _parseRegion( self,
361 '''parse region information.
363 raise Value for for invalid regions.
365 returns a tuple of region, tid, start and end. Region
366 is a valid samtools :term:`region` or None if the region
367 extends over the whole file.
369 Note that regions are 1-based, while start,end are python coordinates.
378 rtid = rstart = rend = 0
380 # translate to a region
382 if start != None and end != None:
383 region = "%s:%i-%i" % (reference, start+1, end)
388 store = StderrStore()
389 bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)
391 if rtid < 0: raise ValueError( "invalid region `%s`" % region )
392 if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
393 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
394 if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
396 return region, rtid, rstart, rend
405 '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
407 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
408 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
410 Without *reference* or *region* all reads will be fetched. The reads will be returned
411 ordered by reference sequence, which will not necessarily be the order within the file.
412 If *until_eof* is given, all reads from the current file position will be returned
413 *as they are sorted within the file*.
415 If only *reference* is set, all reads matching on *reference* will be fetched.
417 The method returns an iterator of type :class:`pysam.IteratorRow` unless
418 a *callback is provided. If *callback* is given, the callback will be executed
419 for each position within the :term:`region`. Note that callbacks currently work
420 only, if *region* or *reference* is given.
422 Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
423 an exception is raised.
429 if not self._isOpen():
430 raise ValueError( "I/O operation on closed file" )
432 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
437 raise ValueError( "callback functionality requires a region/reference" )
438 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
439 return bam_fetch(self.samfile.x.bam,
440 self.index, rtid, rstart, rend, <void*>callback, fetch_callback )
443 return IteratorRow( self, rtid, rstart, rend )
446 return IteratorRowAll( self )
448 # return all targets by chaining the individual targets together.
449 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
453 for rtid from 0 <= rtid < self.nreferences:
454 i.append( IteratorRow( self, rtid, rstart, rend))
455 return itertools.chain( *i )
458 raise ValueError ("fetch for a region is not available for sam files" )
460 raise NotImplementedError( "callback not implemented yet" )
462 return IteratorRowAll( self )
464 def pileup( self, reference = None, start = None, end = None, region = None, callback = None ):
465 '''run a pileup within a :term:`region` using 0-based indexing. The region is specified by
466 :term:`reference`, *start* and *end*. Alternatively, a samtools *region* string can be supplied.
468 Without *reference* or *region* all reads will be fetched. The reads will be returned
469 ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
471 The method returns an iterator of type :class:`pysam.IteratorColumn` unless
472 a *callback is provided. If *callback* is given, the callback will be executed
473 for each position within the :term:`region`.
475 Note that samfiles do not allow random access. If *region* or *reference* are given,
476 an exception is raised.
480 *all* reads which overlap the region are returned. The first base returned will be the
481 first base of the first read *not* necessarily the first base of the region used in the query.
486 cdef bam_plbuf_t *buf
488 if not self._isOpen():
489 raise ValueError( "I/O operation on closed file" )
491 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
494 if not self._hasIndex(): raise ValueError( "no index available for pileup" )
498 raise ValueError( "callback functionality requires a region/reference" )
500 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
501 bam_fetch(self.samfile.x.bam,
502 self.index, rtid, rstart, rend,
503 buf, pileup_fetch_callback )
506 bam_plbuf_push( NULL, buf)
507 bam_plbuf_destroy(buf)
510 return IteratorColumn( self, rtid, rstart, rend )
512 # return all targets by chaining the individual targets together.
516 for rtid from 0 <= rtid < self.nreferences:
517 i.append( IteratorColumn( self, rtid, rstart, rend))
518 return itertools.chain( *i )
521 raise NotImplementedError( "pileup of samfiles not implemented yet" )
525 if self.samfile != NULL:
526 samclose( self.samfile )
527 bam_index_destroy(self.index);
530 def __dealloc__( self ):
532 # remember: dealloc cannot call other methods
533 # Note that __del__ is not called.
535 pysam_bam_destroy1(self.b)
537 def write( self, AlignedRead read ):
538 '''(AlignedRead read )
539 write a single :class:`pysam.AlignedRead`..
541 return the number of bytes written.
543 return samwrite( self.samfile, read._delegate )
545 property nreferences:
546 '''number of :term:`reference` sequences in the file.'''
548 return self.samfile.header.n_targets
551 """tuple with the names of :term:`reference` sequences."""
554 for x from 0 <= x < self.samfile.header.n_targets:
555 t.append( self.samfile.header.target_name[x] )
559 """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference`
563 for x from 0 <= x < self.samfile.header.n_targets:
564 t.append( self.samfile.header.target_len[x] )
568 '''full contents of the :term:`sam file` header as a string.'''
570 # create a temporary 0-terminated copy
572 t = <char*>calloc( self.samfile.header.l_text + 1, sizeof(char) )
573 memcpy( t, self.samfile.header.text, self.samfile.header.l_text )
579 '''header information within the :term:`sam file`. The records and fields are returned as
580 a two-level dictionary.
585 if self.samfile.header.text != NULL:
586 # convert to python string (note: call self.text to create 0-terminated string)
588 for line in t.split("\n"):
589 if not line.strip(): continue
590 assert line.startswith("@"), "header line without '@': '%s'" % line
591 fields = line[1:].split("\t")
593 assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
597 if record not in result: result[record] = []
598 result[record].append( "\t".join( fields[1:] ) )
601 # the following is clumsy as generators do not work?
603 for field in fields[1:]:
604 key, value = field.split(":",1)
605 if key not in VALID_HEADER_FIELDS[record]:
606 raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
607 x[key] = VALID_HEADER_FIELDS[record][key](value)
609 if VALID_HEADER_TYPES[record] == dict:
611 raise ValueError( "multiple '%s' lines are not permitted" % record )
613 elif VALID_HEADER_TYPES[record] == list:
614 if record not in result: result[record] = []
615 result[record].append( x )
619 def _buildLine( self, fields, record ):
620 '''build a header line from *fields* dictionary for *record*'''
622 # TODO: add checking for field and sort order
623 line = ["@%s" % record ]
625 line.append( fields )
627 for key in VALID_HEADER_ORDER[record]:
629 line.append( "%s:%s" % (key, str(fields[key])))
630 return "\t".join( line )
632 cdef bam_header_t * _buildHeader( self, new_header ):
633 '''return a new header built from a dictionary in *new_header*.
635 This method inserts the text field, target_name and target_len.
640 # check if hash exists
642 # create new header and copy old data
643 cdef bam_header_t * dest
645 dest = bam_header_init()
647 for record in VALID_HEADERS:
648 if record in new_header:
649 ttype = VALID_HEADER_TYPES[record]
650 data = new_header[record]
651 if type( data ) != type( ttype() ):
652 raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
653 if type( data ) == types.DictType:
654 lines.append( self._buildLine( data, record ) )
656 for fields in new_header[record]:
657 lines.append( self._buildLine( fields, record ) )
659 text = "\n".join(lines) + "\n"
660 if dest.text != NULL: free( dest.text )
661 dest.text = <char*>calloc( len(text), sizeof(char))
662 dest.l_text = len(text)
663 strncpy( dest.text, text, dest.l_text )
666 if "SQ" in new_header:
668 for fields in new_header["SQ"]:
670 seqs.append( (fields["SN"], fields["LN"] ) )
672 raise KeyError( "incomplete sequence information in '%s'" % str(fields))
674 dest.n_targets = len(seqs)
675 dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
676 dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
678 for x from 0 <= x < dest.n_targets:
679 seqname, seqlen = seqs[x]
680 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
681 strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
682 dest.target_len[x] = seqlen
689 cdef bam1_t * getCurrent( self ):
692 cdef int cnext(self):
693 '''cversion of iterator. Used by IteratorColumn'''
695 return samread(self.samfile, self.b)
698 """python version of next().
700 pyrex uses this non-standard name instead of next()
703 ret = samread(self.samfile, self.b)
705 return makeAlignedRead( self.b )
709 cdef class Fastafile:
712 A *FASTA* file. The file is automatically opened.
714 The file expects an indexed fasta file.
717 add automatic indexing.
718 add function to get sequence names.
722 # pointer to fastafile
723 cdef faidx_t * fastafile
725 def __cinit__(self, *args, **kwargs ):
726 self.fastafile = NULL
727 self._open( *args, **kwargs )
730 '''return true if samfile has been opened.'''
731 return self.fastafile != NULL
735 '''open an indexed fasta file.
737 This method expects an indexed fasta file.
740 # close a previously opened file
741 if self.fastafile != NULL: self.close()
742 self.filename = filename
743 self.fastafile = fai_load( filename )
745 if self.fastafile == NULL:
746 raise IOError("could not open file `%s`" % filename )
749 if self.fastafile != NULL:
750 fai_destroy( self.fastafile )
751 self.fastafile = NULL
759 '''*(reference = None, start = None, end = None, region = None)*
761 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
762 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
765 if not self._isOpen():
766 raise ValueError( "I/O operation on closed file" )
768 cdef int len, max_pos
773 if reference == None: raise ValueError( 'no sequence/region supplied.' )
774 if start == None and end == None:
775 region = "%s" % str(reference)
776 elif start == None or end == None:
777 raise ValueError( 'only start or only end of region supplied' )
779 if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
780 # valid ranges are from 0 to 2^29-1
781 if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
782 if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
783 region = "%s:%i-%i" % (reference, start+1, end )
785 # samtools adds a '\0' at the end
786 seq = fai_fetch( self.fastafile, region, &len )
794 ## turning callbacks elegantly into iterators is an unsolved problem, see the following threads:
795 ## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1
796 ## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html
797 ## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code
798 ## changes, the changes have to be manually entered.
800 cdef class IteratorRow:
801 """iterates over mapped reads in a region.
804 cdef bam_fetch_iterator_t* bam_iter # iterator state object
809 def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
812 assert samfile._isOpen()
813 assert samfile._hasIndex()
815 # makes sure that samfile stays alive as long as the
817 self.samfile = samfile
821 self.error_msg = None
824 fp = samfile.samfile.x.bam
825 self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end)
830 cdef bam1_t * getCurrent( self ):
833 cdef int cnext(self):
834 '''cversion of iterator. Used by IteratorColumn'''
835 self.b = bam_fetch_iterate(self.bam_iter)
836 if self.b == NULL: return 0
840 """python version of next().
842 pyrex uses this non-standard name instead of next()
845 raise ValueError( self.error_msg)
847 self.b = bam_fetch_iterate(self.bam_iter)
849 return makeAlignedRead( self.b )
853 def __dealloc__(self):
854 '''remember: dealloc cannot call other methods!'''
856 bam_cleanup_fetch_iterator(self.bam_iter)
858 cdef class IteratorRowAll:
859 """iterates over all mapped reads
865 def __cinit__(self, Samfile samfile):
867 assert samfile._isOpen()
869 self.fp = samfile.samfile
871 # allocate memory for alignment
872 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
877 cdef bam1_t * getCurrent( self ):
880 cdef int cnext(self):
881 '''cversion of iterator. Used by IteratorColumn'''
883 return samread(self.fp, self.b)
886 """python version of next().
888 pyrex uses this non-standard name instead of next()
891 ret = samread(self.fp, self.b)
893 return makeAlignedRead( self.b )
897 def __dealloc__(self):
898 '''remember: dealloc cannot call other methods!'''
899 pysam_bam_destroy1(self.b)
901 cdef class IteratorColumn:
902 '''iterates over columns.
904 This iterator wraps the pileup functionality of samtools.
906 For reasons of efficiency, the iterator returns the current
907 pileup buffer. As this buffer is updated at every iteration,
908 the contents of this iterator will change accordingly. Hence the conversion to
909 a list will not produce the expected result::
911 f = Samfile("file.bam", "rb")
912 result = list( f.pileup() )
914 Here, result will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
915 but each object will contain the same information.
917 If the results of several columns are required at the same time, the results
918 need to be stored explicitely::
920 result = [ x.pileups() for x in f.pileup() ]
922 Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`.
925 cdef bam_plbuf_t *buf
927 # check if first iteration
929 # result of the last plbuf_push
932 cdef IteratorRow iter
934 def __cinit__(self, Samfile samfile, int tid, int start, int end ):
936 self.iter = IteratorRow( samfile, tid, start, end )
937 self.buf = bam_plbuf_init(NULL, NULL )
944 cdef int cnext(self):
945 '''perform next iteration.
947 return 1 if there is a buffer to emit. Return 0 for end of iteration.
950 cdef int retval1, retval2
952 # pysam bam_plbuf_push returns:
953 # 1: if buf is full and can be emitted
954 # 0: if b has been added
955 # -1: if there was an error
957 # check if previous plbuf was incomplete. If so, continue within
958 # the loop and yield if necessary
960 self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1)
961 if self.n_pu > 0: return 1
963 if self.eof: return 0
965 # get next alignments and submit until plbuf indicates that
966 # an new column has finished
967 while self.n_pu == 0:
968 retval1 = self.iter.cnext()
969 # wrap up if no more input
971 self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0)
976 self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0)
977 if self.n_pu < 0: raise ValueError( "error while iterating" )
983 """python version of next().
985 pyrex uses this non-standard name instead of next()
989 cdef bam_pileup1_t * pl
992 return makePileupProxy( self.buf, self.n_pu )
996 def __dealloc__(self):
997 bam_plbuf_destroy(self.buf);
999 cdef class AlignedRead:
1001 Class representing an aligned read. see SAM format specification for meaning of fields (http://samtools.sourceforge.net/).
1003 This class stores a handle to the samtools C-structure representing
1004 an aligned read. Member read access is forwarded to the C-structure
1005 and converted into python objects. This implementation should be fast,
1006 as only the data needed is converted.
1008 For write access, the C-structure is updated in-place. This is
1009 not the most efficient way to build BAM entries, as the variable
1010 length data is concatenated and thus needs to resized if
1011 a field is updated. Furthermore, the BAM entry might be
1012 in an inconsistent state. The :meth:`~validate` method can
1013 be used to check if an entry is consistent.
1015 One issue to look out for is that the sequence should always
1016 be set *before* the quality scores. Setting the sequence will
1017 also erase any quality scores that were set previously.
1022 def __cinit__( self ):
1024 self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
1025 # allocate some memory
1026 # If size is 0, calloc does not return a pointer that can be passed to free()
1027 # so allocate 40 bytes for a new read
1028 self._delegate.m_data = 40
1029 self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
1030 self._delegate.data_len = 0
1032 def __dealloc__(self):
1033 '''clear up memory.'''
1034 pysam_bam_destroy1(self._delegate)
1038 return "\t".join(map(str, (self.qname,
1049 def __cmp__(self, AlignedRead other):
1050 '''return true, if contents in this are binary equal to ``other``.'''
1056 # uncomment for debugging purposes
1057 # cdef unsigned char * oo, * tt
1058 # tt = <unsigned char*>(&t.core)
1059 # oo = <unsigned char*>(&o.core)
1060 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
1061 # tt = <unsigned char*>(t.data)
1062 # oo = <unsigned char*>(o.data)
1063 # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
1065 retval = memcmp( &t.core,
1067 sizeof( bam1_core_t ))
1069 if retval: return retval
1070 retval = cmp( t.data_len, o.data_len)
1071 if retval: return retval
1072 return memcmp( t.data,
1074 sizeof( t.data_len ))
1077 """the query name (None if not present)"""
1080 src = self._delegate
1081 if src.core.l_qname == 0: return None
1082 return <char *>pysam_bam1_qname( src )
1084 def __set__(self, qname ):
1085 if qname == None or len(qname) == 0: return
1090 src = self._delegate
1091 p = pysam_bam1_qname( src )
1093 # the qname is \0 terminated
1095 pysam_bam_update( src,
1100 src.core.l_qname = l
1102 # re-acquire pointer to location in memory
1103 # as it might have moved
1104 p = pysam_bam1_qname(src)
1106 strncpy( p, qname, l )
1109 """the :term:`cigar` alignment (None if not present).
1112 cdef uint32_t * cigar_p
1115 src = self._delegate
1116 if src.core.n_cigar == 0: return None
1119 cigar_p = pysam_bam1_cigar(src);
1120 for k from 0 <= k < src.core.n_cigar:
1121 op = cigar_p[k] & BAM_CIGAR_MASK
1122 l = cigar_p[k] >> BAM_CIGAR_SHIFT
1123 cigar.append((op, l))
1126 def __set__(self, values ):
1127 if values == None or len(values) == 0: return
1135 src = self._delegate
1137 # get location of cigar string
1138 p = pysam_bam1_cigar(src)
1140 # create space for cigar data within src.data
1141 pysam_bam_update( src,
1142 src.core.n_cigar * 4,
1146 # length is number of cigar operations, not bytes
1147 src.core.n_cigar = len(values)
1149 # re-acquire pointer to location in memory
1150 # as it might have moved
1151 p = pysam_bam1_cigar(src)
1153 # insert cigar operations
1154 for op, l in values:
1155 p[k] = l << BAM_CIGAR_SHIFT | op
1158 ## setting the cigar string also updates the "bin" attribute
1159 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
1162 """the query sequence (None if not present)"""
1167 src = self._delegate
1168 bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
1169 ## parse qseq (bam1_seq)
1170 if src.core.l_qseq == 0: return None
1172 s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char))
1173 p = pysam_bam1_seq( src )
1174 for k from 0 <= k < src.core.l_qseq:
1175 ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
1176 s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)]
1181 def __set__(self,seq):
1182 # samtools manages sequence and quality length memory together
1183 # if no quality information is present, the first byte says 0xff.
1185 if seq == None or len(seq) == 0: return
1189 src = self._delegate
1190 cdef int l, k, nbytes_new, nbytes_old
1194 # as the sequence is stored in half-bytes, the total length (sequence
1195 # plus quality scores) is (l+1)/2 + l
1196 nbytes_new = (l+1)/2 + l
1197 nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
1198 # acquire pointer to location in memory
1199 p = pysam_bam1_seq( src )
1202 pysam_bam_update( src,
1206 # re-acquire pointer to location in memory
1207 # as it might have moved
1208 p = pysam_bam1_seq( src )
1209 for k from 0 <= k < nbytes_new: p[k] = 0
1210 # convert to C string
1212 for k from 0 <= k < l:
1213 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
1216 p = pysam_bam1_qual( src )
1220 """the base quality (None if not present)"""
1225 src = self._delegate
1226 if src.core.l_qseq == 0: return None
1228 p = pysam_bam1_qual( src )
1229 if p[0] == 0xff: return None
1231 q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char))
1232 for k from 0 <= k < src.core.l_qseq:
1233 ## equivalent to t[i] + 33 (see bam.c)
1235 # convert to python string
1241 def __set__(self,qual):
1242 # note that space is already allocated via the sequences
1246 src = self._delegate
1247 p = pysam_bam1_qual( src )
1248 if qual == None or len(qual) == 0:
1249 # if absent - set to 0xff
1253 # convert to C string
1256 if src.core.l_qseq != l:
1257 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
1258 assert src.core.l_qseq == l
1259 for k from 0 <= k < l:
1260 p[k] = <uint8_t>q[k] - 33
1263 """the tags in the AUX field."""
1270 src = self._delegate
1271 if src.l_aux == 0: return None
1273 s = pysam_bam1_aux( src )
1275 ctag = <char*>calloc( 3, sizeof(char) )
1277 while s < (src.data + src.data_len):
1285 # convert type - is there a better way?
1289 # get type and value
1290 # how do I do char literal comparison in cython?
1291 # the code below works (i.e, is C comparison)
1294 value = <int>bam_aux2i(s)
1297 value = <int>bam_aux2i(s)
1300 value = <float>bam_aux2f(s)
1303 value = <double>bam_aux2d(s)
1306 value = <int>bam_aux2i(s)
1309 # there might a more efficient way
1310 # to convert a char into a string
1311 value = "%c" % <char>bam_aux2A(s)
1314 value = <char*>bam_aux2Z(s)
1315 # +1 for NULL terminated string
1322 result.append( (pytag, value) )
1327 def __set__(self, tags):
1331 cdef uint8_t * new_data
1332 cdef int guessed_size, control_size
1333 src = self._delegate
1334 cdef int max_size, size
1337 # map samtools code to python.struct code and byte size
1338 buffer = ctypes.create_string_buffer(max_size)
1341 for pytag, value in tags:
1343 if t == types.FloatType:
1345 elif t == types.IntType:
1347 if value >= -127: fmt, pytype = "<cccb", 'c'
1348 elif value >= -32767: fmt, pytype = "<ccch", 's'
1349 elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
1350 else: fmt, ctype = "<ccci", 'i'[0]
1352 if value <= 255: fmt, pytype = "<cccB", 'C'
1353 elif value <= 65535: fmt, pytype = "<cccH", 'S'
1354 elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
1355 else: fmt, pytype = "<cccI", 'I'
1357 # Note: hex strings (H) are not supported yet
1359 fmt, pytype = "<cccc", 'A'
1361 fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
1363 size = struct.calcsize(fmt)
1364 if offset + size > max_size:
1365 raise NotImplementedError("tags field too large")
1367 struct.pack_into( fmt,
1376 # delete the old data and allocate new
1377 pysam_bam_update( src,
1380 pysam_bam1_aux( src ) )
1384 if offset == 0: return
1386 # get location of new data
1387 s = pysam_bam1_aux( src )
1389 # check if there is direct path from buffer.raw to tmp
1392 memcpy( s, temp, offset )
1395 """properties flag"""
1396 def __get__(self): return self._delegate.core.flag
1397 def __set__(self, flag): self._delegate.core.flag = flag
1404 This field contains the index of the reference sequence
1405 in the sequence dictionary. To obtain the name
1406 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
1409 def __get__(self): return self._delegate.core.tid
1410 def __set__(self, tid): self._delegate.core.tid = tid
1412 """0-based leftmost coordinate"""
1413 def __get__(self): return self._delegate.core.pos
1414 def __set__(self, pos):
1415 ## setting the cigar string also updates the "bin" attribute
1417 src = self._delegate
1418 if src.core.n_cigar:
1419 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) )
1421 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
1422 self._delegate.core.pos = pos
1424 """properties bin"""
1425 def __get__(self): return self._delegate.core.bin
1426 def __set__(self, bin): self._delegate.core.bin = bin
1428 '''length of the read (read only). Returns 0 if not given.'''
1429 def __get__(self): return self._delegate.core.l_qseq
1431 """mapping quality"""
1432 def __get__(self): return self._delegate.core.qual
1433 def __set__(self, qual): self._delegate.core.qual = qual
1435 """the :term:`reference` id of the mate """
1436 def __get__(self): return self._delegate.core.mtid
1437 def __set__(self, mtid): self._delegate.core.mtid = mtid
1439 """the position of the mate"""
1440 def __get__(self): return self._delegate.core.mpos
1441 def __set__(self, mpos): self._delegate.core.mpos = mpos
1443 """the insert size"""
1444 def __get__(self): return self._delegate.core.isize
1445 def __set__(self, isize): self._delegate.core.isize = isize
1447 """true if read is paired in sequencing"""
1448 def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
1449 def __set__(self,val):
1450 if val: self._delegate.core.flag |= BAM_FPAIRED
1451 else: self._delegate.core.flag &= ~BAM_FPAIRED
1452 property is_proper_pair:
1453 """true if read is mapped in a proper pair"""
1454 def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
1455 def __set__(self,val):
1456 if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
1457 else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
1458 property is_unmapped:
1459 """true if read itself is unmapped"""
1460 def __get__(self): return (self.flag & BAM_FUNMAP) != 0
1461 def __set__(self,val):
1462 if val: self._delegate.core.flag |= BAM_FUNMAP
1463 else: self._delegate.core.flag &= ~BAM_FUNMAP
1464 property mate_is_unmapped:
1465 """true if the mate is unmapped"""
1466 def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
1467 def __set__(self,val):
1468 if val: self._delegate.core.flag |= BAM_FMUNMAP
1469 else: self._delegate.core.flag &= ~BAM_FMUNMAP
1470 property is_reverse:
1471 """true if read is mapped to reverse strand"""
1472 def __get__(self):return (self.flag & BAM_FREVERSE) != 0
1473 def __set__(self,val):
1474 if val: self._delegate.core.flag |= BAM_FREVERSE
1475 else: self._delegate.core.flag &= ~BAM_FREVERSE
1476 property mate_is_reverse:
1477 """true is read is mapped to reverse strand"""
1478 def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
1479 def __set__(self,val):
1480 if val: self._delegate.core.flag |= BAM_FMREVERSE
1481 else: self._delegate.core.flag &= ~BAM_FMREVERSE
1483 """true if this is read1"""
1484 def __get__(self): return (self.flag & BAM_FREAD1) != 0
1485 def __set__(self,val):
1486 if val: self._delegate.core.flag |= BAM_FREAD1
1487 else: self._delegate.core.flag &= ~BAM_FREAD1
1489 """true if this is read2"""
1490 def __get__(self): return (self.flag & BAM_FREAD2) != 0
1491 def __set__(self,val):
1492 if val: self._delegate.core.flag |= BAM_FREAD2
1493 else: self._delegate.core.flag &= ~BAM_FREAD2
1494 property is_secondary:
1495 """true if not primary alignment"""
1496 def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
1497 def __set__(self,val):
1498 if val: self._delegate.core.flag |= BAM_FSECONDARY
1499 else: self._delegate.core.flag &= ~BAM_FSECONDARY
1501 """true if QC failure"""
1502 def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
1503 def __set__(self,val):
1504 if val: self._delegate.core.flag |= BAM_FQCFAIL
1505 else: self._delegate.core.flag &= ~BAM_FQCFAIL
1506 property is_duplicate:
1507 """ true if optical or PCR duplicate"""
1508 def __get__(self): return (self.flag & BAM_FDUP) != 0
1509 def __set__(self,val):
1510 if val: self._delegate.core.flag |= BAM_FDUP
1511 else: self._delegate.core.flag &= ~BAM_FDUP
1514 """retrieves optional data given a two-letter *tag*"""
1515 #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
1517 v = bam_aux_get(self._delegate, tag)
1518 if v == NULL: raise KeyError( "tag '%s' not present" % tag )
1520 if type == 'c' or type == 'C' or type == 's' or type == 'S' or type == 'i':
1521 return <int>bam_aux2i(v)
1523 return <float>bam_aux2f(v)
1525 return <double>bam_aux2d(v)
1527 # there might a more efficient way
1528 # to convert a char into a string
1529 return '%c' % <char>bam_aux2A(v)
1531 return <char*>bam_aux2Z(v)
1533 def fancy_str (self):
1534 """returns list of fieldnames/values in pretty format for debugging
1538 "tid": "Contig index",
1539 "pos": "Mapped position on contig",
1540 "mtid": "Contig index for mate pair",
1541 "mpos": "Position of mate pair",
1542 "isize": "Insert size",
1543 "flag": "Binary flag",
1544 "n_cigar": "Count of cigar entries",
1545 "cigar": "Cigar entries",
1546 "qual": "Mapping quality",
1547 "bin": "Bam index bin number",
1548 "l_qname": "Length of query name",
1549 "qname": "Query name",
1550 "l_qseq": "Length of query sequence",
1551 "qseq": "Query sequence",
1552 "bqual": "Quality scores",
1553 "l_aux": "Length of auxilary data",
1554 "m_data": "Maximum data length",
1555 "data_len": "Current data length",
1557 fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
1558 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
1559 "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
1561 for f in fields_names_in_order:
1562 if not f in self.__dict__:
1564 ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
1566 for f in self.__dict__:
1567 if not f in field_names:
1568 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
1571 cdef class PileupProxy:
1572 '''A pileup column. A pileup column contains
1573 all the reads that map to a certain target base.
1576 chromosome ID as is defined in the header
1578 the target base coordinate (0-based)
1580 number of reads mapping to this column
1582 list of reads (:class:`pysam.PileupRead`) aligned to this column
1584 This class is a proxy for results returned by the samtools pileup engine.
1585 If the underlying engine iterator advances, the results of this column
1588 cdef bam_plbuf_t * buf
1591 def __cinit__(self ):
1595 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
1597 "\n".join( map(str, self.pileups) )
1600 '''the chromosome ID as is defined in the header'''
1601 def __get__(self): return pysam_get_tid( self.buf )
1604 '''number of reads mapping to this column.'''
1605 def __get__(self): return self.n_pu
1606 def __set__(self, n): self.n_pu = n
1609 def __get__(self): return pysam_get_pos( self.buf )
1612 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
1614 cdef bam_pileup1_t * pl
1615 pl = pysam_get_pileup( self.buf )
1617 # warning: there could be problems if self.n and self.buf are
1619 for x from 0 <= x < self.n_pu:
1620 pileups.append( makePileupRead( &pl[x]) )
1623 cdef class PileupRead:
1624 '''A read aligned to a column.
1628 AlignedRead _alignment
1636 def __cinit__( self ):
1640 return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
1643 """a :class:`pysam.AlignedRead` object of the aligned read"""
1645 return self._alignment
1647 """position of the read base at the pileup site, 0-based"""
1651 """indel length; 0 for no indel, positive for ins and negative for del"""
1655 """1 iff the base on the padded read is a deletion"""
1660 return self._is_head
1663 return self._is_tail
1669 '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
1670 def __init__(self, id = 1):
1674 def setdevice(self, filename):
1675 '''open an existing file, like "/dev/null"'''
1676 fd = os.open(filename, os.O_WRONLY)
1679 def setfile(self, filename):
1680 '''open a new file.'''
1681 fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
1684 def setfd(self, fd):
1685 ofd = os.dup(self.id) # Save old stream on new unit.
1686 self.streams.append(ofd)
1687 sys.stdout.flush() # Buffered data goes to old stream.
1688 os.dup2(fd, self.id) # Open unit 1 on new stream.
1689 os.close(fd) # Close other unit (look out, caller.)
1692 '''restore previous output stream'''
1694 # the following was not sufficient, hence flush both stderr and stdout
1695 # os.fsync( self.id )
1698 os.dup2(self.streams[-1], self.id)
1699 os.close(self.streams[-1])
1700 del self.streams[-1]
1702 def _samtools_dispatch( method, args = () ):
1703 '''call ``method`` in samtools providing arguments in args.
1706 This method redirects stdout and stderr to capture it
1707 from samtools. If for some reason stdout/stderr disappears
1708 the reason might be in this method.
1711 The current implementation might only work on linux.
1714 This method captures stdout and stderr using temporary files,
1715 which are then read into memory in their entirety. This method
1716 is slow and might cause large memory overhead.
1718 See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
1719 on the topic of redirecting stderr/stdout.
1722 # note that debugging this module can be a problem
1723 # as stdout/stderr will not appear
1725 # redirect stderr and stdout to file
1727 # open files and redirect into it
1728 stderr_h, stderr_f = tempfile.mkstemp()
1729 stdout_h, stdout_f = tempfile.mkstemp()
1731 # patch for `samtools view`
1732 # samtools `view` closes stdout, from which I can not
1733 # recover. Thus redirect output to file with -o option.
1734 if method == "view":
1735 if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
1736 args = ( "-o", stdout_f ) + args
1738 stdout_save = Outs( sys.stdout.fileno() )
1739 stdout_save.setfd( stdout_h )
1740 stderr_save = Outs( sys.stderr.fileno() )
1741 stderr_save.setfd( stderr_h )
1743 # do the function call to samtools
1745 cdef int i, n, retval
1748 # allocate two more for first (dummy) argument (contains command)
1749 cargs = <char**>calloc( n+2, sizeof( char *) )
1750 cargs[0] = "samtools"
1752 for i from 0 <= i < n: cargs[i+2] = args[i]
1753 retval = pysam_dispatch(n+2, cargs)
1756 # restore stdout/stderr. This will also flush, so
1757 # needs to be before reading back the file contents
1758 stdout_save.restore()
1759 stderr_save.restore()
1761 # capture stderr/stdout.
1762 out_stderr = open( stderr_f, "r").readlines()
1763 out_stdout = open( stdout_f, "r").readlines()
1766 os.remove( stderr_f )
1767 os.remove( stdout_f )
1769 return retval, out_stderr, out_stdout
1771 __all__ = ["Samfile",