1 # cython: embedsignature=True
3 # adds doc-strings for sphinx
5 import tempfile, os, sys, types, itertools, struct, ctypes
7 from python_string cimport PyString_FromStringAndSize, PyString_AS_STRING
8 from python_exc cimport PyErr_SetString
10 # defines imported from samtools
15 ## These are bits set in the flag.
16 ## have to put these definitions here, in csamtools.pxd they got ignored
17 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
19 ## @abstract the read is mapped in a proper pair */
20 DEF BAM_FPROPER_PAIR =2
21 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
23 ## @abstract the mate is unmapped */
25 ## @abstract the read is mapped to the reverse strand */
27 ## @abstract the mate is mapped to the reverse strand */
29 ## @abstract this is read1 */
31 ## @abstract this is read2 */
33 ## @abstract not primary alignment */
34 DEF BAM_FSECONDARY =256
35 ## @abstract QC failure */
37 ## @abstract optical or PCR duplicate */
41 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
47 DEF BAM_CSOFT_CLIP = 4
48 DEF BAM_CHARD_CLIP = 5
51 #####################################################################
52 #####################################################################
53 #####################################################################
54 ## private factory methods
55 #####################################################################
56 cdef class AlignedRead
57 cdef makeAlignedRead( bam1_t * src):
58 '''enter src into AlignedRead.'''
61 # destroy dummy delegate created in constructor
62 # to prevent memory leak.
63 bam_destroy1(dest._delegate)
64 dest._delegate = bam_dup1(src)
67 cdef class PileupProxy
68 cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
78 cdef makePileupRead( bam_pileup1_t * src ):
79 '''fill a PileupRead object from a bam_pileup1_t * object.'''
82 dest._alignment = makeAlignedRead( src.b )
84 dest._indel = src.indel
85 dest._level = src.level
86 dest._is_del = src.is_del
87 dest._is_head = src.is_head
88 dest._is_tail = src.is_tail
91 #####################################################################
92 #####################################################################
93 #####################################################################
94 ## Generic callbacks for inserting python callbacks.
95 #####################################################################
96 cdef int fetch_callback( bam1_t *alignment, void *f):
97 '''callback for bam_fetch.
99 calls function in *f* with a new :class:`AlignedRead` object as parameter.
101 a = makeAlignedRead( alignment )
104 class PileupColumn(object):
105 '''A pileup column. A pileup column contains
106 all the reads that map to a certain target base.
109 chromosome ID as is defined in the header
111 the target base coordinate (0-based)
113 number of reads mapping to this column
115 list of reads (:class:`pysam.PileupRead`) aligned to this column
118 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
119 "\n" + "\n".join( map(str, self.pileups) )
121 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
122 '''callback for pileup.
124 calls function in *f* with a new :class:`Pileup` object as parameter.
127 chromosome ID as is defined in the header
129 start coordinate of the alignment, 0-based
131 number of elements in pl array
145 for x from 0 <= x < n:
146 pileups.append( makePileupRead( &(pl[x]) ) )
151 cdef int pileup_fetch_callback( bam1_t *b, void *data):
152 '''callback for bam_fetch.
154 Fetches reads and submits them to pileup.
156 cdef bam_plbuf_t * buf
157 buf = <bam_plbuf_t*>data
158 bam_plbuf_push(b, buf)
166 self.stderr_h, self.stderr_f = tempfile.mkstemp()
167 self.stderr_save = Outs( sys.stderr.fileno() )
168 self.stderr_save.setfd( self.stderr_h )
171 self.stderr_save.restore()
172 if os.path.exists(self.stderr_f):
173 os.remove( self.stderr_f )
178 ######################################################################
179 ######################################################################
180 ######################################################################
181 # valid types for sam headers
182 VALID_HEADER_TYPES = { "HD" : dict,
188 # order of records within sam headers
189 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
191 # type conversions within sam header records
192 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
193 "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
194 "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
195 "PG" : { "ID" : str, "VN" : str, "CL" : str }, }
197 # output order of fields within records
198 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
199 "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
200 "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
201 "PG" : ( "ID", "VN", "CL" ), }
204 ######################################################################
205 ######################################################################
206 ######################################################################
208 ######################################################################
210 '''*(filename, mode='r', template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
212 A *SAM* file. The file is automatically opened.
214 *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode so for binary
215 (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
216 Use ``h`` to output header information in text (:term:`TAM`) mode.
218 If ``b`` is present, it must immediately follow ``r`` or ``w``.
219 Currently valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``.
221 so to open a :term:`BAM` file for reading::
223 f=Samfile('ex1.bam','rb')
226 For writing, the header of a :term:`TAM` file/:term:`BAM` file can be constituted from several
229 1. If *template* is given, the header is copied from a another *Samfile* (*template* must be of type *Samfile*).
231 2. If *header* is given, the header is build from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level is then a list of lines, with each line being a list of tag-value pairs.
233 3. If *text* is given, new header text is copied from raw text.
235 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
237 If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
238 access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
243 cdef samfile_t * samfile
245 cdef bam_index_t *index
246 # true if file is a bam file
248 # true if file is not on the local filesystem
250 # current read within iteration
255 def __cinit__(self, *args, **kwargs ):
258 self._open( *args, **kwargs )
260 # allocate memory for iterator
261 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
264 '''return true if samfile has been opened.'''
265 return self.samfile != NULL
267 def _hasIndex( self ):
268 '''return true if samfile has an existing (and opened) index.'''
269 return self.index != NULL
274 Samfile template = None,
275 referencenames = None,
276 referencelengths = None,
281 '''open a sam/bam file.
283 If _open is called on an existing bamfile, the current file will be
284 closed and a new file will be opened.
287 assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
289 # close a previously opened file
290 if self.samfile != NULL: self.close()
293 cdef bam_header_t * header_to_write
294 header_to_write = NULL
296 self.filename = filename
298 self.isbam = len(mode) > 1 and mode[1] == 'b'
300 self.isremote = strncmp(filename,"http:",5) == 0 or \
301 strncmp(filename,"ftp:",4) == 0
307 # open file for writing
309 # header structure (used for writing)
311 # copy header from another file
312 header_to_write = template.samfile.header
315 header_to_write = self._buildHeader( header )
318 # build header from a target names and lengths
319 assert referencenames and referencelengths, "either supply options `template`, `header` or both `refernencenames` and `referencelengths` for writing"
320 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
322 # allocate and fill header
323 header_to_write = bam_header_init()
324 header_to_write.n_targets = len(referencenames)
326 for x in referencenames: n += len(x) + 1
327 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
328 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
329 for x from 0 <= x < header_to_write.n_targets:
330 header_to_write.target_len[x] = referencelengths[x]
331 name = referencenames[x]
332 header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
333 strncpy( header_to_write.target_name[x], name, len(name) )
338 header_to_write.l_text = strlen(ctext)
339 header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
340 memcpy( header_to_write.text, ctext, strlen(ctext) )
342 header_to_write.hash = NULL
343 header_to_write.rg2lib = NULL
345 # open file. Header gets written to file at the same time for bam files
346 # and sam files (in the latter case, the mode needs to be wh)
347 store = StderrStore()
348 self.samfile = samopen( filename, mode, header_to_write )
351 # bam_header_destroy takes care of cleaning up of all the members
352 if not template and header_to_write != NULL:
353 bam_header_destroy( header_to_write )
356 # open file for reading
357 if strncmp( filename, "-", 1) != 0 and \
358 not self.isremote and \
359 not os.path.exists( filename ):
360 raise IOError( "file `%s` not found" % filename)
362 store = StderrStore()
363 self.samfile = samopen( filename, mode, NULL )
366 if self.samfile == NULL:
367 raise IOError("could not open file `%s`" % filename )
369 # check for index and open if present
370 if mode[0] == "r" and self.isbam:
372 if not self.isremote:
373 if not os.path.exists(filename +".bai"):
376 # returns NULL if there is no index or index could not be opened
377 self.index = bam_index_load(filename)
378 if self.index == NULL:
379 raise IOError("error while opening index `%s` " % filename )
381 self.index = bam_index_load(filename)
382 if self.index == NULL:
383 raise IOError("error while opening index `%s` " % filename )
385 def getrname( self, tid ):
387 convert numerical :term:`tid` into :ref:`reference` name.'''
388 if not 0 <= tid < self.samfile.header.n_targets:
389 raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets )
390 return self.samfile.header.target_name[tid]
392 def _parseRegion( self,
397 '''parse region information.
399 raise Value for for invalid regions.
401 returns a tuple of region, tid, start and end. Region
402 is a valid samtools :term:`region` or None if the region
403 extends over the whole file.
405 Note that regions are 1-based, while start,end are python coordinates.
414 rtid = rstart = rend = 0
416 # translate to a region
418 if start != None and end != None:
419 region = "%s:%i-%i" % (reference, start+1, end)
424 store = StderrStore()
425 bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)
427 if rtid < 0: raise ValueError( "invalid region `%s`" % region )
428 if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
429 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
430 if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
432 return region, rtid, rstart, rend
434 def seek( self, uint64_t offset, int where = 0):
435 '''move to current file to position *offset*'''
437 if not self._isOpen():
438 raise ValueError( "I/O operation on closed file" )
440 raise NotImplementedError("seek only available in bam files")
441 return bam_seek( self.samfile.x.bam, offset, where )
444 '''return current file position'''
446 raise NotImplementedError("seek only available in bam files")
448 return bam_tell( self.samfile.x.bam )
457 '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
459 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
460 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
462 Without *reference* or *region* all reads will be fetched. The reads will be returned
463 ordered by reference sequence, which will not necessarily be the order within the file.
464 If *until_eof* is given, all reads from the current file position will be returned
465 *as they are sorted within the file*.
467 If only *reference* is set, all reads matching on *reference* will be fetched.
469 The method returns an iterator of type :class:`pysam.IteratorRow` unless
470 a *callback is provided. If *callback* is given, the callback will be executed
471 for each position within the :term:`region`. Note that callbacks currently work
472 only, if *region* or *reference* is given.
474 Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
475 an exception is raised.
481 if not self._isOpen():
482 raise ValueError( "I/O operation on closed file" )
484 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
487 if not until_eof and not self._hasIndex() and not self.isremote:
488 raise ValueError( "fetch called on bamfile without index" )
492 raise ValueError( "callback functionality requires a region/reference" )
493 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
494 return bam_fetch(self.samfile.x.bam,
503 return IteratorRow( self, rtid, rstart, rend )
506 return IteratorRowAll( self )
508 # return all targets by chaining the individual targets together.
509 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
513 for rtid from 0 <= rtid < self.nreferences:
514 i.append( IteratorRow( self, rtid, rstart, rend))
515 return itertools.chain( *i )
517 # check if header is present - otherwise sam_read1 aborts
518 # this happens if a bamfile is opened with mode 'r'
519 if self.samfile.header.n_targets == 0:
520 raise ValueError( "fetch called for samfile without header")
523 raise ValueError ("fetch for a region is not available for sam files" )
525 raise NotImplementedError( "callback not implemented yet" )
527 return IteratorRowAll( self )
529 def pileup( self, reference = None, start = None, end = None, region = None, callback = None ):
530 '''run a pileup within a :term:`region` using 0-based indexing. The region is specified by
531 :term:`reference`, *start* and *end*. Alternatively, a samtools *region* string can be supplied.
533 Without *reference* or *region* all reads will be fetched. The reads will be returned
534 ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
536 The method returns an iterator of type :class:`pysam.IteratorColumn` unless
537 a *callback is provided. If *callback* is given, the callback will be executed
538 for each position within the :term:`region`.
540 Note that samfiles do not allow random access. If *region* or *reference* are given,
541 an exception is raised.
545 *all* reads which overlap the region are returned. The first base returned will be the
546 first base of the first read *not* necessarily the first base of the region used in the query.
551 cdef bam_plbuf_t *buf
553 if not self._isOpen():
554 raise ValueError( "I/O operation on closed file" )
556 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
559 if not self._hasIndex(): raise ValueError( "no index available for pileup" )
563 raise ValueError( "callback functionality requires a region/reference" )
565 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
566 bam_fetch(self.samfile.x.bam,
567 self.index, rtid, rstart, rend,
568 buf, pileup_fetch_callback )
571 bam_plbuf_push( NULL, buf)
572 bam_plbuf_destroy(buf)
575 return IteratorColumn( self, rtid, rstart, rend )
577 # return all targets by chaining the individual targets together.
581 for rtid from 0 <= rtid < self.nreferences:
582 i.append( IteratorColumn( self, rtid, rstart, rend))
583 return itertools.chain( *i )
586 raise NotImplementedError( "pileup of samfiles not implemented yet" )
590 if self.samfile != NULL:
591 samclose( self.samfile )
592 bam_index_destroy(self.index);
595 def __dealloc__( self ):
596 # remember: dealloc cannot call other methods
597 # note: no doc string
598 # note: __del__ is not called.
602 def write( self, AlignedRead read ):
603 '''(AlignedRead read )
604 write a single :class:`pysam.AlignedRead`..
606 return the number of bytes written.
608 return samwrite( self.samfile, read._delegate )
613 def __exit__(self, exc_type, exc_value, traceback):
617 property nreferences:
618 '''number of :term:`reference` sequences in the file.'''
620 return self.samfile.header.n_targets
623 """tuple with the names of :term:`reference` sequences."""
626 for x from 0 <= x < self.samfile.header.n_targets:
627 t.append( self.samfile.header.target_name[x] )
631 """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference`
635 for x from 0 <= x < self.samfile.header.n_targets:
636 t.append( self.samfile.header.target_len[x] )
640 '''full contents of the :term:`sam file` header as a string.'''
642 return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
645 '''header information within the :term:`sam file`. The records and fields are returned as
646 a two-level dictionary.
651 if self.samfile.header.text != NULL:
652 # convert to python string (note: call self.text to create 0-terminated string)
654 for line in t.split("\n"):
655 if not line.strip(): continue
656 assert line.startswith("@"), "header line without '@': '%s'" % line
657 fields = line[1:].split("\t")
659 assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
663 if record not in result: result[record] = []
664 result[record].append( "\t".join( fields[1:] ) )
667 # the following is clumsy as generators do not work?
669 for field in fields[1:]:
670 key, value = field.split(":",1)
671 if key not in VALID_HEADER_FIELDS[record]:
672 raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
673 x[key] = VALID_HEADER_FIELDS[record][key](value)
675 if VALID_HEADER_TYPES[record] == dict:
677 raise ValueError( "multiple '%s' lines are not permitted" % record )
679 elif VALID_HEADER_TYPES[record] == list:
680 if record not in result: result[record] = []
681 result[record].append( x )
685 def _buildLine( self, fields, record ):
686 '''build a header line from *fields* dictionary for *record*'''
688 # TODO: add checking for field and sort order
689 line = ["@%s" % record ]
691 line.append( fields )
693 for key in VALID_HEADER_ORDER[record]:
695 line.append( "%s:%s" % (key, str(fields[key])))
696 return "\t".join( line )
698 cdef bam_header_t * _buildHeader( self, new_header ):
699 '''return a new header built from a dictionary in *new_header*.
701 This method inserts the text field, target_name and target_len.
706 # check if hash exists
708 # create new header and copy old data
709 cdef bam_header_t * dest
711 dest = bam_header_init()
713 for record in VALID_HEADERS:
714 if record in new_header:
715 ttype = VALID_HEADER_TYPES[record]
716 data = new_header[record]
717 if type( data ) != type( ttype() ):
718 raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
719 if type( data ) == types.DictType:
720 lines.append( self._buildLine( data, record ) )
722 for fields in new_header[record]:
723 lines.append( self._buildLine( fields, record ) )
725 text = "\n".join(lines) + "\n"
726 if dest.text != NULL: free( dest.text )
727 dest.text = <char*>calloc( len(text), sizeof(char))
728 dest.l_text = len(text)
729 strncpy( dest.text, text, dest.l_text )
732 if "SQ" in new_header:
734 for fields in new_header["SQ"]:
736 seqs.append( (fields["SN"], fields["LN"] ) )
738 raise KeyError( "incomplete sequence information in '%s'" % str(fields))
740 dest.n_targets = len(seqs)
741 dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
742 dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
744 for x from 0 <= x < dest.n_targets:
745 seqname, seqlen = seqs[x]
746 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
747 strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
748 dest.target_len[x] = seqlen
755 cdef bam1_t * getCurrent( self ):
758 cdef int cnext(self):
759 '''cversion of iterator. Used by IteratorColumn'''
761 return samread(self.samfile, self.b)
764 """python version of next().
766 pyrex uses this non-standard name instead of next()
769 ret = samread(self.samfile, self.b)
771 return makeAlignedRead( self.b )
775 cdef class Fastafile:
778 A *FASTA* file. The file is automatically opened.
780 The file expects an indexed fasta file.
783 add automatic indexing.
784 add function to get sequence names.
788 # pointer to fastafile
789 cdef faidx_t * fastafile
791 def __cinit__(self, *args, **kwargs ):
792 self.fastafile = NULL
793 self._open( *args, **kwargs )
796 '''return true if samfile has been opened.'''
797 return self.fastafile != NULL
800 assert self.fastafile != NULL
801 return faidx_fetch_nseq(self.fastafile)
805 '''open an indexed fasta file.
807 This method expects an indexed fasta file.
810 # close a previously opened file
811 if self.fastafile != NULL: self.close()
812 self.filename = filename
813 self.fastafile = fai_load( filename )
815 if self.fastafile == NULL:
816 raise IOError("could not open file `%s`" % filename )
819 if self.fastafile != NULL:
820 fai_destroy( self.fastafile )
821 self.fastafile = NULL
829 '''*(reference = None, start = None, end = None, region = None)*
831 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
832 The region is specified by :term:`reference`, *start* and *end*.
834 If *reference* is given and *start* is None, the sequence from the
835 first base is returned. Similarly, if *end* is None, the sequence
836 until the last base is returned.
838 Alternatively, a samtools :term:`region` string can be supplied.
841 if not self._isOpen():
842 raise ValueError( "I/O operation on closed file" )
844 cdef int len, max_pos
849 if reference is None: raise ValueError( 'no sequence/region supplied.' )
850 if start is None: start = 0
851 if end is None: end = max_pos -1
853 if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
854 if start == end: return ""
855 # valid ranges are from 0 to 2^29-1
856 if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
857 if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
859 seq = faidx_fetch_seq(self.fastafile, reference,
863 # samtools adds a '\0' at the end
864 seq = fai_fetch( self.fastafile, region, &len )
876 ###########################################################################
877 ###########################################################################
878 ###########################################################################
879 ## turning callbacks elegantly into iterators is an unsolved problem, see the following threads:
880 ## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1
881 ## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html
882 ## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code
883 ## changes, the changes have to be manually entered.
884 cdef class IteratorRow:
885 """iterates over mapped reads in a region.
887 The samtools iterators assume that the file
888 position between iterations do not change.
889 As a consequence, no two iterators can work
890 on the same file. To permit this, each iterator
891 creates its own file handle by re-opening the
894 Note that the index will be shared between
895 samfile and the iterator.
898 cdef bam_iter_t iter # iterator state object
904 def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
906 assert samfile._isOpen()
907 assert samfile._hasIndex()
909 # makes sure that samfile stays alive as long as the
911 self.samfile = samfile
913 if samfile.isbam: mode = "rb"
917 store = StderrStore()
918 self.fp = samopen( samfile.filename, mode, NULL )
923 self.iter = bam_iter_query(self.samfile.index,
932 cdef bam1_t * getCurrent( self ):
935 cdef int cnext(self):
936 '''cversion of iterator. Used by IteratorColumn'''
937 self.retval = bam_iter_read( self.fp.x.bam,
942 """python version of next().
945 if self.retval < 0: raise StopIteration
946 return makeAlignedRead( self.b )
948 def __dealloc__(self):
952 cdef class IteratorRowAll:
953 """iterates over all mapped reads
959 def __cinit__(self, Samfile samfile):
961 assert samfile._isOpen()
963 if samfile.isbam: mode = "rb"
966 # reopen the file to avoid iterator conflict
967 store = StderrStore()
968 self.fp = samopen( samfile.filename, mode, NULL )
971 # allocate memory for alignment
972 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
977 cdef bam1_t * getCurrent( self ):
980 cdef int cnext(self):
981 '''cversion of iterator. Used by IteratorColumn'''
983 return samread(self.fp, self.b)
986 """python version of next().
988 pyrex uses this non-standard name instead of next()
991 ret = samread(self.fp, self.b)
993 return makeAlignedRead( self.b )
997 def __dealloc__(self):
1001 ctypedef struct __iterdata:
1005 cdef int __advance( void * data, bam1_t * b ):
1007 d = <__iterdata*>data
1008 return bam_iter_read( d.fp, d.iter, b )
1010 cdef class IteratorColumn:
1011 '''iterates over columns.
1013 This iterator wraps the pileup functionality of samtools.
1015 For reasons of efficiency, the iterator returns the current
1016 pileup buffer. As this buffer is updated at every iteration,
1017 the contents of this iterator will change accordingly. Hence the conversion to
1018 a list will not produce the expected result::
1020 f = Samfile("file.bam", "rb")
1021 result = list( f.pileup() )
1023 Here, result will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
1024 but each object will contain the same information.
1026 If the results of several columns are required at the same time, the results
1027 need to be stored explicitely::
1029 result = [ x.pileups() for x in f.pileup() ]
1031 Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`.
1035 # result of the last plbuf_push
1036 cdef IteratorRow iter
1040 cdef bam_pileup1_t * plp
1041 cdef bam_plp_t pileup_iter
1042 cdef __iterdata iterdata
1043 def __cinit__(self, Samfile samfile, int tid, int start, int end ):
1045 self.iter = IteratorRow( samfile, tid, start, end )
1046 self.iterdata.fp = samfile.samfile.x.bam
1047 self.iterdata.iter = self.iter.iter
1049 self.pileup_iter = bam_plp_init( &__advance, &self.iterdata )
1058 cdef int cnext(self):
1059 '''perform next iteration.
1061 self.plp = bam_plp_auto( self.pileup_iter,
1067 """python version of next().
1069 pyrex uses this non-standard name instead of next()
1073 raise ValueError("error during iteration" )
1075 if self.plp == NULL:
1078 return makePileupProxy( self.plp, self.tid, self.pos, self.n_plp )
1080 def __dealloc__(self):
1081 bam_plp_destroy(self.pileup_iter)
1083 cdef inline int32_t query_start(bam1_t *src) except -1:
1084 cdef uint32_t * cigar_p, op
1086 cdef uint32_t start_offset = 0
1088 if src.core.n_cigar:
1089 cigar_p = bam1_cigar(src);
1090 for k from 0 <= k < src.core.n_cigar:
1091 op = cigar_p[k] & BAM_CIGAR_MASK
1092 if op==BAM_CHARD_CLIP:
1093 if start_offset!=0 and start_offset!=src.core.l_qseq:
1094 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1096 elif op==BAM_CSOFT_CLIP:
1097 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
1104 cdef inline int32_t query_end(bam1_t *src) except -1:
1105 cdef uint32_t * cigar_p, op
1107 cdef uint32_t end_offset = src.core.l_qseq
1109 if src.core.n_cigar>1:
1110 cigar_p = bam1_cigar(src);
1111 for k from src.core.n_cigar > k >= 1:
1112 op = cigar_p[k] & BAM_CIGAR_MASK
1113 if op==BAM_CHARD_CLIP:
1114 if end_offset!=0 and end_offset!=src.core.l_qseq:
1115 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1117 elif op==BAM_CSOFT_CLIP:
1118 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
1123 end_offset = src.core.l_qseq
1128 cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
1132 cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
1134 if not src.core.l_qseq:
1137 seq = PyString_FromStringAndSize(NULL, end-start)
1138 s = PyString_AS_STRING(seq)
1141 for k from start <= k < end:
1142 # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
1143 # note: do not use string literal as it will be a python string
1144 s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
1149 cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
1158 qual = PyString_FromStringAndSize(NULL, end-start)
1159 q = PyString_AS_STRING(qual)
1161 for k from start <= k < end:
1162 ## equivalent to t[i] + 33 (see bam.c)
1163 q[k-start] = p[k] + 33
1167 cdef class AlignedRead:
1169 Class representing an aligned read. see SAM format specification for meaning of fields (http://samtools.sourceforge.net/).
1171 This class stores a handle to the samtools C-structure representing
1172 an aligned read. Member read access is forwarded to the C-structure
1173 and converted into python objects. This implementation should be fast,
1174 as only the data needed is converted.
1176 For write access, the C-structure is updated in-place. This is
1177 not the most efficient way to build BAM entries, as the variable
1178 length data is concatenated and thus needs to resized if
1179 a field is updated. Furthermore, the BAM entry might be
1180 in an inconsistent state. The :meth:`~validate` method can
1181 be used to check if an entry is consistent.
1183 One issue to look out for is that the sequence should always
1184 be set *before* the quality scores. Setting the sequence will
1185 also erase any quality scores that were set previously.
1190 def __cinit__( self ):
1192 self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
1193 # allocate some memory
1194 # If size is 0, calloc does not return a pointer that can be passed to free()
1195 # so allocate 40 bytes for a new read
1196 self._delegate.m_data = 40
1197 self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
1198 self._delegate.data_len = 0
1200 def __dealloc__(self):
1201 bam_destroy1(self._delegate)
1205 return "\t".join(map(str, (self.qname,
1216 def compare(self, AlignedRead other):
1217 '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
1225 # uncomment for debugging purposes
1226 # cdef unsigned char * oo, * tt
1227 # tt = <unsigned char*>(&t.core)
1228 # oo = <unsigned char*>(&o.core)
1229 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
1230 # tt = <unsigned char*>(t.data)
1231 # oo = <unsigned char*>(o.data)
1232 # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
1234 # Fast-path test for object identity
1238 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
1240 if retval: return retval
1241 retval = cmp(t.data_len, o.data_len)
1242 if retval: return retval
1243 return memcmp(t.data, o.data, t.data_len)
1245 # Disabled so long as __cmp__ is a special method
1247 return _Py_HashPointer(<void *>self)
1250 """the query name (None if not present)"""
1253 src = self._delegate
1254 if src.core.l_qname == 0: return None
1255 return <char *>bam1_qname( src )
1257 def __set__(self, qname ):
1258 if qname == None or len(qname) == 0: return
1263 src = self._delegate
1264 p = bam1_qname( src )
1266 # the qname is \0 terminated
1268 pysam_bam_update( src,
1273 src.core.l_qname = l
1275 # re-acquire pointer to location in memory
1276 # as it might have moved
1279 strncpy( p, qname, l )
1282 """the :term:`cigar` alignment (None if not present).
1285 cdef uint32_t * cigar_p
1290 src = self._delegate
1291 if src.core.n_cigar == 0: return None
1294 cigar_p = bam1_cigar(src);
1295 for k from 0 <= k < src.core.n_cigar:
1296 op = cigar_p[k] & BAM_CIGAR_MASK
1297 l = cigar_p[k] >> BAM_CIGAR_SHIFT
1298 cigar.append((op, l))
1301 def __set__(self, values ):
1302 if values == None or len(values) == 0: return
1310 src = self._delegate
1312 # get location of cigar string
1315 # create space for cigar data within src.data
1316 pysam_bam_update( src,
1317 src.core.n_cigar * 4,
1321 # length is number of cigar operations, not bytes
1322 src.core.n_cigar = len(values)
1324 # re-acquire pointer to location in memory
1325 # as it might have moved
1328 # insert cigar operations
1329 for op, l in values:
1330 p[k] = l << BAM_CIGAR_SHIFT | op
1333 ## setting the cigar string also updates the "bin" attribute
1334 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
1337 """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
1342 src = self._delegate
1344 if src.core.l_qseq == 0: return None
1346 return get_seq_range(src, 0, src.core.l_qseq)
1348 def __set__(self,seq):
1349 # samtools manages sequence and quality length memory together
1350 # if no quality information is present, the first byte says 0xff.
1352 if seq == None or len(seq) == 0: return
1356 cdef int l, k, nbytes_new, nbytes_old
1358 src = self._delegate
1362 # as the sequence is stored in half-bytes, the total length (sequence
1363 # plus quality scores) is (l+1)/2 + l
1364 nbytes_new = (l+1)/2 + l
1365 nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
1366 # acquire pointer to location in memory
1370 pysam_bam_update( src,
1374 # re-acquire pointer to location in memory
1375 # as it might have moved
1377 for k from 0 <= k < nbytes_new: p[k] = 0
1378 # convert to C string
1380 for k from 0 <= k < l:
1381 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
1384 p = bam1_qual( src )
1389 """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
1395 src = self._delegate
1397 if src.core.l_qseq == 0: return None
1399 return get_qual_range(src, 0, src.core.l_qseq)
1401 def __set__(self,qual):
1402 # note that space is already allocated via the sequences
1408 src = self._delegate
1409 p = bam1_qual( src )
1410 if qual == None or len(qual) == 0:
1411 # if absent - set to 0xff
1415 # convert to C string
1418 if src.core.l_qseq != l:
1419 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
1420 assert src.core.l_qseq == l
1421 for k from 0 <= k < l:
1422 p[k] = <uint8_t>q[k] - 33
1425 """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
1427 SAM/BAM files may included extra flanking bases sequences that were
1428 not part of the alignment. These bases may be the result of the
1429 Smith-Waterman or other algorithms, which may not require alignments
1430 that begin at the first residue or end at the last. In addition,
1431 extra sequencing adapters, multiplex identifiers, and low-quality bases that
1432 were not considered for alignment may have been retained."""
1436 cdef uint32_t start, end
1439 src = self._delegate
1441 if src.core.l_qseq == 0: return None
1443 start = query_start(src)
1444 end = query_end(src)
1446 return get_seq_range(src, start, end)
1449 """aligned query sequence quality values (None if not present)"""
1452 cdef uint32_t start, end
1455 src = self._delegate
1457 if src.core.l_qseq == 0: return None
1459 start = query_start(src)
1460 end = query_end(src)
1462 return get_qual_range(src, start, end)
1465 """start index of the aligned query portion of the sequence (0-based, inclusive)"""
1467 return query_start(self._delegate)
1470 """end index of the aligned query portion of the sequence (0-based, exclusive)"""
1472 return query_end(self._delegate)
1475 """Length of the aligned query sequence"""
1478 src = self._delegate
1479 return query_end(src)-query_start(src)
1482 """the tags in the AUX field.
1483 This property permits convenience access to
1484 the tags. Changes it the returned list will
1485 not update the tags automatically. Instead,
1486 the following is required for adding a
1489 read.tags = read.tags + [("RG",0)]
1498 src = self._delegate
1499 if src.l_aux == 0: return None
1503 ctag = <char*>calloc( 3, sizeof(char) )
1505 while s < (src.data + src.data_len):
1513 # convert type - is there a better way?
1517 # get type and value
1518 # how do I do char literal comparison in cython?
1519 # the code below works (i.e, is C comparison)
1522 value = <int>bam_aux2i(s)
1525 value = <int>bam_aux2i(s)
1528 value = <float>bam_aux2f(s)
1531 value = <double>bam_aux2d(s)
1534 value = <int>bam_aux2i(s)
1537 # there might a more efficient way
1538 # to convert a char into a string
1539 value = "%c" % <char>bam_aux2A(s)
1542 value = <char*>bam_aux2Z(s)
1543 # +1 for NULL terminated string
1550 result.append( (pytag, value) )
1555 def __set__(self, tags):
1559 cdef uint8_t * new_data
1560 cdef int guessed_size, control_size
1561 src = self._delegate
1562 cdef int max_size, size
1565 # map samtools code to python.struct code and byte size
1566 buffer = ctypes.create_string_buffer(max_size)
1569 for pytag, value in tags:
1571 if t == types.FloatType:
1573 elif t == types.IntType:
1575 if value >= -127: fmt, pytype = "<cccb", 'c'
1576 elif value >= -32767: fmt, pytype = "<ccch", 's'
1577 elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
1578 else: fmt, ctype = "<ccci", 'i'[0]
1580 if value <= 255: fmt, pytype = "<cccB", 'C'
1581 elif value <= 65535: fmt, pytype = "<cccH", 'S'
1582 elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
1583 else: fmt, pytype = "<cccI", 'I'
1585 # Note: hex strings (H) are not supported yet
1587 fmt, pytype = "<cccc", 'A'
1589 fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
1591 size = struct.calcsize(fmt)
1592 if offset + size > max_size:
1593 raise NotImplementedError("tags field too large")
1595 struct.pack_into( fmt,
1604 # delete the old data and allocate new
1605 pysam_bam_update( src,
1612 if offset == 0: return
1614 # get location of new data
1617 # check if there is direct path from buffer.raw to tmp
1620 memcpy( s, temp, offset )
1623 """properties flag"""
1624 def __get__(self): return self._delegate.core.flag
1625 def __set__(self, flag): self._delegate.core.flag = flag
1632 This field contains the index of the reference sequence
1633 in the sequence dictionary. To obtain the name
1634 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
1637 def __get__(self): return self._delegate.core.tid
1638 def __set__(self, tid): self._delegate.core.tid = tid
1640 """0-based leftmost coordinate"""
1641 def __get__(self): return self._delegate.core.pos
1642 def __set__(self, pos):
1643 ## setting the cigar string also updates the "bin" attribute
1645 src = self._delegate
1646 if src.core.n_cigar:
1647 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
1649 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
1650 self._delegate.core.pos = pos
1652 """properties bin"""
1653 def __get__(self): return self._delegate.core.bin
1654 def __set__(self, bin): self._delegate.core.bin = bin
1656 '''length of the read (read only). Returns 0 if not given.'''
1657 def __get__(self): return self._delegate.core.l_qseq
1659 '''aligned end position of the read (read only). Returns
1660 None if not available.'''
1663 src = self._delegate
1664 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
1666 return bam_calend(&src.core, bam1_cigar(src))
1668 '''aligned length of the read (read only). Returns None if
1672 src = self._delegate
1673 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
1675 return bam_calend(&src.core,
1676 bam1_cigar(src)) - \
1677 self._delegate.core.pos
1680 """mapping quality"""
1681 def __get__(self): return self._delegate.core.qual
1682 def __set__(self, qual): self._delegate.core.qual = qual
1684 """the :term:`reference` id of the mate """
1685 def __get__(self): return self._delegate.core.mtid
1686 def __set__(self, mtid): self._delegate.core.mtid = mtid
1688 """the position of the mate"""
1689 def __get__(self): return self._delegate.core.mpos
1690 def __set__(self, mpos): self._delegate.core.mpos = mpos
1692 """the insert size"""
1693 def __get__(self): return self._delegate.core.isize
1694 def __set__(self, isize): self._delegate.core.isize = isize
1696 """true if read is paired in sequencing"""
1697 def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
1698 def __set__(self,val):
1699 if val: self._delegate.core.flag |= BAM_FPAIRED
1700 else: self._delegate.core.flag &= ~BAM_FPAIRED
1701 property is_proper_pair:
1702 """true if read is mapped in a proper pair"""
1703 def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
1704 def __set__(self,val):
1705 if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
1706 else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
1707 property is_unmapped:
1708 """true if read itself is unmapped"""
1709 def __get__(self): return (self.flag & BAM_FUNMAP) != 0
1710 def __set__(self,val):
1711 if val: self._delegate.core.flag |= BAM_FUNMAP
1712 else: self._delegate.core.flag &= ~BAM_FUNMAP
1713 property mate_is_unmapped:
1714 """true if the mate is unmapped"""
1715 def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
1716 def __set__(self,val):
1717 if val: self._delegate.core.flag |= BAM_FMUNMAP
1718 else: self._delegate.core.flag &= ~BAM_FMUNMAP
1719 property is_reverse:
1720 """true if read is mapped to reverse strand"""
1721 def __get__(self):return (self.flag & BAM_FREVERSE) != 0
1722 def __set__(self,val):
1723 if val: self._delegate.core.flag |= BAM_FREVERSE
1724 else: self._delegate.core.flag &= ~BAM_FREVERSE
1725 property mate_is_reverse:
1726 """true is read is mapped to reverse strand"""
1727 def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
1728 def __set__(self,val):
1729 if val: self._delegate.core.flag |= BAM_FMREVERSE
1730 else: self._delegate.core.flag &= ~BAM_FMREVERSE
1732 """true if this is read1"""
1733 def __get__(self): return (self.flag & BAM_FREAD1) != 0
1734 def __set__(self,val):
1735 if val: self._delegate.core.flag |= BAM_FREAD1
1736 else: self._delegate.core.flag &= ~BAM_FREAD1
1738 """true if this is read2"""
1739 def __get__(self): return (self.flag & BAM_FREAD2) != 0
1740 def __set__(self,val):
1741 if val: self._delegate.core.flag |= BAM_FREAD2
1742 else: self._delegate.core.flag &= ~BAM_FREAD2
1743 property is_secondary:
1744 """true if not primary alignment"""
1745 def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
1746 def __set__(self,val):
1747 if val: self._delegate.core.flag |= BAM_FSECONDARY
1748 else: self._delegate.core.flag &= ~BAM_FSECONDARY
1750 """true if QC failure"""
1751 def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
1752 def __set__(self,val):
1753 if val: self._delegate.core.flag |= BAM_FQCFAIL
1754 else: self._delegate.core.flag &= ~BAM_FQCFAIL
1755 property is_duplicate:
1756 """ true if optical or PCR duplicate"""
1757 def __get__(self): return (self.flag & BAM_FDUP) != 0
1758 def __set__(self,val):
1759 if val: self._delegate.core.flag |= BAM_FDUP
1760 else: self._delegate.core.flag &= ~BAM_FDUP
1763 """retrieves optional data given a two-letter *tag*"""
1764 #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
1766 v = bam_aux_get(self._delegate, tag)
1767 if v == NULL: raise KeyError( "tag '%s' not present" % tag )
1769 if type == 'c' or type == 'C' or type == 's' or type == 'S' or type == 'i':
1770 return <int>bam_aux2i(v)
1772 return <float>bam_aux2f(v)
1774 return <double>bam_aux2d(v)
1776 # there might a more efficient way
1777 # to convert a char into a string
1778 return '%c' % <char>bam_aux2A(v)
1780 return <char*>bam_aux2Z(v)
1782 def fancy_str (self):
1783 """returns list of fieldnames/values in pretty format for debugging
1787 "tid": "Contig index",
1788 "pos": "Mapped position on contig",
1789 "mtid": "Contig index for mate pair",
1790 "mpos": "Position of mate pair",
1791 "isize": "Insert size",
1792 "flag": "Binary flag",
1793 "n_cigar": "Count of cigar entries",
1794 "cigar": "Cigar entries",
1795 "qual": "Mapping quality",
1796 "bin": "Bam index bin number",
1797 "l_qname": "Length of query name",
1798 "qname": "Query name",
1799 "l_qseq": "Length of query sequence",
1800 "qseq": "Query sequence",
1801 "bqual": "Quality scores",
1802 "l_aux": "Length of auxilary data",
1803 "m_data": "Maximum data length",
1804 "data_len": "Current data length",
1806 fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
1807 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
1808 "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
1810 for f in fields_names_in_order:
1811 if not f in self.__dict__:
1813 ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
1815 for f in self.__dict__:
1816 if not f in field_names:
1817 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
1820 cdef class PileupProxy:
1821 '''A pileup column. A pileup column contains
1822 all the reads that map to a certain target base.
1825 chromosome ID as is defined in the header
1827 the target base coordinate (0-based)
1829 number of reads mapping to this column
1831 list of reads (:class:`pysam.PileupRead`) aligned to this column
1833 This class is a proxy for results returned by the samtools pileup engine.
1834 If the underlying engine iterator advances, the results of this column
1837 cdef bam_pileup1_t * plp
1842 def __cinit__(self ):
1846 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
1848 "\n".join( map(str, self.pileups) )
1851 '''the chromosome ID as is defined in the header'''
1852 def __get__(self): return self.tid
1855 '''number of reads mapping to this column.'''
1856 def __get__(self): return self.n_pu
1857 def __set__(self, n): self.n_pu = n
1860 def __get__(self): return self.pos
1863 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
1867 # warning: there could be problems if self.n and self.buf are
1869 for x from 0 <= x < self.n_pu:
1870 pileups.append( makePileupRead( &(self.plp[x])) )
1873 cdef class PileupRead:
1874 '''A read aligned to a column.
1878 AlignedRead _alignment
1886 def __cinit__( self ):
1890 return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
1893 """a :class:`pysam.AlignedRead` object of the aligned read"""
1895 return self._alignment
1897 """position of the read base at the pileup site, 0-based"""
1901 """indel length; 0 for no indel, positive for ins and negative for del"""
1905 """1 iff the base on the padded read is a deletion"""
1910 return self._is_head
1913 return self._is_tail
1919 '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
1920 def __init__(self, id = 1):
1924 def setdevice(self, filename):
1925 '''open an existing file, like "/dev/null"'''
1926 fd = os.open(filename, os.O_WRONLY)
1929 def setfile(self, filename):
1930 '''open a new file.'''
1931 fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
1934 def setfd(self, fd):
1935 ofd = os.dup(self.id) # Save old stream on new unit.
1936 self.streams.append(ofd)
1937 sys.stdout.flush() # Buffered data goes to old stream.
1938 os.dup2(fd, self.id) # Open unit 1 on new stream.
1939 os.close(fd) # Close other unit (look out, caller.)
1942 '''restore previous output stream'''
1944 # the following was not sufficient, hence flush both stderr and stdout
1945 # os.fsync( self.id )
1948 os.dup2(self.streams[-1], self.id)
1949 os.close(self.streams[-1])
1950 del self.streams[-1]
1952 def _samtools_dispatch( method, args = () ):
1953 '''call ``method`` in samtools providing arguments in args.
1956 This method redirects stdout and stderr to capture it
1957 from samtools. If for some reason stdout/stderr disappears
1958 the reason might be in this method.
1961 The current implementation might only work on linux.
1964 This method captures stdout and stderr using temporary files,
1965 which are then read into memory in their entirety. This method
1966 is slow and might cause large memory overhead.
1968 See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
1969 on the topic of redirecting stderr/stdout.
1972 # note that debugging this module can be a problem
1973 # as stdout/stderr will not appear
1975 # redirect stderr and stdout to file
1977 # open files and redirect into it
1978 stderr_h, stderr_f = tempfile.mkstemp()
1979 stdout_h, stdout_f = tempfile.mkstemp()
1981 # patch for `samtools view`
1982 # samtools `view` closes stdout, from which I can not
1983 # recover. Thus redirect output to file with -o option.
1984 if method == "view":
1985 if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
1986 args = ( "-o", stdout_f ) + args
1988 stdout_save = Outs( sys.stdout.fileno() )
1989 stdout_save.setfd( stdout_h )
1990 stderr_save = Outs( sys.stderr.fileno() )
1991 stderr_save.setfd( stderr_h )
1993 # do the function call to samtools
1995 cdef int i, n, retval
1998 # allocate two more for first (dummy) argument (contains command)
1999 cargs = <char**>calloc( n+2, sizeof( char *) )
2000 cargs[0] = "samtools"
2002 for i from 0 <= i < n: cargs[i+2] = args[i]
2003 retval = pysam_dispatch(n+2, cargs)
2006 # restore stdout/stderr. This will also flush, so
2007 # needs to be before reading back the file contents
2008 stdout_save.restore()
2009 stderr_save.restore()
2011 # capture stderr/stdout.
2012 out_stderr = open( stderr_f, "r").readlines()
2013 out_stdout = open( stdout_f, "r").readlines()
2016 os.remove( stderr_f )
2017 os.remove( stdout_f )
2019 return retval, out_stderr, out_stdout
2021 __all__ = ["Samfile",