1 # cython: embedsignature=True
3 # adds doc-strings for sphinx
5 import tempfile, os, sys, types, itertools, struct, ctypes
7 from python_string cimport PyString_FromStringAndSize, PyString_AS_STRING
8 from python_exc cimport PyErr_SetString
10 # defines imported from samtools
15 ## These are bits set in the flag.
16 ## have to put these definitions here, in csamtools.pxd they got ignored
17 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
19 ## @abstract the read is mapped in a proper pair */
20 DEF BAM_FPROPER_PAIR =2
21 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
23 ## @abstract the mate is unmapped */
25 ## @abstract the read is mapped to the reverse strand */
27 ## @abstract the mate is mapped to the reverse strand */
29 ## @abstract this is read1 */
31 ## @abstract this is read2 */
33 ## @abstract not primary alignment */
34 DEF BAM_FSECONDARY =256
35 ## @abstract QC failure */
37 ## @abstract optical or PCR duplicate */
41 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
47 DEF BAM_CSOFT_CLIP = 4
48 DEF BAM_CHARD_CLIP = 5
51 #####################################################################
52 #####################################################################
53 #####################################################################
54 ## private factory methods
55 #####################################################################
56 cdef class AlignedRead
57 cdef makeAlignedRead( bam1_t * src):
58 '''enter src into AlignedRead.'''
61 # destroy dummy delegate created in constructor
62 # to prevent memory leak.
63 bam_destroy1(dest._delegate)
64 dest._delegate = bam_dup1(src)
67 cdef class PileupProxy
68 cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
78 cdef makePileupRead( bam_pileup1_t * src ):
79 '''fill a PileupRead object from a bam_pileup1_t * object.'''
82 dest._alignment = makeAlignedRead( src.b )
84 dest._indel = src.indel
85 dest._level = src.level
86 dest._is_del = src.is_del
87 dest._is_head = src.is_head
88 dest._is_tail = src.is_tail
91 #####################################################################
92 #####################################################################
93 #####################################################################
94 ## Generic callbacks for inserting python callbacks.
95 #####################################################################
96 cdef int fetch_callback( bam1_t *alignment, void *f):
97 '''callback for bam_fetch.
99 calls function in *f* with a new :class:`AlignedRead` object as parameter.
101 a = makeAlignedRead( alignment )
104 class PileupColumn(object):
105 '''A pileup column. A pileup column contains
106 all the reads that map to a certain target base.
109 chromosome ID as is defined in the header
111 the target base coordinate (0-based)
113 number of reads mapping to this column
115 list of reads (:class:`pysam.PileupRead`) aligned to this column
118 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
119 "\n" + "\n".join( map(str, self.pileups) )
121 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
122 '''callback for pileup.
124 calls function in *f* with a new :class:`Pileup` object as parameter.
127 chromosome ID as is defined in the header
129 start coordinate of the alignment, 0-based
131 number of elements in pl array
145 for x from 0 <= x < n:
146 pileups.append( makePileupRead( &(pl[x]) ) )
151 cdef int pileup_fetch_callback( bam1_t *b, void *data):
152 '''callback for bam_fetch.
154 Fetches reads and submits them to pileup.
156 cdef bam_plbuf_t * buf
157 buf = <bam_plbuf_t*>data
158 bam_plbuf_push(b, buf)
166 self.stderr_h, self.stderr_f = tempfile.mkstemp()
167 self.stderr_save = Outs( sys.stderr.fileno() )
168 self.stderr_save.setfd( self.stderr_h )
170 def readAndRelease( self ):
171 self.stderr_save.restore()
173 if os.path.exists(self.stderr_f):
174 lines = open( self.stderr_f, "r" ).readlines()
175 os.remove( self.stderr_f )
179 self.stderr_save.restore()
180 if os.path.exists(self.stderr_f):
181 os.remove( self.stderr_f )
186 ######################################################################
187 ######################################################################
188 ######################################################################
189 # valid types for sam headers
190 VALID_HEADER_TYPES = { "HD" : dict,
196 # order of records within sam headers
197 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
199 # type conversions within sam header records
200 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
201 "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
202 "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
203 "PG" : { "ID" : str, "VN" : str, "CL" : str }, }
205 # output order of fields within records
206 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
207 "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
208 "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
209 "PG" : ( "ID", "VN", "CL" ), }
212 ######################################################################
213 ######################################################################
214 ######################################################################
216 ######################################################################
218 '''*(filename, mode='r', template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
220 A *SAM* file. The file is automatically opened.
222 *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode so for binary
223 (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
224 Use ``h`` to output header information in text (:term:`TAM`) mode.
226 If ``b`` is present, it must immediately follow ``r`` or ``w``.
227 Currently valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``.
229 so to open a :term:`BAM` file for reading::
231 f=Samfile('ex1.bam','rb')
234 For writing, the header of a :term:`TAM` file/:term:`BAM` file can be constituted from several
237 1. If *template* is given, the header is copied from a another *Samfile* (*template* must be of type *Samfile*).
239 2. If *header* is given, the header is build from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level is then a list of lines, with each line being a list of tag-value pairs.
241 3. If *text* is given, new header text is copied from raw text.
243 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
245 If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
246 access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
251 cdef samfile_t * samfile
253 cdef bam_index_t *index
254 # true if file is a bam file
256 # true if file is not on the local filesystem
258 # current read within iteration
263 def __cinit__(self, *args, **kwargs ):
266 self._open( *args, **kwargs )
268 # allocate memory for iterator
269 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
272 '''return true if samfile has been opened.'''
273 return self.samfile != NULL
275 def _hasIndex( self ):
276 '''return true if samfile has an existing (and opened) index.'''
277 return self.index != NULL
282 Samfile template = None,
283 referencenames = None,
284 referencelengths = None,
289 '''open a sam/bam file.
291 If _open is called on an existing bamfile, the current file will be
292 closed and a new file will be opened.
295 assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
297 # close a previously opened file
298 if self.samfile != NULL: self.close()
301 cdef bam_header_t * header_to_write
302 header_to_write = NULL
304 self.filename = filename
306 self.isbam = len(mode) > 1 and mode[1] == 'b'
308 self.isremote = strncmp(filename,"http:",5) == 0 or \
309 strncmp(filename,"ftp:",4) == 0
315 # open file for writing
317 # header structure (used for writing)
319 # copy header from another file
320 header_to_write = template.samfile.header
323 header_to_write = self._buildHeader( header )
326 # build header from a target names and lengths
327 assert referencenames and referencelengths, "either supply options `template`, `header` or both `referencenames` and `referencelengths` for writing"
328 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
330 # allocate and fill header
331 header_to_write = bam_header_init()
332 header_to_write.n_targets = len(referencenames)
334 for x in referencenames: n += len(x) + 1
335 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
336 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
337 for x from 0 <= x < header_to_write.n_targets:
338 header_to_write.target_len[x] = referencelengths[x]
339 name = referencenames[x]
340 header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
341 strncpy( header_to_write.target_name[x], name, len(name) )
346 header_to_write.l_text = strlen(ctext)
347 header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
348 memcpy( header_to_write.text, ctext, strlen(ctext) )
350 header_to_write.hash = NULL
351 header_to_write.rg2lib = NULL
353 # open file. Header gets written to file at the same time for bam files
354 # and sam files (in the latter case, the mode needs to be wh)
355 store = StderrStore()
356 self.samfile = samopen( filename, mode, header_to_write )
359 # bam_header_destroy takes care of cleaning up of all the members
360 if not template and header_to_write != NULL:
361 bam_header_destroy( header_to_write )
364 # open file for reading
365 if strncmp( filename, "-", 1) != 0 and \
366 not self.isremote and \
367 not os.path.exists( filename ):
368 raise IOError( "file `%s` not found" % filename)
370 store = StderrStore()
371 self.samfile = samopen( filename, mode, NULL )
372 result = store.readAndRelease()
373 # test for specific messages as open also outputs status messages
374 # that can be ignored.
375 if "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n" in result:
376 raise ValueError( "invalid BAM binary header (is this a BAM file?)" )
377 elif '[samopen] no @SQ lines in the header.\n' in result:
378 raise ValueError( "no @SQ lines in the header (is this a SAM file?)")
380 if self.samfile == NULL:
381 raise IOError("could not open file `%s`" % filename )
383 # check for index and open if present
384 if mode[0] == "r" and self.isbam:
386 if not self.isremote:
387 if not os.path.exists(filename +".bai"):
390 # returns NULL if there is no index or index could not be opened
391 self.index = bam_index_load(filename)
392 if self.index == NULL:
393 raise IOError("error while opening index `%s` " % filename )
395 self.index = bam_index_load(filename)
396 if self.index == NULL:
397 raise IOError("error while opening index `%s` " % filename )
399 def getrname( self, tid ):
401 convert numerical :term:`tid` into :ref:`reference` name.'''
402 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
403 if not 0 <= tid < self.samfile.header.n_targets:
404 raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets )
405 return self.samfile.header.target_name[tid]
407 def _parseRegion( self,
412 '''parse region information.
414 raise Value for for invalid regions.
416 returns a tuple of region, tid, start and end. Region
417 is a valid samtools :term:`region` or None if the region
418 extends over the whole file.
420 Note that regions are 1-based, while start,end are python coordinates.
422 # This method's main objective is to translate from a reference to a tid.
423 # For now, it calls bam_parse_region, which is clumsy. Might be worth
424 # implementing it all in pysam (makes use of khash).
432 rtid = rstart = rend = 0
434 # translate to a region
436 if start != None and end != None:
437 if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
438 region = "%s:%i-%i" % (reference, start+1, end)
443 # this function might be called often (multiprocessing)
444 # thus avoid using StderrStore, see issue 46.
445 bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)
446 if rtid < 0: raise ValueError( "invalid region `%s`" % region )
447 if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
448 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
449 if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
451 return region, rtid, rstart, rend
453 def seek( self, uint64_t offset, int where = 0):
454 '''move to current file to position *offset*'''
456 if not self._isOpen():
457 raise ValueError( "I/O operation on closed file" )
459 raise NotImplementedError("seek only available in bam files")
460 return bam_seek( self.samfile.x.bam, offset, where )
463 '''return current file position'''
464 if not self._isOpen():
465 raise ValueError( "I/O operation on closed file" )
467 raise NotImplementedError("seek only available in bam files")
469 return bam_tell( self.samfile.x.bam )
478 '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
480 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
481 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
483 Without *reference* or *region* all reads will be fetched. The reads will be returned
484 ordered by reference sequence, which will not necessarily be the order within the file.
485 If *until_eof* is given, all reads from the current file position will be returned
486 *as they are sorted within the file*.
488 If only *reference* is set, all reads matching on *reference* will be fetched.
490 The method returns an iterator of type :class:`pysam.IteratorRow` unless
491 a *callback is provided. If *callback* is given, the callback will be executed
492 for each position within the :term:`region`. Note that callbacks currently work
493 only, if *region* or *reference* is given.
495 Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
496 an exception is raised.
502 if not self._isOpen():
503 raise ValueError( "I/O operation on closed file" )
505 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
508 if not until_eof and not self._hasIndex() and not self.isremote:
509 raise ValueError( "fetch called on bamfile without index" )
513 raise ValueError( "callback functionality requires a region/reference" )
514 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
515 return bam_fetch(self.samfile.x.bam,
524 return IteratorRow( self, rtid, rstart, rend )
527 return IteratorRowAll( self )
529 # return all targets by chaining the individual targets together.
530 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
534 for rtid from 0 <= rtid < self.nreferences:
535 i.append( IteratorRow( self, rtid, rstart, rend))
536 return itertools.chain( *i )
538 # check if header is present - otherwise sam_read1 aborts
539 # this happens if a bamfile is opened with mode 'r'
540 if self.samfile.header.n_targets == 0:
541 raise ValueError( "fetch called for samfile without header")
544 raise ValueError ("fetch for a region is not available for sam files" )
546 raise NotImplementedError( "callback not implemented yet" )
548 return IteratorRowAll( self )
550 def pileup( self, reference = None, start = None, end = None, region = None, callback = None ):
551 '''run a pileup within a :term:`region` using 0-based indexing. The region is specified by
552 :term:`reference`, *start* and *end*. Alternatively, a samtools *region* string can be supplied.
554 Without *reference* or *region* all reads will be fetched. The reads will be returned
555 ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
557 The method returns an iterator of type :class:`pysam.IteratorColumn` unless
558 a *callback is provided. If *callback* is given, the callback will be executed
559 for each position within the :term:`region`.
561 Note that samfiles do not allow random access. If *region* or *reference* are given,
562 an exception is raised.
566 *all* reads which overlap the region are returned. The first base returned will be the
567 first base of the first read *not* necessarily the first base of the region used in the query.
572 cdef bam_plbuf_t *buf
574 if not self._isOpen():
575 raise ValueError( "I/O operation on closed file" )
577 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
580 if not self._hasIndex(): raise ValueError( "no index available for pileup" )
584 raise ValueError( "callback functionality requires a region/reference" )
586 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
587 bam_fetch(self.samfile.x.bam,
588 self.index, rtid, rstart, rend,
589 buf, pileup_fetch_callback )
592 bam_plbuf_push( NULL, buf)
593 bam_plbuf_destroy(buf)
596 return IteratorColumn( self, rtid, rstart, rend )
598 # return all targets by chaining the individual targets together.
602 for rtid from 0 <= rtid < self.nreferences:
603 i.append( IteratorColumn( self, rtid, rstart, rend))
604 return itertools.chain( *i )
607 raise NotImplementedError( "pileup of samfiles not implemented yet" )
611 if self.samfile != NULL:
612 samclose( self.samfile )
613 bam_index_destroy(self.index);
616 def __dealloc__( self ):
617 # remember: dealloc cannot call other methods
618 # note: no doc string
619 # note: __del__ is not called.
623 def write( self, AlignedRead read ):
624 '''(AlignedRead read )
625 write a single :class:`pysam.AlignedRead`..
627 return the number of bytes written.
629 if not self._isOpen():
630 raise ValueError( "I/O operation on closed file" )
632 return samwrite( self.samfile, read._delegate )
637 def __exit__(self, exc_type, exc_value, traceback):
641 property nreferences:
642 '''number of :term:`reference` sequences in the file.'''
644 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
645 return self.samfile.header.n_targets
648 """tuple with the names of :term:`reference` sequences."""
650 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
652 for x from 0 <= x < self.samfile.header.n_targets:
653 t.append( self.samfile.header.target_name[x] )
657 """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference`
660 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
662 for x from 0 <= x < self.samfile.header.n_targets:
663 t.append( self.samfile.header.target_len[x] )
667 '''full contents of the :term:`sam file` header as a string.'''
669 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
670 return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
673 '''header information within the :term:`sam file`. The records and fields are returned as
674 a two-level dictionary.
677 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
681 if self.samfile.header.text != NULL:
682 # convert to python string (note: call self.text to create 0-terminated string)
684 for line in t.split("\n"):
685 if not line.strip(): continue
686 assert line.startswith("@"), "header line without '@': '%s'" % line
687 fields = line[1:].split("\t")
689 assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
693 if record not in result: result[record] = []
694 result[record].append( "\t".join( fields[1:] ) )
697 # the following is clumsy as generators do not work?
699 for field in fields[1:]:
700 key, value = field.split(":",1)
701 if key not in VALID_HEADER_FIELDS[record]:
702 raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
703 x[key] = VALID_HEADER_FIELDS[record][key](value)
705 if VALID_HEADER_TYPES[record] == dict:
707 raise ValueError( "multiple '%s' lines are not permitted" % record )
709 elif VALID_HEADER_TYPES[record] == list:
710 if record not in result: result[record] = []
711 result[record].append( x )
715 def _buildLine( self, fields, record ):
716 '''build a header line from *fields* dictionary for *record*'''
718 # TODO: add checking for field and sort order
719 line = ["@%s" % record ]
721 line.append( fields )
723 for key in VALID_HEADER_ORDER[record]:
725 line.append( "%s:%s" % (key, str(fields[key])))
726 return "\t".join( line )
728 cdef bam_header_t * _buildHeader( self, new_header ):
729 '''return a new header built from a dictionary in *new_header*.
731 This method inserts the text field, target_name and target_len.
736 # check if hash exists
738 # create new header and copy old data
739 cdef bam_header_t * dest
741 dest = bam_header_init()
743 for record in VALID_HEADERS:
744 if record in new_header:
745 ttype = VALID_HEADER_TYPES[record]
746 data = new_header[record]
747 if type( data ) != type( ttype() ):
748 raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
749 if type( data ) == types.DictType:
750 lines.append( self._buildLine( data, record ) )
752 for fields in new_header[record]:
753 lines.append( self._buildLine( fields, record ) )
755 text = "\n".join(lines) + "\n"
756 if dest.text != NULL: free( dest.text )
757 dest.text = <char*>calloc( len(text), sizeof(char))
758 dest.l_text = len(text)
759 strncpy( dest.text, text, dest.l_text )
762 if "SQ" in new_header:
764 for fields in new_header["SQ"]:
766 seqs.append( (fields["SN"], fields["LN"] ) )
768 raise KeyError( "incomplete sequence information in '%s'" % str(fields))
770 dest.n_targets = len(seqs)
771 dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
772 dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
774 for x from 0 <= x < dest.n_targets:
775 seqname, seqlen = seqs[x]
776 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
777 strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
778 dest.target_len[x] = seqlen
783 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
786 cdef bam1_t * getCurrent( self ):
789 cdef int cnext(self):
790 '''cversion of iterator. Used by IteratorColumn'''
792 return samread(self.samfile, self.b)
795 """python version of next().
797 pyrex uses this non-standard name instead of next()
800 ret = samread(self.samfile, self.b)
802 return makeAlignedRead( self.b )
806 cdef class Fastafile:
809 A *FASTA* file. The file is automatically opened.
811 The file expects an indexed fasta file.
814 add automatic indexing.
815 add function to get sequence names.
819 # pointer to fastafile
820 cdef faidx_t * fastafile
822 def __cinit__(self, *args, **kwargs ):
823 self.fastafile = NULL
824 self._open( *args, **kwargs )
827 '''return true if samfile has been opened.'''
828 return self.fastafile != NULL
831 if self.fastafile == NULL:
832 raise ValueError( "calling len() on closed file" )
834 return faidx_fetch_nseq(self.fastafile)
838 '''open an indexed fasta file.
840 This method expects an indexed fasta file.
843 # close a previously opened file
844 if self.fastafile != NULL: self.close()
845 self.filename = filename
846 self.fastafile = fai_load( filename )
848 if self.fastafile == NULL:
849 raise IOError("could not open file `%s`" % filename )
852 if self.fastafile != NULL:
853 fai_destroy( self.fastafile )
854 self.fastafile = NULL
862 '''*(reference = None, start = None, end = None, region = None)*
864 fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
865 The region is specified by :term:`reference`, *start* and *end*.
867 If *reference* is given and *start* is None, the sequence from the
868 first base is returned. Similarly, if *end* is None, the sequence
869 until the last base is returned.
871 Alternatively, a samtools :term:`region` string can be supplied.
874 if not self._isOpen():
875 raise ValueError( "I/O operation on closed file" )
877 cdef int length, max_pos
882 if reference is None: raise ValueError( 'no sequence/region supplied.' )
883 if start is None: start = 0
884 if end is None: end = max_pos -1
886 if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
887 if start == end: return ""
888 # valid ranges are from 0 to 2^29-1
889 if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
890 if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
891 seq = faidx_fetch_seq(self.fastafile,
897 # samtools adds a '\0' at the end
898 seq = fai_fetch( self.fastafile, region, &length )
905 py_seq = PyString_FromStringAndSize(seq, length)
911 ###########################################################################
912 ###########################################################################
913 ###########################################################################
914 ## turning callbacks elegantly into iterators is an unsolved problem, see the following threads:
915 ## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1
916 ## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html
917 ## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code
918 ## changes, the changes have to be manually entered.
919 cdef class IteratorRow:
920 """iterates over mapped reads in a region.
922 The samtools iterators assume that the file
923 position between iterations do not change.
924 As a consequence, no two iterators can work
925 on the same file. To permit this, each iterator
926 creates its own file handle by re-opening the
929 Note that the index will be shared between
930 samfile and the iterator.
933 cdef bam_iter_t iter # iterator state object
939 def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
941 if not samfile._isOpen():
942 raise ValueError( "I/O operation on closed file" )
944 if not samfile._hasIndex():
945 raise ValueError( "no index available for pileup" )
947 # makes sure that samfile stays alive as long as the
949 self.samfile = samfile
951 if samfile.isbam: mode = "rb"
955 store = StderrStore()
956 self.fp = samopen( samfile.filename, mode, NULL )
961 self.iter = bam_iter_query(self.samfile.index,
970 cdef bam1_t * getCurrent( self ):
973 cdef int cnext(self):
974 '''cversion of iterator. Used by IteratorColumn'''
975 self.retval = bam_iter_read( self.fp.x.bam,
980 """python version of next().
983 if self.retval < 0: raise StopIteration
984 return makeAlignedRead( self.b )
986 def __dealloc__(self):
990 cdef class IteratorRowAll:
991 """iterates over all mapped reads
997 def __cinit__(self, Samfile samfile):
999 if not samfile._isOpen():
1000 raise ValueError( "I/O operation on closed file" )
1002 if samfile.isbam: mode = "rb"
1005 # reopen the file to avoid iterator conflict
1006 store = StderrStore()
1007 self.fp = samopen( samfile.filename, mode, NULL )
1010 # allocate memory for alignment
1011 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1016 cdef bam1_t * getCurrent( self ):
1019 cdef int cnext(self):
1020 '''cversion of iterator. Used by IteratorColumn'''
1022 return samread(self.fp, self.b)
1025 """python version of next().
1027 pyrex uses this non-standard name instead of next()
1030 ret = samread(self.fp, self.b)
1032 return makeAlignedRead( self.b )
1036 def __dealloc__(self):
1037 bam_destroy1(self.b)
1040 ctypedef struct __iterdata:
1044 cdef int __advance( void * data, bam1_t * b ):
1046 d = <__iterdata*>data
1047 return bam_iter_read( d.fp, d.iter, b )
1049 cdef class IteratorColumn:
1050 '''iterates over columns.
1052 This iterator wraps the pileup functionality of samtools.
1054 For reasons of efficiency, the iterator returns the current
1055 pileup buffer. As this buffer is updated at every iteration,
1056 the contents of this iterator will change accordingly. Hence the conversion to
1057 a list will not produce the expected result::
1059 f = Samfile("file.bam", "rb")
1060 result = list( f.pileup() )
1062 Here, result will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
1063 but each object will contain the same information.
1065 If the results of several columns are required at the same time, the results
1066 need to be stored explicitely::
1068 result = [ x.pileups() for x in f.pileup() ]
1070 Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`.
1074 # result of the last plbuf_push
1075 cdef IteratorRow iter
1079 cdef bam_pileup1_t * plp
1080 cdef bam_plp_t pileup_iter
1081 cdef __iterdata iterdata
1082 def __cinit__(self, Samfile samfile, int tid, int start, int end ):
1084 self.iter = IteratorRow( samfile, tid, start, end )
1085 self.iterdata.fp = samfile.samfile.x.bam
1086 self.iterdata.iter = self.iter.iter
1088 self.pileup_iter = bam_plp_init( &__advance, &self.iterdata )
1097 cdef int cnext(self):
1098 '''perform next iteration.
1100 self.plp = bam_plp_auto( self.pileup_iter,
1106 """python version of next().
1108 pyrex uses this non-standard name instead of next()
1112 raise ValueError("error during iteration" )
1114 if self.plp == NULL:
1117 return makePileupProxy( self.plp, self.tid, self.pos, self.n_plp )
1119 def __dealloc__(self):
1120 bam_plp_destroy(self.pileup_iter)
1122 cdef inline int32_t query_start(bam1_t *src) except -1:
1123 cdef uint32_t * cigar_p, op
1125 cdef uint32_t start_offset = 0
1127 if src.core.n_cigar:
1128 cigar_p = bam1_cigar(src);
1129 for k from 0 <= k < src.core.n_cigar:
1130 op = cigar_p[k] & BAM_CIGAR_MASK
1131 if op==BAM_CHARD_CLIP:
1132 if start_offset!=0 and start_offset!=src.core.l_qseq:
1133 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1135 elif op==BAM_CSOFT_CLIP:
1136 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
1143 cdef inline int32_t query_end(bam1_t *src) except -1:
1144 cdef uint32_t * cigar_p, op
1146 cdef uint32_t end_offset = src.core.l_qseq
1148 if src.core.n_cigar>1:
1149 cigar_p = bam1_cigar(src);
1150 for k from src.core.n_cigar > k >= 1:
1151 op = cigar_p[k] & BAM_CIGAR_MASK
1152 if op==BAM_CHARD_CLIP:
1153 if end_offset!=0 and end_offset!=src.core.l_qseq:
1154 PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1156 elif op==BAM_CSOFT_CLIP:
1157 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
1162 end_offset = src.core.l_qseq
1167 cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
1171 cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
1173 if not src.core.l_qseq:
1176 seq = PyString_FromStringAndSize(NULL, end-start)
1177 s = PyString_AS_STRING(seq)
1180 for k from start <= k < end:
1181 # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
1182 # note: do not use string literal as it will be a python string
1183 s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
1188 cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
1197 qual = PyString_FromStringAndSize(NULL, end-start)
1198 q = PyString_AS_STRING(qual)
1200 for k from start <= k < end:
1201 ## equivalent to t[i] + 33 (see bam.c)
1202 q[k-start] = p[k] + 33
1206 cdef class AlignedRead:
1208 Class representing an aligned read. see SAM format specification for meaning of fields (http://samtools.sourceforge.net/).
1210 This class stores a handle to the samtools C-structure representing
1211 an aligned read. Member read access is forwarded to the C-structure
1212 and converted into python objects. This implementation should be fast,
1213 as only the data needed is converted.
1215 For write access, the C-structure is updated in-place. This is
1216 not the most efficient way to build BAM entries, as the variable
1217 length data is concatenated and thus needs to resized if
1218 a field is updated. Furthermore, the BAM entry might be
1219 in an inconsistent state. The :meth:`~validate` method can
1220 be used to check if an entry is consistent.
1222 One issue to look out for is that the sequence should always
1223 be set *before* the quality scores. Setting the sequence will
1224 also erase any quality scores that were set previously.
1229 def __cinit__( self ):
1231 self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
1232 # allocate some memory
1233 # If size is 0, calloc does not return a pointer that can be passed to free()
1234 # so allocate 40 bytes for a new read
1235 self._delegate.m_data = 40
1236 self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
1237 self._delegate.data_len = 0
1239 def __dealloc__(self):
1240 bam_destroy1(self._delegate)
1244 return "\t".join(map(str, (self.qname,
1255 def compare(self, AlignedRead other):
1256 '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
1264 # uncomment for debugging purposes
1265 # cdef unsigned char * oo, * tt
1266 # tt = <unsigned char*>(&t.core)
1267 # oo = <unsigned char*>(&o.core)
1268 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
1269 # tt = <unsigned char*>(t.data)
1270 # oo = <unsigned char*>(o.data)
1271 # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
1273 # Fast-path test for object identity
1277 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
1279 if retval: return retval
1280 retval = cmp(t.data_len, o.data_len)
1281 if retval: return retval
1282 return memcmp(t.data, o.data, t.data_len)
1284 # Disabled so long as __cmp__ is a special method
1286 return _Py_HashPointer(<void *>self)
1289 """the query name (None if not present)"""
1292 src = self._delegate
1293 if src.core.l_qname == 0: return None
1294 return <char *>bam1_qname( src )
1296 def __set__(self, qname ):
1297 if qname == None or len(qname) == 0: return
1302 src = self._delegate
1303 p = bam1_qname( src )
1305 # the qname is \0 terminated
1307 pysam_bam_update( src,
1312 src.core.l_qname = l
1314 # re-acquire pointer to location in memory
1315 # as it might have moved
1318 strncpy( p, qname, l )
1321 """the :term:`cigar` alignment (None if not present).
1324 cdef uint32_t * cigar_p
1329 src = self._delegate
1330 if src.core.n_cigar == 0: return None
1333 cigar_p = bam1_cigar(src);
1334 for k from 0 <= k < src.core.n_cigar:
1335 op = cigar_p[k] & BAM_CIGAR_MASK
1336 l = cigar_p[k] >> BAM_CIGAR_SHIFT
1337 cigar.append((op, l))
1340 def __set__(self, values ):
1341 if values == None or len(values) == 0: return
1349 src = self._delegate
1351 # get location of cigar string
1354 # create space for cigar data within src.data
1355 pysam_bam_update( src,
1356 src.core.n_cigar * 4,
1360 # length is number of cigar operations, not bytes
1361 src.core.n_cigar = len(values)
1363 # re-acquire pointer to location in memory
1364 # as it might have moved
1367 # insert cigar operations
1368 for op, l in values:
1369 p[k] = l << BAM_CIGAR_SHIFT | op
1372 ## setting the cigar string also updates the "bin" attribute
1373 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
1376 """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
1381 src = self._delegate
1383 if src.core.l_qseq == 0: return None
1385 return get_seq_range(src, 0, src.core.l_qseq)
1387 def __set__(self,seq):
1388 # samtools manages sequence and quality length memory together
1389 # if no quality information is present, the first byte says 0xff.
1391 if seq == None or len(seq) == 0: return
1395 cdef int l, k, nbytes_new, nbytes_old
1397 src = self._delegate
1401 # as the sequence is stored in half-bytes, the total length (sequence
1402 # plus quality scores) is (l+1)/2 + l
1403 nbytes_new = (l+1)/2 + l
1404 nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
1405 # acquire pointer to location in memory
1409 pysam_bam_update( src,
1413 # re-acquire pointer to location in memory
1414 # as it might have moved
1416 for k from 0 <= k < nbytes_new: p[k] = 0
1417 # convert to C string
1419 for k from 0 <= k < l:
1420 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
1423 p = bam1_qual( src )
1428 """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
1434 src = self._delegate
1436 if src.core.l_qseq == 0: return None
1438 return get_qual_range(src, 0, src.core.l_qseq)
1440 def __set__(self,qual):
1441 # note that space is already allocated via the sequences
1447 src = self._delegate
1448 p = bam1_qual( src )
1449 if qual == None or len(qual) == 0:
1450 # if absent - set to 0xff
1454 # convert to C string
1457 if src.core.l_qseq != l:
1458 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
1459 assert src.core.l_qseq == l
1460 for k from 0 <= k < l:
1461 p[k] = <uint8_t>q[k] - 33
1464 """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
1466 SAM/BAM files may included extra flanking bases sequences that were
1467 not part of the alignment. These bases may be the result of the
1468 Smith-Waterman or other algorithms, which may not require alignments
1469 that begin at the first residue or end at the last. In addition,
1470 extra sequencing adapters, multiplex identifiers, and low-quality bases that
1471 were not considered for alignment may have been retained."""
1475 cdef uint32_t start, end
1478 src = self._delegate
1480 if src.core.l_qseq == 0: return None
1482 start = query_start(src)
1483 end = query_end(src)
1485 return get_seq_range(src, start, end)
1488 """aligned query sequence quality values (None if not present)"""
1491 cdef uint32_t start, end
1494 src = self._delegate
1496 if src.core.l_qseq == 0: return None
1498 start = query_start(src)
1499 end = query_end(src)
1501 return get_qual_range(src, start, end)
1504 """start index of the aligned query portion of the sequence (0-based, inclusive)"""
1506 return query_start(self._delegate)
1509 """end index of the aligned query portion of the sequence (0-based, exclusive)"""
1511 return query_end(self._delegate)
1514 """Length of the aligned query sequence"""
1517 src = self._delegate
1518 return query_end(src)-query_start(src)
1521 """the tags in the AUX field.
1523 This property permits convenience access to
1524 the tags. Changes it the returned list will
1525 not update the tags automatically. Instead,
1526 the following is required for adding a
1529 read.tags = read.tags + [("RG",0)]
1539 src = self._delegate
1540 if src.l_aux == 0: return None
1545 while s < (src.data + src.data_len):
1552 if auxtype in ('c', 'C'):
1553 value = <int>bam_aux2i(s)
1555 elif auxtype in ('s', 'S'):
1556 value = <int>bam_aux2i(s)
1558 elif auxtype in ('i', 'I'):
1559 value = <float>bam_aux2i(s)
1561 elif auxtype == 'f':
1562 value = <float>bam_aux2f(s)
1564 elif auxtype == 'd':
1565 value = <double>bam_aux2d(s)
1567 elif auxtype == 'A':
1568 value = "%c" % <char>bam_aux2A(s)
1570 elif auxtype in ('Z', 'H'):
1571 value = <char*>bam_aux2Z(s)
1572 # +1 for NULL terminated string
1577 result.append( (auxtag, value) )
1581 def __set__(self, tags):
1585 cdef uint8_t * new_data
1587 cdef int guessed_size, control_size
1588 cdef int max_size, size, offset
1590 src = self._delegate
1596 # map samtools code to python.struct code and byte size
1597 buffer = ctypes.create_string_buffer(max_size)
1599 for pytag, value in tags:
1601 if t == types.FloatType:
1603 elif t == types.IntType:
1605 if value >= -127: fmt, pytype = "<cccb", 'c'
1606 elif value >= -32767: fmt, pytype = "<ccch", 's'
1607 elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
1608 else: fmt, ctype = "<ccci", 'i'[0]
1610 if value <= 255: fmt, pytype = "<cccB", 'C'
1611 elif value <= 65535: fmt, pytype = "<cccH", 'S'
1612 elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
1613 else: fmt, pytype = "<cccI", 'I'
1615 # Note: hex strings (H) are not supported yet
1617 fmt, pytype = "<cccc", 'A'
1619 fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
1621 size = struct.calcsize(fmt)
1622 if offset + size > max_size:
1623 raise NotImplementedError("tags field too large")
1625 struct.pack_into( fmt,
1634 # delete the old data and allocate new
1635 # if offset == 0, the aux field will be
1637 pysam_bam_update( src,
1644 # copy data only if there is any
1647 # get location of new data
1650 # check if there is direct path from buffer.raw to tmp
1652 memcpy( s, temp, offset )
1655 """properties flag"""
1656 def __get__(self): return self._delegate.core.flag
1657 def __set__(self, flag): self._delegate.core.flag = flag
1664 This field contains the index of the reference sequence
1665 in the sequence dictionary. To obtain the name
1666 of the reference sequence, use :meth:`pysam.Samfile.getrname()`
1669 def __get__(self): return self._delegate.core.tid
1670 def __set__(self, tid): self._delegate.core.tid = tid
1672 """0-based leftmost coordinate"""
1673 def __get__(self): return self._delegate.core.pos
1674 def __set__(self, pos):
1675 ## setting the cigar string also updates the "bin" attribute
1677 src = self._delegate
1678 if src.core.n_cigar:
1679 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
1681 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
1682 self._delegate.core.pos = pos
1684 """properties bin"""
1685 def __get__(self): return self._delegate.core.bin
1686 def __set__(self, bin): self._delegate.core.bin = bin
1688 '''length of the read (read only). Returns 0 if not given.'''
1689 def __get__(self): return self._delegate.core.l_qseq
1691 '''aligned end position of the read (read only). Returns
1692 None if not available.'''
1695 src = self._delegate
1696 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
1698 return bam_calend(&src.core, bam1_cigar(src))
1700 '''aligned length of the read (read only). Returns None if
1704 src = self._delegate
1705 if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
1707 return bam_calend(&src.core,
1708 bam1_cigar(src)) - \
1709 self._delegate.core.pos
1712 """mapping quality"""
1713 def __get__(self): return self._delegate.core.qual
1714 def __set__(self, qual): self._delegate.core.qual = qual
1716 """the :term:`reference` id of the mate """
1717 def __get__(self): return self._delegate.core.mtid
1718 def __set__(self, mtid): self._delegate.core.mtid = mtid
1720 """the position of the mate"""
1721 def __get__(self): return self._delegate.core.mpos
1722 def __set__(self, mpos): self._delegate.core.mpos = mpos
1724 """the insert size"""
1725 def __get__(self): return self._delegate.core.isize
1726 def __set__(self, isize): self._delegate.core.isize = isize
1728 """true if read is paired in sequencing"""
1729 def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
1730 def __set__(self,val):
1731 if val: self._delegate.core.flag |= BAM_FPAIRED
1732 else: self._delegate.core.flag &= ~BAM_FPAIRED
1733 property is_proper_pair:
1734 """true if read is mapped in a proper pair"""
1735 def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
1736 def __set__(self,val):
1737 if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
1738 else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
1739 property is_unmapped:
1740 """true if read itself is unmapped"""
1741 def __get__(self): return (self.flag & BAM_FUNMAP) != 0
1742 def __set__(self,val):
1743 if val: self._delegate.core.flag |= BAM_FUNMAP
1744 else: self._delegate.core.flag &= ~BAM_FUNMAP
1745 property mate_is_unmapped:
1746 """true if the mate is unmapped"""
1747 def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
1748 def __set__(self,val):
1749 if val: self._delegate.core.flag |= BAM_FMUNMAP
1750 else: self._delegate.core.flag &= ~BAM_FMUNMAP
1751 property is_reverse:
1752 """true if read is mapped to reverse strand"""
1753 def __get__(self):return (self.flag & BAM_FREVERSE) != 0
1754 def __set__(self,val):
1755 if val: self._delegate.core.flag |= BAM_FREVERSE
1756 else: self._delegate.core.flag &= ~BAM_FREVERSE
1757 property mate_is_reverse:
1758 """true is read is mapped to reverse strand"""
1759 def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
1760 def __set__(self,val):
1761 if val: self._delegate.core.flag |= BAM_FMREVERSE
1762 else: self._delegate.core.flag &= ~BAM_FMREVERSE
1764 """true if this is read1"""
1765 def __get__(self): return (self.flag & BAM_FREAD1) != 0
1766 def __set__(self,val):
1767 if val: self._delegate.core.flag |= BAM_FREAD1
1768 else: self._delegate.core.flag &= ~BAM_FREAD1
1770 """true if this is read2"""
1771 def __get__(self): return (self.flag & BAM_FREAD2) != 0
1772 def __set__(self,val):
1773 if val: self._delegate.core.flag |= BAM_FREAD2
1774 else: self._delegate.core.flag &= ~BAM_FREAD2
1775 property is_secondary:
1776 """true if not primary alignment"""
1777 def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
1778 def __set__(self,val):
1779 if val: self._delegate.core.flag |= BAM_FSECONDARY
1780 else: self._delegate.core.flag &= ~BAM_FSECONDARY
1782 """true if QC failure"""
1783 def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
1784 def __set__(self,val):
1785 if val: self._delegate.core.flag |= BAM_FQCFAIL
1786 else: self._delegate.core.flag &= ~BAM_FQCFAIL
1787 property is_duplicate:
1788 """ true if optical or PCR duplicate"""
1789 def __get__(self): return (self.flag & BAM_FDUP) != 0
1790 def __set__(self,val):
1791 if val: self._delegate.core.flag |= BAM_FDUP
1792 else: self._delegate.core.flag &= ~BAM_FDUP
1795 """retrieves optional data given a two-letter *tag*"""
1796 #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
1798 v = bam_aux_get(self._delegate, tag)
1799 if v == NULL: raise KeyError( "tag '%s' not present" % tag )
1801 if type == 'c' or type == 'C' or type == 's' or type == 'S' or type == 'i':
1802 return <int>bam_aux2i(v)
1804 return <float>bam_aux2f(v)
1806 return <double>bam_aux2d(v)
1808 # there might a more efficient way
1809 # to convert a char into a string
1810 return '%c' % <char>bam_aux2A(v)
1812 return <char*>bam_aux2Z(v)
1814 def fancy_str (self):
1815 """returns list of fieldnames/values in pretty format for debugging
1819 "tid": "Contig index",
1820 "pos": "Mapped position on contig",
1821 "mtid": "Contig index for mate pair",
1822 "mpos": "Position of mate pair",
1823 "isize": "Insert size",
1824 "flag": "Binary flag",
1825 "n_cigar": "Count of cigar entries",
1826 "cigar": "Cigar entries",
1827 "qual": "Mapping quality",
1828 "bin": "Bam index bin number",
1829 "l_qname": "Length of query name",
1830 "qname": "Query name",
1831 "l_qseq": "Length of query sequence",
1832 "qseq": "Query sequence",
1833 "bqual": "Quality scores",
1834 "l_aux": "Length of auxilary data",
1835 "m_data": "Maximum data length",
1836 "data_len": "Current data length",
1838 fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
1839 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
1840 "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
1842 for f in fields_names_in_order:
1843 if not f in self.__dict__:
1845 ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
1847 for f in self.__dict__:
1848 if not f in field_names:
1849 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
1852 cdef class PileupProxy:
1853 '''A pileup column. A pileup column contains
1854 all the reads that map to a certain target base.
1857 chromosome ID as is defined in the header
1859 the target base coordinate (0-based)
1861 number of reads mapping to this column
1863 list of reads (:class:`pysam.PileupRead`) aligned to this column
1865 This class is a proxy for results returned by the samtools pileup engine.
1866 If the underlying engine iterator advances, the results of this column
1869 cdef bam_pileup1_t * plp
1874 def __cinit__(self ):
1878 return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
1880 "\n".join( map(str, self.pileups) )
1883 '''the chromosome ID as is defined in the header'''
1884 def __get__(self): return self.tid
1887 '''number of reads mapping to this column.'''
1888 def __get__(self): return self.n_pu
1889 def __set__(self, n): self.n_pu = n
1892 def __get__(self): return self.pos
1895 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
1899 # warning: there could be problems if self.n and self.buf are
1901 for x from 0 <= x < self.n_pu:
1902 pileups.append( makePileupRead( &(self.plp[x])) )
1905 cdef class PileupRead:
1906 '''A read aligned to a column.
1910 AlignedRead _alignment
1918 def __cinit__( self ):
1922 return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
1925 """a :class:`pysam.AlignedRead` object of the aligned read"""
1927 return self._alignment
1929 """position of the read base at the pileup site, 0-based"""
1933 """indel length; 0 for no indel, positive for ins and negative for del"""
1937 """1 iff the base on the padded read is a deletion"""
1942 return self._is_head
1945 return self._is_tail
1951 '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
1952 def __init__(self, id = 1):
1956 def setdevice(self, filename):
1957 '''open an existing file, like "/dev/null"'''
1958 fd = os.open(filename, os.O_WRONLY)
1961 def setfile(self, filename):
1962 '''open a new file.'''
1963 fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
1966 def setfd(self, fd):
1967 ofd = os.dup(self.id) # Save old stream on new unit.
1968 self.streams.append(ofd)
1969 sys.stdout.flush() # Buffered data goes to old stream.
1970 os.dup2(fd, self.id) # Open unit 1 on new stream.
1971 os.close(fd) # Close other unit (look out, caller.)
1974 '''restore previous output stream'''
1976 # the following was not sufficient, hence flush both stderr and stdout
1977 # os.fsync( self.id )
1980 os.dup2(self.streams[-1], self.id)
1981 os.close(self.streams[-1])
1982 del self.streams[-1]
1984 def _samtools_dispatch( method, args = () ):
1985 '''call ``method`` in samtools providing arguments in args.
1988 This method redirects stdout and stderr to capture it
1989 from samtools. If for some reason stdout/stderr disappears
1990 the reason might be in this method.
1993 The current implementation might only work on linux.
1996 This method captures stdout and stderr using temporary files,
1997 which are then read into memory in their entirety. This method
1998 is slow and might cause large memory overhead.
2000 See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
2001 on the topic of redirecting stderr/stdout.
2004 # note that debugging this module can be a problem
2005 # as stdout/stderr will not appear
2007 # redirect stderr and stdout to file
2009 # open files and redirect into it
2010 stderr_h, stderr_f = tempfile.mkstemp()
2011 stdout_h, stdout_f = tempfile.mkstemp()
2013 # patch for `samtools view`
2014 # samtools `view` closes stdout, from which I can not
2015 # recover. Thus redirect output to file with -o option.
2016 if method == "view":
2017 if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
2018 args = ( "-o", stdout_f ) + args
2020 stdout_save = Outs( sys.stdout.fileno() )
2021 stdout_save.setfd( stdout_h )
2022 stderr_save = Outs( sys.stderr.fileno() )
2023 stderr_save.setfd( stderr_h )
2025 # do the function call to samtools
2027 cdef int i, n, retval
2030 # allocate two more for first (dummy) argument (contains command)
2031 cargs = <char**>calloc( n+2, sizeof( char *) )
2032 cargs[0] = "samtools"
2034 for i from 0 <= i < n: cargs[i+2] = args[i]
2035 retval = pysam_dispatch(n+2, cargs)
2038 # restore stdout/stderr. This will also flush, so
2039 # needs to be before reading back the file contents
2040 stdout_save.restore()
2041 stderr_save.restore()
2043 # capture stderr/stdout.
2044 out_stderr = open( stderr_f, "r").readlines()
2045 out_stdout = open( stdout_f, "r").readlines()
2048 os.remove( stderr_f )
2049 os.remove( stdout_f )
2051 return retval, out_stderr, out_stdout
2053 __all__ = ["Samfile",