pysam/csamtools.pyx

   1 # cython: embedsignature=True
   2 # cython: profile=True
   3 # adds doc-strings for sphinx
   4 import tempfile
   5 import os
   6 import sys
   7 import types
   8 import itertools
   9 import struct
  10 import ctypes
  11 import collections
  12 import re
  13 import platform
  14 from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
  15 from cpython cimport PyErr_SetString
  16
  17 #from cpython.string cimport PyString_FromStringAndSize, PyString_AS_STRING
  18 #from cpython.exc    cimport PyErr_SetString, PyErr_NoMemory
  19
  20 # defines imported from samtools
  21 DEF SEEK_SET = 0
  22 DEF SEEK_CUR = 1
  23 DEF SEEK_END = 2
  24
  25 ## These are bits set in the flag.
  26 ## have to put these definitions here, in csamtools.pxd they got ignored
  27 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
  28 DEF BAM_FPAIRED       =1
  29 ## @abstract the read is mapped in a proper pair */
  30 DEF BAM_FPROPER_PAIR  =2
  31 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
  32 DEF BAM_FUNMAP        =4
  33 ## @abstract the mate is unmapped */
  34 DEF BAM_FMUNMAP       =8
  35 ## @abstract the read is mapped to the reverse strand */
  36 DEF BAM_FREVERSE      =16
  37 ## @abstract the mate is mapped to the reverse strand */
  38 DEF BAM_FMREVERSE     =32
  39 ## @abstract this is read1 */
  40 DEF BAM_FREAD1        =64
  41 ## @abstract this is read2 */
  42 DEF BAM_FREAD2       =128
  43 ## @abstract not primary alignment */
  44 DEF BAM_FSECONDARY   =256
  45 ## @abstract QC failure */
  46 DEF BAM_FQCFAIL      =512
  47 ## @abstract optical or PCR duplicate */
  48 DEF BAM_FDUP        =1024
  49
  50 DEF BAM_CIGAR_SHIFT=4
  51 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
  52
  53 DEF BAM_CMATCH     = 0
  54 DEF BAM_CINS       = 1
  55 DEF BAM_CDEL       = 2
  56 DEF BAM_CREF_SKIP  = 3
  57 DEF BAM_CSOFT_CLIP = 4
  58 DEF BAM_CHARD_CLIP = 5
  59 DEF BAM_CPAD       = 6
  60
  61 #####################################################################
  62 # hard-coded constants
  63 cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
  64 cdef int max_pos = 2 << 29
  65
  66 # redirect stderr to 0
  67 _logfile = open(os.path.devnull, "w")
  68 pysam_set_stderr( PyFile_AsFile( _logfile ) )
  69
  70 #####################################################################
  71 #####################################################################
  72 #####################################################################
  73 ## private factory methods
  74 #####################################################################
  75 cdef class AlignedRead
  76 cdef makeAlignedRead(bam1_t * src):
  77     '''enter src into AlignedRead.'''
  78     cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
  79     dest._delegate = bam_dup1(src)
  80     return dest
  81
  82 cdef class PileupProxy
  83 cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
  84      cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
  85      dest.plp = plp
  86      dest.tid = tid
  87      dest.pos = pos
  88      dest.n = n
  89      return dest
  90
  91 cdef class PileupRead
  92 cdef makePileupRead( bam_pileup1_t * src ):
  93     '''fill a  PileupRead object from a bam_pileup1_t * object.'''
  94     cdef PileupRead dest = PileupRead.__new__(PileupRead)
  95     dest._alignment = makeAlignedRead( src.b )
  96     dest._qpos = src.qpos
  97     dest._indel = src.indel
  98     dest._level = src.level
  99     dest._is_del = src.is_del
 100     dest._is_head = src.is_head
 101     dest._is_tail = src.is_tail
 102     return dest
 103
 104 #####################################################################
 105 #####################################################################
 106 #####################################################################
 107 ## Generic callbacks for inserting python callbacks.
 108 #####################################################################
 109 cdef int fetch_callback( bam1_t *alignment, void *f):
 110     '''callback for bam_fetch.
 111
 112     calls function in *f* with a new :class:`AlignedRead` object as parameter.
 113     '''
 114     a = makeAlignedRead( alignment )
 115     (<object>f)(a)
 116
 117 class PileupColumn(object):
 118     '''A pileup column. A pileup column contains
 119     all the reads that map to a certain target base.
 120
 121     tid
 122         chromosome ID as is defined in the header
 123     pos
 124         the target base coordinate (0-based)
 125     n
 126         number of reads mapping to this column
 127     pileups
 128         list of reads (:class:`pysam.PileupRead`) aligned to this column
 129     '''
 130     def __str__(self):
 131         return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
 132             "\n" + "\n".join( map(str, self.pileups) )
 133
 134 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
 135     '''callback for pileup.
 136
 137     calls function in *f* with a new :class:`Pileup` object as parameter.
 138
 139     tid
 140         chromosome ID as is defined in the header
 141     pos
 142         start coordinate of the alignment, 0-based
 143     n
 144         number of elements in pl array
 145     pl
 146         array of alignments
 147     data
 148         user provided data
 149     '''
 150
 151     p = PileupColumn()
 152     p.tid = tid
 153     p.pos = pos
 154     p.n = n
 155     pileups = []
 156
 157     cdef int x
 158     for x from 0 <= x < n:
 159         pileups.append( makePileupRead( &(pl[x]) ) )
 160     p.pileups = pileups
 161
 162     (<object>f)(p)
 163
 164 cdef int pileup_fetch_callback( bam1_t *b, void *data):
 165     '''callback for bam_fetch.
 166
 167     Fetches reads and submits them to pileup.
 168     '''
 169     cdef bam_plbuf_t * buf
 170     buf = <bam_plbuf_t*>data
 171     bam_plbuf_push(b, buf)
 172     return 0
 173
 174 class StderrStore():
 175     '''
 176     stderr is captured.
 177     '''
 178     def __init__(self):
 179         return
 180         self.stderr_h, self.stderr_f = tempfile.mkstemp()
 181         self.stderr_save = Outs( sys.stderr.fileno() )
 182         self.stderr_save.setfd( self.stderr_h )
 183
 184     def readAndRelease( self ):
 185         return []
 186         self.stderr_save.restore()
 187         lines = []
 188         if os.path.exists(self.stderr_f):
 189             lines = open( self.stderr_f, "r" ).readlines()
 190             os.remove( self.stderr_f )
 191         return lines
 192
 193     def release(self):
 194         return
 195         self.stderr_save.restore()
 196         if os.path.exists(self.stderr_f):
 197             os.remove( self.stderr_f )
 198
 199     def __del__(self):
 200         self.release()
 201
 202 class StderrStoreWindows():
 203     '''does nothing. stderr can't be redirected on windows'''
 204     def __init__(self): pass
 205     def readAndRelease(self): return []
 206     def release(self): pass
 207
 208 if platform.system()=='Windows':
 209     del StderrStore
 210     StderrStore = StderrStoreWindows
 211
 212
 213 ######################################################################
 214 ######################################################################
 215 ######################################################################
 216 # valid types for sam headers
 217 VALID_HEADER_TYPES = { "HD" : dict,
 218                        "SQ" : list,
 219                        "RG" : list,
 220                        "PG" : list,
 221                        "CO" : list }
 222
 223 # order of records within sam headers
 224 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
 225
 226 # type conversions within sam header records
 227 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
 228                         "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
 229                         "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
 230                         "PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
 231
 232 # output order of fields within records
 233 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
 234                        "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
 235                        "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
 236                        "PG" : ( "PN", "ID", "VN", "CL" ), }
 237
 238
 239 ######################################################################
 240 ######################################################################
 241 ######################################################################
 242 ## Public methods
 243 ######################################################################
 244 cdef class Fastafile:
 245     '''*(filename)*
 246
 247     A *FASTA* file. The file is automatically opened.
 248
 249     The file expects an indexed fasta file.
 250
 251     TODO:
 252         add automatic indexing.
 253         add function to get sequence names.
 254     '''
 255
 256     cdef char * _filename
 257     # pointer to fastafile
 258     cdef faidx_t * fastafile
 259
 260     def __cinit__(self, *args, **kwargs ):
 261         self.fastafile = NULL
 262         self._filename = NULL
 263         self._open( *args, **kwargs )
 264
 265     def _isOpen( self ):
 266         '''return true if samfile has been opened.'''
 267         return self.fastafile != NULL
 268
 269     def __len__(self):
 270         if self.fastafile == NULL:
 271             raise ValueError( "calling len() on closed file" )
 272
 273         return faidx_fetch_nseq(self.fastafile)
 274
 275     def _open( self,
 276                char * filename ):
 277         '''open an indexed fasta file.
 278
 279         This method expects an indexed fasta file.
 280         '''
 281
 282         # close a previously opened file
 283         if self.fastafile != NULL: self.close()
 284         if self._filename != NULL: free(self._filename)
 285         self._filename = strdup(filename)
 286         self.fastafile = fai_load( filename )
 287
 288         if self.fastafile == NULL:
 289             raise IOError("could not open file `%s`" % filename )
 290
 291     def close( self ):
 292         if self.fastafile != NULL:
 293             fai_destroy( self.fastafile )
 294             self.fastafile = NULL
 295
 296     def __dealloc__(self):
 297         self.close()
 298         if self._filename != NULL: free(self._filename)
 299
 300     property filename:
 301         '''number of :term:`filename` associated with this object.'''
 302         def __get__(self):
 303             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 304             return self._filename
 305
 306     def fetch( self,
 307                reference = None,
 308                start = None,
 309                end = None,
 310                region = None):
 311
 312         '''*(reference = None, start = None, end = None, region = None)*
 313
 314         fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
 315
 316         The region is specified by :term:`reference`, *start* and *end*.
 317
 318         fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
 319
 320         If *reference* is given and *start* is None, the sequence from the
 321         first base is returned. Similarly, if *end* is None, the sequence
 322         until the last base is returned.
 323
 324         Alternatively, a samtools :term:`region` string can be supplied.
 325         '''
 326
 327         if not self._isOpen():
 328             raise ValueError( "I/O operation on closed file" )
 329
 330         cdef int length
 331         cdef char * seq
 332
 333         if not region:
 334             if reference is None: raise ValueError( 'no sequence/region supplied.' )
 335             if start is None: start = 0
 336             if end is None: end = max_pos -1
 337
 338             if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
 339             if start == end: return ""
 340             # valid ranges are from 0 to 2^29-1
 341             if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
 342             if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
 343             # note: faidx_fetch_seq has a bug such that out-of-range access
 344             # always returns the last residue. Hence do not use faidx_fetch_seq,
 345             # but use fai_fetch instead
 346             # seq = faidx_fetch_seq(self.fastafile,
 347             #                       reference,
 348             #                       start,
 349             #                       end-1,
 350             #                       &length)
 351             region = "%s:%i-%i" % (reference, start+1, end)
 352             seq = fai_fetch( self.fastafile,
 353                              region,
 354                              &length )
 355         else:
 356             # samtools adds a '\0' at the end
 357             seq = fai_fetch( self.fastafile, region, &length )
 358
 359         # copy to python
 360         if seq == NULL:
 361             return ""
 362         else:
 363             try:
 364                 py_seq = PyString_FromStringAndSize(seq, length)
 365             finally:
 366                 free(seq)
 367
 368         return py_seq
 369
 370     cdef char * _fetch( self, char * reference, int start, int end, int * length ):
 371         '''fetch sequence for reference, start and end'''
 372
 373         return faidx_fetch_seq(self.fastafile,
 374                                reference,
 375                                start,
 376                                end-1,
 377                                length )
 378
 379 #------------------------------------------------------------------------
 380 #------------------------------------------------------------------------
 381 #------------------------------------------------------------------------
 382 cdef int count_callback( bam1_t *alignment, void *f):
 383      '''callback for bam_fetch - count number of reads.
 384      '''
 385      cdef int* counter = (<int*>f)
 386      counter[0] += 1;
 387
 388 ctypedef struct MateData:
 389      char * name
 390      bam1_t * mate
 391      uint32_t flag
 392
 393 #------------------------------------------------------------------------
 394 #------------------------------------------------------------------------
 395 #------------------------------------------------------------------------
 396 cdef int mate_callback( bam1_t *alignment, void *f):
 397      '''callback for bam_fetch = filter mate
 398      '''
 399      cdef MateData * d = (<MateData*>f)
 400      # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
 401      #        d.mate, d.name, bam1_qname(alignment),
 402      #        d.flag, alignment.core.flag, alignment.core.flag & d.flag)
 403
 404      if d.mate == NULL:
 405          # could be sped up by comparing the lengths of query strings first
 406          # using l_qname
 407          #
 408          # also, make sure that we get the other read by comparing
 409          # the flags
 410          if alignment.core.flag & d.flag != 0 and \
 411                  strcmp( bam1_qname( alignment ), d.name ) == 0:
 412              d.mate = bam_dup1( alignment )
 413
 414
 415 cdef class Samfile:
 416     '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
 417
 418     A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
 419
 420     *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
 421     (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
 422     Use ``h`` to output header information in text (:term:`TAM`)  mode.
 423
 424     If ``b`` is present, it must immediately follow ``r`` or ``w``.
 425     Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
 426     a :term:`BAM` formatted file for reading, type::
 427
 428         f = pysam.Samfile('ex1.bam','rb')
 429
 430     If mode is not specified, we will try to auto-detect in the order 'r', 'rb', thus both the following
 431     should work::
 432
 433         f1 = pysam.Samfile('ex1.bam' )
 434         f2 = pysam.Samfile('ex1.bam' )
 435
 436     If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
 437     access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
 438
 439     For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
 440     sources (see also the samtools format specification):
 441
 442         1. If *template* is given, the header is copied from a another *Samfile*
 443            (*template* must be of type *Samfile*).
 444
 445         2. If *header* is given, the header is built from a multi-level dictionary. The first level
 446            are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
 447            being a list of tag-value pairs.
 448
 449         3. If *text* is given, new header text is copied from raw text.
 450
 451         4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
 452
 453     '''
 454
 455     def __cinit__(self, *args, **kwargs ):
 456         self.samfile = NULL
 457         self._filename = NULL
 458         self.isbam = False
 459         self._open( *args, **kwargs )
 460
 461         # allocate memory for iterator
 462         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
 463
 464     def _isOpen( self ):
 465         '''return true if samfile has been opened.'''
 466         return self.samfile != NULL
 467
 468     def _hasIndex( self ):
 469         '''return true if samfile has an existing (and opened) index.'''
 470         return self.index != NULL
 471
 472     def _open( self,
 473                char * filename,
 474                mode = None,
 475                Samfile template = None,
 476                referencenames = None,
 477                referencelengths = None,
 478                text = None,
 479                header = None,
 480                port = None,
 481               ):
 482         '''open a sam/bam file.
 483
 484         If _open is called on an existing bamfile, the current file will be
 485         closed and a new file will be opened.
 486         '''
 487
 488         # read mode autodetection
 489         if mode is None:
 490             try:
 491                 self._open(filename, 'r', template=template,
 492                            referencenames=referencenames,
 493                            referencelengths=referencelengths,
 494                            text=text, header=header, port=port)
 495                 return
 496             except ValueError, msg:
 497                 pass
 498             self._open(filename, 'rb', template=template,
 499                        referencenames=referencenames,
 500                        referencelengths=referencelengths,
 501                        text=text, header=header, port=port)
 502             return
 503
 504         assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
 505         assert filename != NULL
 506
 507         # close a previously opened file
 508         if self.samfile != NULL: self.close()
 509         self.samfile = NULL
 510
 511         cdef bam_header_t * header_to_write
 512         header_to_write = NULL
 513
 514         if self._filename != NULL: free(self._filename )
 515         self._filename = strdup( filename )
 516
 517         self.isbam = len(mode) > 1 and mode[1] == 'b'
 518
 519         self.isremote = strncmp(filename,"http:",5) == 0 or \
 520             strncmp(filename,"ftp:",4) == 0
 521
 522         cdef char * ctext
 523         ctext = NULL
 524
 525         if mode[0] == 'w':
 526             # open file for writing
 527
 528             # header structure (used for writing)
 529             if template:
 530                 # copy header from another file
 531                 header_to_write = template.samfile.header
 532
 533             elif header:
 534                 header_to_write = self._buildHeader( header )
 535
 536             else:
 537                 # build header from a target names and lengths
 538                 assert referencenames and referencelengths, "either supply options `template`, `header` or  both `referencenames` and `referencelengths` for writing"
 539                 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
 540
 541                 # allocate and fill header
 542                 header_to_write = bam_header_init()
 543                 header_to_write.n_targets = len(referencenames)
 544                 n = 0
 545                 for x in referencenames: n += len(x) + 1
 546                 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
 547                 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
 548                 for x from 0 <= x < header_to_write.n_targets:
 549                     header_to_write.target_len[x] = referencelengths[x]
 550                     name = referencenames[x]
 551                     header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
 552                     strncpy( header_to_write.target_name[x], name, len(name) )
 553
 554                 if text != None:
 555                     # copy without \0
 556                     ctext = text
 557                     header_to_write.l_text = strlen(ctext)
 558                     header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
 559                     memcpy( header_to_write.text, ctext, strlen(ctext) )
 560
 561                 header_to_write.hash = NULL
 562                 header_to_write.rg2lib = NULL
 563
 564             # open file. Header gets written to file at the same time for bam files
 565             # and sam files (in the latter case, the mode needs to be wh)
 566             store = StderrStore()
 567             self.samfile = samopen( filename, mode, header_to_write )
 568             store.release()
 569
 570             # bam_header_destroy takes care of cleaning up of all the members
 571             if not template and header_to_write != NULL:
 572                 bam_header_destroy( header_to_write )
 573
 574         elif mode[0] == "r":
 575             # open file for reading
 576             if strncmp( filename, "-", 1) != 0 and \
 577                     not self.isremote and \
 578                     not os.path.exists( filename ):
 579                 raise IOError( "file `%s` not found" % filename)
 580
 581             # try to detect errors
 582             self.samfile = samopen( filename, mode, NULL )
 583             if self.samfile == NULL:
 584                 raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
 585
 586             if self.samfile.header == NULL:
 587                 raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode )
 588
 589             #disabled for autodetection to work
 590             if self.samfile.header.n_targets == 0:
 591                 raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
 592
 593         if self.samfile == NULL:
 594             raise IOError("could not open file `%s`" % filename )
 595
 596         # check for index and open if present
 597         if mode[0] == "r" and self.isbam:
 598
 599             if not self.isremote:
 600                 if not os.path.exists(filename +".bai"):
 601                     self.index = NULL
 602                 else:
 603                     # returns NULL if there is no index or index could not be opened
 604                     self.index = bam_index_load(filename)
 605                     if self.index == NULL:
 606                         raise IOError("error while opening index `%s` " % filename )
 607             else:
 608                 self.index = bam_index_load(filename)
 609                 if self.index == NULL:
 610                     raise IOError("error while opening index `%s` " % filename )
 611
 612     def gettid( self, reference ):
 613         '''
 614         convert :term:`reference` name into numerical :term:`tid`
 615
 616         returns -1 if reference is not known.
 617         '''
 618         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 619         return pysam_reference2tid( self.samfile.header, reference )
 620
 621     def getrname( self, tid ):
 622         '''
 623         convert numerical :term:`tid` into :term:`reference` name.'''
 624         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 625         if not 0 <= tid < self.samfile.header.n_targets:
 626             raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
 627         return self.samfile.header.target_name[tid]
 628
 629     cdef char * _getrname( self, int tid ):
 630         '''
 631         convert numerical :term:`tid` into :term:`reference` name.'''
 632         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 633         if not 0 <= tid < self.samfile.header.n_targets:
 634             raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
 635         return self.samfile.header.target_name[tid]
 636
 637     def _parseRegion( self,
 638                       reference = None,
 639                       start = None,
 640                       end = None,
 641                       region = None ):
 642         '''
 643         parse region information.
 644
 645         raise ValueError for for invalid regions.
 646
 647         returns a tuple of flag, tid, start and end. Flag indicates
 648         whether some coordinates were supplied.
 649
 650         Note that regions are 1-based, while start,end are python coordinates.
 651         '''
 652         # This method's main objective is to translate from a reference to a tid.
 653         # For now, it calls bam_parse_region, which is clumsy. Might be worth
 654         # implementing it all in pysam (makes use of khash).
 655
 656         cdef int rtid
 657         cdef long long rstart
 658         cdef long long rend
 659
 660         rtid = -1
 661         rstart = 0
 662         rend = max_pos
 663         if start != None:
 664             try:
 665                 rstart = start
 666             except OverflowError:
 667                 raise ValueError( 'start out of range (%i)' % start )
 668
 669         if end != None:
 670             try:
 671                 rend = end
 672             except OverflowError:
 673                 raise ValueError( 'end out of range (%i)' % end )
 674
 675         if region:
 676             parts = re.split( "[:-]", region )
 677             reference = parts[0]
 678             if len(parts) >= 2: rstart = int(parts[1]) - 1
 679             if len(parts) >= 3: rend = int(parts[2])
 680
 681         if not reference: return 0, 0, 0, 0
 682
 683         rtid = self.gettid( reference )
 684         if rtid < 0: raise ValueError( "invalid reference `%s`" % reference )
 685         if rstart > rend: raise ValueError( 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend) )
 686         if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
 687         if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
 688
 689         return 1, rtid, rstart, rend
 690
 691     def seek( self, uint64_t offset, int where = 0):
 692         '''
 693         move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
 694         '''
 695
 696         if not self._isOpen():
 697             raise ValueError( "I/O operation on closed file" )
 698         if not self.isbam:
 699             raise NotImplementedError("seek only available in bam files")
 700         return bam_seek( self.samfile.x.bam, offset, where )
 701
 702     def tell( self ):
 703         '''
 704         return current file position
 705         '''
 706         if not self._isOpen():
 707             raise ValueError( "I/O operation on closed file" )
 708         if not self.isbam:
 709             raise NotImplementedError("seek only available in bam files")
 710
 711         return bam_tell( self.samfile.x.bam )
 712
 713     def fetch( self,
 714                reference = None,
 715                start = None,
 716                end = None,
 717                region = None,
 718                callback = None,
 719                until_eof = False ):
 720         '''
 721         fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
 722         :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
 723         be supplied.
 724
 725         Without *reference* or *region* all reads will be fetched. The reads will be returned
 726         ordered by reference sequence, which will not necessarily be the order within the file.
 727         If *until_eof* is given, all reads from the current file position will be returned
 728         *in order as they are within the file*.
 729
 730         If only *reference* is set, all reads aligned to *reference* will be fetched.
 731
 732         The method returns an iterator of type :class:`pysam.IteratorRow` unless
 733         a *callback is provided. If *callback* is given, the callback will be executed
 734         for each position within the :term:`region`. Note that callbacks currently work
 735         only, if *region* or *reference* is given.
 736
 737         Note that a :term:`SAM` file does not allow random access. If *region* or *reference* are given,
 738         an exception is raised.
 739         '''
 740         cdef int rtid, rstart, rend, has_coord
 741
 742         if not self._isOpen():
 743             raise ValueError( "I/O operation on closed file" )
 744
 745         has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 746
 747         if self.isbam:
 748             if not until_eof and not self._hasIndex() and not self.isremote:
 749                 raise ValueError( "fetch called on bamfile without index" )
 750
 751             if callback:
 752                 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
 753                 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
 754                 return bam_fetch(self.samfile.x.bam,
 755                                  self.index,
 756                                  rtid,
 757                                  rstart,
 758                                  rend,
 759                                  <void*>callback,
 760                                  fetch_callback )
 761             else:
 762                 if has_coord:
 763                     return IteratorRowRegion( self, rtid, rstart, rend )
 764                 else:
 765                     if until_eof:
 766                         return IteratorRowAll( self )
 767                     else:
 768                         return IteratorRowAllRefs(self)
 769         else:
 770             # check if header is present - otherwise sam_read1 aborts
 771             # this happens if a bamfile is opened with mode 'r'
 772             if self.samfile.header.n_targets == 0:
 773                 raise ValueError( "fetch called for samfile without header")
 774
 775             if region != None:
 776                 raise ValueError ("fetch for a region is not available for sam files" )
 777             if callback:
 778                 raise NotImplementedError( "callback not implemented yet" )
 779             else:
 780                 return IteratorRowAll( self )
 781
 782     def mate( self,
 783               AlignedRead read ):
 784         '''return the mate of :class:`AlignedRead` *read*.
 785
 786         Throws a ValueError if read is unpaired or the mate
 787         is unmapped.
 788
 789         .. note::
 790             Calling this method will change the file position.
 791             This might interfere with any iterators that have
 792             not re-opened the file.
 793
 794         '''
 795         cdef uint32_t flag = read._delegate.core.flag
 796
 797         if flag & BAM_FPAIRED == 0:
 798             raise ValueError( "read %s: is unpaired" % (read.qname))
 799         if flag & BAM_FMUNMAP != 0:
 800             raise ValueError( "mate %s: is unmapped" % (read.qname))
 801
 802         cdef MateData mate_data
 803
 804         mate_data.name = <char *>bam1_qname(read._delegate)
 805         mate_data.mate = NULL
 806         # xor flags to get the other mate
 807         cdef int x = BAM_FREAD1 + BAM_FREAD2
 808         mate_data.flag = ( flag ^ x) & x
 809
 810         bam_fetch(self.samfile.x.bam,
 811                   self.index,
 812                   read._delegate.core.mtid,
 813                   read._delegate.core.mpos,
 814                   read._delegate.core.mpos + 1,
 815                   <void*>&mate_data,
 816                   mate_callback )
 817
 818         if mate_data.mate == NULL:
 819             raise ValueError( "mate not found" )
 820
 821         cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
 822         dest._delegate = mate_data.mate
 823         return dest
 824
 825     def count( self,
 826                reference = None,
 827                start = None,
 828                end = None,
 829                region = None,
 830                until_eof = False ):
 831         '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
 832
 833         count  reads :term:`region` using 0-based indexing. The region is specified by
 834         :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
 835
 836         Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
 837         an exception is raised.
 838         '''
 839         cdef int rtid
 840         cdef int rstart
 841         cdef int rend
 842
 843         if not self._isOpen():
 844             raise ValueError( "I/O operation on closed file" )
 845
 846         region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 847
 848         cdef int counter
 849         counter = 0;
 850
 851         if self.isbam:
 852             if not until_eof and not self._hasIndex() and not self.isremote:
 853                 raise ValueError( "fetch called on bamfile without index" )
 854
 855             if not region:
 856                 raise ValueError( "counting functionality requires a region/reference" )
 857             if not self._hasIndex(): raise ValueError( "no index available for fetch" )
 858             bam_fetch(self.samfile.x.bam,
 859                              self.index,
 860                              rtid,
 861                              rstart,
 862                              rend,
 863                              <void*>&counter,
 864                              count_callback )
 865             return counter
 866         else:
 867             raise ValueError ("count for a region is not available for sam files" )
 868
 869     def pileup( self,
 870                 reference = None,
 871                 start = None,
 872                 end = None,
 873                 region = None,
 874                 callback = None,
 875                 **kwargs ):
 876         '''
 877         perform a :term:`pileup` within a :term:`region`. The region is specified by
 878         :term:`reference`, *start* and *end* (using 0-based indexing).
 879         Alternatively, a samtools *region* string can be supplied.
 880
 881         Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
 882         ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
 883
 884         The method returns an iterator of type :class:`pysam.IteratorColumn` unless
 885         a *callback is provided. If a *callback* is given, the callback will be executed
 886         for each column within the :term:`region`.
 887
 888         Note that :term:`SAM` formatted files do not allow random access.
 889         In these files, if a *region* or *reference* are given an exception is raised.
 890
 891         Optional *kwargs* to the iterator:
 892
 893         stepper
 894            The stepper controlls how the iterator advances.
 895            Possible options for the stepper are
 896
 897            ``all``
 898               use all reads for pileup.
 899            ``samtools``
 900               same filter and read processing as in :term:`csamtools` pileup
 901
 902         fastafile
 903            A :class:`FastaFile` object
 904
 905          mask
 906            Skip all reads with bits set in mask.
 907
 908
 909         .. note::
 910
 911             *all* reads which overlap the region are returned. The first base returned will be the
 912             first base of the first read *not* necessarily the first base of the region used in the query.
 913
 914             The maximum number of reads considered for pileup is *8000*. This limit is set by
 915             :term:`csamtools`.
 916
 917         '''
 918         cdef int rtid, rstart, rend, has_coord
 919         cdef bam_plbuf_t *buf
 920
 921         if not self._isOpen():
 922             raise ValueError( "I/O operation on closed file" )
 923
 924         has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 925
 926         if self.isbam:
 927             if not self._hasIndex(): raise ValueError( "no index available for pileup" )
 928
 929             if callback:
 930                 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
 931
 932                 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
 933                 bam_fetch(self.samfile.x.bam,
 934                           self.index, rtid, rstart, rend,
 935                           buf, pileup_fetch_callback )
 936
 937                 # finalize pileup
 938                 bam_plbuf_push( NULL, buf)
 939                 bam_plbuf_destroy(buf)
 940             else:
 941                 if has_coord:
 942                     return IteratorColumnRegion( self,
 943                                                  tid = rtid,
 944                                                  start = rstart,
 945                                                  end = rend,
 946                                                  **kwargs )
 947                 else:
 948                     return IteratorColumnAllRefs(self, **kwargs )
 949
 950         else:
 951             raise NotImplementedError( "pileup of samfiles not implemented yet" )
 952
 953     def close( self ):
 954         '''
 955         closes the :class:`pysam.Samfile`.'''
 956         if self.samfile != NULL:
 957             samclose( self.samfile )
 958             bam_index_destroy(self.index);
 959             self.samfile = NULL
 960
 961     def __dealloc__( self ):
 962         # remember: dealloc cannot call other methods
 963         # note: no doc string
 964         # note: __del__ is not called.
 965         self.close()
 966         bam_destroy1(self.b)
 967         if self._filename != NULL: free( self._filename )
 968
 969     cpdef int write( self, AlignedRead read ) except -1:
 970         '''
 971         write a single :class:`pysam.AlignedRead` to disk.
 972
 973         returns the number of bytes written.
 974         '''
 975         if not self._isOpen():
 976             return 0
 977
 978         return samwrite( self.samfile, read._delegate )
 979
 980     def __enter__(self):
 981         return self
 982
 983     def __exit__(self, exc_type, exc_value, traceback):
 984         self.close()
 985         return False
 986
 987     ###############################################################
 988     ###############################################################
 989     ###############################################################
 990     ## properties
 991     ###############################################################
 992     property filename:
 993         '''number of :term:`filename` associated with this object.'''
 994         def __get__(self):
 995             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 996             return self._filename
 997
 998     property nreferences:
 999         '''number of :term:`reference` sequences in the file.'''
1000         def __get__(self):
1001             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1002             return self.samfile.header.n_targets
1003
1004     property references:
1005         """tuple with the names of :term:`reference` sequences."""
1006         def __get__(self):
1007             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1008             t = []
1009             for x from 0 <= x < self.samfile.header.n_targets:
1010                 t.append( self.samfile.header.target_name[x] )
1011             return tuple(t)
1012
1013     property lengths:
1014         """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
1015         :attr:`pysam.Samfile.references`
1016         """
1017         def __get__(self):
1018             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1019             t = []
1020             for x from 0 <= x < self.samfile.header.n_targets:
1021                 t.append( self.samfile.header.target_len[x] )
1022             return tuple(t)
1023
1024     property text:
1025         '''full contents of the :term:`sam file` header as a string.'''
1026         def __get__(self):
1027             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1028             return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
1029
1030     property header:
1031         '''header information within the :term:`sam file`. The records and fields are returned as
1032         a two-level dictionary.
1033         '''
1034         def __get__(self):
1035             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1036
1037             result = {}
1038
1039             if self.samfile.header.text != NULL:
1040                 # convert to python string (note: call self.text to create 0-terminated string)
1041                 t = self.text
1042                 for line in t.split("\n"):
1043                     if not line.strip(): continue
1044                     assert line.startswith("@"), "header line without '@': '%s'" % line
1045                     fields = line[1:].split("\t")
1046                     record = fields[0]
1047                     assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
1048
1049                     # treat comments
1050                     if record == "CO":
1051                         if record not in result: result[record] = []
1052                         result[record].append( "\t".join( fields[1:] ) )
1053                         continue
1054
1055                     # the following is clumsy as generators do not work?
1056                     x = {}
1057                     for field in fields[1:]:
1058                         key, value = field.split(":",1)
1059                         # uppercase keys must be valid
1060                         # lowercase are permitted for user fields
1061                         if key in VALID_HEADER_FIELDS[record]:
1062                             x[key] = VALID_HEADER_FIELDS[record][key](value)
1063                         elif not key.isupper():
1064                             x[key] = value
1065                         else:
1066                             raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
1067
1068                     if VALID_HEADER_TYPES[record] == dict:
1069                         if record in result:
1070                             raise ValueError( "multiple '%s' lines are not permitted" % record )
1071                         result[record] = x
1072                     elif VALID_HEADER_TYPES[record] == list:
1073                         if record not in result: result[record] = []
1074                         result[record].append( x )
1075
1076             return result
1077
1078     def _buildLine( self, fields, record ):
1079         '''build a header line from *fields* dictionary for *record*'''
1080
1081         # TODO: add checking for field and sort order
1082         line = ["@%s" % record ]
1083         if record == "CO":
1084             line.append( fields )
1085         else:
1086             # write fields of the specification
1087             for key in VALID_HEADER_ORDER[record]:
1088                 if key in fields:
1089                     line.append( "%s:%s" % (key, str(fields[key])))
1090             # write user fields
1091             for key in fields:
1092                 if not key.isupper():
1093                     line.append( "%s:%s" % (key, str(fields[key])))
1094
1095         return "\t".join( line )
1096
1097     cdef bam_header_t * _buildHeader( self, new_header ):
1098         '''return a new header built from a dictionary in *new_header*.
1099
1100         This method inserts the text field, target_name and target_len.
1101         '''
1102
1103         lines = []
1104
1105         # check if hash exists
1106
1107         # create new header and copy old data
1108         cdef bam_header_t * dest
1109
1110         dest = bam_header_init()
1111
1112         for record in VALID_HEADERS:
1113             if record in new_header:
1114                 ttype = VALID_HEADER_TYPES[record]
1115                 data = new_header[record]
1116                 if type( data ) != type( ttype() ):
1117                     raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
1118                 if type( data ) == types.DictType:
1119                     lines.append( self._buildLine( data, record ) )
1120                 else:
1121                     for fields in new_header[record]:
1122                         lines.append( self._buildLine( fields, record ) )
1123
1124         text = "\n".join(lines) + "\n"
1125         if dest.text != NULL: free( dest.text )
1126         dest.text = <char*>calloc( len(text), sizeof(char))
1127         dest.l_text = len(text)
1128         strncpy( dest.text, text, dest.l_text )
1129
1130         # collect targets
1131         if "SQ" in new_header:
1132             seqs = []
1133             for fields in new_header["SQ"]:
1134                 try:
1135                     seqs.append( (fields["SN"], fields["LN"] ) )
1136                 except KeyError:
1137                     raise KeyError( "incomplete sequence information in '%s'" % str(fields))
1138
1139             dest.n_targets = len(seqs)
1140             dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
1141             dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
1142
1143             for x from 0 <= x < dest.n_targets:
1144                 seqname, seqlen = seqs[x]
1145                 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
1146                 strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
1147                 dest.target_len[x] = seqlen
1148
1149         return dest
1150
1151     ###############################################################
1152     ###############################################################
1153     ###############################################################
1154     ## file-object like iterator access
1155     ## note: concurrent access will cause errors (see IteratorRow
1156     ## and reopen)
1157     ## Possible solutions: deprecate or open new file handle
1158     ###############################################################
1159     def __iter__(self):
1160         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1161         return self
1162
1163     cdef bam1_t * getCurrent( self ):
1164         return self.b
1165
1166     cdef int cnext(self):
1167         '''
1168         cversion of iterator. Used by :class:`pysam.Samfile.IteratorColumn`.
1169         '''
1170         cdef int ret
1171         return samread(self.samfile, self.b)
1172
1173     def __next__(self):
1174         """
1175         python version of next().
1176         """
1177         cdef int ret
1178         ret = samread(self.samfile, self.b)
1179         if (ret > 0):
1180             return makeAlignedRead( self.b )
1181         else:
1182             raise StopIteration
1183
1184 ##-------------------------------------------------------------------
1185 ##-------------------------------------------------------------------
1186 ##-------------------------------------------------------------------
1187 cdef class IteratorRow:
1188     '''abstract base class for iterators over mapped reads.
1189
1190     Various iterators implement different behaviours for wrapping around
1191     contig boundaries. Examples include:
1192
1193     :class:`pysam.IteratorRowRegion`
1194         iterate within a single contig and a defined region.
1195
1196     :class:`pysam.IteratorRowAll`
1197         iterate until EOF. This iterator will also include unmapped reads.
1198
1199     :class:`pysam.IteratorRowAllRefs`
1200         iterate over all reads in all reference sequences.
1201
1202     The method :meth:`Samfile.fetch` returns an IteratorRow.
1203     '''
1204     pass
1205
1206 cdef class IteratorRowRegion(IteratorRow):
1207     """*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
1208
1209     iterate over mapped reads in a region.
1210
1211     By default, the file is re-openend to avoid conflicts between
1212     multiple iterators working on the same file. Set *reopen* = False
1213     to not re-open *samfile*.
1214
1215     The samtools iterators assume that the file
1216     position between iterations do not change.
1217     As a consequence, no two iterators can work
1218     on the same file. To permit this, each iterator
1219     creates its own file handle by re-opening the
1220     file.
1221
1222     Note that the index will be shared between
1223     samfile and the iterator.
1224     """
1225
1226     cdef bam_iter_t             iter # iterator state object
1227     cdef bam1_t *               b
1228     cdef int                    retval
1229     cdef Samfile                samfile
1230     cdef samfile_t              * fp
1231     # true if samfile belongs to this object
1232     cdef int owns_samfile
1233
1234     def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
1235
1236         if not samfile._isOpen():
1237             raise ValueError( "I/O operation on closed file" )
1238
1239         if not samfile._hasIndex():
1240             raise ValueError( "no index available for iteration" )
1241
1242         # makes sure that samfile stays alive as long as the
1243         # iterator is alive
1244         self.samfile = samfile
1245
1246         if samfile.isbam: mode = "rb"
1247         else: mode = "r"
1248
1249         # reopen the file - note that this makes the iterator
1250         # slow and causes pileup to slow down significantly.
1251         if reopen:
1252             store = StderrStore()
1253             self.fp = samopen( samfile._filename, mode, NULL )
1254             store.release()
1255             assert self.fp != NULL
1256             self.owns_samfile = True
1257         else:
1258             self.fp = self.samfile.samfile
1259             self.owns_samfile = False
1260
1261         self.retval = 0
1262
1263         self.iter = bam_iter_query(self.samfile.index,
1264                                    tid,
1265                                    beg,
1266                                    end)
1267         self.b = bam_init1()
1268
1269     def __iter__(self):
1270         return self
1271
1272     cdef bam1_t * getCurrent( self ):
1273         return self.b
1274
1275     cdef int cnext(self):
1276         '''cversion of iterator. Used by IteratorColumn'''
1277         self.retval = bam_iter_read( self.fp.x.bam,
1278                                      self.iter,
1279                                      self.b)
1280
1281     def __next__(self):
1282         """python version of next().
1283         """
1284         self.cnext()
1285         if self.retval < 0: raise StopIteration
1286         return makeAlignedRead( self.b )
1287
1288     def __dealloc__(self):
1289         bam_destroy1(self.b)
1290         if self.owns_samfile: samclose( self.fp )
1291
1292 cdef class IteratorRowAll(IteratorRow):
1293     """*(Samfile samfile, int reopen = True)*
1294
1295     iterate over all reads in *samfile*
1296
1297     By default, the file is re-openend to avoid conflicts between
1298     multiple iterators working on the same file. Set *reopen* = False
1299     to not re-open *samfile*.
1300     """
1301
1302     cdef bam1_t * b
1303     cdef samfile_t * fp
1304     # true if samfile belongs to this object
1305     cdef int owns_samfile
1306
1307     def __cinit__(self, Samfile samfile, int reopen = True ):
1308
1309         if not samfile._isOpen():
1310             raise ValueError( "I/O operation on closed file" )
1311
1312         if samfile.isbam: mode = "rb"
1313         else: mode = "r"
1314
1315         # reopen the file to avoid iterator conflict
1316         if reopen:
1317             store = StderrStore()
1318             self.fp = samopen( samfile._filename, mode, NULL )
1319             store.release()
1320             assert self.fp != NULL
1321             self.owns_samfile = True
1322         else:
1323             self.fp = samfile.samfile
1324             self.owns_samfile = False
1325
1326         # allocate memory for alignment
1327         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1328
1329     def __iter__(self):
1330         return self
1331
1332     cdef bam1_t * getCurrent( self ):
1333         return self.b
1334
1335     cdef int cnext(self):
1336         '''cversion of iterator. Used by IteratorColumn'''
1337         cdef int ret
1338         return samread(self.fp, self.b)
1339
1340     def __next__(self):
1341         """python version of next().
1342
1343         pyrex uses this non-standard name instead of next()
1344         """
1345         cdef int ret
1346         ret = samread(self.fp, self.b)
1347         if (ret > 0):
1348             return makeAlignedRead( self.b )
1349         else:
1350             raise StopIteration
1351
1352     def __dealloc__(self):
1353         bam_destroy1(self.b)
1354         if self.owns_samfile: samclose( self.fp )
1355
1356 cdef class IteratorRowAllRefs(IteratorRow):
1357     """iterates over all mapped reads by chaining iterators over each reference
1358     """
1359     cdef Samfile     samfile
1360     cdef int         tid
1361     cdef IteratorRowRegion rowiter
1362
1363     def __cinit__(self, Samfile samfile):
1364         assert samfile._isOpen()
1365         if not samfile._hasIndex(): raise ValueError("no index available for fetch")
1366         self.samfile = samfile
1367         self.tid = -1
1368
1369     def nextiter(self):
1370         self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, 1<<29)
1371
1372     def __iter__(self):
1373         return self
1374
1375     def __next__(self):
1376         """python version of next().
1377
1378         pyrex uses this non-standard name instead of next()
1379         """
1380         # Create an initial iterator
1381         if self.tid==-1:
1382             if not self.samfile.nreferences:
1383                 raise StopIteration
1384             self.tid = 0
1385             self.nextiter()
1386
1387         while 1:
1388             self.rowiter.cnext()
1389
1390             # If current iterator is not exhausted, return aligned read
1391             if self.rowiter.retval>0:
1392                 return makeAlignedRead(self.rowiter.b)
1393
1394             self.tid += 1
1395
1396             # Otherwise, proceed to next reference or stop
1397             if self.tid<self.samfile.nreferences:
1398                 self.nextiter()
1399             else:
1400                 raise StopIteration
1401
1402 cdef class IteratorRowSelection(IteratorRow):
1403     """*(Samfile samfile)*
1404
1405     iterate over reads in *samfile* at a given list of file positions.
1406     """
1407
1408     cdef bam1_t * b
1409     cdef int current_pos
1410     cdef samfile_t * fp
1411     cdef positions
1412     # true if samfile belongs to this object
1413     cdef int owns_samfile
1414
1415     def __cinit__(self, Samfile samfile, positions, int reopen = True ):
1416
1417         if not samfile._isOpen():
1418             raise ValueError( "I/O operation on closed file" )
1419
1420         if not samfile._isOpen():
1421             raise ValueError( "I/O operation on closed file" )
1422
1423         assert samfile.isbam, "can only use this iterator on bam files"
1424         mode = "rb"
1425
1426         # reopen the file to avoid iterator conflict
1427         if reopen:
1428             store = StderrStore()
1429             self.fp = samopen( samfile._filename, mode, NULL )
1430             store.release()
1431             assert self.fp != NULL
1432             self.owns_samfile = True
1433         else:
1434             self.fp = samfile.samfile
1435             self.owns_samfile = False
1436
1437         # allocate memory for alignment
1438         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1439
1440         self.positions = positions
1441         self.current_pos = 0
1442
1443     def __iter__(self):
1444         return self
1445
1446     cdef bam1_t * getCurrent( self ):
1447         return self.b
1448
1449     cdef int cnext(self):
1450         '''cversion of iterator'''
1451
1452         # end iteration if out of positions
1453         if self.current_pos >= len(self.positions): return -1
1454
1455         bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
1456         self.current_pos += 1
1457         return samread(self.fp, self.b)
1458
1459     def __next__(self):
1460         """python version of next().
1461
1462         pyrex uses this non-standard name instead of next()
1463         """
1464
1465         cdef int ret = self.cnext()
1466         if (ret > 0):
1467             return makeAlignedRead( self.b )
1468         else:
1469             raise StopIteration
1470
1471     def __dealloc__(self):
1472         bam_destroy1(self.b)
1473         if self.owns_samfile: samclose( self.fp )
1474
1475 ##-------------------------------------------------------------------
1476 ##-------------------------------------------------------------------
1477 ##-------------------------------------------------------------------
1478 ctypedef struct __iterdata:
1479     samfile_t * samfile
1480     bam_iter_t iter
1481     faidx_t * fastafile
1482     int tid
1483     char * seq
1484     int seq_len
1485
1486 cdef int __advance_all( void * data, bam1_t * b ):
1487     '''advance without any read filtering.
1488     '''
1489     cdef __iterdata * d
1490     d = <__iterdata*>data
1491     return bam_iter_read( d.samfile.x.bam, d.iter, b )
1492
1493 cdef int __advance_snpcalls( void * data, bam1_t * b ):
1494     '''advance using same filter and read processing as in
1495     the samtools pileup.
1496     '''
1497     cdef __iterdata * d
1498     d = <__iterdata*>data
1499
1500     cdef int ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1501     cdef int skip = 0
1502     cdef int q
1503     cdef int is_cns = 1
1504     cdef int is_nobaq = 0
1505     cdef int capQ_thres = 0
1506
1507     # reload sequence
1508     if d.fastafile != NULL and b.core.tid != d.tid:
1509         if d.seq != NULL: free(d.seq)
1510         d.tid = b.core.tid
1511         d.seq = faidx_fetch_seq(d.fastafile,
1512                                 d.samfile.header.target_name[d.tid],
1513                                 0, max_pos,
1514                                 &d.seq_len)
1515         if d.seq == NULL:
1516             raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
1517                                   (d.samfile.header.target_name[d.tid],
1518                                    d.tid))
1519
1520
1521     while ret >= 0:
1522
1523         skip = 0
1524
1525         # realign read - changes base qualities
1526         if d.seq != NULL and is_cns and not is_nobaq: bam_prob_realn( b, d.seq )
1527
1528         if d.seq != NULL and capQ_thres > 10:
1529             q = bam_cap_mapQ(b, d.seq, capQ_thres)
1530             if q < 0: skip = 1
1531             elif b.core.qual > q: b.core.qual = q
1532         if b.core.flag & BAM_FUNMAP: skip = 1
1533         elif b.core.flag & 1 and not b.core.flag & 2: skip = 1
1534
1535         if not skip: break
1536         # additional filters
1537
1538         ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1539
1540     return ret
1541
1542 cdef class IteratorColumn:
1543     '''abstract base class for iterators over columns.
1544
1545     IteratorColumn objects wrap the pileup functionality of samtools.
1546
1547     For reasons of efficiency, the iterator points to the current
1548     pileup buffer. The pileup buffer is updated at every iteration.
1549     This might cause some unexpected behavious. For example,
1550     consider the conversion to a list::
1551
1552        f = Samfile("file.bam", "rb")
1553        result = list( f.pileup() )
1554
1555     Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
1556     but each object in ``result`` will contain the same information.
1557
1558     The desired behaviour can be achieved by list comprehension::
1559
1560        result = [ x.pileups() for x in f.pileup() ]
1561
1562     ``result`` will be a list of ``n`` lists of objects of type :class:`PileupRead`.
1563
1564     If the iterator is associated with a :class:`Fastafile` using the :meth:`addReference`
1565     method, then the iterator will export the current sequence via the methods :meth:`getSequence`
1566     and :meth:`seq_len`.
1567
1568     Optional kwargs to the iterator
1569
1570     stepper
1571        The stepper controlls how the iterator advances.
1572        Possible options for the stepper are
1573
1574        all
1575            use all reads for pileup.
1576        samtools
1577            same filter and read processing as in :term:`csamtools` pileup
1578     fastafile
1579        A :class:`FastaFile` object
1580     mask
1581        Skip all reads with bits set in mask.
1582
1583
1584     '''
1585
1586     # result of the last plbuf_push
1587     cdef IteratorRowRegion iter
1588     cdef int tid
1589     cdef int pos
1590     cdef int n_plp
1591     cdef int mask
1592     cdef const_bam_pileup1_t_ptr plp
1593     cdef bam_plp_t pileup_iter
1594     cdef __iterdata iterdata
1595     cdef Samfile samfile
1596     cdef Fastafile fastafile
1597     cdef stepper
1598
1599     def __cinit__( self, Samfile samfile, **kwargs ):
1600         self.samfile = samfile
1601         self.mask = kwargs.get("mask", BAM_DEF_MASK )
1602         self.fastafile = kwargs.get( "fastafile", None )
1603         self.stepper = kwargs.get( "stepper", None )
1604         self.iterdata.seq = NULL
1605         self.tid = 0
1606         self.pos = 0
1607         self.n_plp = 0
1608         self.plp = NULL
1609         self.pileup_iter = <bam_plp_t>NULL
1610
1611     def __iter__(self):
1612         return self
1613
1614     cdef int cnext(self):
1615         '''perform next iteration.
1616
1617         This method is analogous to the samtools bam_plp_auto method.
1618         It has been re-implemented to permit for filtering.
1619         '''
1620         self.plp = bam_plp_auto( self.pileup_iter,
1621                                  &self.tid,
1622                                  &self.pos,
1623                                  &self.n_plp )
1624
1625     cdef char * getSequence( self ):
1626         '''return current reference sequence underlying the iterator.
1627         '''
1628         return self.iterdata.seq
1629
1630     property seq_len:
1631         '''current sequence length.'''
1632         def __get__(self): return self.iterdata.seq_len
1633
1634     def addReference( self, Fastafile fastafile ):
1635        '''
1636        add reference sequences in *fastafile* to iterator.'''
1637        self.fastafile = fastafile
1638        if self.iterdata.seq != NULL: free(self.iterdata.seq)
1639        self.iterdata.tid = -1
1640        self.iterdata.fastafile = self.fastafile.fastafile
1641
1642     def hasReference( self ):
1643         '''
1644         return true if iterator is associated with a reference'''
1645         return self.fastafile
1646
1647     cdef setMask( self, mask ):
1648         '''set masking flag in iterator.
1649
1650         reads with bits set in *mask* will be skipped.
1651         '''
1652         self.mask = mask
1653         bam_plp_set_mask( self.pileup_iter, self.mask )
1654
1655     cdef setupIteratorData( self,
1656                             int tid,
1657                             int start,
1658                             int end,
1659                             int reopen = 0 ):
1660         '''setup the iterator structure'''
1661
1662         self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen )
1663         self.iterdata.samfile = self.samfile.samfile
1664         self.iterdata.iter = self.iter.iter
1665         self.iterdata.seq = NULL
1666         self.iterdata.tid = -1
1667
1668         if self.fastafile != None:
1669             self.iterdata.fastafile = self.fastafile.fastafile
1670         else:
1671             self.iterdata.fastafile = NULL
1672
1673         if self.stepper == None or self.stepper == "all":
1674             self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
1675         elif self.stepper == "samtools":
1676             self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
1677         else:
1678             raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
1679
1680         bam_plp_set_mask( self.pileup_iter, self.mask )
1681
1682     cdef reset( self, tid, start, end ):
1683         '''reset iterator position.
1684
1685         This permits using the iterator multiple times without
1686         having to incur the full set-up costs.
1687         '''
1688         self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
1689         self.iterdata.iter = self.iter.iter
1690
1691         # invalidate sequence if different tid
1692         if self.tid != tid:
1693             if self.iterdata.seq != NULL: free( self.iterdata.seq )
1694             self.iterdata.seq = NULL
1695             self.iterdata.tid = -1
1696
1697         # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
1698         bam_plp_reset(self.pileup_iter)
1699
1700     def __dealloc__(self):
1701         # reset in order to avoid memory leak messages for iterators that have
1702         # not been fully consumed
1703         if self.pileup_iter != <bam_plp_t>NULL:
1704             bam_plp_reset(self.pileup_iter)
1705             bam_plp_destroy(self.pileup_iter)
1706             self.pileup_iter = <bam_plp_t>NULL
1707
1708         if self.iterdata.seq != NULL:
1709             free(self.iterdata.seq)
1710             self.iterdata.seq = NULL
1711
1712 cdef class IteratorColumnRegion(IteratorColumn):
1713     '''iterates over a region only.
1714     '''
1715     def __cinit__(self, Samfile samfile,
1716                   int tid = 0,
1717                   int start = 0,
1718                   int end = max_pos,
1719                   **kwargs ):
1720
1721         # initialize iterator
1722         self.setupIteratorData( tid, start, end, 1 )
1723
1724     def __next__(self):
1725         """python version of next().
1726         """
1727
1728         while 1:
1729             self.cnext()
1730             if self.n_plp < 0:
1731                 raise ValueError("error during iteration" )
1732
1733             if self.plp == NULL:
1734                 raise StopIteration
1735
1736             return makePileupProxy( <bam_pileup1_t*>self.plp,
1737                                      self.tid,
1738                                      self.pos,
1739                                      self.n_plp )
1740
1741 cdef class IteratorColumnAllRefs(IteratorColumn):
1742     """iterates over all columns by chaining iterators over each reference
1743     """
1744
1745     def __cinit__(self,
1746                   Samfile samfile,
1747                   **kwargs ):
1748
1749         # no iteration over empty files
1750         if not samfile.nreferences: raise StopIteration
1751
1752         # initialize iterator
1753         self.setupIteratorData( self.tid, 0, max_pos, 1 )
1754
1755     def __next__(self):
1756         """python version of next().
1757         """
1758
1759         while 1:
1760             self.cnext()
1761
1762             if self.n_plp < 0:
1763                 raise ValueError("error during iteration" )
1764
1765             # return result, if within same reference
1766             if self.plp != NULL:
1767                 return makePileupProxy( <bam_pileup1_t*>self.plp,
1768                                          self.tid,
1769                                          self.pos,
1770                                          self.n_plp )
1771
1772             # otherwise, proceed to next reference or stop
1773             self.tid += 1
1774             if self.tid < self.samfile.nreferences:
1775                 self.setupIteratorData( self.tid, 0, max_pos, 0 )
1776             else:
1777                 raise StopIteration
1778
1779 ##-------------------------------------------------------------------
1780 ##-------------------------------------------------------------------
1781 ##-------------------------------------------------------------------
1782 cdef inline int32_t query_start(bam1_t *src) except -1:
1783     cdef uint32_t * cigar_p, op
1784     cdef uint32_t k
1785     cdef uint32_t start_offset = 0
1786
1787     if src.core.n_cigar:
1788         cigar_p = bam1_cigar(src);
1789         for k from 0 <= k < src.core.n_cigar:
1790             op = cigar_p[k] & BAM_CIGAR_MASK
1791             if op==BAM_CHARD_CLIP:
1792                 if start_offset!=0 and start_offset!=src.core.l_qseq:
1793                     PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1794                     return -1
1795             elif op==BAM_CSOFT_CLIP:
1796                 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
1797             else:
1798                 break
1799
1800     return start_offset
1801
1802 ##-------------------------------------------------------------------
1803 ##-------------------------------------------------------------------
1804 ##-------------------------------------------------------------------
1805 cdef inline int32_t query_end(bam1_t *src) except -1:
1806     cdef uint32_t * cigar_p, op
1807     cdef uint32_t k
1808     cdef uint32_t end_offset = src.core.l_qseq
1809
1810     if src.core.n_cigar>1:
1811         cigar_p = bam1_cigar(src);
1812         for k from src.core.n_cigar > k >= 1:
1813             op = cigar_p[k] & BAM_CIGAR_MASK
1814             if op==BAM_CHARD_CLIP:
1815                 if end_offset!=0 and end_offset!=src.core.l_qseq:
1816                     PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1817                     return -1
1818             elif op==BAM_CSOFT_CLIP:
1819                 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
1820             else:
1821                 break
1822
1823     if end_offset==0:
1824         end_offset = src.core.l_qseq
1825
1826     return end_offset
1827
1828
1829 cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
1830     cdef uint8_t * p
1831     cdef uint32_t k
1832     cdef char * s
1833
1834     if not src.core.l_qseq:
1835         return None
1836
1837     seq = PyString_FromStringAndSize(NULL, end-start)
1838     s   = PyString_AS_STRING(seq)
1839     p   = bam1_seq(src)
1840
1841     for k from start <= k < end:
1842         # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
1843         # note: do not use string literal as it will be a python string
1844         s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
1845
1846     return seq
1847
1848
1849 cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
1850     cdef uint8_t * p
1851     cdef uint32_t k
1852     cdef char * q
1853
1854     p = bam1_qual(src)
1855     if p[0] == 0xff:
1856         return None
1857
1858     qual = PyString_FromStringAndSize(NULL, end-start)
1859     q    = PyString_AS_STRING(qual)
1860
1861     for k from start <= k < end:
1862         ## equivalent to t[i] + 33 (see bam.c)
1863         q[k-start] = p[k] + 33
1864
1865     return qual
1866
1867 cdef class AlignedRead:
1868     '''
1869     Class representing an aligned read. see SAM format specification for
1870     the meaning of fields (http://samtools.sourceforge.net/).
1871
1872     This class stores a handle to the samtools C-structure representing
1873     an aligned read. Member read access is forwarded to the C-structure
1874     and converted into python objects. This implementation should be fast,
1875     as only the data needed is converted.
1876
1877     For write access, the C-structure is updated in-place. This is
1878     not the most efficient way to build BAM entries, as the variable
1879     length data is concatenated and thus needs to resized if
1880     a field is updated. Furthermore, the BAM entry might be
1881     in an inconsistent state. The :meth:`~validate` method can
1882     be used to check if an entry is consistent.
1883
1884     One issue to look out for is that the sequence should always
1885     be set *before* the quality scores. Setting the sequence will
1886     also erase any quality scores that were set previously.
1887     '''
1888
1889     # Now only called when instances are created from Python
1890     def __init__(self):
1891         # see bam_init1
1892         self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
1893         # allocate some memory
1894         # If size is 0, calloc does not return a pointer that can be passed to free()
1895         # so allocate 40 bytes for a new read
1896         self._delegate.m_data = 40
1897         self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
1898         self._delegate.data_len = 0
1899
1900     def __dealloc__(self):
1901         bam_destroy1(self._delegate)
1902
1903     def __str__(self):
1904         """return string representation of alignment.
1905
1906         The representation is an approximate :term:`sam` format.
1907
1908         An aligned read might not be associated with a :term:`Samfile`.
1909         As a result :term:`tid` is shown instead of the reference name.
1910
1911         Similarly, the tags field is returned in its parsed state.
1912         """
1913         # sam-parsing is done in sam.c/bam_format1_core which
1914         # requires a valid header.
1915         return "\t".join(map(str, (self.qname,
1916                                    self.flag,
1917                                    self.rname,
1918                                    self.pos,
1919                                    self.mapq,
1920                                    self.cigar,
1921                                    self.mrnm,
1922                                    self.mpos,
1923                                    self.rlen,
1924                                    self.seq,
1925                                    self.qual,
1926                                    self.tags )))
1927
1928     def compare(self, AlignedRead other):
1929         '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
1930
1931         cdef int retval, x
1932         cdef bam1_t *t, *o
1933
1934         t = self._delegate
1935         o = other._delegate
1936
1937         # uncomment for debugging purposes
1938         # cdef unsigned char * oo, * tt
1939         # tt = <unsigned char*>(&t.core)
1940         # oo = <unsigned char*>(&o.core)
1941         # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
1942         # tt = <unsigned char*>(t.data)
1943         # oo = <unsigned char*>(o.data)
1944         # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
1945
1946         # Fast-path test for object identity
1947         if t==o:
1948             return 0
1949
1950         retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
1951
1952         if retval: return retval
1953         retval = cmp(t.data_len, o.data_len)
1954         if retval: return retval
1955         return memcmp(t.data, o.data, t.data_len)
1956
1957     # Disabled so long as __cmp__ is a special method
1958     def __hash__(self):
1959         return _Py_HashPointer(<void *>self)
1960
1961     property qname:
1962         """the query name (None if not present)"""
1963         def __get__(self):
1964             cdef bam1_t * src
1965             src = self._delegate
1966             if src.core.l_qname == 0: return None
1967             return <char *>bam1_qname( src )
1968
1969         def __set__(self, qname ):
1970             if qname == None or len(qname) == 0: return
1971             cdef bam1_t * src
1972             cdef int l
1973             cdef char * p
1974
1975             src = self._delegate
1976             p = bam1_qname( src )
1977
1978             # the qname is \0 terminated
1979             l = len(qname) + 1
1980             pysam_bam_update( src,
1981                               src.core.l_qname,
1982                               l,
1983                               <uint8_t*>p )
1984
1985             src.core.l_qname = l
1986
1987             # re-acquire pointer to location in memory
1988             # as it might have moved
1989             p = bam1_qname(src)
1990
1991             strncpy( p, qname, l )
1992
1993     property cigar:
1994         """the :term:`cigar` alignment (None if not present).
1995         """
1996         def __get__(self):
1997             cdef uint32_t * cigar_p
1998             cdef bam1_t * src
1999             cdef op, l, cigar
2000             cdef int k
2001
2002             src = self._delegate
2003             if src.core.n_cigar == 0: return None
2004
2005             cigar = []
2006             cigar_p = bam1_cigar(src);
2007             for k from 0 <= k < src.core.n_cigar:
2008                 op = cigar_p[k] & BAM_CIGAR_MASK
2009                 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2010                 cigar.append((op, l))
2011             return cigar
2012
2013         def __set__(self, values ):
2014             if values == None or len(values) == 0: return
2015             cdef uint32_t * p
2016             cdef bam1_t * src
2017             cdef op, l
2018             cdef int k
2019
2020             k = 0
2021
2022             src = self._delegate
2023
2024             # get location of cigar string
2025             p = bam1_cigar(src)
2026
2027             # create space for cigar data within src.data
2028             pysam_bam_update( src,
2029                               src.core.n_cigar * 4,
2030                               len(values) * 4,
2031                               <uint8_t*>p )
2032
2033             # length is number of cigar operations, not bytes
2034             src.core.n_cigar = len(values)
2035
2036             # re-acquire pointer to location in memory
2037             # as it might have moved
2038             p = bam1_cigar(src)
2039
2040             # insert cigar operations
2041             for op, l in values:
2042                 p[k] = l << BAM_CIGAR_SHIFT | op
2043                 k += 1
2044
2045             ## setting the cigar string also updates the "bin" attribute
2046             src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
2047
2048     property seq:
2049         """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
2050         def __get__(self):
2051             cdef bam1_t * src
2052             cdef char * s
2053             src = self._delegate
2054
2055             if src.core.l_qseq == 0: return None
2056
2057             return get_seq_range(src, 0, src.core.l_qseq)
2058
2059         def __set__(self,seq):
2060             # samtools manages sequence and quality length memory together
2061             # if no quality information is present, the first byte says 0xff.
2062
2063             if seq == None or len(seq) == 0: return
2064             cdef bam1_t * src
2065             cdef uint8_t * p
2066             cdef char * s
2067             cdef int l, k, nbytes_new, nbytes_old
2068
2069             src = self._delegate
2070
2071             l = len(seq)
2072
2073             # as the sequence is stored in half-bytes, the total length (sequence
2074             # plus quality scores) is (l+1)/2 + l
2075             nbytes_new = (l+1)/2 + l
2076             nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
2077             # acquire pointer to location in memory
2078             p = bam1_seq( src )
2079             src.core.l_qseq = l
2080
2081             pysam_bam_update( src,
2082                               nbytes_old,
2083                               nbytes_new,
2084                               p)
2085             # re-acquire pointer to location in memory
2086             # as it might have moved
2087             p = bam1_seq( src )
2088             for k from 0 <= k < nbytes_new: p[k] = 0
2089             # convert to C string
2090             s = seq
2091             for k from 0 <= k < l:
2092                 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
2093
2094             # erase qualities
2095             p = bam1_qual( src )
2096             p[0] = 0xff
2097
2098
2099     property qual:
2100         """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
2101         def __get__(self):
2102
2103             cdef bam1_t * src
2104             cdef char * q
2105
2106             src = self._delegate
2107
2108             if src.core.l_qseq == 0: return None
2109
2110             return get_qual_range(src, 0, src.core.l_qseq)
2111
2112         def __set__(self,qual):
2113             # note that space is already allocated via the sequences
2114             cdef bam1_t * src
2115             cdef uint8_t * p
2116             cdef char * q
2117             cdef int k
2118
2119             src = self._delegate
2120             p = bam1_qual( src )
2121             if qual == None or len(qual) == 0:
2122                 # if absent - set to 0xff
2123                 p[0] = 0xff
2124                 return
2125             cdef int l
2126             # convert to C string
2127             q = qual
2128             l = len(qual)
2129             if src.core.l_qseq != l:
2130                 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
2131             assert src.core.l_qseq == l
2132             for k from 0 <= k < l:
2133                 p[k] = <uint8_t>q[k] - 33
2134
2135     property query:
2136         """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
2137
2138         SAM/BAM files may included extra flanking bases sequences that were
2139         not part of the alignment.  These bases may be the result of the
2140         Smith-Waterman or other algorithms, which may not require alignments
2141         that begin at the first residue or end at the last.  In addition,
2142         extra sequencing adapters, multiplex identifiers, and low-quality bases that
2143         were not considered for alignment may have been retained."""
2144
2145         def __get__(self):
2146             cdef bam1_t * src
2147             cdef uint32_t start, end
2148             cdef char * s
2149
2150             src = self._delegate
2151
2152             if src.core.l_qseq == 0: return None
2153
2154             start = query_start(src)
2155             end   = query_end(src)
2156
2157             return get_seq_range(src, start, end)
2158
2159     property qqual:
2160         """aligned query sequence quality values (None if not present)"""
2161         def __get__(self):
2162             cdef bam1_t * src
2163             cdef uint32_t start, end
2164             cdef char * q
2165
2166             src = self._delegate
2167
2168             if src.core.l_qseq == 0: return None
2169
2170             start = query_start(src)
2171             end   = query_end(src)
2172
2173             return get_qual_range(src, start, end)
2174
2175     property qstart:
2176         """start index of the aligned query portion of the sequence (0-based, inclusive)"""
2177         def __get__(self):
2178             return query_start(self._delegate)
2179
2180     property qend:
2181         """end index of the aligned query portion of the sequence (0-based, exclusive)"""
2182         def __get__(self):
2183             return query_end(self._delegate)
2184
2185     property qlen:
2186         """Length of the aligned query sequence"""
2187         def __get__(self):
2188             cdef bam1_t * src
2189             src = self._delegate
2190             return query_end(src)-query_start(src)
2191
2192     property tags:
2193         """the tags in the AUX field.
2194
2195         This property permits convenience access to
2196         the tags. Changes it the returned list will
2197         not update the tags automatically. Instead,
2198         the following is required for adding a
2199         new tag::
2200
2201             read.tags = read.tags + [("RG",0)]
2202
2203         """
2204         def __get__(self):
2205             cdef char * ctag
2206             cdef bam1_t * src
2207             cdef uint8_t * s
2208             cdef char auxtag[3]
2209             cdef char auxtype
2210
2211             src = self._delegate
2212             if src.l_aux == 0: return None
2213
2214             s = bam1_aux( src )
2215             result = []
2216             auxtag[2] = 0
2217             while s < (src.data + src.data_len):
2218                 # get tag
2219                 auxtag[0] = s[0]
2220                 auxtag[1] = s[1]
2221                 s += 2
2222                 auxtype = s[0]
2223
2224                 if auxtype in ('c', 'C'):
2225                     value = <int>bam_aux2i(s)
2226                     s += 1
2227                 elif auxtype in ('s', 'S'):
2228                     value = <int>bam_aux2i(s)
2229                     s += 2
2230                 elif auxtype in ('i', 'I'):
2231                     value = <float>bam_aux2i(s)
2232                     s += 4
2233                 elif auxtype == 'f':
2234                     value = <float>bam_aux2f(s)
2235                     s += 4
2236                 elif auxtype == 'd':
2237                     value = <double>bam_aux2d(s)
2238                     s += 8
2239                 elif auxtype == 'A':
2240                     value = "%c" % <char>bam_aux2A(s)
2241                     s += 1
2242                 elif auxtype in ('Z', 'H'):
2243                     value = <char*>bam_aux2Z(s)
2244                     # +1 for NULL terminated string
2245                     s += len(value) + 1
2246                  #
2247                 s += 1
2248
2249                 result.append( (auxtag, value) )
2250
2251             return result
2252
2253         def __set__(self, tags):
2254             cdef char * ctag
2255             cdef bam1_t * src
2256             cdef uint8_t * s
2257             cdef uint8_t * new_data
2258             cdef char * temp
2259             cdef int guessed_size, control_size
2260             cdef int max_size, size, offset
2261
2262             src = self._delegate
2263             max_size = 4000
2264             offset = 0
2265
2266             if tags != None:
2267
2268                 # map samtools code to python.struct code and byte size
2269                 buffer = ctypes.create_string_buffer(max_size)
2270
2271                 for pytag, value in tags:
2272                     t = type(value)
2273                     if t == types.FloatType:
2274                         fmt, pytype = "<cccf", 'f'
2275                     elif t == types.IntType:
2276                         if value < 0:
2277                             if value >= -127: fmt, pytype = "<cccb", 'c'
2278                             elif value >= -32767: fmt, pytype = "<ccch", 's'
2279                             elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2280                             else: fmt, pytype = "<ccci", 'i'[0]
2281                         else:
2282                             if value <= 255: fmt, pytype = "<cccB", 'C'
2283                             elif value <= 65535: fmt, pytype = "<cccH", 'S'
2284                             elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2285                             else: fmt, pytype = "<cccI", 'I'
2286                     else:
2287                         # Note: hex strings (H) are not supported yet
2288                         if len(value) == 1:
2289                             fmt, pytype = "<cccc", 'A'
2290                         else:
2291                             fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
2292
2293                     size = struct.calcsize(fmt)
2294                     if offset + size > max_size:
2295                         raise NotImplementedError("tags field too large")
2296
2297                     struct.pack_into( fmt,
2298                                       buffer,
2299                                       offset,
2300                                       pytag[0],
2301                                       pytag[1],
2302                                       pytype,
2303                                       value )
2304                     offset += size
2305
2306             # delete the old data and allocate new
2307             # if offset == 0, the aux field will be
2308             # empty
2309             pysam_bam_update( src,
2310                               src.l_aux,
2311                               offset,
2312                               bam1_aux( src ) )
2313
2314             src.l_aux = offset
2315
2316             # copy data only if there is any
2317             if offset != 0:
2318
2319                 # get location of new data
2320                 s = bam1_aux( src )
2321
2322                 # check if there is direct path from buffer.raw to tmp
2323                 temp = buffer.raw
2324                 memcpy( s, temp, offset )
2325
2326     property flag:
2327         """properties flag"""
2328         def __get__(self): return self._delegate.core.flag
2329         def __set__(self, flag): self._delegate.core.flag = flag
2330
2331     property rname:
2332         """
2333         :term:`target` ID
2334
2335         DEPRECATED from pysam-0.4 - use tid in the future.
2336         The rname field caused a lot of confusion as it returns
2337         the :term:`target` ID instead of the reference sequence
2338         name.
2339
2340         .. note::
2341
2342             This field contains the index of the reference sequence
2343             in the sequence dictionary. To obtain the name
2344             of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2345
2346         """
2347         def __get__(self): return self._delegate.core.tid
2348         def __set__(self, tid): self._delegate.core.tid = tid
2349
2350     property tid:
2351         """
2352         :term:`target` ID
2353
2354         .. note::
2355
2356             This field contains the index of the reference sequence
2357             in the sequence dictionary. To obtain the name
2358             of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2359
2360         """
2361         def __get__(self): return self._delegate.core.tid
2362         def __set__(self, tid): self._delegate.core.tid = tid
2363
2364     property pos:
2365         """0-based leftmost coordinate"""
2366         def __get__(self): return self._delegate.core.pos
2367         def __set__(self, pos):
2368             ## setting the cigar string also updates the "bin" attribute
2369             cdef bam1_t * src
2370             src = self._delegate
2371             if src.core.n_cigar:
2372                 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
2373             else:
2374                 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
2375             self._delegate.core.pos = pos
2376     property bin:
2377         """properties bin"""
2378         def __get__(self): return self._delegate.core.bin
2379         def __set__(self, bin): self._delegate.core.bin = bin
2380     property rlen:
2381         '''length of the read (read only). Returns 0 if not given.'''
2382         def __get__(self): return self._delegate.core.l_qseq
2383     property aend:
2384         '''aligned end position of the read (read only).  Returns
2385         None if not available.'''
2386         def __get__(self):
2387             cdef bam1_t * src
2388             src = self._delegate
2389             if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2390                 return None
2391             return bam_calend(&src.core, bam1_cigar(src))
2392     property alen:
2393         '''aligned length of the read (read only).  Returns None if
2394         not available.'''
2395         def __get__(self):
2396             cdef bam1_t * src
2397             src = self._delegate
2398             if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2399                 return None
2400             return bam_calend(&src.core,
2401                                bam1_cigar(src)) - \
2402                                self._delegate.core.pos
2403
2404     property mapq:
2405         """mapping quality"""
2406         def __get__(self): return self._delegate.core.qual
2407         def __set__(self, qual): self._delegate.core.qual = qual
2408     property mrnm:
2409         """the :term:`reference` id of the mate """
2410         def __get__(self): return self._delegate.core.mtid
2411         def __set__(self, mtid): self._delegate.core.mtid = mtid
2412     property mpos:
2413         """the position of the mate"""
2414         def __get__(self): return self._delegate.core.mpos
2415         def __set__(self, mpos): self._delegate.core.mpos = mpos
2416     property isize:
2417         """the insert size"""
2418         def __get__(self): return self._delegate.core.isize
2419         def __set__(self, isize): self._delegate.core.isize = isize
2420     property is_paired:
2421         """true if read is paired in sequencing"""
2422         def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
2423         def __set__(self,val):
2424             if val: self._delegate.core.flag |= BAM_FPAIRED
2425             else: self._delegate.core.flag &= ~BAM_FPAIRED
2426     property is_proper_pair:
2427         """true if read is mapped in a proper pair"""
2428         def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
2429         def __set__(self,val):
2430             if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
2431             else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
2432     property is_unmapped:
2433         """true if read itself is unmapped"""
2434         def __get__(self): return (self.flag & BAM_FUNMAP) != 0
2435         def __set__(self,val):
2436             if val: self._delegate.core.flag |= BAM_FUNMAP
2437             else: self._delegate.core.flag &= ~BAM_FUNMAP
2438     property mate_is_unmapped:
2439         """true if the mate is unmapped"""
2440         def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
2441         def __set__(self,val):
2442             if val: self._delegate.core.flag |= BAM_FMUNMAP
2443             else: self._delegate.core.flag &= ~BAM_FMUNMAP
2444     property is_reverse:
2445         """true if read is mapped to reverse strand"""
2446         def __get__(self): return (self.flag & BAM_FREVERSE) != 0
2447         def __set__(self,val):
2448             if val: self._delegate.core.flag |= BAM_FREVERSE
2449             else: self._delegate.core.flag &= ~BAM_FREVERSE
2450     property mate_is_reverse:
2451         """true is read is mapped to reverse strand"""
2452         def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
2453         def __set__(self,val):
2454             if val: self._delegate.core.flag |= BAM_FMREVERSE
2455             else: self._delegate.core.flag &= ~BAM_FMREVERSE
2456     property is_read1:
2457         """true if this is read1"""
2458         def __get__(self): return (self.flag & BAM_FREAD1) != 0
2459         def __set__(self,val):
2460             if val: self._delegate.core.flag |= BAM_FREAD1
2461             else: self._delegate.core.flag &= ~BAM_FREAD1
2462     property is_read2:
2463         """true if this is read2"""
2464         def __get__(self): return (self.flag & BAM_FREAD2) != 0
2465         def __set__(self,val):
2466             if val: self._delegate.core.flag |= BAM_FREAD2
2467             else: self._delegate.core.flag &= ~BAM_FREAD2
2468     property is_secondary:
2469         """true if not primary alignment"""
2470         def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
2471         def __set__(self,val):
2472             if val: self._delegate.core.flag |= BAM_FSECONDARY
2473             else: self._delegate.core.flag &= ~BAM_FSECONDARY
2474     property is_qcfail:
2475         """true if QC failure"""
2476         def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
2477         def __set__(self,val):
2478             if val: self._delegate.core.flag |= BAM_FQCFAIL
2479             else: self._delegate.core.flag &= ~BAM_FQCFAIL
2480     property is_duplicate:
2481         """true if optical or PCR duplicate"""
2482         def __get__(self): return (self.flag & BAM_FDUP) != 0
2483         def __set__(self,val):
2484             if val: self._delegate.core.flag |= BAM_FDUP
2485             else: self._delegate.core.flag &= ~BAM_FDUP
2486     property positions:
2487         """a list of reference positions that this read aligns to."""
2488         def __get__(self):
2489             cdef uint32_t k, i, pos
2490             cdef int op
2491             cdef uint32_t * cigar_p
2492             cdef bam1_t * src
2493
2494             result = []
2495             src = self._delegate
2496             if src.core.n_cigar == 0: return []
2497
2498             pos = src.core.pos
2499
2500             cigar_p = bam1_cigar(src)
2501             for k from 0 <= k < src.core.n_cigar:
2502                 op = cigar_p[k] & BAM_CIGAR_MASK
2503                 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2504                 if op == BAM_CMATCH:
2505                     for i from pos <= i < pos + l:
2506                         result.append( i )
2507
2508                 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2509                     pos += l
2510
2511             return result
2512     def overlap( self, uint32_t start, uint32_t end ):
2513         """return number of bases on reference overlapping *start* and *end*
2514         """
2515         cdef uint32_t k, i, pos, overlap
2516         cdef int op, o
2517         cdef uint32_t * cigar_p
2518         cdef bam1_t * src
2519
2520         overlap = 0
2521
2522         src = self._delegate
2523         if src.core.n_cigar == 0: return 0
2524         pos = src.core.pos
2525         o = 0
2526
2527         cigar_p = bam1_cigar(src)
2528         for k from 0 <= k < src.core.n_cigar:
2529             op = cigar_p[k] & BAM_CIGAR_MASK
2530             l = cigar_p[k] >> BAM_CIGAR_SHIFT
2531
2532             if op == BAM_CMATCH:
2533                 o = min( pos + l, end) - max( pos, start )
2534                 if o > 0: overlap += o
2535
2536             if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2537                 pos += l
2538
2539         return overlap
2540
2541     def opt(self, tag):
2542         """retrieves optional data given a two-letter *tag*"""
2543         #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
2544         cdef uint8_t * v
2545         v = bam_aux_get(self._delegate, tag)
2546         if v == NULL: raise KeyError( "tag '%s' not present" % tag )
2547         type = chr(v[0])
2548         if type == 'c' or type == 'C' or type == 's' or type == 'S':
2549             return <int>bam_aux2i(v)
2550         elif type == 'i' or type == 'I':
2551             return <int32_t>bam_aux2i(v)
2552         elif type == 'f' or type == 'F':
2553             return <float>bam_aux2f(v)
2554         elif type == 'd' or type == 'D':
2555             return <double>bam_aux2d(v)
2556         elif type == 'A':
2557             # there might a more efficient way
2558             # to convert a char into a string
2559             return '%c' % <char>bam_aux2A(v)
2560         elif type == 'Z':
2561             return <char*>bam_aux2Z(v)
2562
2563     def fancy_str (self):
2564         """returns list of fieldnames/values in pretty format for debugging
2565         """
2566         ret_string = []
2567         field_names = {
2568            "tid":           "Contig index",
2569            "pos":           "Mapped position on contig",
2570            "mtid":          "Contig index for mate pair",
2571            "mpos":          "Position of mate pair",
2572            "isize":         "Insert size",
2573            "flag":          "Binary flag",
2574            "n_cigar":       "Count of cigar entries",
2575            "cigar":         "Cigar entries",
2576            "qual":          "Mapping quality",
2577            "bin":           "Bam index bin number",
2578            "l_qname":       "Length of query name",
2579            "qname":         "Query name",
2580            "l_qseq":        "Length of query sequence",
2581            "qseq":          "Query sequence",
2582            "bqual":         "Quality scores",
2583            "l_aux":         "Length of auxilary data",
2584            "m_data":        "Maximum data length",
2585            "data_len":      "Current data length",
2586            }
2587         fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
2588                                  "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
2589                                  "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
2590
2591         for f in fields_names_in_order:
2592             if not f in self.__dict__:
2593                 continue
2594             ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
2595
2596         for f in self.__dict__:
2597             if not f in field_names:
2598                 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
2599         return ret_string
2600
2601 cdef class PileupProxy:
2602     '''A pileup column. A pileup column contains
2603     all the reads that map to a certain target base.
2604
2605     tid
2606         chromosome ID as is defined in the header
2607     pos
2608         the target base coordinate (0-based)
2609     n
2610         number of reads mapping to this column
2611     pileups
2612         list of reads (:class:`pysam.PileupRead`) aligned to this column
2613
2614     This class is a proxy for results returned by the samtools pileup engine.
2615     If the underlying engine iterator advances, the results of this column
2616     will change.
2617     '''
2618     cdef bam_pileup1_t * plp
2619     cdef int tid
2620     cdef int pos
2621     cdef int n_pu
2622
2623     def __init__(self):
2624         raise TypeError("This class cannot be instantiated from Python")
2625
2626     def __str__(self):
2627         return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
2628             "\n" +\
2629             "\n".join( map(str, self.pileups) )
2630
2631     property tid:
2632         '''the chromosome ID as is defined in the header'''
2633         def __get__(self): return self.tid
2634
2635     property n:
2636         '''number of reads mapping to this column.'''
2637         def __get__(self): return self.n_pu
2638         def __set__(self, n): self.n_pu = n
2639
2640     property pos:
2641         def __get__(self): return self.pos
2642
2643     property pileups:
2644         '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
2645         def __get__(self):
2646             cdef int x
2647             pileups = []
2648             # warning: there could be problems if self.n and self.buf are
2649             # out of sync.
2650             for x from 0 <= x < self.n_pu:
2651                 pileups.append( makePileupRead( &(self.plp[x])) )
2652             return pileups
2653
2654 cdef class PileupRead:
2655     '''A read aligned to a column.
2656     '''
2657
2658     cdef:
2659          AlignedRead _alignment
2660          int32_t  _qpos
2661          int _indel
2662          int _level
2663          uint32_t _is_del
2664          uint32_t _is_head
2665          uint32_t _is_tail
2666
2667     def __init__(self):
2668         raise TypeError("This class cannot be instantiated from Python")
2669
2670     def __str__(self):
2671         return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
2672
2673     property alignment:
2674         """a :class:`pysam.AlignedRead` object of the aligned read"""
2675         def __get__(self):
2676             return self._alignment
2677     property qpos:
2678         """position of the read base at the pileup site, 0-based"""
2679         def __get__(self):
2680             return self._qpos
2681     property indel:
2682         """indel length; 0 for no indel, positive for ins and negative for del"""
2683         def __get__(self):
2684             return self._indel
2685     property is_del:
2686         """1 iff the base on the padded read is a deletion"""
2687         def __get__(self):
2688             return self._is_del
2689     property is_head:
2690         def __get__(self):
2691             return self._is_head
2692     property is_tail:
2693         def __get__(self):
2694             return self._is_tail
2695     property level:
2696         def __get__(self):
2697             return self._level
2698
2699 class Outs:
2700     '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
2701     def __init__(self, id = 1):
2702         self.streams = []
2703         self.id = id
2704
2705     def setdevice(self, filename):
2706         '''open an existing file, like "/dev/null"'''
2707         fd = os.open(filename, os.O_WRONLY)
2708         self.setfd(fd)
2709
2710     def setfile(self, filename):
2711         '''open a new file.'''
2712         fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
2713         self.setfd(fd)
2714
2715     def setfd(self, fd):
2716         ofd = os.dup(self.id)      #  Save old stream on new unit.
2717         self.streams.append(ofd)
2718         sys.stdout.flush()          #  Buffered data goes to old stream.
2719         sys.stderr.flush()          #  Buffered data goes to old stream.
2720         os.dup2(fd, self.id)        #  Open unit 1 on new stream.
2721         os.close(fd)                #  Close other unit (look out, caller.)
2722
2723     def restore(self):
2724         '''restore previous output stream'''
2725         if self.streams:
2726             # the following was not sufficient, hence flush both stderr and stdout
2727             # os.fsync( self.id )
2728             sys.stdout.flush()
2729             sys.stderr.flush()
2730             os.dup2(self.streams[-1], self.id)
2731             os.close(self.streams[-1])
2732             del self.streams[-1]
2733
2734 def _samtools_dispatch( method,
2735                         args = (),
2736                         catch_stdout = True,
2737                         catch_stderr = False,
2738                         ):
2739     '''call ``method`` in samtools providing arguments in args.
2740
2741     .. note::
2742        This method redirects stdout and stderr to capture it
2743        from samtools. If for some reason stdout/stderr disappears
2744        the reason might be in this method.
2745
2746     .. note::
2747        The current implementation might only work on linux.
2748
2749     .. note::
2750        This method captures stdout and stderr using temporary files,
2751        which are then read into memory in their entirety. This method
2752        is slow and might cause large memory overhead.
2753
2754     See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
2755     on the topic of redirecting stderr/stdout.
2756     '''
2757
2758     # note that debugging this module can be a problem
2759     # as stdout/stderr will not appear
2760
2761     # some special cases
2762     if method == "index":
2763         if not os.path.exists( args[0] ):
2764             raise IOError( "No such file or directory: '%s'" % args[0] )
2765
2766     # redirect stderr and stdout to file
2767     if catch_stderr:
2768         stderr_h, stderr_f = tempfile.mkstemp()
2769         stderr_save = Outs( sys.stderr.fileno() )
2770         stderr_save.setfd( stderr_h )
2771
2772     if catch_stdout:
2773         stdout_h, stdout_f = tempfile.mkstemp()
2774         stdout_save = Outs( sys.stdout.fileno() )
2775         stdout_save.setfd( stdout_h )
2776
2777         # patch for `samtools view`
2778         # samtools `view` closes stdout, from which I can not
2779         # recover. Thus redirect output to file with -o option.
2780         if method == "view":
2781             if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
2782             args = ( "-o", stdout_f ) + args
2783
2784
2785     # do the function call to samtools
2786     cdef char ** cargs
2787     cdef int i, n, retval
2788
2789     n = len(args)
2790     # allocate two more for first (dummy) argument (contains command)
2791     cargs = <char**>calloc( n+2, sizeof( char *) )
2792     cargs[0] = "samtools"
2793     cargs[1] = method
2794     for i from 0 <= i < n: cargs[i+2] = args[i]
2795     retval = pysam_dispatch(n+2, cargs)
2796     free( cargs )
2797
2798     # restore stdout/stderr. This will also flush, so
2799     # needs to be before reading back the file contents
2800     if catch_stdout:
2801         stdout_save.restore()
2802         out_stdout = open( stdout_f, "r").readlines()
2803         os.remove( stdout_f )
2804     else:
2805         out_stdout = []
2806
2807     if catch_stderr:
2808         stderr_save.restore()
2809         out_stderr = open( stderr_f, "r").readlines()
2810         os.remove( stderr_f )
2811     else:
2812         out_stderr = []
2813
2814     return retval, out_stderr, out_stdout
2815
2816 cdef class SNPCall:
2817     '''the results of a SNP call.'''
2818     cdef int _tid
2819     cdef int _pos
2820     cdef char _reference_base
2821     cdef char _genotype
2822     cdef int _consensus_quality
2823     cdef int _snp_quality
2824     cdef int _rms_mapping_quality
2825     cdef int _coverage
2826
2827     property tid:
2828         '''the chromosome ID as is defined in the header'''
2829         def __get__(self):
2830             return self._tid
2831
2832     property pos:
2833        '''nucleotide position of SNP.'''
2834        def __get__(self): return self._pos
2835
2836     property reference_base:
2837        '''reference base at pos. ``N`` if no reference sequence supplied.'''
2838        def __get__(self): return PyString_FromStringAndSize( &self._reference_base, 1 )
2839
2840     property genotype:
2841        '''the genotype called.'''
2842        def __get__(self): return PyString_FromStringAndSize( &self._genotype, 1 )
2843
2844     property consensus_quality:
2845        '''the genotype quality (Phred-scaled).'''
2846        def __get__(self): return self._consensus_quality
2847
2848     property snp_quality:
2849        '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
2850        def __get__(self): return self._snp_quality
2851
2852     property mapping_quality:
2853        '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
2854        def __get__(self): return self._rms_mapping_quality
2855
2856     property coverage:
2857        '''coverage or read depth - the number of reads involved in the call.'''
2858        def __get__(self): return self._coverage
2859
2860     def __str__(self):
2861
2862         return "\t".join( map(str, (
2863                     self.tid,
2864                     self.pos,
2865                     self.reference_base,
2866                     self.genotype,
2867                     self.consensus_quality,
2868                     self.snp_quality,
2869                     self.mapping_quality,
2870                     self.coverage ) ) )
2871
2872
2873 cdef class SNPCallerBase:
2874     '''Base class for SNP callers.
2875
2876     *min_baseQ*
2877        minimum base quality (possibly capped by BAQ)
2878     *capQ_threshold*
2879        coefficient for adjusting mapQ of poor mappings
2880     *theta*
2881        theta in maq consensus calling model
2882     *n_haplotypes*
2883        number of haplotypes in the sample
2884     *het_rate*
2885        prior of a difference between two haplotypes
2886     '''
2887
2888     cdef bam_maqcns_t * c
2889     cdef IteratorColumn iter
2890
2891     def __cinit__(self,
2892                   IteratorColumn iterator_column,
2893                   **kwargs ):
2894
2895         self.iter = iterator_column
2896         self.c =  bam_maqcns_init()
2897
2898         # set the default parameterization according to
2899         # samtools
2900
2901         # new default mode for samtools >0.1.10
2902         self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
2903
2904         self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
2905         # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
2906         self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
2907         self.c.het_rate = kwargs.get( "het_rate", 0.001 )
2908         self.c.theta = kwargs.get( "theta", 0.83 )
2909
2910         if self.c.errmod != BAM_ERRMOD_MAQ2:
2911             self.c.theta += 0.02
2912
2913         # call prepare AFTER setting parameters
2914         bam_maqcns_prepare( self.c )
2915
2916     def __dealloc__(self):
2917         bam_maqcns_destroy( self.c )
2918
2919     cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
2920         '''debugging output.'''
2921
2922         pysam_dump_glf( g, self.c );
2923         print ""
2924         for x in range(self.iter.n_plp):
2925             print "--> read %i %s %i" % (x,
2926                                          bam1_qname(self.iter.plp[x].b),
2927                                          self.iter.plp[x].qpos,
2928                                          )
2929
2930         print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
2931             % (self.iter.pos,
2932                cns,
2933                self.c.q_r,
2934                self.iter.n_plp,
2935                self.iter.n_plp,
2936                rb,
2937                cns >> 8 & 0xff,
2938                cns >> 16 & 0xff,
2939                cns & 0xff,
2940                cns >> 28,
2941                )
2942
2943         printf("-------------------------------------\n");
2944         sys.stdout.flush()
2945
2946 cdef class IteratorSNPCalls( SNPCallerBase ):
2947     """*(IteratorColumn iterator)*
2948
2949     call SNPs within a region.
2950
2951     *iterator* is a pileup iterator. SNPs will be called
2952     on all positions returned by this iterator.
2953
2954     This caller is fast if SNPs are called over large continuous
2955     regions. It is slow, if instantiated frequently and in random
2956     order as the sequence will have to be reloaded.
2957
2958     """
2959
2960     def __cinit__(self,
2961                   IteratorColumn iterator_column,
2962                   **kwargs ):
2963
2964         assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
2965
2966     def __iter__(self):
2967         return self
2968
2969     def __next__(self):
2970         """python version of next().
2971         """
2972
2973         # the following code was adapted from bam_plcmd.c:pileup_func()
2974         self.iter.cnext()
2975
2976         if self.iter.n_plp < 0:
2977             raise ValueError("error during iteration" )
2978
2979         if self.iter.plp == NULL:
2980            raise StopIteration
2981
2982         cdef char * seq = self.iter.getSequence()
2983         cdef int seq_len = self.iter.seq_len
2984
2985         assert seq != NULL
2986
2987         # reference base
2988         if self.iter.pos >= seq_len:
2989             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
2990
2991         cdef int rb = seq[self.iter.pos]
2992         cdef uint32_t cns
2993         cdef glf1_t * g
2994
2995         g = bam_maqcns_glfgen( self.iter.n_plp,
2996                                self.iter.plp,
2997                                bam_nt16_table[rb],
2998                                self.c )
2999
3000         if pysam_glf_depth( g ) == 0:
3001             cns = 0xfu << 28 | 0xf << 24
3002         else:
3003             cns = glf2cns(g, <int>(self.c.q_r + .499))
3004
3005         free(g)
3006
3007         cdef SNPCall call
3008
3009         call = SNPCall()
3010         call._tid = self.iter.tid
3011         call._pos = self.iter.pos
3012         call._reference_base = rb
3013         call._genotype = bam_nt16_rev_table[cns>>28]
3014         call._consensus_quality = cns >> 8 & 0xff
3015         call._snp_quality = cns & 0xff
3016         call._rms_mapping_quality = cns >> 16&0xff
3017         call._coverage = self.iter.n_plp
3018
3019         return call
3020
3021 cdef class SNPCaller( SNPCallerBase ):
3022     '''*(IteratorColumn iterator_column )*
3023
3024     The samtools SNP caller.
3025
3026     This object will call SNPs in *samfile* against the reference
3027     sequence in *fasta*.
3028
3029     This caller is fast for calling few SNPs in selected regions.
3030
3031     It is slow, if called over large genomic regions.
3032     '''
3033
3034
3035     def __cinit__(self,
3036                   IteratorColumn iterator_column,
3037                   **kwargs ):
3038
3039         pass
3040
3041     def call(self, reference, int pos ):
3042         """call a snp on chromosome *reference*
3043         and position *pos*.
3044
3045         returns a :class:`SNPCall` object.
3046         """
3047
3048         cdef int tid = self.iter.samfile.gettid( reference )
3049
3050         self.iter.reset( tid, pos, pos + 1 )
3051
3052         while 1:
3053             self.iter.cnext()
3054
3055             if self.iter.n_plp < 0:
3056                 raise ValueError("error during iteration" )
3057
3058             if self.iter.plp == NULL:
3059                 raise ValueError( "no reads in region - no call" )
3060
3061             if self.iter.pos == pos: break
3062
3063         cdef char * seq = self.iter.getSequence()
3064         cdef int seq_len = self.iter.seq_len
3065
3066         assert seq != NULL
3067
3068         # reference base
3069         if self.iter.pos >= seq_len:
3070             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3071
3072         cdef int rb = seq[self.iter.pos]
3073         cdef uint32_t cns
3074         cdef glf1_t * g
3075
3076         g = bam_maqcns_glfgen( self.iter.n_plp,
3077                                self.iter.plp,
3078                                bam_nt16_table[rb],
3079                                self.c )
3080
3081
3082         if pysam_glf_depth( g ) == 0:
3083             cns = 0xfu << 28 | 0xf << 24
3084         else:
3085             cns = glf2cns(g, <int>(self.c.q_r + .499))
3086
3087         free(g)
3088
3089         cdef SNPCall call
3090
3091         call = SNPCall()
3092         call._tid = self.iter.tid
3093         call._pos = self.iter.pos
3094         call._reference_base = rb
3095         call._genotype = bam_nt16_rev_table[cns>>28]
3096         call._consensus_quality = cns >> 8 & 0xff
3097         call._snp_quality = cns & 0xff
3098         call._rms_mapping_quality = cns >> 16&0xff
3099         call._coverage = self.iter.n_plp
3100
3101         return call
3102
3103 cdef class IndelCall:
3104     '''the results of an indel call.'''
3105     cdef int _tid
3106     cdef int _pos
3107     cdef int _coverage
3108     cdef int _rms_mapping_quality
3109     cdef bam_maqindel_ret_t * _r
3110
3111     def __cinit__(self):
3112         #assert r != NULL
3113         #self._r = r
3114         pass
3115
3116     property tid:
3117         '''the chromosome ID as is defined in the header'''
3118         def __get__(self):
3119             return self._tid
3120
3121     property pos:
3122        '''nucleotide position of SNP.'''
3123        def __get__(self): return self._pos
3124
3125     property genotype:
3126        '''the genotype called.'''
3127        def __get__(self):
3128            if self._r.gt == 0:
3129                s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3130                return "%s/%s" % (s,s)
3131            elif self._r.gt == 1:
3132                s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3133                return "%s/%s" % (s,s)
3134            else:
3135                return "%s/%s" % (self.first_allele, self.second_allele )
3136
3137     property consensus_quality:
3138        '''the genotype quality (Phred-scaled).'''
3139        def __get__(self): return self._r.q_cns
3140
3141     property snp_quality:
3142        '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
3143        def __get__(self): return self._r.q_ref
3144
3145     property mapping_quality:
3146        '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
3147        def __get__(self): return self._rms_mapping_quality
3148
3149     property coverage:
3150        '''coverage or read depth - the number of reads involved in the call.'''
3151        def __get__(self): return self._coverage
3152
3153     property first_allele:
3154        '''sequence of first allele.'''
3155        def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3156
3157     property second_allele:
3158        '''sequence of second allele.'''
3159        def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3160
3161     property reads_first:
3162        '''reads supporting first allele.'''
3163        def __get__(self): return self._r.cnt1
3164
3165     property reads_second:
3166        '''reads supporting first allele.'''
3167        def __get__(self): return self._r.cnt2
3168
3169     property reads_diff:
3170        '''reads supporting first allele.'''
3171        def __get__(self): return self._r.cnt_anti
3172
3173     def __str__(self):
3174
3175         return "\t".join( map(str, (
3176                     self.tid,
3177                     self.pos,
3178                     self.genotype,
3179                     self.consensus_quality,
3180                     self.snp_quality,
3181                     self.mapping_quality,
3182                     self.coverage,
3183                     self.first_allele,
3184                     self.second_allele,
3185                     self.reads_first,
3186                     self.reads_second,
3187                     self.reads_diff ) ) )
3188
3189     def __dealloc__(self ):
3190         bam_maqindel_ret_destroy(self._r)
3191
3192 cdef class IndelCallerBase:
3193     '''Base class for SNP callers.
3194
3195     *min_baseQ*
3196        minimum base quality (possibly capped by BAQ)
3197     *capQ_threshold*
3198        coefficient for adjusting mapQ of poor mappings
3199     *theta*
3200        theta in maq consensus calling model
3201     *n_haplotypes*
3202        number of haplotypes in the sample
3203     *het_rate*
3204        prior of a difference between two haplotypes
3205     '''
3206
3207     cdef bam_maqindel_opt_t * options
3208     cdef IteratorColumn iter
3209     cdef int cap_mapQ
3210     cdef int max_depth
3211
3212     def __cinit__(self,
3213                   IteratorColumn iterator_column,
3214                   **kwargs ):
3215
3216
3217         self.iter = iterator_column
3218
3219         assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
3220
3221         self.options = bam_maqindel_opt_init()
3222
3223         # set the default parameterization according to
3224         # samtools
3225
3226         self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
3227         self.options.q_indel = kwargs.get( "q_indel", 40 )
3228         self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
3229         self.max_depth = kwargs.get( "max_depth", 1024 )
3230
3231     def __dealloc__(self):
3232         free( self.options )
3233
3234     def _call( self ):
3235
3236         cdef char * seq = self.iter.getSequence()
3237         cdef int seq_len = self.iter.seq_len
3238
3239         assert seq != NULL
3240
3241         # reference base
3242         if self.iter.pos >= seq_len:
3243             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3244
3245         cdef bam_maqindel_ret_t * r
3246
3247         cdef int m = min( self.max_depth, self.iter.n_plp )
3248
3249         # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
3250         #        m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
3251         #        self.options.indel_err, self.options.ambi_thres );
3252
3253         r = bam_maqindel(m,
3254                          self.iter.pos,
3255                          self.options,
3256                          self.iter.plp,
3257                          seq,
3258                          0,
3259                          NULL)
3260
3261         if r == NULL: return None
3262
3263         cdef IndelCall call
3264         call = IndelCall()
3265         call._r = r
3266         call._tid = self.iter.tid
3267         call._pos = self.iter.pos
3268         call._coverage = self.iter.n_plp
3269
3270         cdef uint64_t rms_aux = 0
3271         cdef int i = 0
3272         cdef bam_pileup1_t * p
3273         cdef int tmp
3274
3275         for i from 0 <= i < self.iter.n_plp:
3276             p = self.iter.plp + i
3277             if p.b.core.qual < self.cap_mapQ:
3278                 tmp = p.b.core.qual
3279             else:
3280                 tmp = self.cap_mapQ
3281             rms_aux += tmp * tmp
3282
3283         call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
3284
3285         return call
3286
3287 cdef class IndelCaller( IndelCallerBase ):
3288     '''*(IteratorColumn iterator_column )*
3289
3290     The samtools SNP caller.
3291
3292     This object will call SNPs in *samfile* against the reference
3293     sequence in *fasta*.
3294
3295     This caller is fast for calling few SNPs in selected regions.
3296
3297     It is slow, if called over large genomic regions.
3298     '''
3299
3300     def __cinit__(self,
3301                   IteratorColumn iterator_column,
3302                   **kwargs ):
3303
3304         pass
3305
3306     def call(self, reference, int pos ):
3307         """call a snp on chromosome *reference*
3308         and position *pos*.
3309
3310         returns a :class:`SNPCall` object or None, if no indel call could be made.
3311         """
3312
3313         cdef int tid = self.iter.samfile.gettid( reference )
3314
3315         self.iter.reset( tid, pos, pos + 1 )
3316
3317         while 1:
3318             self.iter.cnext()
3319
3320             if self.iter.n_plp < 0:
3321                 raise ValueError("error during iteration" )
3322
3323             if self.iter.plp == NULL:
3324                 raise ValueError( "no reads in region - no call" )
3325
3326             if self.iter.pos == pos: break
3327
3328         return self._call()
3329
3330 cdef class IteratorIndelCalls( IndelCallerBase ):
3331     """*(IteratorColumn iterator)*
3332
3333     call indels within a region.
3334
3335     *iterator* is a pileup iterator. SNPs will be called
3336     on all positions returned by this iterator.
3337
3338     This caller is fast if SNPs are called over large continuous
3339     regions. It is slow, if instantiated frequently and in random
3340     order as the sequence will have to be reloaded.
3341
3342     """
3343
3344     def __cinit__(self,
3345                   IteratorColumn iterator_column,
3346                   **kwargs ):
3347         pass
3348
3349
3350     def __iter__(self):
3351         return self
3352
3353     def __next__(self):
3354         """python version of next().
3355         """
3356
3357         # the following code was adapted from bam_plcmd.c:pileup_func()
3358         self.iter.cnext()
3359
3360         if self.iter.n_plp < 0:
3361             raise ValueError("error during iteration" )
3362
3363         if self.iter.plp == NULL:
3364            raise StopIteration
3365
3366         return self._call()
3367
3368
3369
3370 cdef class IndexedReads:
3371     """index a bamfile by read.
3372
3373     The index is kept in memory.
3374
3375     By default, the file is re-openend to avoid conflicts if
3376     multiple operators work on the same file. Set *reopen* = False
3377     to not re-open *samfile*.
3378     """
3379
3380     cdef Samfile samfile
3381     cdef samfile_t * fp
3382     cdef index
3383     # true if samfile belongs to this object
3384     cdef int owns_samfile
3385
3386     def __init__(self, Samfile samfile, int reopen = True ):
3387         self.samfile = samfile
3388
3389         if samfile.isbam: mode = "rb"
3390         else: mode = "r"
3391
3392         # reopen the file - note that this makes the iterator
3393         # slow and causes pileup to slow down significantly.
3394         if reopen:
3395             store = StderrStore()
3396             self.fp = samopen( samfile._filename, mode, NULL )
3397             store.release()
3398             assert self.fp != NULL
3399             self.owns_samfile = True
3400         else:
3401             self.fp = samfile.samfile
3402             self.owns_samfile = False
3403
3404         assert samfile.isbam, "can only IndexReads on bam files"
3405
3406     def build( self ):
3407         '''build index.'''
3408
3409         self.index = collections.defaultdict( list )
3410
3411         # this method will start indexing from the current file position
3412         # if you decide
3413         cdef int ret = 1
3414         cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
3415
3416         cdef uint64_t pos
3417
3418         while ret > 0:
3419             pos = bam_tell( self.fp.x.bam )
3420             ret = samread( self.fp, b)
3421             if ret > 0:
3422                 qname = bam1_qname( b )
3423                 self.index[qname].append( pos )
3424
3425         bam_destroy1( b )
3426
3427     def find( self, qname ):
3428         if qname in self.index:
3429             return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
3430         else:
3431             raise KeyError( "read %s not found" % qname )
3432
3433     def __dealloc__(self):
3434         if self.owns_samfile: samclose( self.fp )
3435
3436 __all__ = ["Samfile",
3437            "Fastafile",
3438            "IteratorRow",
3439            "IteratorColumn",
3440            "AlignedRead",
3441            "PileupColumn",
3442            "PileupProxy",
3443            "PileupRead",
3444            "IteratorSNPCalls",
3445            "SNPCaller",
3446            "IndelCaller",
3447            "IteratorIndelCalls",
3448            "IndexedReads" ]
3449
3450
3451