pysam/csamtools.pyx

   1 # cython: embedsignature=True
   2 # cython: profile=True
   3 # adds doc-strings for sphinx
   4 import tempfile
   5 import os
   6 import sys
   7 import types
   8 import itertools
   9 import struct
  10 import ctypes
  11 import collections
  12 import re
  13 import platform
  14 from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
  15 from cpython cimport PyErr_SetString
  16
  17 #from cpython.string cimport PyString_FromStringAndSize, PyString_AS_STRING
  18 #from cpython.exc    cimport PyErr_SetString, PyErr_NoMemory
  19
  20 # defines imported from samtools
  21 DEF SEEK_SET = 0
  22 DEF SEEK_CUR = 1
  23 DEF SEEK_END = 2
  24
  25 ## These are bits set in the flag.
  26 ## have to put these definitions here, in csamtools.pxd they got ignored
  27 ## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
  28 DEF BAM_FPAIRED       =1
  29 ## @abstract the read is mapped in a proper pair */
  30 DEF BAM_FPROPER_PAIR  =2
  31 ## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
  32 DEF BAM_FUNMAP        =4
  33 ## @abstract the mate is unmapped */
  34 DEF BAM_FMUNMAP       =8
  35 ## @abstract the read is mapped to the reverse strand */
  36 DEF BAM_FREVERSE      =16
  37 ## @abstract the mate is mapped to the reverse strand */
  38 DEF BAM_FMREVERSE     =32
  39 ## @abstract this is read1 */
  40 DEF BAM_FREAD1        =64
  41 ## @abstract this is read2 */
  42 DEF BAM_FREAD2       =128
  43 ## @abstract not primary alignment */
  44 DEF BAM_FSECONDARY   =256
  45 ## @abstract QC failure */
  46 DEF BAM_FQCFAIL      =512
  47 ## @abstract optical or PCR duplicate */
  48 DEF BAM_FDUP        =1024
  49
  50 DEF BAM_CIGAR_SHIFT=4
  51 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
  52
  53 DEF BAM_CMATCH     = 0
  54 DEF BAM_CINS       = 1
  55 DEF BAM_CDEL       = 2
  56 DEF BAM_CREF_SKIP  = 3
  57 DEF BAM_CSOFT_CLIP = 4
  58 DEF BAM_CHARD_CLIP = 5
  59 DEF BAM_CPAD       = 6
  60
  61 #####################################################################
  62 # hard-coded constants
  63 cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
  64 cdef int max_pos = 2 << 29
  65
  66 # redirect stderr to 0
  67 _logfile = open(os.path.devnull, "w")
  68 pysam_set_stderr( PyFile_AsFile( _logfile ) )
  69
  70 #####################################################################
  71 #####################################################################
  72 #####################################################################
  73 ## private factory methods
  74 #####################################################################
  75 cdef class AlignedRead
  76 cdef makeAlignedRead(bam1_t * src):
  77     '''enter src into AlignedRead.'''
  78     cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
  79     dest._delegate = bam_dup1(src)
  80     return dest
  81
  82 cdef class PileupProxy
  83 cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):
  84      cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
  85      dest.plp = plp
  86      dest.tid = tid
  87      dest.pos = pos
  88      dest.n = n
  89      return dest
  90
  91 cdef class PileupRead
  92 cdef makePileupRead( bam_pileup1_t * src ):
  93     '''fill a  PileupRead object from a bam_pileup1_t * object.'''
  94     cdef PileupRead dest = PileupRead.__new__(PileupRead)
  95     dest._alignment = makeAlignedRead( src.b )
  96     dest._qpos = src.qpos
  97     dest._indel = src.indel
  98     dest._level = src.level
  99     dest._is_del = src.is_del
 100     dest._is_head = src.is_head
 101     dest._is_tail = src.is_tail
 102     return dest
 103
 104 #####################################################################
 105 #####################################################################
 106 #####################################################################
 107 ## Generic callbacks for inserting python callbacks.
 108 #####################################################################
 109 cdef int fetch_callback( bam1_t *alignment, void *f):
 110     '''callback for bam_fetch.
 111
 112     calls function in *f* with a new :class:`AlignedRead` object as parameter.
 113     '''
 114     a = makeAlignedRead( alignment )
 115     (<object>f)(a)
 116
 117 class PileupColumn(object):
 118     '''A pileup column. A pileup column contains
 119     all the reads that map to a certain target base.
 120
 121     tid
 122         chromosome ID as is defined in the header
 123     pos
 124         the target base coordinate (0-based)
 125     n
 126         number of reads mapping to this column
 127     pileups
 128         list of reads (:class:`pysam.PileupRead`) aligned to this column
 129     '''
 130     def __str__(self):
 131         return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
 132             "\n" + "\n".join( map(str, self.pileups) )
 133
 134 cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
 135     '''callback for pileup.
 136
 137     calls function in *f* with a new :class:`Pileup` object as parameter.
 138
 139     tid
 140         chromosome ID as is defined in the header
 141     pos
 142         start coordinate of the alignment, 0-based
 143     n
 144         number of elements in pl array
 145     pl
 146         array of alignments
 147     data
 148         user provided data
 149     '''
 150
 151     p = PileupColumn()
 152     p.tid = tid
 153     p.pos = pos
 154     p.n = n
 155     pileups = []
 156
 157     cdef int x
 158     for x from 0 <= x < n:
 159         pileups.append( makePileupRead( &(pl[x]) ) )
 160     p.pileups = pileups
 161
 162     (<object>f)(p)
 163
 164 cdef int pileup_fetch_callback( bam1_t *b, void *data):
 165     '''callback for bam_fetch.
 166
 167     Fetches reads and submits them to pileup.
 168     '''
 169     cdef bam_plbuf_t * buf
 170     buf = <bam_plbuf_t*>data
 171     bam_plbuf_push(b, buf)
 172     return 0
 173
 174 class StderrStore():
 175     '''
 176     stderr is captured.
 177     '''
 178     def __init__(self):
 179         return
 180         self.stderr_h, self.stderr_f = tempfile.mkstemp()
 181         self.stderr_save = Outs( sys.stderr.fileno() )
 182         self.stderr_save.setfd( self.stderr_h )
 183
 184     def readAndRelease( self ):
 185         return []
 186         self.stderr_save.restore()
 187         lines = []
 188         if os.path.exists(self.stderr_f):
 189             lines = open( self.stderr_f, "r" ).readlines()
 190             os.remove( self.stderr_f )
 191         return lines
 192
 193     def release(self):
 194         return
 195         self.stderr_save.restore()
 196         if os.path.exists(self.stderr_f):
 197             os.remove( self.stderr_f )
 198
 199     def __del__(self):
 200         self.release()
 201
 202 class StderrStoreWindows():
 203     '''does nothing. stderr can't be redirected on windows'''
 204     def __init__(self): pass
 205     def readAndRelease(self): return []
 206     def release(self): pass
 207
 208 if platform.system()=='Windows':
 209     del StderrStore
 210     StderrStore = StderrStoreWindows
 211
 212
 213 ######################################################################
 214 ######################################################################
 215 ######################################################################
 216 # valid types for sam headers
 217 VALID_HEADER_TYPES = { "HD" : dict,
 218                        "SQ" : list,
 219                        "RG" : list,
 220                        "PG" : list,
 221                        "CO" : list }
 222
 223 # order of records within sam headers
 224 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
 225
 226 # type conversions within sam header records
 227 VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
 228                         "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
 229                         "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
 230                         "PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
 231
 232 # output order of fields within records
 233 VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
 234                        "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
 235                        "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
 236                        "PG" : ( "PN", "ID", "VN", "CL" ), }
 237
 238
 239 ######################################################################
 240 ######################################################################
 241 ######################################################################
 242 ## Public methods
 243 ######################################################################
 244 cdef class Fastafile:
 245     '''*(filename)*
 246
 247     A *FASTA* file. The file is automatically opened.
 248
 249     The file expects an indexed fasta file.
 250
 251     TODO:
 252         add automatic indexing.
 253         add function to get sequence names.
 254     '''
 255
 256     cdef char * _filename
 257     # pointer to fastafile
 258     cdef faidx_t * fastafile
 259
 260     def __cinit__(self, *args, **kwargs ):
 261         self.fastafile = NULL
 262         self._filename = NULL
 263         self._open( *args, **kwargs )
 264
 265     def _isOpen( self ):
 266         '''return true if samfile has been opened.'''
 267         return self.fastafile != NULL
 268
 269     def __len__(self):
 270         if self.fastafile == NULL:
 271             raise ValueError( "calling len() on closed file" )
 272
 273         return faidx_fetch_nseq(self.fastafile)
 274
 275     def _open( self,
 276                char * filename ):
 277         '''open an indexed fasta file.
 278
 279         This method expects an indexed fasta file.
 280         '''
 281
 282         # close a previously opened file
 283         if self.fastafile != NULL: self.close()
 284         if self._filename != NULL: free(self._filename)
 285         self._filename = strdup(filename)
 286         self.fastafile = fai_load( filename )
 287
 288         if self.fastafile == NULL:
 289             raise IOError("could not open file `%s`" % filename )
 290
 291     def close( self ):
 292         if self.fastafile != NULL:
 293             fai_destroy( self.fastafile )
 294             self.fastafile = NULL
 295
 296     def __dealloc__(self):
 297         self.close()
 298         if self._filename != NULL: free(self._filename)
 299
 300     property filename:
 301         '''number of :term:`filename` associated with this object.'''
 302         def __get__(self):
 303             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 304             return self._filename
 305
 306     def fetch( self,
 307                reference = None,
 308                start = None,
 309                end = None,
 310                region = None):
 311
 312         '''*(reference = None, start = None, end = None, region = None)*
 313
 314         fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
 315
 316         The region is specified by :term:`reference`, *start* and *end*.
 317
 318         fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
 319
 320         If *reference* is given and *start* is None, the sequence from the
 321         first base is returned. Similarly, if *end* is None, the sequence
 322         until the last base is returned.
 323
 324         Alternatively, a samtools :term:`region` string can be supplied.
 325         '''
 326
 327         if not self._isOpen():
 328             raise ValueError( "I/O operation on closed file" )
 329
 330         cdef int length
 331         cdef char * seq
 332
 333         if not region:
 334             if reference is None: raise ValueError( 'no sequence/region supplied.' )
 335             if start is None: start = 0
 336             if end is None: end = max_pos -1
 337
 338             if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
 339             if start == end: return ""
 340             # valid ranges are from 0 to 2^29-1
 341             if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
 342             if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
 343             # note: faidx_fetch_seq has a bug such that out-of-range access
 344             # always returns the last residue. Hence do not use faidx_fetch_seq,
 345             # but use fai_fetch instead
 346             # seq = faidx_fetch_seq(self.fastafile,
 347             #                       reference,
 348             #                       start,
 349             #                       end-1,
 350             #                       &length)
 351             region = "%s:%i-%i" % (reference, start+1, end)
 352             seq = fai_fetch( self.fastafile,
 353                              region,
 354                              &length )
 355         else:
 356             # samtools adds a '\0' at the end
 357             seq = fai_fetch( self.fastafile, region, &length )
 358
 359         # copy to python
 360         if seq == NULL:
 361             return ""
 362         else:
 363             try:
 364                 py_seq = PyString_FromStringAndSize(seq, length)
 365             finally:
 366                 free(seq)
 367
 368         return py_seq
 369
 370     cdef char * _fetch( self, char * reference, int start, int end, int * length ):
 371         '''fetch sequence for reference, start and end'''
 372
 373         return faidx_fetch_seq(self.fastafile,
 374                                reference,
 375                                start,
 376                                end-1,
 377                                length )
 378
 379 #------------------------------------------------------------------------
 380 #------------------------------------------------------------------------
 381 #------------------------------------------------------------------------
 382 cdef int count_callback( bam1_t *alignment, void *f):
 383      '''callback for bam_fetch - count number of reads.
 384      '''
 385      cdef int* counter = (<int*>f)
 386      counter[0] += 1;
 387
 388 ctypedef struct MateData:
 389      char * name
 390      bam1_t * mate
 391      uint32_t flag
 392
 393 #------------------------------------------------------------------------
 394 #------------------------------------------------------------------------
 395 #------------------------------------------------------------------------
 396 cdef int mate_callback( bam1_t *alignment, void *f):
 397      '''callback for bam_fetch = filter mate
 398      '''
 399      cdef MateData * d = (<MateData*>f)
 400      # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
 401      #        d.mate, d.name, bam1_qname(alignment),
 402      #        d.flag, alignment.core.flag, alignment.core.flag & d.flag)
 403
 404      if d.mate == NULL:
 405          # could be sped up by comparing the lengths of query strings first
 406          # using l_qname
 407          #
 408          # also, make sure that we get the other read by comparing
 409          # the flags
 410          if alignment.core.flag & d.flag != 0 and \
 411                  strcmp( bam1_qname( alignment ), d.name ) == 0:
 412              d.mate = bam_dup1( alignment )
 413
 414
 415 cdef class Samfile:
 416     '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None,
 417          add_sq_text = False )*
 418
 419     A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
 420
 421     *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
 422     (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
 423     Use ``h`` to output header information in text (:term:`TAM`)  mode.
 424
 425     If ``b`` is present, it must immediately follow ``r`` or ``w``.
 426     Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
 427     a :term:`BAM` formatted file for reading, type::
 428
 429         f = pysam.Samfile('ex1.bam','rb')
 430
 431     If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following
 432     should work::
 433
 434         f1 = pysam.Samfile('ex1.bam' )
 435         f2 = pysam.Samfile('ex1.sam' )
 436
 437     If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
 438     access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
 439
 440     For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
 441     sources (see also the samtools format specification):
 442
 443         1. If *template* is given, the header is copied from a another *Samfile*
 444            (*template* must be of type *Samfile*).
 445
 446         2. If *header* is given, the header is built from a multi-level dictionary. The first level
 447            are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
 448            being a list of tag-value pairs.
 449
 450         3. If *text* is given, new header text is copied from raw text.
 451
 452         4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists.
 453            By default, 'SQ' and 'LN' tags will be added to the header text. This option can be
 454            changed by unsetting the flag *add_sq_text*.
 455
 456     '''
 457
 458     def __cinit__(self, *args, **kwargs ):
 459         self.samfile = NULL
 460         self._filename = NULL
 461         self.isbam = False
 462         self.isstream = False
 463         self._open( *args, **kwargs )
 464
 465         # allocate memory for iterator
 466         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
 467
 468     def _isOpen( self ):
 469         '''return true if samfile has been opened.'''
 470         return self.samfile != NULL
 471
 472     def _hasIndex( self ):
 473         '''return true if samfile has an existing (and opened) index.'''
 474         return self.index != NULL
 475
 476     def _open( self,
 477                char * filename,
 478                mode = None,
 479                Samfile template = None,
 480                referencenames = None,
 481                referencelengths = None,
 482                text = None,
 483                header = None,
 484                port = None,
 485                add_sq_text = True,
 486               ):
 487         '''open a sam/bam file.
 488
 489         If _open is called on an existing bamfile, the current file will be
 490         closed and a new file will be opened.
 491         '''
 492
 493         # read mode autodetection
 494         if mode is None:
 495             try:
 496                 self._open(filename, 'rb', template=template,
 497                            referencenames=referencenames,
 498                            referencelengths=referencelengths,
 499                            text=text, header=header, port=port)
 500                 return
 501             except ValueError, msg:
 502                 pass
 503
 504             self._open(filename, 'r', template=template,
 505                        referencenames=referencenames,
 506                        referencelengths=referencelengths,
 507                        text=text, header=header, port=port)
 508             return
 509
 510         assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode
 511         assert filename != NULL
 512
 513         # close a previously opened file
 514         if self.samfile != NULL: self.close()
 515         self.samfile = NULL
 516
 517         cdef bam_header_t * header_to_write
 518         header_to_write = NULL
 519
 520         if self._filename != NULL: free(self._filename )
 521         self._filename = strdup( filename )
 522         self.isstream = strcmp( filename, "-" ) == 0
 523
 524         self.isbam = len(mode) > 1 and mode[1] == 'b'
 525
 526         self.isremote = strncmp(filename,"http:",5) == 0 or \
 527             strncmp(filename,"ftp:",4) == 0
 528
 529         cdef char * ctext
 530         ctext = NULL
 531
 532         if mode[0] == 'w':
 533             # open file for writing
 534
 535             # header structure (used for writing)
 536             if template:
 537                 # copy header from another file
 538                 header_to_write = template.samfile.header
 539
 540             elif header:
 541                 header_to_write = self._buildHeader( header )
 542
 543             else:
 544                 # build header from a target names and lengths
 545                 assert referencenames and referencelengths, "either supply options `template`, `header` or  both `referencenames` and `referencelengths` for writing"
 546                 assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
 547
 548                 # allocate and fill header
 549                 header_to_write = bam_header_init()
 550                 header_to_write.n_targets = len(referencenames)
 551                 n = 0
 552                 for x in referencenames: n += len(x) + 1
 553                 header_to_write.target_name = <char**>calloc(n, sizeof(char*))
 554                 header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
 555                 for x from 0 <= x < header_to_write.n_targets:
 556                     header_to_write.target_len[x] = referencelengths[x]
 557                     name = referencenames[x]
 558                     header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
 559                     strncpy( header_to_write.target_name[x], name, len(name) )
 560
 561                 # Optionally, if there is no text, add a SAM compatible header to output
 562                 # file.
 563                 if text is None and add_sq_text:
 564                     text = ''
 565                     for x from 0 <= x < header_to_write.n_targets:
 566                         text += "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] )
 567
 568                 if text != None:
 569                     # copy without \0
 570                     ctext = text
 571                     header_to_write.l_text = strlen(ctext)
 572                     header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
 573                     memcpy( header_to_write.text, ctext, strlen(ctext) )
 574
 575                 header_to_write.hash = NULL
 576                 header_to_write.rg2lib = NULL
 577
 578             # open file. Header gets written to file at the same time for bam files
 579             # and sam files (in the latter case, the mode needs to be wh)
 580             store = StderrStore()
 581             self.samfile = samopen( filename, mode, header_to_write )
 582             store.release()
 583
 584             # bam_header_destroy takes care of cleaning up of all the members
 585             if not template and header_to_write != NULL:
 586                 bam_header_destroy( header_to_write )
 587
 588         elif mode[0] == "r":
 589             # open file for reading
 590             if strncmp( filename, "-", 1) != 0 and \
 591                     not self.isremote and \
 592                     not os.path.exists( filename ):
 593                 raise IOError( "file `%s` not found" % filename)
 594
 595             # try to detect errors
 596             self.samfile = samopen( filename, mode, NULL )
 597             if self.samfile == NULL:
 598                 raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
 599
 600             if self.samfile.header == NULL:
 601                 raise ValueError( "file does not have valid header (mode='%s') - is it SAM/BAM format?" % mode )
 602
 603             #disabled for autodetection to work
 604             # needs to be disabled so that reading from sam-files without headers works
 605             #if self.samfile.header.n_targets == 0:
 606             #    raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
 607
 608         if self.samfile == NULL:
 609             raise IOError("could not open file `%s`" % filename )
 610
 611         # check for index and open if present
 612         if mode[0] == "r" and self.isbam:
 613
 614             if not self.isremote:
 615                 if not os.path.exists(filename +".bai"):
 616                     self.index = NULL
 617                 else:
 618                     # returns NULL if there is no index or index could not be opened
 619                     self.index = bam_index_load(filename)
 620                     if self.index == NULL:
 621                         raise IOError("error while opening index `%s` " % filename )
 622             else:
 623                 self.index = bam_index_load(filename)
 624                 if self.index == NULL:
 625                     raise IOError("error while opening index `%s` " % filename )
 626
 627             if not self.isstream:
 628                 self.start_offset = bam_tell( self.samfile.x.bam )
 629
 630     def gettid( self, reference ):
 631         '''
 632         convert :term:`reference` name into numerical :term:`tid`
 633
 634         returns -1 if reference is not known.
 635         '''
 636         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 637         return pysam_reference2tid( self.samfile.header, reference )
 638
 639     def getrname( self, tid ):
 640         '''
 641         convert numerical :term:`tid` into :term:`reference` name.'''
 642         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 643         if not 0 <= tid < self.samfile.header.n_targets:
 644             raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
 645         return self.samfile.header.target_name[tid]
 646
 647     cdef char * _getrname( self, int tid ):
 648         '''
 649         convert numerical :term:`tid` into :term:`reference` name.'''
 650         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 651         if not 0 <= tid < self.samfile.header.n_targets:
 652             raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
 653         return self.samfile.header.target_name[tid]
 654
 655     def _parseRegion( self,
 656                       reference = None,
 657                       start = None,
 658                       end = None,
 659                       region = None ):
 660         '''
 661         parse region information.
 662
 663         raise ValueError for for invalid regions.
 664
 665         returns a tuple of flag, tid, start and end. Flag indicates
 666         whether some coordinates were supplied.
 667
 668         Note that regions are 1-based, while start,end are python coordinates.
 669         '''
 670         # This method's main objective is to translate from a reference to a tid.
 671         # For now, it calls bam_parse_region, which is clumsy. Might be worth
 672         # implementing it all in pysam (makes use of khash).
 673
 674         cdef int rtid
 675         cdef long long rstart
 676         cdef long long rend
 677
 678         rtid = -1
 679         rstart = 0
 680         rend = max_pos
 681         if start != None:
 682             try:
 683                 rstart = start
 684             except OverflowError:
 685                 raise ValueError( 'start out of range (%i)' % start )
 686
 687         if end != None:
 688             try:
 689                 rend = end
 690             except OverflowError:
 691                 raise ValueError( 'end out of range (%i)' % end )
 692
 693         if region:
 694             parts = re.split( "[:-]", region )
 695             reference = parts[0]
 696             if len(parts) >= 2: rstart = int(parts[1]) - 1
 697             if len(parts) >= 3: rend = int(parts[2])
 698
 699         if not reference: return 0, 0, 0, 0
 700
 701         rtid = self.gettid( reference )
 702         if rtid < 0: raise ValueError( "invalid reference `%s`" % reference )
 703         if rstart > rend: raise ValueError( 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend) )
 704         if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
 705         if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
 706
 707         return 1, rtid, rstart, rend
 708
 709     def reset( self ):
 710         '''reset file position to beginning of read section.'''
 711         return self.seek( self.start_offset, 0 )
 712
 713     def seek( self, uint64_t offset, int where = 0):
 714         '''
 715         move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
 716         '''
 717
 718         if not self._isOpen():
 719             raise ValueError( "I/O operation on closed file" )
 720         if not self.isbam:
 721             raise NotImplementedError("seek only available in bam files")
 722         if self.isstream:
 723             raise OSError("seek no available in streams")
 724
 725         return bam_seek( self.samfile.x.bam, offset, where )
 726
 727     def tell( self ):
 728         '''
 729         return current file position
 730         '''
 731         if not self._isOpen():
 732             raise ValueError( "I/O operation on closed file" )
 733         if not self.isbam:
 734             raise NotImplementedError("seek only available in bam files")
 735
 736         return bam_tell( self.samfile.x.bam )
 737
 738     def fetch( self,
 739                reference = None,
 740                start = None,
 741                end = None,
 742                region = None,
 743                callback = None,
 744                until_eof = False ):
 745         '''
 746         fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
 747         :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
 748         be supplied.
 749
 750         Without *reference* or *region* all mapped reads will be fetched. The reads will be returned
 751         ordered by reference sequence, which will not necessarily be the order within the file.
 752
 753         If *until_eof* is given, all reads from the current file position will be returned
 754         in order as they are within the file. Using this option will also fetch unmapped reads.
 755
 756         If only *reference* is set, all reads aligned to *reference* will be fetched.
 757
 758         The method returns an iterator of type :class:`pysam.IteratorRow` unless
 759         a *callback is provided. If *callback* is given, the callback will be executed
 760         for each position within the :term:`region`. Note that callbacks currently work
 761         only, if *region* or *reference* is given.
 762
 763         Note that a :term:`SAM` file does not allow random access. If *region* or *reference* are given,
 764         an exception is raised.
 765         '''
 766         cdef int rtid, rstart, rend, has_coord
 767
 768         if not self._isOpen():
 769             raise ValueError( "I/O operation on closed file" )
 770
 771         has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 772
 773         if self.isstream: reopen = False
 774         else: reopen = True
 775
 776         if self.isbam:
 777             if not until_eof and not self._hasIndex() and not self.isremote:
 778                 raise ValueError( "fetch called on bamfile without index" )
 779
 780             if callback:
 781                 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
 782                 if not self._hasIndex(): raise ValueError( "no index available for fetch" )
 783                 return bam_fetch(self.samfile.x.bam,
 784                                  self.index,
 785                                  rtid,
 786                                  rstart,
 787                                  rend,
 788                                  <void*>callback,
 789                                  fetch_callback )
 790             else:
 791                 if has_coord:
 792                     return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen )
 793                 else:
 794                     if until_eof:
 795                         return IteratorRowAll( self, reopen=reopen )
 796                     else:
 797                         # AH: check - reason why no reopen for AllRefs?
 798                         return IteratorRowAllRefs(self ) # , reopen=reopen )
 799         else:
 800             # check if header is present - otherwise sam_read1 aborts
 801             # this happens if a bamfile is opened with mode 'r'
 802             if has_coord:
 803                 raise ValueError ("fetching by region is not available for sam files" )
 804
 805             if self.samfile.header.n_targets == 0:
 806                 raise ValueError( "fetch called for samfile without header")
 807
 808             if callback:
 809                 raise NotImplementedError( "callback not implemented yet" )
 810             else:
 811                 return IteratorRowAll( self, reopen=reopen )
 812
 813     def mate( self,
 814               AlignedRead read ):
 815         '''return the mate of :class:`AlignedRead` *read*.
 816
 817         Throws a ValueError if read is unpaired or the mate
 818         is unmapped.
 819
 820         .. note::
 821             Calling this method will change the file position.
 822             This might interfere with any iterators that have
 823             not re-opened the file.
 824
 825         '''
 826         cdef uint32_t flag = read._delegate.core.flag
 827
 828         if flag & BAM_FPAIRED == 0:
 829             raise ValueError( "read %s: is unpaired" % (read.qname))
 830         if flag & BAM_FMUNMAP != 0:
 831             raise ValueError( "mate %s: is unmapped" % (read.qname))
 832
 833         cdef MateData mate_data
 834
 835         mate_data.name = <char *>bam1_qname(read._delegate)
 836         mate_data.mate = NULL
 837         # xor flags to get the other mate
 838         cdef int x = BAM_FREAD1 + BAM_FREAD2
 839         mate_data.flag = ( flag ^ x) & x
 840
 841         bam_fetch(self.samfile.x.bam,
 842                   self.index,
 843                   read._delegate.core.mtid,
 844                   read._delegate.core.mpos,
 845                   read._delegate.core.mpos + 1,
 846                   <void*>&mate_data,
 847                   mate_callback )
 848
 849         if mate_data.mate == NULL:
 850             raise ValueError( "mate not found" )
 851
 852         cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
 853         dest._delegate = mate_data.mate
 854         return dest
 855
 856     def count( self,
 857                reference = None,
 858                start = None,
 859                end = None,
 860                region = None,
 861                until_eof = False ):
 862         '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
 863
 864         count  reads :term:`region` using 0-based indexing. The region is specified by
 865         :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
 866
 867         Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
 868         an exception is raised.
 869         '''
 870         cdef int rtid
 871         cdef int rstart
 872         cdef int rend
 873
 874         if not self._isOpen():
 875             raise ValueError( "I/O operation on closed file" )
 876
 877         region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 878
 879         cdef int counter
 880         counter = 0;
 881
 882         if self.isbam:
 883             if not until_eof and not self._hasIndex() and not self.isremote:
 884                 raise ValueError( "fetch called on bamfile without index" )
 885
 886             if not region:
 887                 raise ValueError( "counting functionality requires a region/reference" )
 888             if not self._hasIndex(): raise ValueError( "no index available for fetch" )
 889             bam_fetch(self.samfile.x.bam,
 890                              self.index,
 891                              rtid,
 892                              rstart,
 893                              rend,
 894                              <void*>&counter,
 895                              count_callback )
 896             return counter
 897         else:
 898             raise ValueError ("count for a region is not available for sam files" )
 899
 900     def pileup( self,
 901                 reference = None,
 902                 start = None,
 903                 end = None,
 904                 region = None,
 905                 callback = None,
 906                 **kwargs ):
 907         '''
 908         perform a :term:`pileup` within a :term:`region`. The region is specified by
 909         :term:`reference`, *start* and *end* (using 0-based indexing).
 910         Alternatively, a samtools *region* string can be supplied.
 911
 912         Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
 913         ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
 914
 915         The method returns an iterator of type :class:`pysam.IteratorColumn` unless
 916         a *callback is provided. If a *callback* is given, the callback will be executed
 917         for each column within the :term:`region`.
 918
 919         Note that :term:`SAM` formatted files do not allow random access.
 920         In these files, if a *region* or *reference* are given an exception is raised.
 921
 922         Optional *kwargs* to the iterator:
 923
 924         stepper
 925            The stepper controlls how the iterator advances.
 926            Possible options for the stepper are
 927
 928            ``all``
 929               use all reads for pileup.
 930            ``samtools``
 931               same filter and read processing as in :term:`csamtools` pileup
 932
 933         fastafile
 934            A :class:`FastaFile` object
 935
 936          mask
 937            Skip all reads with bits set in mask.
 938
 939          max_depth
 940            Maximum read depth permitted. The default limit is *8000*.
 941
 942         .. note::
 943
 944             *all* reads which overlap the region are returned. The first base returned will be the
 945             first base of the first read *not* necessarily the first base of the region used in the query.
 946
 947         '''
 948         cdef int rtid, rstart, rend, has_coord
 949         cdef bam_plbuf_t *buf
 950
 951         if not self._isOpen():
 952             raise ValueError( "I/O operation on closed file" )
 953
 954         has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 955
 956         if self.isbam:
 957             if not self._hasIndex(): raise ValueError( "no index available for pileup" )
 958
 959             if callback:
 960                 if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
 961
 962                 buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
 963                 bam_fetch(self.samfile.x.bam,
 964                           self.index, rtid, rstart, rend,
 965                           buf, pileup_fetch_callback )
 966
 967                 # finalize pileup
 968                 bam_plbuf_push( NULL, buf)
 969                 bam_plbuf_destroy(buf)
 970             else:
 971                 if has_coord:
 972                     return IteratorColumnRegion( self,
 973                                                  tid = rtid,
 974                                                  start = rstart,
 975                                                  end = rend,
 976                                                  **kwargs )
 977                 else:
 978                     return IteratorColumnAllRefs(self, **kwargs )
 979
 980         else:
 981             raise NotImplementedError( "pileup of samfiles not implemented yet" )
 982
 983     def close( self ):
 984         '''
 985         closes the :class:`pysam.Samfile`.'''
 986         if self.samfile != NULL:
 987             samclose( self.samfile )
 988             bam_index_destroy(self.index);
 989             self.samfile = NULL
 990
 991     def __dealloc__( self ):
 992         # remember: dealloc cannot call other methods
 993         # note: no doc string
 994         # note: __del__ is not called.
 995         self.close()
 996         bam_destroy1(self.b)
 997         if self._filename != NULL: free( self._filename )
 998
 999     cpdef int write( self, AlignedRead read ) except -1:
1000         '''
1001         write a single :class:`pysam.AlignedRead` to disk.
1002
1003         returns the number of bytes written.
1004         '''
1005         if not self._isOpen():
1006             return 0
1007
1008         return samwrite( self.samfile, read._delegate )
1009
1010     def __enter__(self):
1011         return self
1012
1013     def __exit__(self, exc_type, exc_value, traceback):
1014         self.close()
1015         return False
1016
1017     ###############################################################
1018     ###############################################################
1019     ###############################################################
1020     ## properties
1021     ###############################################################
1022     property filename:
1023         '''number of :term:`filename` associated with this object.'''
1024         def __get__(self):
1025             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1026             return self._filename
1027
1028     property nreferences:
1029         '''number of :term:`reference` sequences in the file.'''
1030         def __get__(self):
1031             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1032             return self.samfile.header.n_targets
1033
1034     property references:
1035         """tuple with the names of :term:`reference` sequences."""
1036         def __get__(self):
1037             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1038             t = []
1039             for x from 0 <= x < self.samfile.header.n_targets:
1040                 t.append( self.samfile.header.target_name[x] )
1041             return tuple(t)
1042
1043     property lengths:
1044         """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
1045         :attr:`pysam.Samfile.references`
1046         """
1047         def __get__(self):
1048             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1049             t = []
1050             for x from 0 <= x < self.samfile.header.n_targets:
1051                 t.append( self.samfile.header.target_len[x] )
1052             return tuple(t)
1053
1054     property mapped:
1055         """total number of mapped reads in file.
1056         """
1057         def __get__(self):
1058             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1059             if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" )
1060
1061             cdef int tid
1062             cdef uint32_t total = 0
1063             for tid from 0 <= tid < self.samfile.header.n_targets:
1064                 total += pysam_get_mapped( self.index, tid )
1065             return total
1066
1067     property unmapped:
1068         """total number of unmapped reads in file.
1069         """
1070         def __get__(self):
1071             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1072             if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" )
1073             cdef int tid
1074             cdef uint32_t total = 0
1075             for tid from 0 <= tid < self.samfile.header.n_targets:
1076                 total += pysam_get_unmapped( self.index, tid )
1077             # get unmapped reads without coordinates
1078             total += pysam_get_unmapped( self.index, -1 )
1079             return total
1080
1081     property text:
1082         '''full contents of the :term:`sam file` header as a string.'''
1083         def __get__(self):
1084             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1085             return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
1086
1087     property header:
1088         '''header information within the :term:`sam file`. The records and fields are returned as
1089         a two-level dictionary.
1090         '''
1091         def __get__(self):
1092             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1093
1094             result = {}
1095
1096             if self.samfile.header.text != NULL:
1097                 # convert to python string (note: call self.text to create 0-terminated string)
1098                 t = self.text
1099                 for line in t.split("\n"):
1100                     if not line.strip(): continue
1101                     assert line.startswith("@"), "header line without '@': '%s'" % line
1102                     fields = line[1:].split("\t")
1103                     record = fields[0]
1104                     assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
1105
1106                     # treat comments
1107                     if record == "CO":
1108                         if record not in result: result[record] = []
1109                         result[record].append( "\t".join( fields[1:] ) )
1110                         continue
1111
1112                     # the following is clumsy as generators do not work?
1113                     x = {}
1114                     for field in fields[1:]:
1115                         key, value = field.split(":",1)
1116                         # uppercase keys must be valid
1117                         # lowercase are permitted for user fields
1118                         if key in VALID_HEADER_FIELDS[record]:
1119                             x[key] = VALID_HEADER_FIELDS[record][key](value)
1120                         elif not key.isupper():
1121                             x[key] = value
1122                         else:
1123                             raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
1124
1125                     if VALID_HEADER_TYPES[record] == dict:
1126                         if record in result:
1127                             raise ValueError( "multiple '%s' lines are not permitted" % record )
1128                         result[record] = x
1129                     elif VALID_HEADER_TYPES[record] == list:
1130                         if record not in result: result[record] = []
1131                         result[record].append( x )
1132
1133             return result
1134
1135     def _buildLine( self, fields, record ):
1136         '''build a header line from *fields* dictionary for *record*'''
1137
1138         # TODO: add checking for field and sort order
1139         line = ["@%s" % record ]
1140         if record == "CO":
1141             line.append( fields )
1142         else:
1143             # write fields of the specification
1144             for key in VALID_HEADER_ORDER[record]:
1145                 if key in fields:
1146                     line.append( "%s:%s" % (key, str(fields[key])))
1147             # write user fields
1148             for key in fields:
1149                 if not key.isupper():
1150                     line.append( "%s:%s" % (key, str(fields[key])))
1151
1152         return "\t".join( line )
1153
1154     cdef bam_header_t * _buildHeader( self, new_header ):
1155         '''return a new header built from a dictionary in *new_header*.
1156
1157         This method inserts the text field, target_name and target_len.
1158         '''
1159
1160         lines = []
1161
1162         # check if hash exists
1163
1164         # create new header and copy old data
1165         cdef bam_header_t * dest
1166
1167         dest = bam_header_init()
1168
1169         for record in VALID_HEADERS:
1170             if record in new_header:
1171                 ttype = VALID_HEADER_TYPES[record]
1172                 data = new_header[record]
1173                 if type( data ) != type( ttype() ):
1174                     raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
1175                 if type( data ) == types.DictType:
1176                     lines.append( self._buildLine( data, record ) )
1177                 else:
1178                     for fields in new_header[record]:
1179                         lines.append( self._buildLine( fields, record ) )
1180
1181         text = "\n".join(lines) + "\n"
1182         if dest.text != NULL: free( dest.text )
1183         dest.text = <char*>calloc( len(text), sizeof(char))
1184         dest.l_text = len(text)
1185         strncpy( dest.text, text, dest.l_text )
1186
1187         # collect targets
1188         if "SQ" in new_header:
1189             seqs = []
1190             for fields in new_header["SQ"]:
1191                 try:
1192                     seqs.append( (fields["SN"], fields["LN"] ) )
1193                 except KeyError:
1194                     raise KeyError( "incomplete sequence information in '%s'" % str(fields))
1195
1196             dest.n_targets = len(seqs)
1197             dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
1198             dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
1199
1200             for x from 0 <= x < dest.n_targets:
1201                 seqname, seqlen = seqs[x]
1202                 dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
1203                 strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
1204                 dest.target_len[x] = seqlen
1205
1206         return dest
1207
1208     ###############################################################
1209     ###############################################################
1210     ###############################################################
1211     ## file-object like iterator access
1212     ## note: concurrent access will cause errors (see IteratorRow
1213     ## and reopen)
1214     ## Possible solutions: deprecate or open new file handle
1215     ###############################################################
1216     def __iter__(self):
1217         if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
1218         if not self.isbam and self.samfile.header.n_targets == 0:
1219                 raise NotImplementedError( "can not iterate over samfile without header")
1220         return self
1221
1222     cdef bam1_t * getCurrent( self ):
1223         return self.b
1224
1225     cdef int cnext(self):
1226         '''
1227         cversion of iterator. Used by :class:`pysam.Samfile.IteratorColumn`.
1228         '''
1229         cdef int ret
1230         return samread(self.samfile, self.b)
1231
1232     def __next__(self):
1233         """
1234         python version of next().
1235         """
1236         cdef int ret
1237         ret = samread(self.samfile, self.b)
1238         if (ret > 0):
1239             return makeAlignedRead( self.b )
1240         else:
1241             raise StopIteration
1242
1243 ##-------------------------------------------------------------------
1244 ##-------------------------------------------------------------------
1245 ##-------------------------------------------------------------------
1246 cdef class IteratorRow:
1247     '''abstract base class for iterators over mapped reads.
1248
1249     Various iterators implement different behaviours for wrapping around
1250     contig boundaries. Examples include:
1251
1252     :class:`pysam.IteratorRowRegion`
1253         iterate within a single contig and a defined region.
1254
1255     :class:`pysam.IteratorRowAll`
1256         iterate until EOF. This iterator will also include unmapped reads.
1257
1258     :class:`pysam.IteratorRowAllRefs`
1259         iterate over all reads in all reference sequences.
1260
1261     The method :meth:`Samfile.fetch` returns an IteratorRow.
1262     '''
1263     pass
1264
1265 cdef class IteratorRowRegion(IteratorRow):
1266     """*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
1267
1268     iterate over mapped reads in a region.
1269
1270     By default, the file is re-openend to avoid conflicts between
1271     multiple iterators working on the same file. Set *reopen* = False
1272     to not re-open *samfile*.
1273
1274     The samtools iterators assume that the file
1275     position between iterations do not change.
1276     As a consequence, no two iterators can work
1277     on the same file. To permit this, each iterator
1278     creates its own file handle by re-opening the
1279     file.
1280
1281     Note that the index will be shared between
1282     samfile and the iterator.
1283     """
1284
1285     cdef bam_iter_t             iter # iterator state object
1286     cdef bam1_t *               b
1287     cdef int                    retval
1288     cdef Samfile                samfile
1289     cdef samfile_t              * fp
1290     # true if samfile belongs to this object
1291     cdef int owns_samfile
1292
1293     def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
1294
1295         if not samfile._isOpen():
1296             raise ValueError( "I/O operation on closed file" )
1297
1298         if not samfile._hasIndex():
1299             raise ValueError( "no index available for iteration" )
1300
1301         # makes sure that samfile stays alive as long as the
1302         # iterator is alive
1303         self.samfile = samfile
1304
1305         if samfile.isbam: mode = "rb"
1306         else: mode = "r"
1307
1308         # reopen the file - note that this makes the iterator
1309         # slow and causes pileup to slow down significantly.
1310         if reopen:
1311             store = StderrStore()
1312             self.fp = samopen( samfile._filename, mode, NULL )
1313             store.release()
1314             assert self.fp != NULL
1315             self.owns_samfile = True
1316         else:
1317             self.fp = self.samfile.samfile
1318             self.owns_samfile = False
1319
1320         self.retval = 0
1321
1322         self.iter = bam_iter_query(self.samfile.index,
1323                                    tid,
1324                                    beg,
1325                                    end)
1326         self.b = bam_init1()
1327
1328     def __iter__(self):
1329         return self
1330
1331     cdef bam1_t * getCurrent( self ):
1332         return self.b
1333
1334     cdef int cnext(self):
1335         '''cversion of iterator. Used by IteratorColumn'''
1336         self.retval = bam_iter_read( self.fp.x.bam,
1337                                      self.iter,
1338                                      self.b)
1339
1340     def __next__(self):
1341         """python version of next().
1342         """
1343         self.cnext()
1344         if self.retval < 0: raise StopIteration
1345         return makeAlignedRead( self.b )
1346
1347     def __dealloc__(self):
1348         bam_destroy1(self.b)
1349         if self.owns_samfile: samclose( self.fp )
1350
1351 cdef class IteratorRowAll(IteratorRow):
1352     """*(Samfile samfile, int reopen = True)*
1353
1354     iterate over all reads in *samfile*
1355
1356     By default, the file is re-openend to avoid conflicts between
1357     multiple iterators working on the same file. Set *reopen* = False
1358     to not re-open *samfile*.
1359     """
1360
1361     # cdef bam1_t * b
1362     # cdef samfile_t * fp
1363     # # true if samfile belongs to this object
1364     # cdef int owns_samfile
1365
1366     def __cinit__(self, Samfile samfile, int reopen = True ):
1367
1368         if not samfile._isOpen():
1369             raise ValueError( "I/O operation on closed file" )
1370
1371         if samfile.isbam: mode = "rb"
1372         else: mode = "r"
1373
1374         # reopen the file to avoid iterator conflict
1375         if reopen:
1376             store = StderrStore()
1377             self.fp = samopen( samfile._filename, mode, NULL )
1378             store.release()
1379             assert self.fp != NULL
1380             self.owns_samfile = True
1381         else:
1382             self.fp = samfile.samfile
1383             self.owns_samfile = False
1384
1385         # allocate memory for alignment
1386         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1387
1388     def __iter__(self):
1389         return self
1390
1391     cdef bam1_t * getCurrent( self ):
1392         return self.b
1393
1394     cdef int cnext(self):
1395         '''cversion of iterator. Used by IteratorColumn'''
1396         cdef int ret
1397         return samread(self.fp, self.b)
1398
1399     def __next__(self):
1400         """python version of next().
1401
1402         pyrex uses this non-standard name instead of next()
1403         """
1404         cdef int ret
1405         ret = samread(self.fp, self.b)
1406         if (ret > 0):
1407             return makeAlignedRead( self.b )
1408         else:
1409             raise StopIteration
1410
1411     def __dealloc__(self):
1412         bam_destroy1(self.b)
1413         if self.owns_samfile: samclose( self.fp )
1414
1415 cdef class IteratorRowAllRefs(IteratorRow):
1416     """iterates over all mapped reads by chaining iterators over each reference
1417     """
1418     cdef Samfile     samfile
1419     cdef int         tid
1420     cdef IteratorRowRegion rowiter
1421
1422     def __cinit__(self, Samfile samfile):
1423         assert samfile._isOpen()
1424         if not samfile._hasIndex(): raise ValueError("no index available for fetch")
1425         self.samfile = samfile
1426         self.tid = -1
1427
1428     def nextiter(self):
1429         self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, 1<<29)
1430
1431     def __iter__(self):
1432         return self
1433
1434     def __next__(self):
1435         """python version of next().
1436
1437         pyrex uses this non-standard name instead of next()
1438         """
1439         # Create an initial iterator
1440         if self.tid==-1:
1441             if not self.samfile.nreferences:
1442                 raise StopIteration
1443             self.tid = 0
1444             self.nextiter()
1445
1446         while 1:
1447             self.rowiter.cnext()
1448
1449             # If current iterator is not exhausted, return aligned read
1450             if self.rowiter.retval>0:
1451                 return makeAlignedRead(self.rowiter.b)
1452
1453             self.tid += 1
1454
1455             # Otherwise, proceed to next reference or stop
1456             if self.tid<self.samfile.nreferences:
1457                 self.nextiter()
1458             else:
1459                 raise StopIteration
1460
1461 cdef class IteratorRowSelection(IteratorRow):
1462     """*(Samfile samfile)*
1463
1464     iterate over reads in *samfile* at a given list of file positions.
1465     """
1466
1467     cdef bam1_t * b
1468     cdef int current_pos
1469     cdef samfile_t * fp
1470     cdef positions
1471     # true if samfile belongs to this object
1472     cdef int owns_samfile
1473
1474     def __cinit__(self, Samfile samfile, positions, int reopen = True ):
1475
1476         if not samfile._isOpen():
1477             raise ValueError( "I/O operation on closed file" )
1478
1479         if not samfile._isOpen():
1480             raise ValueError( "I/O operation on closed file" )
1481
1482         assert samfile.isbam, "can only use this iterator on bam files"
1483         mode = "rb"
1484
1485         # reopen the file to avoid iterator conflict
1486         if reopen:
1487             store = StderrStore()
1488             self.fp = samopen( samfile._filename, mode, NULL )
1489             store.release()
1490             assert self.fp != NULL
1491             self.owns_samfile = True
1492         else:
1493             self.fp = samfile.samfile
1494             self.owns_samfile = False
1495
1496         # allocate memory for alignment
1497         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
1498
1499         self.positions = positions
1500         self.current_pos = 0
1501
1502     def __iter__(self):
1503         return self
1504
1505     cdef bam1_t * getCurrent( self ):
1506         return self.b
1507
1508     cdef int cnext(self):
1509         '''cversion of iterator'''
1510
1511         # end iteration if out of positions
1512         if self.current_pos >= len(self.positions): return -1
1513
1514         bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
1515         self.current_pos += 1
1516         return samread(self.fp, self.b)
1517
1518     def __next__(self):
1519         """python version of next().
1520
1521         pyrex uses this non-standard name instead of next()
1522         """
1523
1524         cdef int ret = self.cnext()
1525         if (ret > 0):
1526             return makeAlignedRead( self.b )
1527         else:
1528             raise StopIteration
1529
1530     def __dealloc__(self):
1531         bam_destroy1(self.b)
1532         if self.owns_samfile: samclose( self.fp )
1533
1534 ##-------------------------------------------------------------------
1535 ##-------------------------------------------------------------------
1536 ##-------------------------------------------------------------------
1537 ctypedef struct __iterdata:
1538     samfile_t * samfile
1539     bam_iter_t iter
1540     faidx_t * fastafile
1541     int tid
1542     char * seq
1543     int seq_len
1544
1545 cdef int __advance_all( void * data, bam1_t * b ):
1546     '''advance without any read filtering.
1547     '''
1548     cdef __iterdata * d
1549     d = <__iterdata*>data
1550     return bam_iter_read( d.samfile.x.bam, d.iter, b )
1551
1552 cdef int __advance_snpcalls( void * data, bam1_t * b ):
1553     '''advance using same filter and read processing as in
1554     the samtools pileup.
1555     '''
1556     cdef __iterdata * d
1557     d = <__iterdata*>data
1558
1559     cdef int ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1560     cdef int skip = 0
1561     cdef int q
1562     cdef int is_cns = 1
1563     cdef int is_nobaq = 0
1564     cdef int capQ_thres = 0
1565
1566     # reload sequence
1567     if d.fastafile != NULL and b.core.tid != d.tid:
1568         if d.seq != NULL: free(d.seq)
1569         d.tid = b.core.tid
1570         d.seq = faidx_fetch_seq(d.fastafile,
1571                                 d.samfile.header.target_name[d.tid],
1572                                 0, max_pos,
1573                                 &d.seq_len)
1574         if d.seq == NULL:
1575             raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
1576                                   (d.samfile.header.target_name[d.tid],
1577                                    d.tid))
1578
1579
1580     while ret >= 0:
1581
1582         skip = 0
1583
1584         # realign read - changes base qualities
1585         if d.seq != NULL and is_cns and not is_nobaq: bam_prob_realn( b, d.seq )
1586
1587         if d.seq != NULL and capQ_thres > 10:
1588             q = bam_cap_mapQ(b, d.seq, capQ_thres)
1589             if q < 0: skip = 1
1590             elif b.core.qual > q: b.core.qual = q
1591         if b.core.flag & BAM_FUNMAP: skip = 1
1592         elif b.core.flag & 1 and not b.core.flag & 2: skip = 1
1593
1594         if not skip: break
1595         # additional filters
1596
1597         ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
1598
1599     return ret
1600
1601 cdef class IteratorColumn:
1602     '''abstract base class for iterators over columns.
1603
1604     IteratorColumn objects wrap the pileup functionality of samtools.
1605
1606     For reasons of efficiency, the iterator points to the current
1607     pileup buffer. The pileup buffer is updated at every iteration.
1608     This might cause some unexpected behavious. For example,
1609     consider the conversion to a list::
1610
1611        f = Samfile("file.bam", "rb")
1612        result = list( f.pileup() )
1613
1614     Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
1615     but each object in ``result`` will contain the same information.
1616
1617     The desired behaviour can be achieved by list comprehension::
1618
1619        result = [ x.pileups() for x in f.pileup() ]
1620
1621     ``result`` will be a list of ``n`` lists of objects of type :class:`PileupRead`.
1622
1623     If the iterator is associated with a :class:`Fastafile` using the :meth:`addReference`
1624     method, then the iterator will export the current sequence via the methods :meth:`getSequence`
1625     and :meth:`seq_len`.
1626
1627     Optional kwargs to the iterator
1628
1629     stepper
1630        The stepper controlls how the iterator advances.
1631        Possible options for the stepper are
1632
1633        all
1634            use all reads for pileup.
1635        samtools
1636            same filter and read processing as in :term:`csamtools` pileup
1637     fastafile
1638        A :class:`FastaFile` object
1639     mask
1640        Skip all reads with bits set in mask.
1641     max_depth
1642        maximum read depth. The default is 8000.
1643     '''
1644
1645     # result of the last plbuf_push
1646     cdef IteratorRowRegion iter
1647     cdef int tid
1648     cdef int pos
1649     cdef int n_plp
1650     cdef int mask
1651     cdef const_bam_pileup1_t_ptr plp
1652     cdef bam_plp_t pileup_iter
1653     cdef __iterdata iterdata
1654     cdef Samfile samfile
1655     cdef Fastafile fastafile
1656     cdef stepper
1657     cdef int max_depth
1658
1659     def __cinit__( self, Samfile samfile, **kwargs ):
1660         self.samfile = samfile
1661         self.mask = kwargs.get("mask", BAM_DEF_MASK )
1662         self.fastafile = kwargs.get( "fastafile", None )
1663         self.stepper = kwargs.get( "stepper", None )
1664         self.max_depth = kwargs.get( "max_depth", 8000 )
1665         self.iterdata.seq = NULL
1666         self.tid = 0
1667         self.pos = 0
1668         self.n_plp = 0
1669         self.plp = NULL
1670         self.pileup_iter = <bam_plp_t>NULL
1671
1672
1673     def __iter__(self):
1674         return self
1675
1676     cdef int cnext(self):
1677         '''perform next iteration.
1678
1679         This method is analogous to the samtools bam_plp_auto method.
1680         It has been re-implemented to permit for filtering.
1681         '''
1682         self.plp = bam_plp_auto( self.pileup_iter,
1683                                  &self.tid,
1684                                  &self.pos,
1685                                  &self.n_plp )
1686
1687     cdef char * getSequence( self ):
1688         '''return current reference sequence underlying the iterator.
1689         '''
1690         return self.iterdata.seq
1691
1692     property seq_len:
1693         '''current sequence length.'''
1694         def __get__(self): return self.iterdata.seq_len
1695
1696     def addReference( self, Fastafile fastafile ):
1697        '''
1698        add reference sequences in *fastafile* to iterator.'''
1699        self.fastafile = fastafile
1700        if self.iterdata.seq != NULL: free(self.iterdata.seq)
1701        self.iterdata.tid = -1
1702        self.iterdata.fastafile = self.fastafile.fastafile
1703
1704     def hasReference( self ):
1705         '''
1706         return true if iterator is associated with a reference'''
1707         return self.fastafile
1708
1709     cdef setMask( self, mask ):
1710         '''set masking flag in iterator.
1711
1712         reads with bits set in *mask* will be skipped.
1713         '''
1714         self.mask = mask
1715         bam_plp_set_mask( self.pileup_iter, self.mask )
1716
1717     cdef setupIteratorData( self,
1718                             int tid,
1719                             int start,
1720                             int end,
1721                             int reopen = 0 ):
1722         '''setup the iterator structure'''
1723
1724         self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen )
1725         self.iterdata.samfile = self.samfile.samfile
1726         self.iterdata.iter = self.iter.iter
1727         self.iterdata.seq = NULL
1728         self.iterdata.tid = -1
1729
1730         if self.fastafile != None:
1731             self.iterdata.fastafile = self.fastafile.fastafile
1732         else:
1733             self.iterdata.fastafile = NULL
1734
1735         if self.stepper == None or self.stepper == "all":
1736             self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
1737         elif self.stepper == "samtools":
1738             self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
1739         else:
1740             raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
1741
1742         if self.max_depth:
1743             bam_plp_set_maxcnt( self.pileup_iter, self.max_depth )
1744
1745         bam_plp_set_mask( self.pileup_iter, self.mask )
1746
1747     cdef reset( self, tid, start, end ):
1748         '''reset iterator position.
1749
1750         This permits using the iterator multiple times without
1751         having to incur the full set-up costs.
1752         '''
1753         self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
1754         self.iterdata.iter = self.iter.iter
1755
1756         # invalidate sequence if different tid
1757         if self.tid != tid:
1758             if self.iterdata.seq != NULL: free( self.iterdata.seq )
1759             self.iterdata.seq = NULL
1760             self.iterdata.tid = -1
1761
1762         # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
1763         bam_plp_reset(self.pileup_iter)
1764
1765     def __dealloc__(self):
1766         # reset in order to avoid memory leak messages for iterators that have
1767         # not been fully consumed
1768         if self.pileup_iter != <bam_plp_t>NULL:
1769             bam_plp_reset(self.pileup_iter)
1770             bam_plp_destroy(self.pileup_iter)
1771             self.pileup_iter = <bam_plp_t>NULL
1772
1773         if self.iterdata.seq != NULL:
1774             free(self.iterdata.seq)
1775             self.iterdata.seq = NULL
1776
1777 cdef class IteratorColumnRegion(IteratorColumn):
1778     '''iterates over a region only.
1779     '''
1780     def __cinit__(self, Samfile samfile,
1781                   int tid = 0,
1782                   int start = 0,
1783                   int end = max_pos,
1784                   **kwargs ):
1785
1786         # initialize iterator
1787         self.setupIteratorData( tid, start, end, 1 )
1788
1789     def __next__(self):
1790         """python version of next().
1791         """
1792
1793         while 1:
1794             self.cnext()
1795             if self.n_plp < 0:
1796                 raise ValueError("error during iteration" )
1797
1798             if self.plp == NULL:
1799                 raise StopIteration
1800
1801             return makePileupProxy( <bam_pileup1_t*>self.plp,
1802                                      self.tid,
1803                                      self.pos,
1804                                      self.n_plp )
1805
1806 cdef class IteratorColumnAllRefs(IteratorColumn):
1807     """iterates over all columns by chaining iterators over each reference
1808     """
1809
1810     def __cinit__(self,
1811                   Samfile samfile,
1812                   **kwargs ):
1813
1814         # no iteration over empty files
1815         if not samfile.nreferences: raise StopIteration
1816
1817         # initialize iterator
1818         self.setupIteratorData( self.tid, 0, max_pos, 1 )
1819
1820     def __next__(self):
1821         """python version of next().
1822         """
1823
1824         while 1:
1825             self.cnext()
1826
1827             if self.n_plp < 0:
1828                 raise ValueError("error during iteration" )
1829
1830             # return result, if within same reference
1831             if self.plp != NULL:
1832                 return makePileupProxy( <bam_pileup1_t*>self.plp,
1833                                          self.tid,
1834                                          self.pos,
1835                                          self.n_plp )
1836
1837             # otherwise, proceed to next reference or stop
1838             self.tid += 1
1839             if self.tid < self.samfile.nreferences:
1840                 self.setupIteratorData( self.tid, 0, max_pos, 0 )
1841             else:
1842                 raise StopIteration
1843
1844 ##-------------------------------------------------------------------
1845 ##-------------------------------------------------------------------
1846 ##-------------------------------------------------------------------
1847 cdef inline int32_t query_start(bam1_t *src) except -1:
1848     cdef uint32_t * cigar_p, op
1849     cdef uint32_t k
1850     cdef uint32_t start_offset = 0
1851
1852     if src.core.n_cigar:
1853         cigar_p = bam1_cigar(src);
1854         for k from 0 <= k < src.core.n_cigar:
1855             op = cigar_p[k] & BAM_CIGAR_MASK
1856             if op==BAM_CHARD_CLIP:
1857                 if start_offset!=0 and start_offset!=src.core.l_qseq:
1858                     PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1859                     return -1
1860             elif op==BAM_CSOFT_CLIP:
1861                 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
1862             else:
1863                 break
1864
1865     return start_offset
1866
1867 ##-------------------------------------------------------------------
1868 ##-------------------------------------------------------------------
1869 ##-------------------------------------------------------------------
1870 cdef inline int32_t query_end(bam1_t *src) except -1:
1871     cdef uint32_t * cigar_p, op
1872     cdef uint32_t k
1873     cdef uint32_t end_offset = src.core.l_qseq
1874
1875     if src.core.n_cigar>1:
1876         cigar_p = bam1_cigar(src);
1877         for k from src.core.n_cigar > k >= 1:
1878             op = cigar_p[k] & BAM_CIGAR_MASK
1879             if op==BAM_CHARD_CLIP:
1880                 if end_offset!=0 and end_offset!=src.core.l_qseq:
1881                     PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
1882                     return -1
1883             elif op==BAM_CSOFT_CLIP:
1884                 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
1885             else:
1886                 break
1887
1888     if end_offset==0:
1889         end_offset = src.core.l_qseq
1890
1891     return end_offset
1892
1893
1894 cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
1895     cdef uint8_t * p
1896     cdef uint32_t k
1897     cdef char * s
1898
1899     if not src.core.l_qseq:
1900         return None
1901
1902     seq = PyString_FromStringAndSize(NULL, end-start)
1903     s   = PyString_AS_STRING(seq)
1904     p   = bam1_seq(src)
1905
1906     for k from start <= k < end:
1907         # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
1908         # note: do not use string literal as it will be a python string
1909         s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
1910
1911     return seq
1912
1913
1914 cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
1915     cdef uint8_t * p
1916     cdef uint32_t k
1917     cdef char * q
1918
1919     p = bam1_qual(src)
1920     if p[0] == 0xff:
1921         return None
1922
1923     qual = PyString_FromStringAndSize(NULL, end-start)
1924     q    = PyString_AS_STRING(qual)
1925
1926     for k from start <= k < end:
1927         ## equivalent to t[i] + 33 (see bam.c)
1928         q[k-start] = p[k] + 33
1929
1930     return qual
1931
1932 cdef class AlignedRead:
1933     '''
1934     Class representing an aligned read. see SAM format specification for
1935     the meaning of fields (http://samtools.sourceforge.net/).
1936
1937     This class stores a handle to the samtools C-structure representing
1938     an aligned read. Member read access is forwarded to the C-structure
1939     and converted into python objects. This implementation should be fast,
1940     as only the data needed is converted.
1941
1942     For write access, the C-structure is updated in-place. This is
1943     not the most efficient way to build BAM entries, as the variable
1944     length data is concatenated and thus needs to resized if
1945     a field is updated. Furthermore, the BAM entry might be
1946     in an inconsistent state. The :meth:`~validate` method can
1947     be used to check if an entry is consistent.
1948
1949     One issue to look out for is that the sequence should always
1950     be set *before* the quality scores. Setting the sequence will
1951     also erase any quality scores that were set previously.
1952     '''
1953
1954     # Now only called when instances are created from Python
1955     def __init__(self):
1956         # see bam_init1
1957         self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
1958         # allocate some memory
1959         # If size is 0, calloc does not return a pointer that can be passed to free()
1960         # so allocate 40 bytes for a new read
1961         self._delegate.m_data = 40
1962         self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
1963         self._delegate.data_len = 0
1964
1965     def __dealloc__(self):
1966         bam_destroy1(self._delegate)
1967
1968     def __str__(self):
1969         """return string representation of alignment.
1970
1971         The representation is an approximate :term:`sam` format.
1972
1973         An aligned read might not be associated with a :term:`Samfile`.
1974         As a result :term:`tid` is shown instead of the reference name.
1975
1976         Similarly, the tags field is returned in its parsed state.
1977         """
1978         # sam-parsing is done in sam.c/bam_format1_core which
1979         # requires a valid header.
1980         return "\t".join(map(str, (self.qname,
1981                                    self.flag,
1982                                    self.rname,
1983                                    self.pos,
1984                                    self.mapq,
1985                                    self.cigar,
1986                                    self.mrnm,
1987                                    self.mpos,
1988                                    self.rlen,
1989                                    self.seq,
1990                                    self.qual,
1991                                    self.tags )))
1992
1993     def compare(self, AlignedRead other):
1994         '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
1995
1996         cdef int retval, x
1997         cdef bam1_t *t, *o
1998
1999         t = self._delegate
2000         o = other._delegate
2001
2002         # uncomment for debugging purposes
2003         # cdef unsigned char * oo, * tt
2004         # tt = <unsigned char*>(&t.core)
2005         # oo = <unsigned char*>(&o.core)
2006         # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
2007         # tt = <unsigned char*>(t.data)
2008         # oo = <unsigned char*>(o.data)
2009         # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
2010
2011         # Fast-path test for object identity
2012         if t==o:
2013             return 0
2014
2015         retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
2016
2017         if retval: return retval
2018         retval = cmp(t.data_len, o.data_len)
2019         if retval: return retval
2020         return memcmp(t.data, o.data, t.data_len)
2021
2022     # Disabled so long as __cmp__ is a special method
2023     def __hash__(self):
2024         return _Py_HashPointer(<void *>self)
2025
2026     property qname:
2027         """the query name (None if not present)"""
2028         def __get__(self):
2029             cdef bam1_t * src
2030             src = self._delegate
2031             if src.core.l_qname == 0: return None
2032             return <char *>bam1_qname( src )
2033
2034         def __set__(self, qname ):
2035             if qname == None or len(qname) == 0: return
2036             cdef bam1_t * src
2037             cdef int l
2038             cdef char * p
2039
2040             src = self._delegate
2041             p = bam1_qname( src )
2042
2043             # the qname is \0 terminated
2044             l = len(qname) + 1
2045             pysam_bam_update( src,
2046                               src.core.l_qname,
2047                               l,
2048                               <uint8_t*>p )
2049
2050             src.core.l_qname = l
2051
2052             # re-acquire pointer to location in memory
2053             # as it might have moved
2054             p = bam1_qname(src)
2055
2056             strncpy( p, qname, l )
2057
2058     property cigar:
2059         """the :term:`cigar` alignment (None if not present).
2060         """
2061         def __get__(self):
2062             cdef uint32_t * cigar_p
2063             cdef bam1_t * src
2064             cdef op, l, cigar
2065             cdef int k
2066
2067             src = self._delegate
2068             if src.core.n_cigar == 0: return None
2069
2070             cigar = []
2071             cigar_p = bam1_cigar(src);
2072             for k from 0 <= k < src.core.n_cigar:
2073                 op = cigar_p[k] & BAM_CIGAR_MASK
2074                 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2075                 cigar.append((op, l))
2076             return cigar
2077
2078         def __set__(self, values ):
2079             if values == None or len(values) == 0: return
2080             cdef uint32_t * p
2081             cdef bam1_t * src
2082             cdef op, l
2083             cdef int k
2084
2085             k = 0
2086
2087             src = self._delegate
2088
2089             # get location of cigar string
2090             p = bam1_cigar(src)
2091
2092             # create space for cigar data within src.data
2093             pysam_bam_update( src,
2094                               src.core.n_cigar * 4,
2095                               len(values) * 4,
2096                               <uint8_t*>p )
2097
2098             # length is number of cigar operations, not bytes
2099             src.core.n_cigar = len(values)
2100
2101             # re-acquire pointer to location in memory
2102             # as it might have moved
2103             p = bam1_cigar(src)
2104
2105             # insert cigar operations
2106             for op, l in values:
2107                 p[k] = l << BAM_CIGAR_SHIFT | op
2108                 k += 1
2109
2110             ## setting the cigar string also updates the "bin" attribute
2111             src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
2112
2113     property seq:
2114         """read sequence bases, including :term:`soft clipped` bases (None if not present)"""
2115         def __get__(self):
2116             cdef bam1_t * src
2117             cdef char * s
2118             src = self._delegate
2119
2120             if src.core.l_qseq == 0: return None
2121
2122             return get_seq_range(src, 0, src.core.l_qseq)
2123
2124         def __set__(self,seq):
2125             # samtools manages sequence and quality length memory together
2126             # if no quality information is present, the first byte says 0xff.
2127
2128             if seq == None or len(seq) == 0: return
2129             cdef bam1_t * src
2130             cdef uint8_t * p
2131             cdef char * s
2132             cdef int l, k, nbytes_new, nbytes_old
2133
2134             src = self._delegate
2135
2136             l = len(seq)
2137
2138             # as the sequence is stored in half-bytes, the total length (sequence
2139             # plus quality scores) is (l+1)/2 + l
2140             nbytes_new = (l+1)/2 + l
2141             nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
2142             # acquire pointer to location in memory
2143             p = bam1_seq( src )
2144             src.core.l_qseq = l
2145
2146             pysam_bam_update( src,
2147                               nbytes_old,
2148                               nbytes_new,
2149                               p)
2150             # re-acquire pointer to location in memory
2151             # as it might have moved
2152             p = bam1_seq( src )
2153             for k from 0 <= k < nbytes_new: p[k] = 0
2154             # convert to C string
2155             s = seq
2156             for k from 0 <= k < l:
2157                 p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
2158
2159             # erase qualities
2160             p = bam1_qual( src )
2161             p[0] = 0xff
2162
2163
2164     property qual:
2165         """read sequence base qualities, including :term:`soft clipped` bases (None if not present)"""
2166         def __get__(self):
2167
2168             cdef bam1_t * src
2169             cdef char * q
2170
2171             src = self._delegate
2172
2173             if src.core.l_qseq == 0: return None
2174
2175             return get_qual_range(src, 0, src.core.l_qseq)
2176
2177         def __set__(self,qual):
2178             # note that space is already allocated via the sequences
2179             cdef bam1_t * src
2180             cdef uint8_t * p
2181             cdef char * q
2182             cdef int k
2183
2184             src = self._delegate
2185             p = bam1_qual( src )
2186             if qual == None or len(qual) == 0:
2187                 # if absent - set to 0xff
2188                 p[0] = 0xff
2189                 return
2190             cdef int l
2191             # convert to C string
2192             q = qual
2193             l = len(qual)
2194             if src.core.l_qseq != l:
2195                 raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
2196             assert src.core.l_qseq == l
2197             for k from 0 <= k < l:
2198                 p[k] = <uint8_t>q[k] - 33
2199
2200     property query:
2201         """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present)
2202
2203         SAM/BAM files may included extra flanking bases sequences that were
2204         not part of the alignment.  These bases may be the result of the
2205         Smith-Waterman or other algorithms, which may not require alignments
2206         that begin at the first residue or end at the last.  In addition,
2207         extra sequencing adapters, multiplex identifiers, and low-quality bases that
2208         were not considered for alignment may have been retained."""
2209
2210         def __get__(self):
2211             cdef bam1_t * src
2212             cdef uint32_t start, end
2213             cdef char * s
2214
2215             src = self._delegate
2216
2217             if src.core.l_qseq == 0: return None
2218
2219             start = query_start(src)
2220             end   = query_end(src)
2221
2222             return get_seq_range(src, start, end)
2223
2224     property qqual:
2225         """aligned query sequence quality values (None if not present)"""
2226         def __get__(self):
2227             cdef bam1_t * src
2228             cdef uint32_t start, end
2229             cdef char * q
2230
2231             src = self._delegate
2232
2233             if src.core.l_qseq == 0: return None
2234
2235             start = query_start(src)
2236             end   = query_end(src)
2237
2238             return get_qual_range(src, start, end)
2239
2240     property qstart:
2241         """start index of the aligned query portion of the sequence (0-based, inclusive)"""
2242         def __get__(self):
2243             return query_start(self._delegate)
2244
2245     property qend:
2246         """end index of the aligned query portion of the sequence (0-based, exclusive)"""
2247         def __get__(self):
2248             return query_end(self._delegate)
2249
2250     property qlen:
2251         """Length of the aligned query sequence"""
2252         def __get__(self):
2253             cdef bam1_t * src
2254             src = self._delegate
2255             return query_end(src)-query_start(src)
2256
2257     property tags:
2258         """the tags in the AUX field.
2259
2260         This property permits convenience access to
2261         the tags. Changes it the returned list will
2262         not update the tags automatically. Instead,
2263         the following is required for adding a
2264         new tag::
2265
2266             read.tags = read.tags + [("RG",0)]
2267
2268
2269         This method will happily write the same tag
2270         multiple times.
2271         """
2272         def __get__(self):
2273             cdef char * ctag
2274             cdef bam1_t * src
2275             cdef uint8_t * s
2276             cdef char auxtag[3]
2277             cdef char auxtype
2278
2279             src = self._delegate
2280             if src.l_aux == 0: return []
2281
2282             s = bam1_aux( src )
2283             result = []
2284             auxtag[2] = 0
2285             while s < (src.data + src.data_len):
2286                 # get tag
2287                 auxtag[0] = s[0]
2288                 auxtag[1] = s[1]
2289                 s += 2
2290                 auxtype = s[0]
2291
2292                 if auxtype in ('c', 'C'):
2293                     value = <int>bam_aux2i(s)
2294                     s += 1
2295                 elif auxtype in ('s', 'S'):
2296                     value = <int>bam_aux2i(s)
2297                     s += 2
2298                 elif auxtype in ('i', 'I'):
2299                     value = <float>bam_aux2i(s)
2300                     s += 4
2301                 elif auxtype == 'f':
2302                     value = <float>bam_aux2f(s)
2303                     s += 4
2304                 elif auxtype == 'd':
2305                     value = <double>bam_aux2d(s)
2306                     s += 8
2307                 elif auxtype == 'A':
2308                     value = "%c" % <char>bam_aux2A(s)
2309                     s += 1
2310                 elif auxtype in ('Z', 'H'):
2311                     value = <char*>bam_aux2Z(s)
2312                     # +1 for NULL terminated string
2313                     s += len(value) + 1
2314                  #
2315                 s += 1
2316
2317                 result.append( (auxtag, value) )
2318
2319             return result
2320
2321         def __set__(self, tags):
2322             cdef char * ctag
2323             cdef bam1_t * src
2324             cdef uint8_t * s
2325             cdef uint8_t * new_data
2326             cdef char * temp
2327             cdef int guessed_size, control_size
2328             cdef int max_size, size, offset
2329
2330             src = self._delegate
2331             max_size = 4000
2332             offset = 0
2333
2334             if tags != None:
2335
2336                 # map samtools code to python.struct code and byte size
2337                 buffer = ctypes.create_string_buffer(max_size)
2338
2339                 for pytag, value in tags:
2340                     t = type(value)
2341                     if t == types.FloatType:
2342                         fmt, pytype = "<cccf", 'f'
2343                     elif t == types.IntType:
2344                         if value < 0:
2345                             if value >= -127: fmt, pytype = "<cccb", 'c'
2346                             elif value >= -32767: fmt, pytype = "<ccch", 's'
2347                             elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2348                             else: fmt, pytype = "<ccci", 'i'[0]
2349                         else:
2350                             if value <= 255: fmt, pytype = "<cccB", 'C'
2351                             elif value <= 65535: fmt, pytype = "<cccH", 'S'
2352                             elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
2353                             else: fmt, pytype = "<cccI", 'I'
2354                     else:
2355                         # Note: hex strings (H) are not supported yet
2356                         if len(value) == 1:
2357                             fmt, pytype = "<cccc", 'A'
2358                         else:
2359                             fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
2360
2361                     size = struct.calcsize(fmt)
2362                     if offset + size > max_size:
2363                         raise NotImplementedError("tags field too large")
2364
2365                     struct.pack_into( fmt,
2366                                       buffer,
2367                                       offset,
2368                                       pytag[0],
2369                                       pytag[1],
2370                                       pytype,
2371                                       value )
2372                     offset += size
2373
2374             # delete the old data and allocate new
2375             # if offset == 0, the aux field will be
2376             # empty
2377             pysam_bam_update( src,
2378                               src.l_aux,
2379                               offset,
2380                               bam1_aux( src ) )
2381
2382             src.l_aux = offset
2383
2384             # copy data only if there is any
2385             if offset != 0:
2386
2387                 # get location of new data
2388                 s = bam1_aux( src )
2389
2390                 # check if there is direct path from buffer.raw to tmp
2391                 temp = buffer.raw
2392                 memcpy( s, temp, offset )
2393
2394     property flag:
2395         """properties flag"""
2396         def __get__(self): return self._delegate.core.flag
2397         def __set__(self, flag): self._delegate.core.flag = flag
2398
2399     property rname:
2400         """
2401         :term:`target` ID
2402
2403         DEPRECATED from pysam-0.4 - use tid in the future.
2404         The rname field caused a lot of confusion as it returns
2405         the :term:`target` ID instead of the reference sequence
2406         name.
2407
2408         .. note::
2409
2410             This field contains the index of the reference sequence
2411             in the sequence dictionary. To obtain the name
2412             of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2413
2414         """
2415         def __get__(self): return self._delegate.core.tid
2416         def __set__(self, tid): self._delegate.core.tid = tid
2417
2418     property tid:
2419         """
2420         :term:`target` ID
2421
2422         .. note::
2423
2424             This field contains the index of the reference sequence
2425             in the sequence dictionary. To obtain the name
2426             of the reference sequence, use :meth:`pysam.Samfile.getrname()`
2427
2428         """
2429         def __get__(self): return self._delegate.core.tid
2430         def __set__(self, tid): self._delegate.core.tid = tid
2431
2432     property pos:
2433         """0-based leftmost coordinate"""
2434         def __get__(self): return self._delegate.core.pos
2435         def __set__(self, pos):
2436             ## setting the cigar string also updates the "bin" attribute
2437             cdef bam1_t * src
2438             src = self._delegate
2439             if src.core.n_cigar:
2440                 src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
2441             else:
2442                 src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
2443             self._delegate.core.pos = pos
2444     property bin:
2445         """properties bin"""
2446         def __get__(self): return self._delegate.core.bin
2447         def __set__(self, bin): self._delegate.core.bin = bin
2448     property rlen:
2449         '''length of the read (read only). Returns 0 if not given.'''
2450         def __get__(self): return self._delegate.core.l_qseq
2451     property aend:
2452         '''aligned end position of the read on the reference genome.  Returns
2453         None if not available.'''
2454         def __get__(self):
2455             cdef bam1_t * src
2456             src = self._delegate
2457             if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2458                 return None
2459             return bam_calend(&src.core, bam1_cigar(src))
2460     property alen:
2461         '''aligned length of the read on the reference genome.  Returns None if
2462         not available.'''
2463         def __get__(self):
2464             cdef bam1_t * src
2465             src = self._delegate
2466             if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
2467                 return None
2468             return bam_calend(&src.core,
2469                                bam1_cigar(src)) - \
2470                                self._delegate.core.pos
2471
2472     property mapq:
2473         """mapping quality"""
2474         def __get__(self): return self._delegate.core.qual
2475         def __set__(self, qual): self._delegate.core.qual = qual
2476     property mrnm:
2477         """the :term:`reference` id of the mate
2478         deprecated, use RNEXT instead.
2479         """
2480         def __get__(self): return self._delegate.core.mtid
2481         def __set__(self, mtid): self._delegate.core.mtid = mtid
2482     property rnext:
2483         """the :term:`reference` id of the mate """
2484         def __get__(self): return self._delegate.core.mtid
2485         def __set__(self, mtid): self._delegate.core.mtid = mtid
2486     property mpos:
2487         """the position of the mate
2488         deprecated, use PNEXT instead."""
2489         def __get__(self): return self._delegate.core.mpos
2490         def __set__(self, mpos): self._delegate.core.mpos = mpos
2491     property pnext:
2492         """the position of the mate"""
2493         def __get__(self): return self._delegate.core.mpos
2494         def __set__(self, mpos): self._delegate.core.mpos = mpos
2495     property isize:
2496         """the insert size
2497         deprecated: use tlen instead"""
2498         def __get__(self): return self._delegate.core.isize
2499         def __set__(self, isize): self._delegate.core.isize = isize
2500     property tlen:
2501         """the insert size"""
2502         def __get__(self): return self._delegate.core.isize
2503         def __set__(self, isize): self._delegate.core.isize = isize
2504     property is_paired:
2505         """true if read is paired in sequencing"""
2506         def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
2507         def __set__(self,val):
2508             if val: self._delegate.core.flag |= BAM_FPAIRED
2509             else: self._delegate.core.flag &= ~BAM_FPAIRED
2510     property is_proper_pair:
2511         """true if read is mapped in a proper pair"""
2512         def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
2513         def __set__(self,val):
2514             if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
2515             else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
2516     property is_unmapped:
2517         """true if read itself is unmapped"""
2518         def __get__(self): return (self.flag & BAM_FUNMAP) != 0
2519         def __set__(self,val):
2520             if val: self._delegate.core.flag |= BAM_FUNMAP
2521             else: self._delegate.core.flag &= ~BAM_FUNMAP
2522     property mate_is_unmapped:
2523         """true if the mate is unmapped"""
2524         def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
2525         def __set__(self,val):
2526             if val: self._delegate.core.flag |= BAM_FMUNMAP
2527             else: self._delegate.core.flag &= ~BAM_FMUNMAP
2528     property is_reverse:
2529         """true if read is mapped to reverse strand"""
2530         def __get__(self): return (self.flag & BAM_FREVERSE) != 0
2531         def __set__(self,val):
2532             if val: self._delegate.core.flag |= BAM_FREVERSE
2533             else: self._delegate.core.flag &= ~BAM_FREVERSE
2534     property mate_is_reverse:
2535         """true is read is mapped to reverse strand"""
2536         def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
2537         def __set__(self,val):
2538             if val: self._delegate.core.flag |= BAM_FMREVERSE
2539             else: self._delegate.core.flag &= ~BAM_FMREVERSE
2540     property is_read1:
2541         """true if this is read1"""
2542         def __get__(self): return (self.flag & BAM_FREAD1) != 0
2543         def __set__(self,val):
2544             if val: self._delegate.core.flag |= BAM_FREAD1
2545             else: self._delegate.core.flag &= ~BAM_FREAD1
2546     property is_read2:
2547         """true if this is read2"""
2548         def __get__(self): return (self.flag & BAM_FREAD2) != 0
2549         def __set__(self,val):
2550             if val: self._delegate.core.flag |= BAM_FREAD2
2551             else: self._delegate.core.flag &= ~BAM_FREAD2
2552     property is_secondary:
2553         """true if not primary alignment"""
2554         def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
2555         def __set__(self,val):
2556             if val: self._delegate.core.flag |= BAM_FSECONDARY
2557             else: self._delegate.core.flag &= ~BAM_FSECONDARY
2558     property is_qcfail:
2559         """true if QC failure"""
2560         def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
2561         def __set__(self,val):
2562             if val: self._delegate.core.flag |= BAM_FQCFAIL
2563             else: self._delegate.core.flag &= ~BAM_FQCFAIL
2564     property is_duplicate:
2565         """true if optical or PCR duplicate"""
2566         def __get__(self): return (self.flag & BAM_FDUP) != 0
2567         def __set__(self,val):
2568             if val: self._delegate.core.flag |= BAM_FDUP
2569             else: self._delegate.core.flag &= ~BAM_FDUP
2570     property positions:
2571         """a list of reference positions that this read aligns to."""
2572         def __get__(self):
2573             cdef uint32_t k, i, pos
2574             cdef int op
2575             cdef uint32_t * cigar_p
2576             cdef bam1_t * src
2577
2578             result = []
2579             src = self._delegate
2580             if src.core.n_cigar == 0: return []
2581
2582             pos = src.core.pos
2583
2584             cigar_p = bam1_cigar(src)
2585             for k from 0 <= k < src.core.n_cigar:
2586                 op = cigar_p[k] & BAM_CIGAR_MASK
2587                 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2588                 if op == BAM_CMATCH:
2589                     for i from pos <= i < pos + l:
2590                         result.append( i )
2591
2592                 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2593                     pos += l
2594
2595             return result
2596
2597     def overlap( self, uint32_t start, uint32_t end ):
2598         """return number of aligned bases of read overlapping the interval *start* and *end*
2599         on the reference sequence.
2600         """
2601         cdef uint32_t k, i, pos, overlap
2602         cdef int op, o
2603         cdef uint32_t * cigar_p
2604         cdef bam1_t * src
2605
2606         overlap = 0
2607
2608         src = self._delegate
2609         if src.core.n_cigar == 0: return 0
2610         pos = src.core.pos
2611         o = 0
2612
2613         cigar_p = bam1_cigar(src)
2614         for k from 0 <= k < src.core.n_cigar:
2615             op = cigar_p[k] & BAM_CIGAR_MASK
2616             l = cigar_p[k] >> BAM_CIGAR_SHIFT
2617
2618             if op == BAM_CMATCH:
2619                 o = min( pos + l, end) - max( pos, start )
2620                 if o > 0: overlap += o
2621
2622             if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
2623                 pos += l
2624
2625         return overlap
2626
2627     def opt(self, tag):
2628         """retrieves optional data given a two-letter *tag*"""
2629         #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
2630         cdef uint8_t * v
2631         v = bam_aux_get(self._delegate, tag)
2632         if v == NULL: raise KeyError( "tag '%s' not present" % tag )
2633         type = chr(v[0])
2634         if type == 'c' or type == 'C' or type == 's' or type == 'S':
2635             return <int>bam_aux2i(v)
2636         elif type == 'i' or type == 'I':
2637             return <int32_t>bam_aux2i(v)
2638         elif type == 'f' or type == 'F':
2639             return <float>bam_aux2f(v)
2640         elif type == 'd' or type == 'D':
2641             return <double>bam_aux2d(v)
2642         elif type == 'A':
2643             # there might a more efficient way
2644             # to convert a char into a string
2645             return '%c' % <char>bam_aux2A(v)
2646         elif type == 'Z':
2647             return <char*>bam_aux2Z(v)
2648
2649     def fancy_str (self):
2650         """returns list of fieldnames/values in pretty format for debugging
2651         """
2652         ret_string = []
2653         field_names = {
2654            "tid":           "Contig index",
2655            "pos":           "Mapped position on contig",
2656            "mtid":          "Contig index for mate pair",
2657            "mpos":          "Position of mate pair",
2658            "isize":         "Insert size",
2659            "flag":          "Binary flag",
2660            "n_cigar":       "Count of cigar entries",
2661            "cigar":         "Cigar entries",
2662            "qual":          "Mapping quality",
2663            "bin":           "Bam index bin number",
2664            "l_qname":       "Length of query name",
2665            "qname":         "Query name",
2666            "l_qseq":        "Length of query sequence",
2667            "qseq":          "Query sequence",
2668            "bqual":         "Quality scores",
2669            "l_aux":         "Length of auxilary data",
2670            "m_data":        "Maximum data length",
2671            "data_len":      "Current data length",
2672            }
2673         fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
2674                                  "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
2675                                  "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
2676
2677         for f in fields_names_in_order:
2678             if not f in self.__dict__:
2679                 continue
2680             ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
2681
2682         for f in self.__dict__:
2683             if not f in field_names:
2684                 ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
2685         return ret_string
2686
2687 cdef class PileupProxy:
2688     '''A pileup column. A pileup column contains
2689     all the reads that map to a certain target base.
2690
2691     tid
2692         chromosome ID as is defined in the header
2693     pos
2694         the target base coordinate (0-based)
2695     n
2696         number of reads mapping to this column
2697     pileups
2698         list of reads (:class:`pysam.PileupRead`) aligned to this column
2699
2700     This class is a proxy for results returned by the samtools pileup engine.
2701     If the underlying engine iterator advances, the results of this column
2702     will change.
2703     '''
2704     cdef bam_pileup1_t * plp
2705     cdef int tid
2706     cdef int pos
2707     cdef int n_pu
2708
2709     def __init__(self):
2710         raise TypeError("This class cannot be instantiated from Python")
2711
2712     def __str__(self):
2713         return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
2714             "\n" +\
2715             "\n".join( map(str, self.pileups) )
2716
2717     property tid:
2718         '''the chromosome ID as is defined in the header'''
2719         def __get__(self): return self.tid
2720
2721     property n:
2722         '''number of reads mapping to this column.'''
2723         def __get__(self): return self.n_pu
2724         def __set__(self, n): self.n_pu = n
2725
2726     property pos:
2727         def __get__(self): return self.pos
2728
2729     property pileups:
2730         '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
2731         def __get__(self):
2732             cdef int x
2733             pileups = []
2734             # warning: there could be problems if self.n and self.buf are
2735             # out of sync.
2736             for x from 0 <= x < self.n_pu:
2737                 pileups.append( makePileupRead( &(self.plp[x])) )
2738             return pileups
2739
2740 cdef class PileupRead:
2741     '''A read aligned to a column.
2742     '''
2743
2744     cdef:
2745          AlignedRead _alignment
2746          int32_t  _qpos
2747          int _indel
2748          int _level
2749          uint32_t _is_del
2750          uint32_t _is_head
2751          uint32_t _is_tail
2752
2753     def __init__(self):
2754         raise TypeError("This class cannot be instantiated from Python")
2755
2756     def __str__(self):
2757         return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
2758
2759     property alignment:
2760         """a :class:`pysam.AlignedRead` object of the aligned read"""
2761         def __get__(self):
2762             return self._alignment
2763     property qpos:
2764         """position of the read base at the pileup site, 0-based"""
2765         def __get__(self):
2766             return self._qpos
2767     property indel:
2768         """indel length; 0 for no indel, positive for ins and negative for del"""
2769         def __get__(self):
2770             return self._indel
2771     property is_del:
2772         """1 iff the base on the padded read is a deletion"""
2773         def __get__(self):
2774             return self._is_del
2775     property is_head:
2776         def __get__(self):
2777             return self._is_head
2778     property is_tail:
2779         def __get__(self):
2780             return self._is_tail
2781     property level:
2782         def __get__(self):
2783             return self._level
2784
2785 class Outs:
2786     '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
2787     def __init__(self, id = 1):
2788         self.streams = []
2789         self.id = id
2790
2791     def setdevice(self, filename):
2792         '''open an existing file, like "/dev/null"'''
2793         fd = os.open(filename, os.O_WRONLY)
2794         self.setfd(fd)
2795
2796     def setfile(self, filename):
2797         '''open a new file.'''
2798         fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
2799         self.setfd(fd)
2800
2801     def setfd(self, fd):
2802         ofd = os.dup(self.id)      #  Save old stream on new unit.
2803         self.streams.append(ofd)
2804         sys.stdout.flush()          #  Buffered data goes to old stream.
2805         sys.stderr.flush()          #  Buffered data goes to old stream.
2806         os.dup2(fd, self.id)        #  Open unit 1 on new stream.
2807         os.close(fd)                #  Close other unit (look out, caller.)
2808
2809     def restore(self):
2810         '''restore previous output stream'''
2811         if self.streams:
2812             # the following was not sufficient, hence flush both stderr and stdout
2813             # os.fsync( self.id )
2814             sys.stdout.flush()
2815             sys.stderr.flush()
2816             os.dup2(self.streams[-1], self.id)
2817             os.close(self.streams[-1])
2818             del self.streams[-1]
2819
2820 def _samtools_dispatch( method,
2821                         args = (),
2822                         catch_stdout = True,
2823                         catch_stderr = False,
2824                         ):
2825     '''call ``method`` in samtools providing arguments in args.
2826
2827     .. note::
2828        This method redirects stdout and (optionally) stderr to capture it
2829        from samtools. If for some reason stdout/stderr disappears
2830        the reason might be in this method.
2831
2832     .. note::
2833        The current implementation might only work on linux.
2834
2835     .. note::
2836        This method captures stdout and stderr using temporary files,
2837        which are then read into memory in their entirety. This method
2838        is slow and might cause large memory overhead.
2839
2840     See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
2841     on the topic of redirecting stderr/stdout.
2842     '''
2843
2844     # note that debugging this module can be a problem
2845     # as stdout/stderr will not appear on the terminal
2846
2847     # some special cases
2848     if method == "index":
2849         if not os.path.exists( args[0] ):
2850             raise IOError( "No such file or directory: '%s'" % args[0] )
2851
2852     # redirect stderr and stdout to file
2853     if catch_stderr:
2854         stderr_h, stderr_f = tempfile.mkstemp()
2855         stderr_save = Outs( sys.stderr.fileno() )
2856         stderr_save.setfd( stderr_h )
2857
2858     if catch_stdout:
2859         stdout_h, stdout_f = tempfile.mkstemp()
2860         stdout_save = Outs( sys.stdout.fileno() )
2861         stdout_save.setfd( stdout_h )
2862
2863         # patch for `samtools view`
2864         # samtools `view` closes stdout, from which I can not
2865         # recover. Thus redirect output to file with -o option.
2866         if method == "view":
2867             if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
2868             args = ( "-o", stdout_f ) + args
2869
2870     # do the function call to samtools
2871     cdef char ** cargs
2872     cdef int i, n, retval
2873
2874     n = len(args)
2875     # allocate two more for first (dummy) argument (contains command)
2876     cargs = <char**>calloc( n+2, sizeof( char *) )
2877     cargs[0] = "samtools"
2878     cargs[1] = method
2879     for i from 0 <= i < n: cargs[i+2] = args[i]
2880
2881     retval = pysam_dispatch(n+2, cargs)
2882     free( cargs )
2883
2884     # restore stdout/stderr. This will also flush, so
2885     # needs to be before reading back the file contents
2886     if catch_stdout:
2887         stdout_save.restore()
2888         out_stdout = open( stdout_f, "r").readlines()
2889         os.remove( stdout_f )
2890     else:
2891         out_stdout = []
2892
2893     if catch_stderr:
2894         stderr_save.restore()
2895         out_stderr = open( stderr_f, "r").readlines()
2896         os.remove( stderr_f )
2897     else:
2898         out_stderr = []
2899
2900     return retval, out_stderr, out_stdout
2901
2902 cdef class SNPCall:
2903     '''the results of a SNP call.'''
2904     cdef int _tid
2905     cdef int _pos
2906     cdef char _reference_base
2907     cdef char _genotype
2908     cdef int _consensus_quality
2909     cdef int _snp_quality
2910     cdef int _rms_mapping_quality
2911     cdef int _coverage
2912
2913     property tid:
2914         '''the chromosome ID as is defined in the header'''
2915         def __get__(self):
2916             return self._tid
2917
2918     property pos:
2919        '''nucleotide position of SNP.'''
2920        def __get__(self): return self._pos
2921
2922     property reference_base:
2923        '''reference base at pos. ``N`` if no reference sequence supplied.'''
2924        def __get__(self): return PyString_FromStringAndSize( &self._reference_base, 1 )
2925
2926     property genotype:
2927        '''the genotype called.'''
2928        def __get__(self): return PyString_FromStringAndSize( &self._genotype, 1 )
2929
2930     property consensus_quality:
2931        '''the genotype quality (Phred-scaled).'''
2932        def __get__(self): return self._consensus_quality
2933
2934     property snp_quality:
2935        '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
2936        def __get__(self): return self._snp_quality
2937
2938     property mapping_quality:
2939        '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
2940        def __get__(self): return self._rms_mapping_quality
2941
2942     property coverage:
2943        '''coverage or read depth - the number of reads involved in the call.'''
2944        def __get__(self): return self._coverage
2945
2946     def __str__(self):
2947
2948         return "\t".join( map(str, (
2949                     self.tid,
2950                     self.pos,
2951                     self.reference_base,
2952                     self.genotype,
2953                     self.consensus_quality,
2954                     self.snp_quality,
2955                     self.mapping_quality,
2956                     self.coverage ) ) )
2957
2958
2959 # cdef class SNPCallerBase:
2960 #     '''Base class for SNP callers.
2961
2962 #     *min_baseQ*
2963 #        minimum base quality (possibly capped by BAQ)
2964 #     *capQ_threshold*
2965 #        coefficient for adjusting mapQ of poor mappings
2966 #     *theta*
2967 #        theta in maq consensus calling model
2968 #     *n_haplotypes*
2969 #        number of haplotypes in the sample
2970 #     *het_rate*
2971 #        prior of a difference between two haplotypes
2972 #     '''
2973
2974 #     cdef bam_maqcns_t * c
2975 #     cdef IteratorColumn iter
2976
2977 #     def __cinit__(self,
2978 #                   IteratorColumn iterator_column,
2979 #                   **kwargs ):
2980
2981 #         self.iter = iterator_column
2982 #         self.c =  bam_maqcns_init()
2983
2984 #         # set the default parameterization according to
2985 #         # samtools
2986
2987 #         # new default mode for samtools >0.1.10
2988 #         self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
2989
2990 #         self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
2991 #         # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
2992 #         self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
2993 #         self.c.het_rate = kwargs.get( "het_rate", 0.001 )
2994 #         self.c.theta = kwargs.get( "theta", 0.83 )
2995
2996 #         if self.c.errmod != BAM_ERRMOD_MAQ2:
2997 #             self.c.theta += 0.02
2998
2999 #         # call prepare AFTER setting parameters
3000 #         bam_maqcns_prepare( self.c )
3001
3002 #     def __dealloc__(self):
3003 #         bam_maqcns_destroy( self.c )
3004
3005     # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
3006     #     '''debugging output.'''
3007
3008     #     pysam_dump_glf( g, self.c );
3009     #     print ""
3010     #     for x in range(self.iter.n_plp):
3011     #         print "--> read %i %s %i" % (x,
3012     #                                      bam1_qname(self.iter.plp[x].b),
3013     #                                      self.iter.plp[x].qpos,
3014     #                                      )
3015
3016     #     print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
3017     #         % (self.iter.pos,
3018     #            cns,
3019     #            self.c.q_r,
3020     #            self.iter.n_plp,
3021     #            self.iter.n_plp,
3022     #            rb,
3023     #            cns >> 8 & 0xff,
3024     #            cns >> 16 & 0xff,
3025     #            cns & 0xff,
3026     #            cns >> 28,
3027     #            )
3028
3029     #     printf("-------------------------------------\n");
3030     #     sys.stdout.flush()
3031
3032 # cdef class IteratorSNPCalls( SNPCallerBase ):
3033 #     """*(IteratorColumn iterator)*
3034
3035 #     call SNPs within a region.
3036
3037 #     *iterator* is a pileup iterator. SNPs will be called
3038 #     on all positions returned by this iterator.
3039
3040 #     This caller is fast if SNPs are called over large continuous
3041 #     regions. It is slow, if instantiated frequently and in random
3042 #     order as the sequence will have to be reloaded.
3043
3044 #     """
3045
3046 #     def __cinit__(self,
3047 #                   IteratorColumn iterator_column,
3048 #                   **kwargs ):
3049
3050 #         assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
3051
3052 #     def __iter__(self):
3053 #         return self
3054
3055 #     def __next__(self):
3056 #         """python version of next().
3057 #         """
3058
3059 #         # the following code was adapted from bam_plcmd.c:pileup_func()
3060 #         self.iter.cnext()
3061
3062 #         if self.iter.n_plp < 0:
3063 #             raise ValueError("error during iteration" )
3064
3065 #         if self.iter.plp == NULL:
3066 #            raise StopIteration
3067
3068 #         cdef char * seq = self.iter.getSequence()
3069 #         cdef int seq_len = self.iter.seq_len
3070
3071 #         assert seq != NULL
3072
3073 #         # reference base
3074 #         if self.iter.pos >= seq_len:
3075 #             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3076
3077 #         cdef int rb = seq[self.iter.pos]
3078 #         cdef uint32_t cns
3079 #        cdef glf1_t * g
3080
3081 #        g = bam_maqcns_glfgen( self.iter.n_plp,
3082 #                               self.iter.plp,
3083 #                               bam_nt16_table[rb],
3084 #                               self.c )
3085
3086 #        if pysam_glf_depth( g ) == 0:
3087 #            cns = 0xfu << 28 | 0xf << 24
3088 #        else:
3089 #            cns = glf2cns(g, <int>(self.c.q_r + .499))
3090
3091 #        free(g)
3092
3093 #         cdef SNPCall call
3094
3095 #         call = SNPCall()
3096 #         call._tid = self.iter.tid
3097 #         call._pos = self.iter.pos
3098 #         call._reference_base = rb
3099 #         call._genotype = bam_nt16_rev_table[cns>>28]
3100 #         call._consensus_quality = cns >> 8 & 0xff
3101 #         call._snp_quality = cns & 0xff
3102 #         call._rms_mapping_quality = cns >> 16&0xff
3103 #         call._coverage = self.iter.n_plp
3104
3105 #         return call
3106
3107 # cdef class SNPCaller( SNPCallerBase ):
3108 #     '''*(IteratorColumn iterator_column )*
3109
3110 #     The samtools SNP caller.
3111
3112 #     This object will call SNPs in *samfile* against the reference
3113 #     sequence in *fasta*.
3114
3115 #     This caller is fast for calling few SNPs in selected regions.
3116
3117 #     It is slow, if called over large genomic regions.
3118 #     '''
3119
3120
3121 #     def __cinit__(self,
3122 #                   IteratorColumn iterator_column,
3123 #                   **kwargs ):
3124
3125 #         pass
3126
3127 #     def call(self, reference, int pos ):
3128 #         """call a snp on chromosome *reference*
3129 #         and position *pos*.
3130
3131 #         returns a :class:`SNPCall` object.
3132 #         """
3133
3134 #         cdef int tid = self.iter.samfile.gettid( reference )
3135
3136 #         self.iter.reset( tid, pos, pos + 1 )
3137
3138 #         while 1:
3139 #             self.iter.cnext()
3140
3141 #             if self.iter.n_plp < 0:
3142 #                 raise ValueError("error during iteration" )
3143
3144 #             if self.iter.plp == NULL:
3145 #                 raise ValueError( "no reads in region - no call" )
3146
3147 #             if self.iter.pos == pos: break
3148
3149 #         cdef char * seq = self.iter.getSequence()
3150 #         cdef int seq_len = self.iter.seq_len
3151
3152 #         assert seq != NULL
3153
3154 #         # reference base
3155 #         if self.iter.pos >= seq_len:
3156 #             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3157
3158 #         cdef int rb = seq[self.iter.pos]
3159 #         cdef uint32_t cns
3160 # #        cdef glf1_t * g
3161 # #
3162 # #        g = bam_maqcns_glfgen( self.iter.n_plp,
3163 # #                               self.iter.plp,
3164 # #                               bam_nt16_table[rb],
3165 # #                               self.c )
3166 # ##
3167 # #
3168 # #        if pysam_glf_depth( g ) == 0:
3169 # #            cns = 0xfu << 28 | 0xf << 24
3170 # #        else:
3171 # #            cns = glf2cns(g, <int>(self.c.q_r + .499))
3172 # #
3173 # #        free(g)
3174
3175 #         cdef SNPCall call
3176
3177 #         call = SNPCall()
3178 #         call._tid = self.iter.tid
3179 #         call._pos = self.iter.pos
3180 #         call._reference_base = rb
3181 #         call._genotype = bam_nt16_rev_table[cns>>28]
3182 #         call._consensus_quality = cns >> 8 & 0xff
3183 #         call._snp_quality = cns & 0xff
3184 #         call._rms_mapping_quality = cns >> 16&0xff
3185 #         call._coverage = self.iter.n_plp
3186
3187 #         return call
3188
3189 # cdef class IndelCall:
3190 #     '''the results of an indel call.'''
3191 #     cdef int _tid
3192 #     cdef int _pos
3193 #     cdef int _coverage
3194 #     cdef int _rms_mapping_quality
3195 #     cdef bam_maqindel_ret_t * _r
3196
3197 #     def __cinit__(self):
3198 #         #assert r != NULL
3199 #         #self._r = r
3200 #         pass
3201
3202 #     property tid:
3203 #         '''the chromosome ID as is defined in the header'''
3204 #         def __get__(self):
3205 #             return self._tid
3206
3207 #     property pos:
3208 #        '''nucleotide position of SNP.'''
3209 #        def __get__(self): return self._pos
3210
3211 #     property genotype:
3212 #        '''the genotype called.'''
3213 #        def __get__(self):
3214 #            if self._r.gt == 0:
3215 #                s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3216 #                return "%s/%s" % (s,s)
3217 #            elif self._r.gt == 1:
3218 #                s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3219 #                return "%s/%s" % (s,s)
3220 #            else:
3221 #                return "%s/%s" % (self.first_allele, self.second_allele )
3222
3223 #     property consensus_quality:
3224 #        '''the genotype quality (Phred-scaled).'''
3225 #        def __get__(self): return self._r.q_cns
3226
3227 #     property snp_quality:
3228 #        '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
3229 #        def __get__(self): return self._r.q_ref
3230
3231 #     property mapping_quality:
3232 #        '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
3233 #        def __get__(self): return self._rms_mapping_quality
3234
3235 #     property coverage:
3236 #        '''coverage or read depth - the number of reads involved in the call.'''
3237 #        def __get__(self): return self._coverage
3238
3239 #     property first_allele:
3240 #        '''sequence of first allele.'''
3241 #        def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
3242
3243 #     property second_allele:
3244 #        '''sequence of second allele.'''
3245 #        def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
3246
3247 #     property reads_first:
3248 #        '''reads supporting first allele.'''
3249 #        def __get__(self): return self._r.cnt1
3250
3251 #     property reads_second:
3252 #        '''reads supporting first allele.'''
3253 #        def __get__(self): return self._r.cnt2
3254
3255 #     property reads_diff:
3256 #        '''reads supporting first allele.'''
3257 #        def __get__(self): return self._r.cnt_anti
3258
3259 #     def __str__(self):
3260
3261 #         return "\t".join( map(str, (
3262 #                     self.tid,
3263 #                     self.pos,
3264 #                     self.genotype,
3265 #                     self.consensus_quality,
3266 #                     self.snp_quality,
3267 #                     self.mapping_quality,
3268 #                     self.coverage,
3269 #                     self.first_allele,
3270 #                     self.second_allele,
3271 #                     self.reads_first,
3272 #                     self.reads_second,
3273 #                     self.reads_diff ) ) )
3274
3275 #     def __dealloc__(self ):
3276 #         bam_maqindel_ret_destroy(self._r)
3277
3278 # cdef class IndelCallerBase:
3279 #     '''Base class for SNP callers.
3280
3281 #     *min_baseQ*
3282 #        minimum base quality (possibly capped by BAQ)
3283 #     *capQ_threshold*
3284 #        coefficient for adjusting mapQ of poor mappings
3285 #     *theta*
3286 #        theta in maq consensus calling model
3287 #     *n_haplotypes*
3288 #        number of haplotypes in the sample
3289 #     *het_rate*
3290 #        prior of a difference between two haplotypes
3291 #     '''
3292
3293 #     cdef bam_maqindel_opt_t * options
3294 #     cdef IteratorColumn iter
3295 #     cdef int cap_mapQ
3296 #     cdef int max_depth
3297
3298 #     def __cinit__(self,
3299 #                   IteratorColumn iterator_column,
3300 #                   **kwargs ):
3301
3302
3303 #         self.iter = iterator_column
3304
3305 #         assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
3306
3307 #         self.options = bam_maqindel_opt_init()
3308
3309 #         # set the default parameterization according to
3310 #         # samtools
3311
3312 #         self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
3313 #         self.options.q_indel = kwargs.get( "q_indel", 40 )
3314 #         self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
3315 #         self.max_depth = kwargs.get( "max_depth", 1024 )
3316
3317 #     def __dealloc__(self):
3318 #         free( self.options )
3319
3320 #     def _call( self ):
3321
3322 #         cdef char * seq = self.iter.getSequence()
3323 #         cdef int seq_len = self.iter.seq_len
3324
3325 #         assert seq != NULL
3326
3327 #         # reference base
3328 #         if self.iter.pos >= seq_len:
3329 #             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
3330
3331 #         cdef bam_maqindel_ret_t * r
3332
3333 #         cdef int m = min( self.max_depth, self.iter.n_plp )
3334
3335 #         # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
3336 #         #        m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
3337 #         #        self.options.indel_err, self.options.ambi_thres );
3338
3339 #         r = bam_maqindel(m,
3340 #                          self.iter.pos,
3341 #                          self.options,
3342 #                          self.iter.plp,
3343 #                          seq,
3344 #                          0,
3345 #                          NULL)
3346
3347 #         if r == NULL: return None
3348
3349 #         cdef IndelCall call
3350 #         call = IndelCall()
3351 #         call._r = r
3352 #         call._tid = self.iter.tid
3353 #         call._pos = self.iter.pos
3354 #         call._coverage = self.iter.n_plp
3355
3356 #         cdef uint64_t rms_aux = 0
3357 #         cdef int i = 0
3358 #         cdef bam_pileup1_t * p
3359 #         cdef int tmp
3360
3361 #         for i from 0 <= i < self.iter.n_plp:
3362 #             p = self.iter.plp + i
3363 #             if p.b.core.qual < self.cap_mapQ:
3364 #                 tmp = p.b.core.qual
3365 #             else:
3366 #                 tmp = self.cap_mapQ
3367 #             rms_aux += tmp * tmp
3368
3369 #         call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
3370
3371 #         return call
3372
3373 # cdef class IndelCaller( IndelCallerBase ):
3374 #     '''*(IteratorColumn iterator_column )*
3375
3376 #     The samtools SNP caller.
3377
3378 #     This object will call SNPs in *samfile* against the reference
3379 #     sequence in *fasta*.
3380
3381 #     This caller is fast for calling few SNPs in selected regions.
3382
3383 #     It is slow, if called over large genomic regions.
3384 #     '''
3385
3386 #     def __cinit__(self,
3387 #                   IteratorColumn iterator_column,
3388 #                   **kwargs ):
3389
3390 #         pass
3391
3392 #     def call(self, reference, int pos ):
3393 #         """call a snp on chromosome *reference*
3394 #         and position *pos*.
3395
3396 #         returns a :class:`SNPCall` object or None, if no indel call could be made.
3397 #         """
3398
3399 #         cdef int tid = self.iter.samfile.gettid( reference )
3400
3401 #         self.iter.reset( tid, pos, pos + 1 )
3402
3403 #         while 1:
3404 #             self.iter.cnext()
3405
3406 #             if self.iter.n_plp < 0:
3407 #                 raise ValueError("error during iteration" )
3408
3409 #             if self.iter.plp == NULL:
3410 #                 raise ValueError( "no reads in region - no call" )
3411
3412 #             if self.iter.pos == pos: break
3413
3414 #         return self._call()
3415
3416 # cdef class IteratorIndelCalls( IndelCallerBase ):
3417 #     """*(IteratorColumn iterator)*
3418
3419 #     call indels within a region.
3420
3421 #     *iterator* is a pileup iterator. SNPs will be called
3422 #     on all positions returned by this iterator.
3423
3424 #     This caller is fast if SNPs are called over large continuous
3425 #     regions. It is slow, if instantiated frequently and in random
3426 #     order as the sequence will have to be reloaded.
3427
3428 #     """
3429
3430 #     def __cinit__(self,
3431 #                   IteratorColumn iterator_column,
3432 #                   **kwargs ):
3433 #         pass
3434
3435
3436 #     def __iter__(self):
3437 #         return self
3438
3439 #     def __next__(self):
3440 #         """python version of next().
3441 #         """
3442
3443 #         # the following code was adapted from bam_plcmd.c:pileup_func()
3444 #         self.iter.cnext()
3445
3446 #         if self.iter.n_plp < 0:
3447 #             raise ValueError("error during iteration" )
3448
3449 #         if self.iter.plp == NULL:
3450 #            raise StopIteration
3451
3452 #         return self._call()
3453
3454
3455
3456 cdef class IndexedReads:
3457     """index a bamfile by read.
3458
3459     The index is kept in memory.
3460
3461     By default, the file is re-openend to avoid conflicts if
3462     multiple operators work on the same file. Set *reopen* = False
3463     to not re-open *samfile*.
3464     """
3465
3466     cdef Samfile samfile
3467     cdef samfile_t * fp
3468     cdef index
3469     # true if samfile belongs to this object
3470     cdef int owns_samfile
3471
3472     def __init__(self, Samfile samfile, int reopen = True ):
3473         self.samfile = samfile
3474
3475         if samfile.isbam: mode = "rb"
3476         else: mode = "r"
3477
3478         # reopen the file - note that this makes the iterator
3479         # slow and causes pileup to slow down significantly.
3480         if reopen:
3481             store = StderrStore()
3482             self.fp = samopen( samfile._filename, mode, NULL )
3483             store.release()
3484             assert self.fp != NULL
3485             self.owns_samfile = True
3486         else:
3487             self.fp = samfile.samfile
3488             self.owns_samfile = False
3489
3490         assert samfile.isbam, "can only IndexReads on bam files"
3491
3492     def build( self ):
3493         '''build index.'''
3494
3495         self.index = collections.defaultdict( list )
3496
3497         # this method will start indexing from the current file position
3498         # if you decide
3499         cdef int ret = 1
3500         cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
3501
3502         cdef uint64_t pos
3503
3504         while ret > 0:
3505             pos = bam_tell( self.fp.x.bam )
3506             ret = samread( self.fp, b)
3507             if ret > 0:
3508                 qname = bam1_qname( b )
3509                 self.index[qname].append( pos )
3510
3511         bam_destroy1( b )
3512
3513     def find( self, qname ):
3514         if qname in self.index:
3515             return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
3516         else:
3517             raise KeyError( "read %s not found" % qname )
3518
3519     def __dealloc__(self):
3520         if self.owns_samfile: samclose( self.fp )
3521
3522 __all__ = ["Samfile",
3523            "Fastafile",
3524            "IteratorRow",
3525            "IteratorColumn",
3526            "AlignedRead",
3527            "PileupColumn",
3528            "PileupProxy",
3529            "PileupRead",
3530            # "IteratorSNPCalls",
3531            # "SNPCaller",
3532            # "IndelCaller",
3533            # "IteratorIndelCalls",
3534            "IndexedReads" ]
3535
3536
3537