Imported Upstream version 0.7

[pysam.git] / pysam / csamtools.pyx
diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx

index 0da8d9eba23c1270f9739ea56c59af51950c934c..7dd075c0c78e352b252353f75f4aa8232b8aa529 100644 (file)
--- a/pysam/csamtools.pyx
+++ b/pysam/csamtools.pyx
@@ -1,8 +1,95 @@
  # cython: embedsignature=True
+# cython: profile=True
  # adds doc-strings for sphinx
-
-import tempfile, os, sys, types, itertools, struct, ctypes
-
+import tempfile
+import os
+import sys
+import types
+import itertools
+import struct
+import ctypes
+import collections
+import re
+import platform
+import warnings
+from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize
+from cpython.version cimport PY_MAJOR_VERSION
+
+########################################################################
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+cdef from_string_and_size(char* s, size_t length):
+    if PY_MAJOR_VERSION < 3:
+        return s[:length]
+    else:
+        return s[:length].decode("ascii")
+
+# filename encoding (copied from lxml.etree.pyx)
+cdef str _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+    _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+    _FILENAME_ENCODING = 'ascii'
+
+#cdef char* _C_FILENAME_ENCODING
+#_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
+
+cdef bytes _my_encodeFilename(object filename):
+    u"""Make sure a filename is 8-bit encoded (or None).
+    """
+    if filename is None:
+        return None
+    elif PyBytes_Check(filename):
+        return filename
+    elif PyUnicode_Check(filename):
+        return filename.encode(_FILENAME_ENCODING)
+    else:
+        raise TypeError, u"Argument must be string or unicode."
+
+
+cdef bytes _force_bytes(object s):
+    u"""convert string or unicode object to bytes, assuming ascii encoding.
+    """
+    if PY_MAJOR_VERSION < 3:
+        return s
+    elif s is None:
+        return None
+    elif PyBytes_Check(s):
+        return s
+    elif PyUnicode_Check(s):
+        return s.encode('ascii')
+    else:
+        raise TypeError, u"Argument must be string, bytes or unicode."
+
+cdef inline bytes _force_cmdline_bytes(object s):
+    return _force_bytes(s)
+
+cdef _charptr_to_str(char* s):
+    if PY_MAJOR_VERSION < 3:
+        return s
+    else:
+        return s.decode("ascii")
+
+cdef _force_str(object s):
+    """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
+    if s is None:
+        return None
+    if PY_MAJOR_VERSION < 3:
+        return s
+    elif PyBytes_Check(s):
+        return s.decode('ascii')
+    else:
+        # assume unicode
+        return s
+########################################################################
+########################################################################
+########################################################################
+## Constants and global variables
+########################################################################
  # defines imported from samtools
  DEF SEEK_SET = 0
  DEF SEEK_CUR = 1
@@ -33,38 +120,62 @@ DEF BAM_FQCFAIL      =512
  ## @abstract optical or PCR duplicate */
  DEF BAM_FDUP        =1024
  
+#####################################################################
+# CIGAR operations
  DEF BAM_CIGAR_SHIFT=4
  DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
  
+DEF BAM_CMATCH     = 0
+DEF BAM_CINS       = 1
+DEF BAM_CDEL       = 2
+DEF BAM_CREF_SKIP  = 3
+DEF BAM_CSOFT_CLIP = 4
+DEF BAM_CHARD_CLIP = 5
+DEF BAM_CPAD       = 6
+DEF BAM_CEQUAL     = 7
+DEF BAM_CDIFF      = 8
+
+cdef char* CODE2CIGAR= "MIDNSHP=X"
+if IS_PYTHON3:
+    CIGAR2CODE = dict( [y,x] for x,y in enumerate( CODE2CIGAR) )
+else:
+    CIGAR2CODE = dict( [ord(y),x] for x,y in enumerate( CODE2CIGAR) )
+CIGAR_REGEX = re.compile( "([MIDNSHP=X])(\d+)" )
+
+#####################################################################
+## set pysam stderr to /dev/null
+pysam_unset_stderr()
+
+#####################################################################
+# hard-coded constants
+cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
+cdef int max_pos = 2 << 29
+
  #####################################################################
  #####################################################################
  #####################################################################
  ## private factory methods
  #####################################################################
  cdef class AlignedRead
-cdef makeAlignedRead( bam1_t * src):
+cdef makeAlignedRead(bam1_t * src):
      '''enter src into AlignedRead.'''
-    cdef AlignedRead dest
-    dest = AlignedRead()
-    # destroy dummy delegate created in constructor
-    # to prevent memory leak.
-    pysam_bam_destroy1(dest._delegate)
+    cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
      dest._delegate = bam_dup1(src)
      return dest
  
  cdef class PileupProxy
-cdef makePileupProxy( bam_plbuf_t * buf, int n ):
-     cdef PileupProxy dest
-     dest = PileupProxy()
-     dest.buf = buf
+cdef makePileupProxy( bam_pileup1_t ** plp, int tid, int pos, int n ):
+     cdef PileupProxy dest = PileupProxy.__new__(PileupProxy)
+     dest.plp = plp
+     dest.tid = tid
+     dest.pos = pos
       dest.n = n
       return dest
  
  cdef class PileupRead
  cdef makePileupRead( bam_pileup1_t * src ):
      '''fill a  PileupRead object from a bam_pileup1_t * object.'''
-    cdef PileupRead dest
-    dest = PileupRead()
+    cdef PileupRead dest = PileupRead.__new__(PileupRead)
      dest._alignment = makeAlignedRead( src.b )
      dest._qpos = src.qpos
      dest._indel = src.indel
@@ -74,33 +185,79 @@ cdef makePileupRead( bam_pileup1_t * src ):
      dest._is_tail = src.is_tail
      return dest
  
+cdef convertBinaryTagToList( uint8_t * s ):
+    """return bytesize, number of values list of values in s."""
+    cdef char auxtype
+    cdef uint8_t byte_size
+    cdef int32_t nvalues
+
+    # get byte size
+    auxtype = s[0]
+    byte_size = bam_aux_type2size( auxtype )
+    s += 1
+    # get number of values in array
+    nvalues = (<int32_t*>s)[0]
+    s += 4
+    # get values
+    values = []
+    if auxtype == 'c':
+        for x from 0 <= x < nvalues:
+            values.append((<int8_t*>s)[0])
+            s += 1
+    elif auxtype == 'C':
+        for x from 0 <= x < nvalues:
+            values.append((<uint8_t*>s)[0])
+            s += 1
+    elif auxtype == 's':
+        for x from 0 <= x < nvalues:
+            values.append((<int16_t*>s)[0])
+            s += 2
+    elif auxtype == 'S':
+        for x from 0 <= x < nvalues:
+            values.append((<uint16_t*>s)[0])
+            s += 2
+    elif auxtype == 'i':
+        for x from 0 <= x < nvalues:
+            values.append((<int32_t*>s)[0])
+            s += 4
+    elif auxtype == 'I':
+        for x from 0 <= x < nvalues:
+            values.append((<uint32_t*>s)[0])
+            s += 4
+    elif auxtype == 'f':
+        for x from 0 <= x < nvalues:
+            values.append((<float*>s)[0])
+            s += 4
+
+    return byte_size, nvalues, values
+
  #####################################################################
  #####################################################################
  #####################################################################
  ## Generic callbacks for inserting python callbacks.
  #####################################################################
  cdef int fetch_callback( bam1_t *alignment, void *f):
-    '''callback for bam_fetch. 
-    
+    '''callback for bam_fetch.
+
      calls function in *f* with a new :class:`AlignedRead` object as parameter.
      '''
      a = makeAlignedRead( alignment )
      (<object>f)(a)
  
-class PileupColumn(object):                       
-    '''A pileup column. A pileup column contains  
+class PileupColumn(object):
+    '''A pileup column. A pileup column contains
      all the reads that map to a certain target base.
  
-    tid      
-        chromosome ID as is defined in the header      
-    pos      
-        the target base coordinate (0-based)    
-    n 
-        number of reads mapping to this column  
-    pileups  
-        list of reads (:class:`pysam.PileupRead`) aligned to this column    
-    '''      
-    def __str__(self):     
+    tid
+        chromosome ID as is defined in the header
+    pos
+        the target base coordinate (0-based)
+    n
+        number of reads mapping to this column
+    pileups
+        list of reads (:class:`pysam.PileupRead`) aligned to this column
+    '''
+    def __str__(self):
          return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
              "\n" + "\n".join( map(str, self.pileups) )
  
@@ -127,14 +284,15 @@ cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl,
      p.n = n
      pileups = []
  
+    cdef int x
      for x from 0 <= x < n:
          pileups.append( makePileupRead( &(pl[x]) ) )
      p.pileups = pileups
-        
+
      (<object>f)(p)
  
  cdef int pileup_fetch_callback( bam1_t *b, void *data):
-    '''callback for bam_fetch. 
+    '''callback for bam_fetch.
  
      Fetches reads and submits them to pileup.
      '''
@@ -145,14 +303,25 @@ cdef int pileup_fetch_callback( bam1_t *b, void *data):
  
  class StderrStore():
      '''
-    stderr is captured. 
+    stderr is captured.
      '''
      def __init__(self):
+        return
          self.stderr_h, self.stderr_f = tempfile.mkstemp()
          self.stderr_save = Outs( sys.stderr.fileno() )
          self.stderr_save.setfd( self.stderr_h )
-        
+
+    def readAndRelease( self ):
+        return []
+        self.stderr_save.restore()
+        lines = []
+        if os.path.exists(self.stderr_f):
+            lines = open( self.stderr_f, "r" ).readlines()
+            os.remove( self.stderr_f )
+        return lines
+
      def release(self):
+        return
          self.stderr_save.restore()
          if os.path.exists(self.stderr_f):
              os.remove( self.stderr_f )
@@ -160,14 +329,25 @@ class StderrStore():
      def __del__(self):
          self.release()
  
+class StderrStoreWindows():
+    '''does nothing. stderr can't be redirected on windows'''
+    def __init__(self): pass
+    def readAndRelease(self): return []
+    def release(self): pass
+
+if platform.system()=='Windows':
+    del StderrStore
+    StderrStore = StderrStoreWindows
+
+
  ######################################################################
  ######################################################################
  ######################################################################
  # valid types for sam headers
-VALID_HEADER_TYPES = { "HD" : dict, 
-                       "SQ" : list, 
-                       "RG" : list, 
-                       "PG" : list, 
+VALID_HEADER_TYPES = { "HD" : dict,
+                       "SQ" : list,
+                       "RG" : list,
+                       "PG" : list,
                         "CO" : list }
  
  # order of records within sam headers
@@ -176,66 +356,244 @@ VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
  # type conversions within sam header records
  VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
                          "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
-                        "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
-                        "PG" : { "ID" : str, "VN" : str, "CL" : str }, }
+                        "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, 
+                                 "CN" : str, "DT" : str, "PL" : str, "FO" : str, "KS" : str },
+                        "PG" : { "PN" : str, "ID" : str, "VN" : str, "CL" : str }, }
  
  # output order of fields within records
  VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
                         "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
-                       "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
-                       "PG" : ( "ID", "VN", "CL" ), }
+                       "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL", "FO", "KS" ),
+                       "PG" : ( "PN", "ID", "VN", "CL" ), }
+
  
  ######################################################################
  ######################################################################
  ######################################################################
  ## Public methods
  ######################################################################
-cdef class Samfile:
-    '''*(filename, mode='r', template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
-              
-    A *SAM* file. The file is automatically opened.
-    
-    *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode so for binary 
-    (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output. 
-    Use ``h`` to output header information  in text (:term:`TAM`)  mode.
+cdef class Fastafile:
+    '''*(filename)*
  
-    If ``b`` is present, it must immediately follow ``r`` or ``w``. 
-    Currently valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``.
-    
-    so to open a :term:`BAM` file for reading::
+    A *FASTA* file. The file is automatically opened.
+
+    The file expects an indexed fasta file.
+
+    TODO:
+        add automatic indexing.
+        add function to get sequence names.
+    '''
  
-        f=Samfile('ex1.bam','rb')
+    def __cinit__(self, *args, **kwargs ):
+        self.fastafile = NULL
+        self._filename = NULL
+        self._open( *args, **kwargs )
+
+    def _isOpen( self ):
+        '''return true if samfile has been opened.'''
+        return self.fastafile != NULL
  
+    def __len__(self):
+        if self.fastafile == NULL:
+            raise ValueError( "calling len() on closed file" )
  
-    For writing, the header of a :term:`TAM` file/:term:`BAM` file can be constituted from several
-    sources:
+        return faidx_fetch_nseq(self.fastafile)
  
-        1. If *template* is given, the header is copied from a another *Samfile* (*template* must be of type *Samfile*).
+    def _open( self,
+               filename ):
+        '''open an indexed fasta file.
  
-        2. If *header* is given, the header is build from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level is then a list of lines, with each line being a list of tag-value pairs.
+        This method expects an indexed fasta file.
+        '''
  
-        3. If *text* is given, new header text is copied from raw text.
+        # close a previously opened file
+        if self.fastafile != NULL: self.close()
+        if self._filename != NULL: free(self._filename)
+        filename = _my_encodeFilename(filename)
+        self._filename = strdup(filename)
+        self.fastafile = fai_load( filename )
  
-        4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists. 
+        if self.fastafile == NULL:
+            raise IOError("could not open file `%s`" % filename )
+
+    def close( self ):
+        if self.fastafile != NULL:
+            fai_destroy( self.fastafile )
+            self.fastafile = NULL
+
+    def __dealloc__(self):
+        self.close()
+        if self._filename != NULL: free(self._filename)
+
+    property filename:
+        '''number of :term:`filename` associated with this object.'''
+        def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+            return self._filename
+
+    def fetch( self,
+               reference = None,
+               start = None,
+               end = None,
+               region = None):
+
+        '''*(reference = None, start = None, end = None, region = None)*
+
+        fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing.
+
+        The region is specified by :term:`reference`, *start* and *end*.
+
+        fetch returns an empty string if the region is out of range or addresses an unknown *reference*.
+
+        If *reference* is given and *start* is None, the sequence from the
+        first base is returned. Similarly, if *end* is None, the sequence
+        until the last base is returned.
+
+        Alternatively, a samtools :term:`region` string can be supplied.
+        '''
+
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        cdef int length
+        cdef char * seq
+
+        if not region:
+            if reference is None: raise ValueError( 'no sequence/region supplied.' )
+            if start is None: start = 0
+            if end is None: end = max_pos -1
+
+            if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
+            if start == end: return b""
+            # valid ranges are from 0 to 2^29-1
+            if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
+            if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
+            # note: faidx_fetch_seq has a bug such that out-of-range access
+            # always returns the last residue. Hence do not use faidx_fetch_seq,
+            # but use fai_fetch instead
+            # seq = faidx_fetch_seq(self.fastafile,
+            #                       reference,
+            #                       start,
+            #                       end-1,
+            #                       &length)
+            region = "%s:%i-%i" % (reference, start+1, end)
+            if PY_MAJOR_VERSION >= 3:
+                region = region.encode('ascii')
+            seq = fai_fetch( self.fastafile,
+                             region,
+                             &length )
+        else:
+            # samtools adds a '\0' at the end
+            seq = fai_fetch( self.fastafile, region, &length )
+
+        # copy to python
+        if seq == NULL:
+            return b""
+        else:
+            try:
+                py_seq = seq[:length]
+            finally:
+                free(seq)
+
+        return py_seq
+
+    cdef char * _fetch( self, char * reference, int start, int end, int * length ):
+        '''fetch sequence for reference, start and end'''
+
+        return faidx_fetch_seq(self.fastafile,
+                               reference,
+                               start,
+                               end-1,
+                               length )
+
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+cdef int count_callback( bam1_t *alignment, void *f):
+     '''callback for bam_fetch - count number of reads.
+     '''
+     cdef int* counter = (<int*>f)
+     counter[0] += 1;
+
+ctypedef struct MateData:
+     char * name
+     bam1_t * mate
+     uint32_t flag
+
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+cdef int mate_callback( bam1_t *alignment, void *f):
+     '''callback for bam_fetch = filter mate
+     '''
+     cdef MateData * d = (<MateData*>f)
+     # printf("mate = %p, name1 = %s, name2=%s\t%i\t%i\t%i\n",
+     #        d.mate, d.name, bam1_qname(alignment),
+     #        d.flag, alignment.core.flag, alignment.core.flag & d.flag)
+
+     if d.mate == NULL:
+         # could be sped up by comparing the lengths of query strings first
+         # using l_qname
+         #
+         # also, make sure that we get the other read by comparing
+         # the flags
+         if alignment.core.flag & d.flag != 0 and \
+                 strcmp( bam1_qname( alignment ), d.name ) == 0:
+             d.mate = bam_dup1( alignment )
+
+
+cdef class Samfile:
+    '''*(filename, mode=None, template = None, referencenames = None, referencelengths = None, text = NULL, header = None,
+         add_sq_text = False, check_header = True, check_sq = True )*
+
+    A :term:`SAM`/:term:`BAM` formatted file. The file is automatically opened.
+
+    *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode (:term:`SAM`). For binary
+    (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output.
+    Use ``h`` to output header information in text (:term:`TAM`)  mode.
+
+    If ``b`` is present, it must immediately follow ``r`` or ``w``.
+    Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. For instance, to open
+    a :term:`BAM` formatted file for reading, type::
+
+        f = pysam.Samfile('ex1.bam','rb')
+
+    If mode is not specified, we will try to auto-detect in the order 'rb', 'r', thus both the following
+    should work::
+
+        f1 = pysam.Samfile('ex1.bam' )
+        f2 = pysam.Samfile('ex1.sam' )
  
      If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
      access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
-    '''
  
-    cdef char * filename
-    # pointer to samfile
-    cdef samfile_t * samfile
-    # pointer to index
-    cdef bam_index_t *index
-    # true if file is a bam file
-    cdef int isbam
+    For writing, the header of a :term:`SAM` file/:term:`BAM` file can be constituted from several
+    sources (see also the samtools format specification):
+
+        1. If *template* is given, the header is copied from a another *Samfile*
+           (*template* must be of type *Samfile*).
+
+        2. If *header* is given, the header is built from a multi-level dictionary. The first level
+           are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line
+           being a list of tag-value pairs. The header is constructed first from all the defined fields,
+           followed by user tags in alphabetical order.
+
+        3. If *text* is given, new header text is copied from raw text.
+
+        4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists. 
+           By default, 'SQ' and 'LN' tags will be added to the header text. This option can be
+           changed by unsetting the flag *add_sq_text*. 
  
-    # current read within iteration
-    cdef bam1_t * b
+    By default, if file a file is opened in mode 'r', it is checked for a valid header
+    (*check_header* = True) and a definition of chromosome names (*check_sq* = True). 
+    
+    '''
  
      def __cinit__(self, *args, **kwargs ):
          self.samfile = NULL
+        self._filename = NULL
          self.isbam = False
+        self.isstream = False
          self._open( *args, **kwargs )
  
          # allocate memory for iterator
@@ -249,14 +607,18 @@ cdef class Samfile:
          '''return true if samfile has an existing (and opened) index.'''
          return self.index != NULL
  
-    def _open( self, 
-               char * filename, 
-               mode ='r',
+    def _open( self,
+               filename,
+               mode = None,
                 Samfile template = None,
                 referencenames = None,
                 referencelengths = None,
-               char * text = NULL,
+               text = None,
                 header = None,
+               port = None,
+               add_sq_text = True,
+               check_header = True,
+               check_sq = True,
                ):
          '''open a sam/bam file.
  
@@ -264,7 +626,30 @@ cdef class Samfile:
          closed and a new file will be opened.
          '''
  
-        assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
+        # read mode autodetection
+        if mode is None:
+            try:
+                self._open(filename, 'rb', template=template,
+                           referencenames=referencenames,
+                           referencelengths=referencelengths,
+                           text=text, header=header, port=port,
+                           check_header=check_header,
+                           check_sq=check_sq)
+                return
+            except ValueError, msg:
+                pass
+
+            self._open(filename, 'r', template=template,
+                       referencenames=referencenames,
+                       referencelengths=referencelengths,
+                       text=text, header=header, port=port,
+                       check_header=check_header,
+                       check_sq=check_sq)
+            return
+
+        assert mode in ( "r","w","rb","wb", "wh", "wbu", "rU" ), "invalid file opening mode `%s`" % mode
+
+        #assert filename != NULL
  
          # close a previously opened file
          if self.samfile != NULL: self.close()
@@ -273,13 +658,25 @@ cdef class Samfile:
          cdef bam_header_t * header_to_write
          header_to_write = NULL
  
-        self.filename = filename
+        if self._filename != NULL: free(self._filename )
+        filename = _my_encodeFilename(filename)
+        cdef bytes bmode = mode.encode('ascii')
+        #cdef char* cfilename
+        #cfilename = filename.encode(_FILENAME_ENCODING)
+        self._filename = strdup(filename)
+        self.isstream = strcmp( filename, "-" ) == 0
  
          self.isbam = len(mode) > 1 and mode[1] == 'b'
  
+        self.isremote = strncmp(filename,"http:",5) == 0 or \
+            strncmp(filename,"ftp:",4) == 0
+
+        cdef char * ctext
+        ctext = NULL
+
          if mode[0] == 'w':
              # open file for writing
-            
+
              # header structure (used for writing)
              if template:
                  # copy header from another file
@@ -290,10 +687,11 @@ cdef class Samfile:
  
              else:
                  # build header from a target names and lengths
-                assert referencenames and referencelengths, "either supply options `template`, `header` or  both `refernencenames` and `referencelengths` for writing"
+                assert referencenames and referencelengths, "either supply options `template`, `header` or  both `referencenames` and `referencelengths` for writing"
                  assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
  
                  # allocate and fill header
+                referencenames = [ _force_bytes(ref) for ref in referencenames ]
                  header_to_write = bam_header_init()
                  header_to_write.n_targets = len(referencenames)
                  n = 0
@@ -306,19 +704,29 @@ cdef class Samfile:
                      header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
                      strncpy( header_to_write.target_name[x], name, len(name) )
  
-                if text != NULL:
+                # Optionally, if there is no text, add a SAM compatible header to output
+                # file.
+                if text is None and add_sq_text:
+                    text = []
+                    for x from 0 <= x < header_to_write.n_targets:
+                        text.append( "@SQ\tSN:%s\tLN:%s\n" % (referencenames[x], referencelengths[x] ) )
+                    text = ''.join(text)
+
+                if text != None:
                      # copy without \0
-                    header_to_write.l_text = strlen(text)
-                    header_to_write.text = <char*>calloc( strlen(text), sizeof(char) )
-                    memcpy( header_to_write.text, text, strlen(text) )
+                    text = _force_bytes(text)
+                    ctext = text
+                    header_to_write.l_text = strlen(ctext)
+                    header_to_write.text = <char*>calloc( strlen(ctext), sizeof(char) )
+                    memcpy( header_to_write.text, ctext, strlen(ctext) )
  
                  header_to_write.hash = NULL
                  header_to_write.rg2lib = NULL
-                    
+
              # open file. Header gets written to file at the same time for bam files
              # and sam files (in the latter case, the mode needs to be wh)
              store = StderrStore()
-            self.samfile = samopen( filename, mode, header_to_write )
+            self.samfile = samopen( filename, bmode, header_to_write )
              store.release()
  
              # bam_header_destroy takes care of cleaning up of all the members
@@ -327,261 +735,528 @@ cdef class Samfile:
  
          elif mode[0] == "r":
              # open file for reading
-            if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ):
+            if strncmp( filename, "-", 1) != 0 and \
+                    not self.isremote and \
+                    not os.path.exists( filename ):
                  raise IOError( "file `%s` not found" % filename)
  
-            store = StderrStore()
-            self.samfile = samopen( filename, mode, NULL )
-            store.release()
+            # try to detect errors
+            self.samfile = samopen( filename, bmode, NULL )
+            if self.samfile == NULL:
+                raise ValueError( "could not open file (mode='%s') - is it SAM/BAM format?" % mode)
+
+            # bam files require a valid header
+            if self.isbam:
+                if self.samfile.header == NULL:
+                    raise ValueError( "file does not have valid header (mode='%s') - is it BAM format?" % mode )
+            else:
+                # in sam files it is optional (samfile full of unmapped reads)
+                if check_header and self.samfile.header == NULL:
+                    raise ValueError( "file does not have valid header (mode='%s') - is it SAM format?" % mode )
+
+            # disabled for autodetection to work
+            # needs to be disabled so that reading from sam-files without headers works
+            if check_sq and self.samfile.header.n_targets == 0:
+                raise ValueError( "file header is empty (mode='%s') - is it SAM/BAM format?" % mode)
  
          if self.samfile == NULL:
              raise IOError("could not open file `%s`" % filename )
  
+        # check for index and open if present
          if mode[0] == "r" and self.isbam:
-            if not os.path.exists(filename + ".bai"):
-                self.index = NULL
+
+            if not self.isremote:
+                if not os.path.exists(filename + b".bai"):
+                    self.index = NULL
+                else:
+                    # returns NULL if there is no index or index could not be opened
+                    self.index = bam_index_load(filename)
+                    if self.index == NULL:
+                        raise IOError("error while opening index `%s` " % filename )
              else:
-                # returns NULL if there is no index or index could not be opened
                  self.index = bam_index_load(filename)
                  if self.index == NULL:
                      raise IOError("error while opening index `%s` " % filename )
  
+            if not self.isstream:
+                self.start_offset = bam_tell( self.samfile.x.bam )
+
+    def gettid( self, reference ):
+        '''
+        convert :term:`reference` name into numerical :term:`tid`
+
+        returns -1 if reference is not known.
+        '''
+        if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+        reference = _force_bytes(reference)
+        return pysam_reference2tid( self.samfile.header, reference )
+
      def getrname( self, tid ):
-        '''(tid )
-        convert numerical :term:`tid` into :ref:`reference` name.'''
+        '''
+        convert numerical :term:`tid` into :term:`reference` name.'''
+        if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
          if not 0 <= tid < self.samfile.header.n_targets:
-            raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets )
+            raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
+        return _charptr_to_str(self.samfile.header.target_name[tid])
+
+    cdef char * _getrname( self, int tid ): # TODO unused
+        '''
+        convert numerical :term:`tid` into :term:`reference` name.'''
+        if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+        if not 0 <= tid < self.samfile.header.n_targets:
+            raise ValueError( "tid %i out of range 0<=tid<%i" % (tid, self.samfile.header.n_targets ) )
          return self.samfile.header.target_name[tid]
  
-    def _parseRegion( self, 
-                      reference = None, 
-                      start = None, 
-                      end = None, 
+    def _parseRegion( self,
+                      reference = None,
+                      start = None,
+                      end = None,
                        region = None ):
-        '''parse region information.
+        '''
+        parse region information.
  
-        raise Value for for invalid regions.
+        raise ValueError for for invalid regions.
  
-        returns a tuple of region, tid, start and end. Region
-        is a valid samtools :term:`region` or None if the region
-        extends over the whole file.
+        returns a tuple of flag, tid, start and end. Flag indicates
+        whether some coordinates were supplied.
  
          Note that regions are 1-based, while start,end are python coordinates.
          '''
-        
+        # This method's main objective is to translate from a reference to a tid.
+        # For now, it calls bam_parse_region, which is clumsy. Might be worth
+        # implementing it all in pysam (makes use of khash).
+
          cdef int rtid
-        cdef int rstart
-        cdef int rend
-        cdef int max_pos
-        max_pos = 2 << 29
+        cdef long long rstart
+        cdef long long rend
+
+        rtid = -1
+        rstart = 0
+        rend = max_pos
+        if start != None:
+            try:
+                rstart = start
+            except OverflowError:
+                raise ValueError( 'start out of range (%i)' % start )
+
+        if end != None:
+            try:
+                rend = end
+            except OverflowError:
+                raise ValueError( 'end out of range (%i)' % end )
  
-        rtid = rstart = rend = 0
+        if region:
+            region = _force_str(region)
+            parts = re.split( "[:-]", region )
+            reference = parts[0]
+            if len(parts) >= 2: rstart = int(parts[1]) - 1
+            if len(parts) >= 3: rend = int(parts[2])
  
-        # translate to a region
-        if reference:
-            if start != None and end != None:
-                region = "%s:%i-%i" % (reference, start+1, end)
-            else:
-                region = reference
+        if not reference: return 0, 0, 0, 0
  
-        if region:
-            store = StderrStore()
-            bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)        
-            store.release()
-            if rtid < 0: raise ValueError( "invalid region `%s`" % region )
-            if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
-            if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
-            if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
-
-        return region, rtid, rstart, rend
-
-    def fetch( self, 
-               reference = None, 
-               start = None, 
-               end = None, 
-               region = None, 
+        rtid = self.gettid( reference )
+        if rtid < 0: raise ValueError( "invalid reference `%s`" % reference )
+        if rstart > rend: raise ValueError( 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend) )
+        if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
+        if not 0 <= rend <= max_pos: raise ValueError( 'end out of range (%i)' % rend )
+
+        return 1, rtid, rstart, rend
+
+    def reset( self ):
+        '''reset file position to beginning of read section.'''
+        return self.seek( self.start_offset, 0 )
+
+    def seek( self, uint64_t offset, int where = 0):
+        '''
+        move file pointer to position *offset*, see :meth:`pysam.Samfile.tell`.
+        '''
+
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+        if not self.isbam:
+            raise NotImplementedError("seek only available in bam files")
+        if self.isstream:
+            raise OSError("seek no available in streams")
+
+        return bam_seek( self.samfile.x.bam, offset, where )
+
+    def tell( self ):
+        '''
+        return current file position
+        '''
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+        if not self.isbam:
+            raise NotImplementedError("seek only available in bam files")
+
+        return bam_tell( self.samfile.x.bam )
+
+    def fetch( self,
+               reference = None,
+               start = None,
+               end = None,
+               region = None,
                 callback = None,
                 until_eof = False ):
-        '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
-               
-        fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
-        :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
+        '''
+        fetch aligned reads in a :term:`region` using 0-based indexing. The region is specified by
+        :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can
+        be supplied.
  
-        Without *reference* or *region* all reads will be fetched. The reads will be returned
+        Without *reference* or *region* all mapped reads will be fetched. The reads will be returned
          ordered by reference sequence, which will not necessarily be the order within the file.
+
          If *until_eof* is given, all reads from the current file position will be returned
-        *as they are sorted within the file*.  
-        
-        If only *reference* is set, all reads matching on *reference* will be fetched.
+        in order as they are within the file. Using this option will also fetch unmapped reads.
+
+        If only *reference* is set, all reads aligned to *reference* will be fetched.
  
          The method returns an iterator of type :class:`pysam.IteratorRow` unless
-        a *callback is provided. If *callback* is given, the callback will be executed 
+        a *callback is provided. If *callback* is given, the callback will be executed
          for each position within the :term:`region`. Note that callbacks currently work
          only, if *region* or *reference* is given.
  
-        Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
+        Note that a :term:`SAM` file does not allow random access. If *region* or *reference* are given,
          an exception is raised.
          '''
-        cdef int rtid
-        cdef int rstart
-        cdef int rend
+        cdef int rtid, rstart, rend, has_coord
  
          if not self._isOpen():
              raise ValueError( "I/O operation on closed file" )
  
-        region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+        has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+
+        if self.isstream: reopen = False
+        else: reopen = True
  
          if self.isbam:
+            if not until_eof and not self._hasIndex() and not self.isremote:
+                raise ValueError( "fetch called on bamfile without index" )
+
              if callback:
-                if not region:
-                    raise ValueError( "callback functionality requires a region/reference" )
+                if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
                  if not self._hasIndex(): raise ValueError( "no index available for fetch" )
-                return bam_fetch(self.samfile.x.bam, 
-                                 self.index, rtid, rstart, rend, <void*>callback, fetch_callback )
+                return bam_fetch(self.samfile.x.bam,
+                                 self.index,
+                                 rtid,
+                                 rstart,
+                                 rend,
+                                 <void*>callback,
+                                 fetch_callback )
              else:
-                if region:
-                    return IteratorRow( self, rtid, rstart, rend )
+                if has_coord:
+                    return IteratorRowRegion( self, rtid, rstart, rend, reopen=reopen )
                  else:
                      if until_eof:
-                        return IteratorRowAll( self )
+                        return IteratorRowAll( self, reopen=reopen )
                      else:
-                        # return all targets by chaining the individual targets together.
-                        if not self._hasIndex(): raise ValueError( "no index available for fetch" )
-                        i = []
-                        rstart = 0
-                        rend = 1<<29
-                        for rtid from 0 <= rtid < self.nreferences: 
-                            i.append( IteratorRow( self, rtid, rstart, rend))
-                        return itertools.chain( *i )
-        else:                    
-            if region != None:
-                raise ValueError ("fetch for a region is not available for sam files" )
+                        # AH: check - reason why no reopen for AllRefs?
+                        return IteratorRowAllRefs(self ) # , reopen=reopen )
+        else:
+            if has_coord:
+                raise ValueError ("fetching by region is not available for sam files" )
+
              if callback:
                  raise NotImplementedError( "callback not implemented yet" )
-            else:
-                return IteratorRowAll( self )
  
-    def pileup( self, reference = None, start = None, end = None, region = None, callback = None ):
-        '''run a pileup within a :term:`region` using 0-based indexing. The region is specified by
-        :term:`reference`, *start* and *end*. Alternatively, a samtools *region* string can be supplied.
+            if self.samfile.header == NULL:
+                raise ValueError( "fetch called for samfile without header")
  
-        Without *reference* or *region* all reads will be fetched. The reads will be returned
-        ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
+            # check if targets are defined
+            # give warning, sam_read1 segfaults
+            if self.samfile.header.n_targets == 0:
+                warnings.warn( "fetch called for samfile without header")
+                
+            return IteratorRowAll( self, reopen=reopen )
  
-        The method returns an iterator of type :class:`pysam.IteratorColumn` unless
-        a *callback is provided. If *callback* is given, the callback will be executed 
-        for each position within the :term:`region`. 
+    def mate( self,
+              AlignedRead read ):
+        '''return the mate of :class:`AlignedRead` *read*.
  
-        Note that samfiles do not allow random access. If *region* or *reference* are given,
-        an exception is raised.
-        
-        .. Note::
+        Throws a ValueError if read is unpaired or the mate
+        is unmapped.
  
-            *all* reads which overlap the region are returned. The first base returned will be the 
-            first base of the first read *not* necessarily the first base of the region used in the query.
+        .. note::
+            Calling this method will change the file position.
+            This might interfere with any iterators that have
+            not re-opened the file.
+
+        '''
+        cdef uint32_t flag = read._delegate.core.flag
+
+        if flag & BAM_FPAIRED == 0:
+            raise ValueError( "read %s: is unpaired" % (read.qname))
+        if flag & BAM_FMUNMAP != 0:
+            raise ValueError( "mate %s: is unmapped" % (read.qname))
+
+        cdef MateData mate_data
+
+        mate_data.name = <char *>bam1_qname(read._delegate)
+        mate_data.mate = NULL
+        # xor flags to get the other mate
+        cdef int x = BAM_FREAD1 + BAM_FREAD2
+        mate_data.flag = ( flag ^ x) & x
+
+        bam_fetch(self.samfile.x.bam,
+                  self.index,
+                  read._delegate.core.mtid,
+                  read._delegate.core.mpos,
+                  read._delegate.core.mpos + 1,
+                  <void*>&mate_data,
+                  mate_callback )
+
+        if mate_data.mate == NULL:
+            raise ValueError( "mate not found" )
+
+        cdef AlignedRead dest = AlignedRead.__new__(AlignedRead)
+        dest._delegate = mate_data.mate
+        return dest
+
+    def count( self,
+               reference = None,
+               start = None,
+               end = None,
+               region = None,
+               until_eof = False ):
+        '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
+
+        count  reads :term:`region` using 0-based indexing. The region is specified by
+        :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
+
+        Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
+        an exception is raised.
          '''
          cdef int rtid
          cdef int rstart
          cdef int rend
-        cdef bam_plbuf_t *buf
  
          if not self._isOpen():
              raise ValueError( "I/O operation on closed file" )
  
          region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
-        
+
+        cdef int counter
+        counter = 0;
+
+        if self.isbam:
+            if not until_eof and not self._hasIndex() and not self.isremote:
+                raise ValueError( "fetch called on bamfile without index" )
+
+            if not region:
+                raise ValueError( "counting functionality requires a region/reference" )
+            if not self._hasIndex(): raise ValueError( "no index available for fetch" )
+            bam_fetch(self.samfile.x.bam,
+                             self.index,
+                             rtid,
+                             rstart,
+                             rend,
+                             <void*>&counter,
+                             count_callback )
+            return counter
+        else:
+            raise ValueError ("count for a region is not available for sam files" )
+
+    def pileup( self,
+                reference = None,
+                start = None,
+                end = None,
+                region = None,
+                callback = None,
+                **kwargs ):
+        '''
+        perform a :term:`pileup` within a :term:`region`. The region is specified by
+        :term:`reference`, *start* and *end* (using 0-based indexing).
+        Alternatively, a samtools *region* string can be supplied.
+
+        Without *reference* or *region* all reads will be used for the pileup. The reads will be returned
+        ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
+
+        The method returns an iterator of type :class:`pysam.IteratorColumn` unless
+        a *callback is provided. If a *callback* is given, the callback will be executed
+        for each column within the :term:`region`.
+
+        Note that :term:`SAM` formatted files do not allow random access.
+        In these files, if a *region* or *reference* are given an exception is raised.
+
+        Optional *kwargs* to the iterator:
+
+        stepper
+           The stepper controlls how the iterator advances.
+           Possible options for the stepper are
+
+           ``all``
+              use all reads for pileup.
+           ``samtools``
+              same filter and read processing as in :term:`csamtools` pileup
+
+        fastafile
+           A :class:`FastaFile` object
+
+         mask
+           Skip all reads with bits set in mask if mask=True.
+
+         max_depth
+           Maximum read depth permitted. The default limit is *8000*.
+
+         truncate
+           By default, the samtools pileup engine outputs all reads overlapping a region (see note below).
+           If truncate is True and a region is given, only output columns in the exact region
+           specificied.
+
+        .. note::
+
+            *all* reads which overlap the region are returned. The first base returned will be the
+            first base of the first read *not* necessarily the first base of the region used in the query.
+
+        '''
+        cdef int rtid, rstart, rend, has_coord
+        cdef bam_plbuf_t *buf
+
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+
          if self.isbam:
              if not self._hasIndex(): raise ValueError( "no index available for pileup" )
  
              if callback:
-                if not region:
-                    raise ValueError( "callback functionality requires a region/reference" )
+                if not has_coord: raise ValueError( "callback functionality requires a region/reference" )
  
                  buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
-                bam_fetch(self.samfile.x.bam, 
-                          self.index, rtid, rstart, rend, 
+                bam_fetch(self.samfile.x.bam,
+                          self.index, rtid, rstart, rend,
                            buf, pileup_fetch_callback )
-                
+
                  # finalize pileup
                  bam_plbuf_push( NULL, buf)
                  bam_plbuf_destroy(buf)
              else:
-                if region:
-                    return IteratorColumn( self, rtid, rstart, rend )
+                if has_coord:
+                    return IteratorColumnRegion( self,
+                                                 tid = rtid,
+                                                 start = rstart,
+                                                 end = rend,
+                                                 **kwargs )
                  else:
-                    # return all targets by chaining the individual targets together.
-                    i = []
-                    rstart = 0
-                    rend = 1<<29
-                    for rtid from 0 <= rtid < self.nreferences: 
-                        i.append( IteratorColumn( self, rtid, rstart, rend))
-                    return itertools.chain( *i )
+                    return IteratorColumnAllRefs(self, **kwargs )
  
          else:
              raise NotImplementedError( "pileup of samfiles not implemented yet" )
  
      def close( self ):
-        '''closes file.'''
+        '''
+        closes the :class:`pysam.Samfile`.'''
          if self.samfile != NULL:
              samclose( self.samfile )
              bam_index_destroy(self.index);
              self.samfile = NULL
  
      def __dealloc__( self ):
-        '''clean up.'''
          # remember: dealloc cannot call other methods
-        # Note that __del__ is not called.
+        # note: no doc string
+        # note: __del__ is not called.
          self.close()
-        pysam_bam_destroy1(self.b)
+        bam_destroy1(self.b)
+        if self._filename != NULL: free( self._filename )
  
-    def write( self, AlignedRead read ):
-        '''(AlignedRead read )
-        write a single :class:`pysam.AlignedRead`..
+    cpdef int write( self, AlignedRead read ) except -1:
+        '''
+        write a single :class:`pysam.AlignedRead` to disk.
  
-        return the number of bytes written.
+        returns the number of bytes written.
          '''
+        if not self._isOpen():
+            return 0
+
          return samwrite( self.samfile, read._delegate )
  
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+        return False
+
+    ###############################################################
+    ###############################################################
+    ###############################################################
+    ## properties
+    ###############################################################
+    property filename:
+        '''number of :term:`filename` associated with this object.'''
+        def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+            return self._filename
+
      property nreferences:
          '''number of :term:`reference` sequences in the file.'''
          def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
              return self.samfile.header.n_targets
  
      property references:
          """tuple with the names of :term:`reference` sequences."""
-        def __get__(self): 
+        def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
              t = []
              for x from 0 <= x < self.samfile.header.n_targets:
-                t.append( self.samfile.header.target_name[x] )
+                t.append( _charptr_to_str(self.samfile.header.target_name[x]) )
              return tuple(t)
  
      property lengths:
-        """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference`
+        """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as
+        :attr:`pysam.Samfile.references`
          """
-        def __get__(self): 
+        def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
              t = []
              for x from 0 <= x < self.samfile.header.n_targets:
                  t.append( self.samfile.header.target_len[x] )
              return tuple(t)
  
+    property mapped:
+        """total number of mapped reads in file.
+        """
+        def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+            if not self.isbam: raise AttributeError( "Samfile.mapped only available in bam files" )
+
+            cdef int tid
+            cdef uint32_t total = 0
+            for tid from 0 <= tid < self.samfile.header.n_targets:
+                total += pysam_get_mapped( self.index, tid )
+            return total
+
+    property unmapped:
+        """total number of unmapped reads in file.
+        """
+        def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+            if not self.isbam: raise AttributeError( "Samfile.unmapped only available in bam files" )
+            cdef int tid
+            cdef uint32_t total = 0
+            for tid from 0 <= tid < self.samfile.header.n_targets:
+                total += pysam_get_unmapped( self.index, tid )
+            # get unmapped reads without coordinates
+            total += pysam_get_unmapped( self.index, -1 )
+            return total
+
      property text:
          '''full contents of the :term:`sam file` header as a string.'''
          def __get__(self):
-            # create a temporary 0-terminated copy
-            cdef char * t
-            t = <char*>calloc( self.samfile.header.l_text + 1, sizeof(char) )
-            memcpy( t, self.samfile.header.text, self.samfile.header.l_text )
-            result = t
-            free(t)
-            return result
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+            return from_string_and_size(self.samfile.header.text, self.samfile.header.l_text)
  
      property header:
-        '''header information within the :term:`sam file`. The records and fields are returned as 
+        '''header information within the :term:`sam file`. The records and fields are returned as
          a two-level dictionary.
          '''
          def __get__(self):
-            result = {}
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
  
+            result = {}
+            
              if self.samfile.header.text != NULL:
                  # convert to python string (note: call self.text to create 0-terminated string)
                  t = self.text
@@ -602,9 +1277,14 @@ cdef class Samfile:
                      x = {}
                      for field in fields[1:]:
                          key, value = field.split(":",1)
-                        if key not in VALID_HEADER_FIELDS[record]:
+                        # uppercase keys must be valid
+                        # lowercase are permitted for user fields
+                        if key in VALID_HEADER_FIELDS[record]:
+                            x[key] = VALID_HEADER_FIELDS[record][key](value)
+                        elif not key.isupper():
+                            x[key] = value
+                        else:
                              raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
-                        x[key] = VALID_HEADER_FIELDS[record][key](value)
  
                      if VALID_HEADER_TYPES[record] == dict:
                          if record in result:
@@ -614,6 +1294,18 @@ cdef class Samfile:
                          if record not in result: result[record] = []
                          result[record].append( x )
  
+                # if there are no SQ lines in the header, add the reference names
+                # from the information in the bam file.
+                # Background: c-samtools keeps the textual part of the header separate from
+                # the list of reference names and lengths. Thus, if a header contains only 
+                # SQ lines, the SQ information is not part of the textual header and thus
+                # are missing from the output. See issue 84.
+                if "SQ" not in result:
+                    sq = []
+                    for ref, length in zip( self.references, self.lengths ):
+                        sq.append( {'LN': length, 'SN': ref } )
+                    result["SQ"] = sq
+
              return result
  
      def _buildLine( self, fields, record ):
@@ -621,13 +1313,25 @@ cdef class Samfile:
  
          # TODO: add checking for field and sort order
          line = ["@%s" % record ]
+        # comment
          if record == "CO":
              line.append( fields )
+        # user tags
+        elif record.islower():
+            for key in sorted(fields):
+                line.append( "%s:%s" % (key, str(fields[key])))
+        # defined tags
          else:
+            # write fields of the specification
              for key in VALID_HEADER_ORDER[record]:
                  if key in fields:
                      line.append( "%s:%s" % (key, str(fields[key])))
-        return "\t".join( line ) 
+            # write user fields
+            for key in fields:
+                if not key.isupper():
+                    line.append( "%s:%s" % (key, str(fields[key])))
+
+        return "\t".join( line )
  
      cdef bam_header_t * _buildHeader( self, new_header ):
          '''return a new header built from a dictionary in *new_header*.
@@ -643,25 +1347,37 @@ cdef class Samfile:
          cdef bam_header_t * dest
  
          dest = bam_header_init()
-                
+
+        # first: defined tags
          for record in VALID_HEADERS:
              if record in new_header:
                  ttype = VALID_HEADER_TYPES[record]
                  data = new_header[record]
                  if type( data ) != type( ttype() ):
                      raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
-                if type( data ) == types.DictType:
+                if type( data ) is dict:
                      lines.append( self._buildLine( data, record ) )
                  else:
                      for fields in new_header[record]:
                          lines.append( self._buildLine( fields, record ) )
  
+        # then: user tags (lower case), sorted alphabetically
+        for record, data in sorted(new_header.items()):
+            if record in VALID_HEADERS: continue
+            if type( data ) is dict:
+                lines.append( self._buildLine( data, record ) )
+            else:
+                for fields in new_header[record]:
+                    lines.append( self._buildLine( fields, record ) )
+
          text = "\n".join(lines) + "\n"
          if dest.text != NULL: free( dest.text )
          dest.text = <char*>calloc( len(text), sizeof(char))
          dest.l_text = len(text)
-        strncpy( dest.text, text, dest.l_text )
+        cdef bytes btext = text.encode('ascii')
+        strncpy( dest.text, btext, dest.l_text )
  
+        cdef bytes bseqname
          # collect targets
          if "SQ" in new_header:
              seqs = []
@@ -670,34 +1386,47 @@ cdef class Samfile:
                      seqs.append( (fields["SN"], fields["LN"] ) )
                  except KeyError:
                      raise KeyError( "incomplete sequence information in '%s'" % str(fields))
-                
+
              dest.n_targets = len(seqs)
              dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
              dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
-            
+
              for x from 0 <= x < dest.n_targets:
                  seqname, seqlen = seqs[x]
                  dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
-                strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
+                bseqname = seqname.encode('ascii')
+                strncpy( dest.target_name[x], bseqname, len(seqname) + 1 )
                  dest.target_len[x] = seqlen
  
          return dest
  
+    ###############################################################
+    ###############################################################
+    ###############################################################
+    ## file-object like iterator access
+    ## note: concurrent access will cause errors (see IteratorRow
+    ## and reopen)
+    ## Possible solutions: deprecate or open new file handle
+    ###############################################################
      def __iter__(self):
-        return self 
+        if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+        if not self.isbam and self.samfile.header.n_targets == 0:
+                raise NotImplementedError( "can not iterate over samfile without header")
+        return self
  
      cdef bam1_t * getCurrent( self ):
          return self.b
  
      cdef int cnext(self):
-        '''cversion of iterator. Used by IteratorColumn'''
+        '''
+        cversion of iterator. Used by :class:`pysam.Samfile.IteratorColumn`.
+        '''
          cdef int ret
          return samread(self.samfile, self.b)
  
-    def __next__(self): 
-        """python version of next().
-
-        pyrex uses this non-standard name instead of next()
+    def __next__(self):
+        """
+        python version of next().
          """
          cdef int ret
          ret = samread(self.samfile, self.b)
@@ -706,299 +1435,666 @@ cdef class Samfile:
          else:
              raise StopIteration
  
-cdef class Fastafile:
-    '''*(filename)*
-              
-    A *FASTA* file. The file is automatically opened.
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef class IteratorRow:
+    '''abstract base class for iterators over mapped reads.
  
-    The file expects an indexed fasta file.
+    Various iterators implement different behaviours for wrapping around
+    contig boundaries. Examples include:
  
-    TODO: 
-        add automatic indexing.
-        add function to get sequence names.
-    '''
+    :class:`pysam.IteratorRowRegion`
+        iterate within a single contig and a defined region.
  
-    cdef char * filename
-    # pointer to fastafile
-    cdef faidx_t * fastafile
+    :class:`pysam.IteratorRowAll`
+        iterate until EOF. This iterator will also include unmapped reads.
  
-    def __cinit__(self, *args, **kwargs ):
-        self.fastafile = NULL
-        self._open( *args, **kwargs )
+    :class:`pysam.IteratorRowAllRefs`
+        iterate over all reads in all reference sequences.
  
-    def _isOpen( self ):
-        '''return true if samfile has been opened.'''
-        return self.fastafile != NULL
+    The method :meth:`Samfile.fetch` returns an IteratorRow.
+    '''
+    pass
  
-    def _open( self, 
-               char * filename ):
-        '''open an indexed fasta file.
  
-        This method expects an indexed fasta file.
-        '''
+cdef class IteratorRowRegion(IteratorRow):
+    """*(Samfile samfile, int tid, int beg, int end, int reopen = True )*
  
-        # close a previously opened file
-        if self.fastafile != NULL: self.close()
-        self.filename = filename
-        self.fastafile = fai_load( filename )
+    iterate over mapped reads in a region.
  
-        if self.fastafile == NULL:
-            raise IOError("could not open file `%s`" % filename )
+    By default, the file is re-openend to avoid conflicts between
+    multiple iterators working on the same file. Set *reopen* = False
+    to not re-open *samfile*.
  
-    def close( self ):
-        if self.fastafile != NULL:
-            fai_destroy( self.fastafile )
-            self.fastafile = NULL
+    The samtools iterators assume that the file
+    position between iterations do not change.
+    As a consequence, no two iterators can work
+    on the same file. To permit this, each iterator
+    creates its own file handle by re-opening the
+    file.
  
-    def fetch( self, 
-               reference = None, 
-               start = None, 
-               end = None,
-               region = None):
-               
-        '''*(reference = None, start = None, end = None, region = None)*
-               
-        fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
-        :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
-        '''
-        
-        if not self._isOpen():
+    Note that the index will be shared between
+    samfile and the iterator.
+    """
+
+    def __cinit__(self, Samfile samfile, int tid, int beg, int end, int reopen = True ):
+
+        if not samfile._isOpen():
              raise ValueError( "I/O operation on closed file" )
  
-        cdef int len, max_pos
-        cdef char * seq
-        max_pos = 2 << 29
+        if not samfile._hasIndex():
+            raise ValueError( "no index available for iteration" )
  
-        if not region:
-            if reference == None: raise ValueError( 'no sequence/region supplied.' )
-            if start == None and end == None:
-                region = "%s" % str(reference)
-            elif start == None or end == None:
-                raise ValueError( 'only start or only end of region supplied' )
-            else:
-                if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
-               # valid ranges are from 0 to 2^29-1
-                if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
-                if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
-                region = "%s:%i-%i" % (reference, start+1, end )
-
-        # samtools adds a '\0' at the end
-        seq = fai_fetch( self.fastafile, region, &len )
-        # copy to python
-        result = seq
-        # clean up
-        free(seq)
-        
-        return result
+        # makes sure that samfile stays alive as long as the
+        # iterator is alive
+        self.samfile = samfile
  
-## turning callbacks elegantly into iterators is an unsolved problem, see the following threads:
-## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1
-## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html
-## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code
-## changes, the changes have to be manually entered.
+        if samfile.isbam: mode = b"rb"
+        else: mode = b"r"
  
-cdef class IteratorRow:
-    """iterates over mapped reads in a region.
+        # reopen the file - note that this makes the iterator
+        # slow and causes pileup to slow down significantly.
+        if reopen:
+            store = StderrStore()
+            self.fp = samopen( samfile._filename, mode, NULL )
+            store.release()
+            assert self.fp != NULL
+            self.owns_samfile = True
+        else:
+            self.fp = self.samfile.samfile
+            self.owns_samfile = False
+
+        self.retval = 0
+
+        self.iter = bam_iter_query(self.samfile.index,
+                                   tid,
+                                   beg,
+                                   end)
+        self.b = bam_init1()
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent( self ):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        self.retval = bam_iter_read( self.fp.x.bam,
+                                     self.iter,
+                                     self.b)
+
+    def __next__(self):
+        """python version of next().
+        """
+        self.cnext()
+        if self.retval < 0: raise StopIteration
+        return makeAlignedRead( self.b )
+
+    def __dealloc__(self):
+        bam_destroy1(self.b)
+        bam_iter_destroy( self.iter )
+        if self.owns_samfile: samclose( self.fp )
+
+cdef class IteratorRowAll(IteratorRow):
+    """*(Samfile samfile, int reopen = True)*
+
+    iterate over all reads in *samfile*
+
+    By default, the file is re-openend to avoid conflicts between
+    multiple iterators working on the same file. Set *reopen* = False
+    to not re-open *samfile*.
      """
-    
-    cdef bam_fetch_iterator_t*  bam_iter # iterator state object
-    cdef bam1_t *               b
-    cdef                        error_msg
-    cdef int                    error_state
-    cdef Samfile                samfile
-    def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
-        self.bam_iter = NULL
  
-        assert samfile._isOpen()
-        assert samfile._hasIndex()
-        
-        # makes sure that samfile stays alive as long as the
-        # iterator is alive.
-        self.samfile = samfile
+    def __cinit__(self, Samfile samfile, int reopen = True ):
  
-        # parse the region
-        self.error_state = 0
-        self.error_msg = None
+        if not samfile._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        if samfile.isbam: mode = b"rb"
+        else: mode = b"r"
+
+        # reopen the file to avoid iterator conflict
+        if reopen:
+            store = StderrStore()
+            self.fp = samopen( samfile._filename, mode, NULL )
+            store.release()
+            assert self.fp != NULL
+            self.owns_samfile = True
+        else:
+            self.fp = samfile.samfile
+            self.owns_samfile = False
  
-        cdef bamFile  fp
-        fp = samfile.samfile.x.bam
-        self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end)
+        # allocate memory for alignment
+        self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
  
      def __iter__(self):
-        return self 
+        return self
  
      cdef bam1_t * getCurrent( self ):
          return self.b
  
      cdef int cnext(self):
          '''cversion of iterator. Used by IteratorColumn'''
-        self.b = bam_fetch_iterate(self.bam_iter)
-        if self.b == NULL: return 0
-        return 1
+        return samread(self.fp, self.b)
  
-    def __next__(self): 
+    def __next__(self):
          """python version of next().
  
          pyrex uses this non-standard name instead of next()
          """
-        if self.error_state:
-            raise ValueError( self.error_msg)
-        
-        self.b = bam_fetch_iterate(self.bam_iter)
-        if self.b != NULL:
+        cdef int ret
+        ret = samread(self.fp, self.b)
+        if (ret > 0):
              return makeAlignedRead( self.b )
          else:
              raise StopIteration
  
      def __dealloc__(self):
-        '''remember: dealloc cannot call other methods!'''
-        if self.bam_iter:
-            bam_cleanup_fetch_iterator(self.bam_iter)
-        
-cdef class IteratorRowAll:
-    """iterates over all mapped reads
-    """
+        bam_destroy1(self.b)
+        if self.owns_samfile: samclose( self.fp )
  
-    cdef bam1_t * b
-    cdef samfile_t * fp
+cdef class IteratorRowAllRefs(IteratorRow):
+    """iterates over all mapped reads by chaining iterators over each reference
+    """
  
      def __cinit__(self, Samfile samfile):
-
          assert samfile._isOpen()
+        if not samfile._hasIndex(): raise ValueError("no index available for fetch")
+        self.samfile = samfile
+        self.tid = -1
+
+    def nextiter(self):
+        self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, 1<<29)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        # Create an initial iterator
+        if self.tid==-1:
+            if not self.samfile.nreferences:
+                raise StopIteration
+            self.tid = 0
+            self.nextiter()
+
+        while 1:
+            self.rowiter.cnext()
+
+            # If current iterator is not exhausted, return aligned read
+            if self.rowiter.retval>0:
+                return makeAlignedRead(self.rowiter.b)
+
+            self.tid += 1
+
+            # Otherwise, proceed to next reference or stop
+            if self.tid<self.samfile.nreferences:
+                self.nextiter()
+            else:
+                raise StopIteration
+
+cdef class IteratorRowSelection(IteratorRow):
+    """*(Samfile samfile)*
+
+    iterate over reads in *samfile* at a given list of file positions.
+    """
+
+    def __cinit__(self, Samfile samfile, positions, int reopen = True ):
+
+        if not samfile._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        if not samfile._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        assert samfile.isbam, "can only use this iterator on bam files"
+        mode = b"rb"
  
-        self.fp = samfile.samfile
+        # reopen the file to avoid iterator conflict
+        if reopen:
+            store = StderrStore()
+            self.fp = samopen( samfile._filename, mode, NULL )
+            store.release()
+            assert self.fp != NULL
+            self.owns_samfile = True
+        else:
+            self.fp = samfile.samfile
+            self.owns_samfile = False
  
          # allocate memory for alignment
          self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
  
+        self.positions = positions
+        self.current_pos = 0
+
      def __iter__(self):
-        return self 
+        return self
  
      cdef bam1_t * getCurrent( self ):
          return self.b
  
      cdef int cnext(self):
-        '''cversion of iterator. Used by IteratorColumn'''
-        cdef int ret
+        '''cversion of iterator'''
+
+        # end iteration if out of positions
+        if self.current_pos >= len(self.positions): return -1
+
+        bam_seek( self.fp.x.bam, self.positions[self.current_pos], 0 )
+        self.current_pos += 1
          return samread(self.fp, self.b)
  
-    def __next__(self): 
+    def __next__(self):
          """python version of next().
  
          pyrex uses this non-standard name instead of next()
          """
-        cdef int ret
-        ret = samread(self.fp, self.b)
+
+        cdef int ret = self.cnext()
          if (ret > 0):
              return makeAlignedRead( self.b )
          else:
              raise StopIteration
  
      def __dealloc__(self):
-        '''remember: dealloc cannot call other methods!'''
-        pysam_bam_destroy1(self.b)
-        
+        bam_destroy1(self.b)
+        if self.owns_samfile: samclose( self.fp )
+
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef int __advance_all( void * data, bam1_t * b ):
+    '''advance without any read filtering.
+    '''
+    cdef __iterdata * d
+    d = <__iterdata*>data
+    return bam_iter_read( d.samfile.x.bam, d.iter, b )
+
+cdef int __advance_snpcalls( void * data, bam1_t * b ):
+    '''advance using same filter and read processing as in
+    the samtools pileup.
+    '''
+    cdef __iterdata * d
+    d = <__iterdata*>data
+
+    cdef int ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
+    cdef int skip = 0
+    cdef int q
+    cdef int is_cns = 1
+    cdef int is_nobaq = 0
+    cdef int capQ_thres = 0
+
+    # reload sequence
+    if d.fastafile != NULL and b.core.tid != d.tid:
+        if d.seq != NULL: free(d.seq)
+        d.tid = b.core.tid
+        d.seq = faidx_fetch_seq(d.fastafile,
+                                d.samfile.header.target_name[d.tid],
+                                0, max_pos,
+                                &d.seq_len)
+        if d.seq == NULL:
+            raise ValueError( "reference sequence for '%s' (tid=%i) not found" % \
+                                  (d.samfile.header.target_name[d.tid],
+                                   d.tid))
+
+
+    while ret >= 0:
+
+        skip = 0
+
+        # realign read - changes base qualities
+        if d.seq != NULL and is_cns and not is_nobaq: bam_prob_realn( b, d.seq )
+
+        if d.seq != NULL and capQ_thres > 10:
+            q = bam_cap_mapQ(b, d.seq, capQ_thres)
+            if q < 0: skip = 1
+            elif b.core.qual > q: b.core.qual = q
+        if b.core.flag & BAM_FUNMAP: skip = 1
+        elif b.core.flag & 1 and not b.core.flag & 2: skip = 1
+
+        if not skip: break
+        # additional filters
+
+        ret = bam_iter_read( d.samfile.x.bam, d.iter, b )
+
+    return ret
+
  cdef class IteratorColumn:
-    '''iterates over columns.
+    '''abstract base class for iterators over columns.
+
+    IteratorColumn objects wrap the pileup functionality of samtools.
+
+    For reasons of efficiency, the iterator points to the current
+    pileup buffer. The pileup buffer is updated at every iteration.
+    This might cause some unexpected behavious. For example,
+    consider the conversion to a list::
  
-    This iterator wraps the pileup functionality of samtools.
-    
-    For reasons of efficiency, the iterator returns the current 
-    pileup buffer. As this buffer is updated at every iteration, 
-    the contents of this iterator will change accordingly. Hence the conversion to
-    a list will not produce the expected result::
-    
         f = Samfile("file.bam", "rb")
         result = list( f.pileup() )
  
-    Here, result will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns, 
-    but each object will contain the same information.
-    
-    If the results of several columns are required at the same time, the results
-    need to be stored explicitely::
+    Here, ``result`` will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns,
+    but each object in ``result`` will contain the same information.
+
+    The desired behaviour can be achieved by list comprehension::
  
         result = [ x.pileups() for x in f.pileup() ]
  
-    Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`.
+    ``result`` will be a list of ``n`` lists of objects of type :class:`PileupRead`.
  
-    '''
-    cdef bam_plbuf_t *buf
+    If the iterator is associated with a :class:`Fastafile` using the :meth:`addReference`
+    method, then the iterator will export the current sequence via the methods :meth:`getSequence`
+    and :meth:`seq_len`.
+
+    Optional kwargs to the iterator
+
+    stepper
+       The stepper controls how the iterator advances.
+       Possible options for the stepper are
+
+       all
+           use all reads for pileup.
+       samtools
+           same filter and read processing as in :term:`csamtools` pileup
  
-    # check if first iteration
-    cdef int notfirst
-    # result of the last plbuf_push
-    cdef int n_pu
-    cdef int eof 
-    cdef IteratorRow iter
+       The default is to use "all" if no stepper is given.
  
-    def __cinit__(self, Samfile samfile, int tid, int start, int end ):
+    fastafile
+       A :class:`FastaFile` object
+    mask
+       Skip all reads with bits set in mask.
+    max_depth
+       maximum read depth. The default is 8000.
+    '''
+
+    def __cinit__( self, Samfile samfile, **kwargs ):
+        self.samfile = samfile
+        self.mask = kwargs.get("mask", BAM_DEF_MASK )
+        self.fastafile = kwargs.get( "fastafile", None )
+        self.stepper = kwargs.get( "stepper", None )
+        self.max_depth = kwargs.get( "max_depth", 8000 )
+        self.iterdata.seq = NULL
+        self.tid = 0
+        self.pos = 0
+        self.n_plp = 0
+        self.plp = NULL
+        self.pileup_iter = <bam_plp_t>NULL
  
-        self.iter = IteratorRow( samfile, tid, start, end )
-        self.buf = bam_plbuf_init(NULL, NULL )
-        self.n_pu = 0
-        self.eof = 0
  
      def __iter__(self):
-        return self 
+        return self
  
      cdef int cnext(self):
          '''perform next iteration.
-        
-        return 1 if there is a buffer to emit. Return 0 for end of iteration.
+
+        This method is analogous to the samtools bam_plp_auto method.
+        It has been re-implemented to permit for filtering.
+        '''
+        self.plp = bam_plp_auto( self.pileup_iter,
+                                 &self.tid,
+                                 &self.pos,
+                                 &self.n_plp )
+
+    cdef char * getSequence( self ):
+        '''return current reference sequence underlying the iterator.
+        '''
+        return self.iterdata.seq
+
+    property seq_len:
+        '''current sequence length.'''
+        def __get__(self): return self.iterdata.seq_len
+
+    def addReference( self, Fastafile fastafile ):
+       '''
+       add reference sequences in *fastafile* to iterator.'''
+       self.fastafile = fastafile
+       if self.iterdata.seq != NULL: free(self.iterdata.seq)
+       self.iterdata.tid = -1
+       self.iterdata.fastafile = self.fastafile.fastafile
+
+    def hasReference( self ):
+        '''
+        return true if iterator is associated with a reference'''
+        return self.fastafile
+
+    cdef setMask( self, mask ):
+        '''set masking flag in iterator.
+
+        reads with bits set in *mask* will be skipped.
          '''
+        self.mask = mask
+        bam_plp_set_mask( self.pileup_iter, self.mask )
+
+    cdef setupIteratorData( self,
+                            int tid,
+                            int start,
+                            int end,
+                            int reopen = 0 ):
+        '''setup the iterator structure'''
+
+        self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen )
+        self.iterdata.samfile = self.samfile.samfile
+        self.iterdata.iter = self.iter.iter
+        self.iterdata.seq = NULL
+        self.iterdata.tid = -1
+
+        if self.fastafile != None:
+            self.iterdata.fastafile = self.fastafile.fastafile
+        else:
+            self.iterdata.fastafile = NULL
  
-        cdef int retval1, retval2
+        if self.stepper == None or self.stepper == "all":
+            self.pileup_iter = bam_plp_init( &__advance_all, &self.iterdata )
+        elif self.stepper == "samtools":
+            self.pileup_iter = bam_plp_init( &__advance_snpcalls, &self.iterdata )
+        else:
+            raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper)
  
-        # pysam bam_plbuf_push returns:
-        # 1: if buf is full and can be emitted
-        # 0: if b has been added
-        # -1: if there was an error
+        if self.max_depth:
+            bam_plp_set_maxcnt( self.pileup_iter, self.max_depth )
  
-        # check if previous plbuf was incomplete. If so, continue within
-        # the loop and yield if necessary
-        if self.n_pu > 0:
-            self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1)
-            if self.n_pu > 0: return 1
+        bam_plp_set_mask( self.pileup_iter, self.mask )
  
-        if self.eof: return 0
+    cdef reset( self, tid, start, end ):
+        '''reset iterator position.
  
-        # get next alignments and submit until plbuf indicates that
-        # an new column has finished
-        while self.n_pu == 0:
-            retval1 = self.iter.cnext()
-            # wrap up if no more input
-            if retval1 == 0: 
-                self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0)            
-                self.eof = 1
-                return self.n_pu
+        This permits using the iterator multiple times without
+        having to incur the full set-up costs.
+        '''
+        self.iter = IteratorRowRegion( self.samfile, tid, start, end, reopen = 0 )
+        self.iterdata.iter = self.iter.iter
  
-            # submit to plbuf
-            self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0)            
-            if self.n_pu < 0: raise ValueError( "error while iterating" )
+        # invalidate sequence if different tid
+        if self.tid != tid:
+            if self.iterdata.seq != NULL: free( self.iterdata.seq )
+            self.iterdata.seq = NULL
+            self.iterdata.tid = -1
  
-        # plbuf has yielded
-        return 1
+        # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
+        bam_plp_reset(self.pileup_iter)
  
-    def __next__(self): 
+    def __dealloc__(self):
+        # reset in order to avoid memory leak messages for iterators that have
+        # not been fully consumed
+        if self.pileup_iter != <bam_plp_t>NULL:
+            bam_plp_reset(self.pileup_iter)
+            bam_plp_destroy(self.pileup_iter)
+            self.pileup_iter = <bam_plp_t>NULL
+
+        if self.iterdata.seq != NULL:
+            free(self.iterdata.seq)
+            self.iterdata.seq = NULL
+
+cdef class IteratorColumnRegion(IteratorColumn):
+    '''iterates over a region only.
+    '''
+    def __cinit__(self, Samfile samfile,
+                  int tid = 0,
+                  int start = 0,
+                  int end = max_pos,
+                  int truncate = False,
+                  **kwargs ):
+
+        # initialize iterator
+        self.setupIteratorData( tid, start, end, 1 )
+        self.start = start
+        self.end = end
+        self.truncate = truncate
+
+    def __next__(self):
          """python version of next().
+        """
  
-        pyrex uses this non-standard name instead of next()
+        while 1:
+            self.cnext()
+            if self.n_plp < 0:
+                raise ValueError("error during iteration" )
+
+            if self.plp == NULL:
+                raise StopIteration
+            
+            if self.truncate:
+                if self.start < self.pos: continue
+                if self.pos >= self.end: raise StopIteration
+
+            return makePileupProxy( &self.plp,
+                                     self.tid,
+                                     self.pos,
+                                     self.n_plp )
+
+cdef class IteratorColumnAllRefs(IteratorColumn):
+    """iterates over all columns by chaining iterators over each reference
+    """
+
+    def __cinit__(self,
+                  Samfile samfile,
+                  **kwargs ):
+
+        # no iteration over empty files
+        if not samfile.nreferences: raise StopIteration
+
+        # initialize iterator
+        self.setupIteratorData( self.tid, 0, max_pos, 1 )
+
+    def __next__(self):
+        """python version of next().
          """
-        cdef int ret
-        ret = self.cnext()
-        cdef bam_pileup1_t * pl
  
-        if ret > 0 :
-            return makePileupProxy( self.buf, self.n_pu )
-        else:
-            raise StopIteration
+        while 1:
+            self.cnext()
  
-    def __dealloc__(self):
-        bam_plbuf_destroy(self.buf);
+            if self.n_plp < 0:
+                raise ValueError("error during iteration" )
+
+            # return result, if within same reference
+            if self.plp != NULL:
+                return makePileupProxy( &self.plp,
+                                         self.tid,
+                                         self.pos,
+                                         self.n_plp )
+
+            # otherwise, proceed to next reference or stop
+            self.tid += 1
+            if self.tid < self.samfile.nreferences:
+                self.setupIteratorData( self.tid, 0, max_pos, 0 )
+            else:
+                raise StopIteration
+
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef inline int32_t query_start(bam1_t *src) except -1:
+    cdef uint32_t * cigar_p, op
+    cdef uint32_t k
+    cdef uint32_t start_offset = 0
+
+    if src.core.n_cigar:
+        cigar_p = bam1_cigar(src);
+        for k from 0 <= k < src.core.n_cigar:
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            if op==BAM_CHARD_CLIP:
+                if start_offset!=0 and start_offset!=src.core.l_qseq:
+                    PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+                    return -1
+            elif op==BAM_CSOFT_CLIP:
+                start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+            else:
+                break
+
+    return start_offset
+
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+##-------------------------------------------------------------------
+cdef inline int32_t query_end(bam1_t *src) except -1:
+    cdef uint32_t * cigar_p, op
+    cdef uint32_t k
+    cdef uint32_t end_offset = src.core.l_qseq
+
+    if src.core.n_cigar>1:
+        cigar_p = bam1_cigar(src);
+        for k from src.core.n_cigar > k >= 1:
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            if op==BAM_CHARD_CLIP:
+                if end_offset!=0 and end_offset!=src.core.l_qseq:
+                    PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+                    return -1
+            elif op==BAM_CSOFT_CLIP:
+                end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
+            else:
+                break
+
+    if end_offset==0:
+        end_offset = src.core.l_qseq
+
+    return end_offset
+
+
+cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end):
+    cdef uint8_t * p
+    cdef uint32_t k
+    cdef char * s
+
+    if not src.core.l_qseq:
+        return None
+
+    seq = PyBytes_FromStringAndSize(NULL, end - start)
+    s   = <char*>seq
+    p   = bam1_seq(src)
+
+    for k from start <= k < end:
+        # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
+        # note: do not use string literal as it will be a python string
+        s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf]
+
+    return seq
+
+
+cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end):
+    cdef uint8_t * p
+    cdef uint32_t k
+    cdef char * q
+
+    p = bam1_qual(src)
+    if p[0] == 0xff:
+        return None
+
+    qual = PyBytes_FromStringAndSize(NULL, end - start)
+    q    = <char*>qual
+
+    for k from start <= k < end:
+        ## equivalent to t[i] + 33 (see bam.c)
+        q[k-start] = p[k] + 33
+
+    return qual
  
  cdef class AlignedRead:
      '''
-    Class representing an aligned read. see SAM format specification for meaning of fields (http://samtools.sourceforge.net/).
+    Class representing an aligned read. see SAM format specification for
+    the meaning of fields (http://samtools.sourceforge.net/).
  
      This class stores a handle to the samtools C-structure representing
      an aligned read. Member read access is forwarded to the C-structure
@@ -1015,14 +2111,20 @@ cdef class AlignedRead:
      One issue to look out for is that the sequence should always
      be set *before* the quality scores. Setting the sequence will
      also erase any quality scores that were set previously.
+
+    In Python 3, the fields containing sequence and quality
+    (seq, query, qual and qqual) data are of type bytes. Other
+    string data, such as the qname field and strings in the
+    tags tuple, is represented as unicode strings. On assignment,
+    both bytes and unicode objects are allowed, but unicode strings
+    must contain only ASCII characters.
      '''
-    cdef:
-         bam1_t * _delegate 
  
-    def __cinit__( self ):
+    # Now only called when instances are created from Python
+    def __init__(self):
          # see bam_init1
          self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
-        # allocate some memory 
+        # allocate some memory
          # If size is 0, calloc does not return a pointer that can be passed to free()
          # so allocate 40 bytes for a new read
          self._delegate.m_data = 40
@@ -1030,26 +2132,45 @@ cdef class AlignedRead:
          self._delegate.data_len = 0
  
      def __dealloc__(self):
-        '''clear up memory.'''
-        pysam_bam_destroy1(self._delegate)
-    
+        bam_destroy1(self._delegate)
+
      def __str__(self):
-        """todo"""
+        """return string representation of alignment.
+
+        The representation is an approximate :term:`sam` format.
+
+        An aligned read might not be associated with a :term:`Samfile`.
+        As a result :term:`tid` is shown instead of the reference name.
+
+        Similarly, the tags field is returned in its parsed state.
+        """
+        # sam-parsing is done in sam.c/bam_format1_core which
+        # requires a valid header.
+        if sys.version_info[0] < 3:
+            seq = self.seq
+            qual = self.qual
+        else:
+            seq = self.seq.decode('ascii')
+            qual = self.qual.decode('ascii')
          return "\t".join(map(str, (self.qname,
+                                   self.flag,
                                     self.rname,
                                     self.pos,
-                                   self.cigar,
-                                   self.qual,
-                                   self.flag,
-                                   self.seq,
                                     self.mapq,
-                                   self.tags)))
-    
-       
-    def __cmp__(self, AlignedRead other):
-        '''return true, if contents in this are binary equal to ``other``.'''
+                                   self.cigar,
+                                   self.mrnm,
+                                   self.mpos,
+                                   self.rlen,
+                                   seq,
+                                   qual,
+                                   self.tags )))
+
+    def compare(self, AlignedRead other):
+        '''return -1,0,1, if contents in this are binary <,=,> to *other*'''
+
          cdef int retval, x
          cdef bam1_t *t, *o
+
          t = self._delegate
          o = other._delegate
  
@@ -1062,61 +2183,90 @@ cdef class AlignedRead:
          # oo = <unsigned char*>(o.data)
          # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
  
-        retval = memcmp( &t.core, 
-                          &o.core, 
-                          sizeof( bam1_core_t ))
+        # Fast-path test for object identity
+        if t==o:
+            return 0
+
+        retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
  
          if retval: return retval
-        retval = cmp( t.data_len, o.data_len)
+        retval = (t.data_len > o.data_len) - (t.data_len < o.data_len) # cmp(t.data_len, o.data_len)
          if retval: return retval
-        return memcmp( t.data, 
-                       o.data, 
-                       sizeof( t.data_len ))
+        return memcmp(t.data, o.data, t.data_len)
+
+    # Disabled so long as __cmp__ is a special method
+    def __hash__(self):
+        return _Py_HashPointer(<void *>self)
  
      property qname:
          """the query name (None if not present)"""
          def __get__(self):
-            cdef bam1_t * src 
+            cdef bam1_t * src
              src = self._delegate
              if src.core.l_qname == 0: return None
-            return <char *>pysam_bam1_qname( src )
+            return _charptr_to_str(<char *>bam1_qname( src ))
  
          def __set__(self, qname ):
              if qname == None or len(qname) == 0: return
-            cdef bam1_t * src 
-            cdef int l 
+            qname = _force_bytes(qname)
+            cdef bam1_t * src
+            cdef int l
              cdef char * p
  
-            src = self._delegate            
-            p = pysam_bam1_qname( src )
+            src = self._delegate
+            p = bam1_qname( src )
  
              # the qname is \0 terminated
              l = len(qname) + 1
-            pysam_bam_update( src, 
-                              src.core.l_qname, 
-                              l, 
+            pysam_bam_update( src,
+                              src.core.l_qname,
+                              l,
                                <uint8_t*>p )
  
              src.core.l_qname = l
  
              # re-acquire pointer to location in memory
              # as it might have moved
-            p = pysam_bam1_qname(src)
+            p = bam1_qname(src)
  
              strncpy( p, qname, l )
-            
+
      property cigar:
-        """the :term:`cigar` alignment (None if not present).
+        """the :term:`cigar` alignment (None if not present). The alignment
+        is returned as a list of operations. The operations are:
+
+        +-----+--------------+-----+
+        |M    |BAM_CMATCH    |0    |
+        +-----+--------------+-----+
+        |I    |BAM_CINS      |1    |
+        +-----+--------------+-----+
+        |D    |BAM_CDEL      |2    |
+        +-----+--------------+-----+
+        |N    |BAM_CREF_SKIP |3    |
+        +-----+--------------+-----+
+        |S    |BAM_CSOFT_CLIP|4    |
+        +-----+--------------+-----+
+        |H    |BAM_CHARD_CLIP|5    |
+        +-----+--------------+-----+
+        |P    |BAM_CPAD      |6    |
+        +-----+--------------+-----+
+        |=    |BAM_CEQUAL    |7    |
+        +-----+--------------+-----+
+        |X    |BAM_CDIFF     |8    |
+        +-----+--------------+-----+
+
          """
          def __get__(self):
              cdef uint32_t * cigar_p
-            cdef bam1_t * src 
+            cdef bam1_t * src
              cdef op, l, cigar
+            cdef int k
+
              src = self._delegate
              if src.core.n_cigar == 0: return None
-            
+
              cigar = []
-            cigar_p = pysam_bam1_cigar(src);
+            cigar_p = bam1_cigar(src);
              for k from 0 <= k < src.core.n_cigar:
                  op = cigar_p[k] & BAM_CIGAR_MASK
                  l = cigar_p[k] >> BAM_CIGAR_SHIFT
@@ -1126,7 +2276,7 @@ cdef class AlignedRead:
          def __set__(self, values ):
              if values == None or len(values) == 0: return
              cdef uint32_t * p
-            cdef bam1_t * src 
+            cdef bam1_t * src
              cdef op, l
              cdef int k
  
@@ -1135,20 +2285,20 @@ cdef class AlignedRead:
              src = self._delegate
  
              # get location of cigar string
-            p = pysam_bam1_cigar(src)
+            p = bam1_cigar(src)
  
              # create space for cigar data within src.data
-            pysam_bam_update( src, 
+            pysam_bam_update( src,
                                src.core.n_cigar * 4,
-                              len(values) * 4, 
-                              p )
-            
-            # length is number of cigar operations, not bytes
+                              len(values) * 4,
+                              <uint8_t*>p )
+
+            # length is number of cigar operations, not bytes
              src.core.n_cigar = len(values)
  
              # re-acquire pointer to location in memory
              # as it might have moved
-            p = pysam_bam1_cigar(src)
+            p = bam1_cigar(src)
  
              # insert cigar operations
              for op, l in values:
@@ -1158,54 +2308,64 @@ cdef class AlignedRead:
              ## setting the cigar string also updates the "bin" attribute
              src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
  
+    property cigarstring:
+        '''the :term:`cigar` alignment as a string.
+        
+        Returns the empty string if not present.
+        '''
+        def __get__(self):
+            c = self.cigar
+            if c == None: return ""
+            else: return "".join([ "%c%i" % (CODE2CIGAR[x],y) for x,y in c])
+            
+        def __set__(self, cigar):
+            if cigar == None or len(cigar) == 0: self.cigar = []
+            parts = CIGAR_REGEX.findall( cigar )
+            self.cigar = [ (CIGAR2CODE[ord(x)], int(y)) for x,y in parts ]
+
      property seq:
-        """the query sequence (None if not present)"""
+        """read sequence bases, including :term:`soft clipped` bases (None if not present).
+
+        In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
          def __get__(self):
              cdef bam1_t * src
-            cdef uint8_t * p 
              cdef char * s
              src = self._delegate
-            bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
-            ## parse qseq (bam1_seq)
+
              if src.core.l_qseq == 0: return None
  
-            s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char))
-            p = pysam_bam1_seq( src )
-            for k from 0 <= k < src.core.l_qseq:
-            ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
-                s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)]
-            retval=s
-            free(s)
-            return retval
+            return get_seq_range(src, 0, src.core.l_qseq)
  
          def __set__(self,seq):
              # samtools manages sequence and quality length memory together
              # if no quality information is present, the first byte says 0xff.
-            
+
              if seq == None or len(seq) == 0: return
+            seq = _force_bytes(seq)
              cdef bam1_t * src
-            cdef uint8_t * p 
+            cdef uint8_t * p
              cdef char * s
-            src = self._delegate
              cdef int l, k, nbytes_new, nbytes_old
  
+            src = self._delegate
+
              l = len(seq)
-            
+
              # as the sequence is stored in half-bytes, the total length (sequence
              # plus quality scores) is (l+1)/2 + l
              nbytes_new = (l+1)/2 + l
              nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
              # acquire pointer to location in memory
-            p = pysam_bam1_seq( src )
+            p = bam1_seq( src )
              src.core.l_qseq = l
  
-            pysam_bam_update( src, 
+            pysam_bam_update( src,
                                nbytes_old,
                                nbytes_new,
                                p)
              # re-acquire pointer to location in memory
              # as it might have moved
-            p = pysam_bam1_seq( src )
+            p = bam1_seq( src )
              for k from 0 <= k < nbytes_new: p[k] = 0
              # convert to C string
              s = seq
@@ -1213,42 +2373,39 @@ cdef class AlignedRead:
                  p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
  
              # erase qualities
-            p = pysam_bam1_qual( src )
+            p = bam1_qual( src )
              p[0] = 0xff
  
+
      property qual:
-        """the base quality (None if not present)"""
+        """read sequence base qualities, including :term:`soft clipped` bases (None if not present).
+
+        In Python 3, this property is of type bytes and assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient."""
          def __get__(self):
-            cdef bam1_t * src 
-            cdef uint8_t * p
+
+            cdef bam1_t * src
              cdef char * q
+
              src = self._delegate
-            if src.core.l_qseq == 0: return None
  
-            p = pysam_bam1_qual( src )
-            if p[0] == 0xff: return None
+            if src.core.l_qseq == 0: return None
  
-            q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char))
-            for k from 0 <= k < src.core.l_qseq:
-            ## equivalent to t[i] + 33 (see bam.c)
-                q[k] = p[k] + 33
-            # convert to python string
-            retval=q
-            # clean up
-            free(q)
-            return retval
+            return get_qual_range(src, 0, src.core.l_qseq)
  
          def __set__(self,qual):
              # note that space is already allocated via the sequences
              cdef bam1_t * src
              cdef uint8_t * p
-            cdef char * q 
+            cdef char * q
+            cdef int k
+
              src = self._delegate
-            p = pysam_bam1_qual( src )
+            p = bam1_qual( src )
              if qual == None or len(qual) == 0:
                  # if absent - set to 0xff
                  p[0] = 0xff
                  return
+            qual = _force_bytes(qual)
              cdef int l
              # convert to C string
              q = qual
@@ -1259,277 +2416,551 @@ cdef class AlignedRead:
              for k from 0 <= k < l:
                  p[k] = <uint8_t>q[k] - 33
  
+    property query:
+        """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present).
+
+        In Python 3, this property is of type bytes. Assigning a unicode string to it consisting of ASCII characters only will work, but is inefficient.
+
+        SAM/BAM files may included extra flanking bases sequences that were
+        not part of the alignment.  These bases may be the result of the
+        Smith-Waterman or other algorithms, which may not require alignments
+        that begin at the first residue or end at the last.  In addition,
+        extra sequencing adapters, multiplex identifiers, and low-quality bases that
+        were not considered for alignment may have been retained."""
+
+        def __get__(self):
+            cdef bam1_t * src
+            cdef uint32_t start, end
+            cdef char * s
+
+            src = self._delegate
+
+            if src.core.l_qseq == 0: return None
+
+            start = query_start(src)
+            end   = query_end(src)
+
+            return get_seq_range(src, start, end)
+
+    property qqual:
+        """aligned query sequence quality values (None if not present). This property is read-only.
+
+        In Python 3, this property is of type bytes."""
+        def __get__(self):
+            cdef bam1_t * src
+            cdef uint32_t start, end
+
+            src = self._delegate
+
+            if src.core.l_qseq == 0: return None
+
+            start = query_start(src)
+            end   = query_end(src)
+
+            return get_qual_range(src, start, end)
+
+    property qstart:
+        """start index of the aligned query portion of the sequence (0-based, inclusive)"""
+        def __get__(self):
+            return query_start(self._delegate)
+
+    property qend:
+        """end index of the aligned query portion of the sequence (0-based, exclusive)"""
+        def __get__(self):
+            return query_end(self._delegate)
+
+    property qlen:
+        """Length of the aligned query sequence"""
+        def __get__(self):
+            cdef bam1_t * src
+            src = self._delegate
+            return query_end(src)-query_start(src)
+
      property tags:
-        """the tags in the AUX field."""
+        """the tags in the AUX field.
+
+        This property permits convenience access to
+        the tags. Changes it the returned list will
+        not update the tags automatically. Instead,
+        the following is required for adding a
+        new tag::
+
+            read.tags = read.tags + [("RG",0)]
+
+
+        This method will happily write the same tag
+        multiple times.
+        """
          def __get__(self):
              cdef char * ctag
              cdef bam1_t * src
              cdef uint8_t * s
-            cdef char tpe
-            
+            cdef char auxtag[3]
+            cdef char auxtype
+            cdef uint8_t byte_size
+            cdef int32_t nvalues
+
              src = self._delegate
-            if src.l_aux == 0: return None
-            
-            s = pysam_bam1_aux( src )
+            if src.l_aux == 0: return []
+            s = bam1_aux( src )
              result = []
-            ctag = <char*>calloc( 3, sizeof(char) )
-            cdef int x
+            auxtag[2] = 0
              while s < (src.data + src.data_len):
                  # get tag
-                ctag[0] = s[0]
-                ctag[1] = s[1]
-                pytag = ctag
-
+                auxtag[0] = s[0]
+                auxtag[1] = s[1]
                  s += 2
-
-                # convert type - is there a better way?
-                ctag[0] = s[0]
-                ctag[1] = 0
-                pytype = ctag
-                # get type and value 
-                # how do I do char literal comparison in cython?
-                # the code below works (i.e, is C comparison)
-                tpe = toupper(s[0])
-                if tpe == 'S'[0]:
-                    value = <int>bam_aux2i(s)            
+                auxtype = s[0]
+                if auxtype in ('c', 'C'):
+                    value = <int>bam_aux2i(s)
+                    s += 1
+                elif auxtype in ('s', 'S'):
+                    value = <int>bam_aux2i(s)
                      s += 2
-                elif tpe == 'I'[0]:
-                    value = <int>bam_aux2i(s)            
+                elif auxtype in ('i', 'I'):
+                    value = <int32_t>bam_aux2i(s)
                      s += 4
-                elif tpe == 'F'[0]:
+                elif auxtype == 'f':
                      value = <float>bam_aux2f(s)
                      s += 4
-                elif tpe == 'D'[0]:
+                elif auxtype == 'd':
                      value = <double>bam_aux2d(s)
                      s += 8
-                elif tpe == 'C'[0]:
-                    value = <int>bam_aux2i(s)
-                    s += 1
-                elif tpe == 'A'[0]:
-                    # there might a more efficient way
-                    # to convert a char into a string
+                elif auxtype == 'A':
                      value = "%c" % <char>bam_aux2A(s)
                      s += 1
-                elif tpe == 'Z'[0]:
-                    value = <char*>bam_aux2Z(s)
+                elif auxtype in ('Z', 'H'):
+                    value = _charptr_to_str(<char*>bam_aux2Z(s))
                      # +1 for NULL terminated string
                      s += len(value) + 1
+                elif auxtype == 'B':
+                    s += 1
+                    byte_size, nvalues, value = convertBinaryTagToList( s )
+                    # 5 for 1 char and 1 int
+                    s += 5 + ( nvalues * byte_size) - 1
  
-                # skip over type
                  s += 1
  
-                # ignore pytype
-                result.append( (pytag, value) )
+                result.append( (_charptr_to_str(auxtag), value) )
  
-            free( ctag )
              return result
  
          def __set__(self, tags):
-            cdef char * ctag
              cdef bam1_t * src
              cdef uint8_t * s
              cdef uint8_t * new_data
-            cdef int guessed_size, control_size
+            cdef char * temp
+
              src = self._delegate
-            cdef int max_size, size
-            max_size = 4000
-
-            # map samtools code to python.struct code and byte size
-            buffer = ctypes.create_string_buffer(max_size)
-
-            offset = 0
-            for pytag, value in tags:
-                t = type(value)
-                if t == types.FloatType:
-                    fmt = "<cccf"
-                elif t == types.IntType:
-                    if value < 0:
-                        if value >= -127: fmt, pytype = "<cccb", 'c'
-                        elif value >= -32767: fmt, pytype = "<ccch", 's'
-                        elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
-                        else: fmt, ctype = "<ccci", 'i'[0]
-                    else:
-                        if value <= 255: fmt, pytype = "<cccB", 'C'
-                        elif value <= 65535: fmt, pytype = "<cccH", 'S'
-                        elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
-                        else: fmt, pytype = "<cccI", 'I'
-                else:
-                    # Note: hex strings (H) are not supported yet
-                    if len(value) == 1:
-                        fmt, pytype = "<cccc", 'A'
-                    else:
-                        fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
  
-                size = struct.calcsize(fmt)
-                if offset + size > max_size:
-                    raise NotImplementedError("tags field too large")
+            fmts, args = ["<"], []
+            
+            if tags != None:
+
+                # map samtools code to python.struct code and byte size
+                for pytag, value in tags:
+                    if not type(pytag) is bytes:
+                        pytag = pytag.encode('ascii')
+                    t = type(value)
+
+                    if t is tuple or t is list:
+                        # binary tags - treat separately
+                        pytype = 'B'
+                        # get data type - first value determines type
+                        if type(value[0]) is float:
+                            datafmt, datatype = "f", "f"
+                        else:
+                            mi, ma = min(value), max(value)
+                            absmax = max( abs(mi), abs(ma) )
+                            # signed ints
+                            if mi < 0: 
+                                if mi >= -127: datafmt, datatype = "b", 'c'
+                                elif mi >= -32767: datafmt, datatype = "h", 's'
+                                elif absmax < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                                else: datafmt, datatype = "i", 'i'
+
+                            # unsigned ints
+                            else:
+                                if absmax <= 255: datafmt, datatype = "B", 'C'
+                                elif absmax <= 65535: datafmt, datatype = "H", 'S'
+                                elif absmax > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                                else: datafmt, datatype = "I", 'I'
+                                
+                        datafmt = "2sccI%i%s" % (len(value), datafmt)
+                        args.extend( [pytag[:2], 
+                                      pytype.encode('ascii'),
+                                      datatype.encode('ascii'),
+                                      len(value)] + list(value) )
+                        fmts.append( datafmt )
+                        continue
+
+                    if t is float:
+                        fmt, pytype = "2scf", 'f'
+                    elif t is int:
+                        # negative values
+                        if value < 0:
+                            if value >= -127: fmt, pytype = "2scb", 'c'
+                            elif value >= -32767: fmt, pytype = "2sch", 's'
+                            elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                            else: fmt, pytype = "2sci", 'i'
+                        # positive values
+                        else:
+                            if value <= 255: fmt, pytype = "2scB", 'C'
+                            elif value <= 65535: fmt, pytype = "2scH", 'S'
+                            elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                            else: fmt, pytype = "2scI", 'I'
+                    else:
+                        # Note: hex strings (H) are not supported yet
+                        if t is not bytes:
+                            value = value.encode('ascii')
+                        if len(value) == 1:
+                            fmt, pytype = "2scc", 'A'
+                        else:
+                            fmt, pytype = "2sc%is" % (len(value)+1), 'Z'
+
+                    args.extend( [pytag[:2],
+                                  pytype.encode('ascii'),
+                                  value ] )
+                    
+                    fmts.append( fmt )
  
+                fmt = "".join(fmts)
+                total_size = struct.calcsize(fmt)
+                buffer = ctypes.create_string_buffer(total_size)
                  struct.pack_into( fmt,
                                    buffer,
-                                  offset,
-                                  pytag[0],
-                                  pytag[1],
-                                  pytype,
-                                  value )
-                offset += size
-            
-            # delete the old data and allocate new
-            pysam_bam_update( src, 
+                                  0, 
+                                  *args )
+
+            # delete the old data and allocate new space.
+            # If total_size == 0, the aux field will be
+            # empty
+            pysam_bam_update( src,
                                src.l_aux,
-                              offset,
-                              pysam_bam1_aux( src ) )
-            
-            src.l_aux = offset
+                              total_size,
+                              bam1_aux( src ) )
  
-            if offset == 0: return
+            src.l_aux = total_size
  
-            # get location of new data
-            s = pysam_bam1_aux( src )            
-            
-            # check if there is direct path from buffer.raw to tmp
-            cdef char * temp 
-            temp = buffer.raw
-            memcpy( s, temp, offset )            
+            # copy data only if there is any
+            if total_size != 0:
+                
+                # get location of new data
+                s = bam1_aux( src )
+
+                # check if there is direct path from buffer.raw to tmp
+                temp = buffer.raw
+                memcpy( s, temp, total_size )
  
-    property flag: 
+    property flag:
          """properties flag"""
          def __get__(self): return self._delegate.core.flag
          def __set__(self, flag): self._delegate.core.flag = flag
-    property rname: 
+
+    property rname:
          """
          :term:`target` ID
  
+        DEPRECATED from pysam-0.4 - use tid in the future.
+        The rname field caused a lot of confusion as it returns
+        the :term:`target` ID instead of the reference sequence
+        name.
+
          .. note::
  
-            This field contains the index of the reference sequence 
+            This field contains the index of the reference sequence
              in the sequence dictionary. To obtain the name
              of the reference sequence, use :meth:`pysam.Samfile.getrname()`
  
          """
          def __get__(self): return self._delegate.core.tid
          def __set__(self, tid): self._delegate.core.tid = tid
-    property pos: 
+
+    property tid:
+        """
+        :term:`target` ID
+
+        .. note::
+
+            This field contains the index of the reference sequence
+            in the sequence dictionary. To obtain the name
+            of the reference sequence, use :meth:`pysam.Samfile.getrname()`
+
+        """
+        def __get__(self): return self._delegate.core.tid
+        def __set__(self, tid): self._delegate.core.tid = tid
+
+    property pos:
          """0-based leftmost coordinate"""
          def __get__(self): return self._delegate.core.pos
-        def __set__(self, pos): 
+        def __set__(self, pos):
              ## setting the cigar string also updates the "bin" attribute
              cdef bam1_t * src
              src = self._delegate
              if src.core.n_cigar:
-                src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) )
+                src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) )
              else:
                  src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
              self._delegate.core.pos = pos
-    property bin: 
+    property bin:
          """properties bin"""
          def __get__(self): return self._delegate.core.bin
          def __set__(self, bin): self._delegate.core.bin = bin
      property rlen:
          '''length of the read (read only). Returns 0 if not given.'''
          def __get__(self): return self._delegate.core.l_qseq
-    property mapq: 
+    property aend:
+        '''aligned reference position of the read on the reference genome.  
+        
+        aend points to one past the last aligned residue.
+        Returns None if not available.'''
+        def __get__(self):
+            cdef bam1_t * src
+            src = self._delegate
+            if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
+                return None
+            return bam_calend(&src.core, bam1_cigar(src))
+
+    property alen:
+        '''aligned length of the read on the reference genome.  Returns None if
+        not available.'''
+        def __get__(self):
+            cdef bam1_t * src
+            src = self._delegate
+            if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0:
+                return None
+            return bam_calend(&src.core,
+                               bam1_cigar(src)) - \
+                               self._delegate.core.pos
+
+    property mapq:
          """mapping quality"""
          def __get__(self): return self._delegate.core.qual
          def __set__(self, qual): self._delegate.core.qual = qual
+
      property mrnm:
-        """the :term:`reference` id of the mate """     
+        """the :term:`reference` id of the mate
+        deprecated, use RNEXT instead.
+        """
+        def __get__(self): return self._delegate.core.mtid
+        def __set__(self, mtid): self._delegate.core.mtid = mtid
+    property rnext:
+        """the :term:`reference` id of the mate """
          def __get__(self): return self._delegate.core.mtid
          def __set__(self, mtid): self._delegate.core.mtid = mtid
-    property mpos: 
+    property mpos:
+        """the position of the mate
+        deprecated, use PNEXT instead."""
+        def __get__(self): return self._delegate.core.mpos
+        def __set__(self, mpos): self._delegate.core.mpos = mpos
+    property pnext:
          """the position of the mate"""
          def __get__(self): return self._delegate.core.mpos
          def __set__(self, mpos): self._delegate.core.mpos = mpos
-    property isize: 
+    property isize:
+        """the insert size
+        deprecated: use tlen instead"""
+        def __get__(self): return self._delegate.core.isize
+        def __set__(self, isize): self._delegate.core.isize = isize
+    property tlen:
          """the insert size"""
          def __get__(self): return self._delegate.core.isize
          def __set__(self, isize): self._delegate.core.isize = isize
-    property is_paired: 
+    property is_paired:
          """true if read is paired in sequencing"""
          def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FPAIRED
              else: self._delegate.core.flag &= ~BAM_FPAIRED
      property is_proper_pair:
          """true if read is mapped in a proper pair"""
          def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
              else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
      property is_unmapped:
          """true if read itself is unmapped"""
          def __get__(self): return (self.flag & BAM_FUNMAP) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FUNMAP
              else: self._delegate.core.flag &= ~BAM_FUNMAP
-    property mate_is_unmapped: 
-        """true if the mate is unmapped""" 
+    property mate_is_unmapped:
+        """true if the mate is unmapped"""
          def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FMUNMAP
              else: self._delegate.core.flag &= ~BAM_FMUNMAP
      property is_reverse:
          """true if read is mapped to reverse strand"""
-        def __get__(self):return (self.flag & BAM_FREVERSE) != 0
-        def __set__(self,val): 
+        def __get__(self): return (self.flag & BAM_FREVERSE) != 0
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FREVERSE
              else: self._delegate.core.flag &= ~BAM_FREVERSE
      property mate_is_reverse:
          """true is read is mapped to reverse strand"""
          def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FMREVERSE
              else: self._delegate.core.flag &= ~BAM_FMREVERSE
-    property is_read1: 
+    property is_read1:
          """true if this is read1"""
          def __get__(self): return (self.flag & BAM_FREAD1) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FREAD1
              else: self._delegate.core.flag &= ~BAM_FREAD1
      property is_read2:
          """true if this is read2"""
          def __get__(self): return (self.flag & BAM_FREAD2) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FREAD2
              else: self._delegate.core.flag &= ~BAM_FREAD2
      property is_secondary:
          """true if not primary alignment"""
          def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FSECONDARY
              else: self._delegate.core.flag &= ~BAM_FSECONDARY
      property is_qcfail:
          """true if QC failure"""
          def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FQCFAIL
              else: self._delegate.core.flag &= ~BAM_FQCFAIL
      property is_duplicate:
-        """ true if optical or PCR duplicate"""
+        """true if optical or PCR duplicate"""
          def __get__(self): return (self.flag & BAM_FDUP) != 0
-        def __set__(self,val): 
+        def __set__(self,val):
              if val: self._delegate.core.flag |= BAM_FDUP
              else: self._delegate.core.flag &= ~BAM_FDUP
-    
+    property positions:
+        """a list of reference positions that this read aligns to."""
+        def __get__(self):
+            cdef uint32_t k, i, pos
+            cdef int op
+            cdef uint32_t * cigar_p
+            cdef bam1_t * src
+
+            src = self._delegate
+            if src.core.n_cigar == 0: return []
+
+            result = []
+            pos = src.core.pos
+            cigar_p = bam1_cigar(src)
+
+            for k from 0 <= k < src.core.n_cigar:
+                op = cigar_p[k] & BAM_CIGAR_MASK
+                l = cigar_p[k] >> BAM_CIGAR_SHIFT
+                if op == BAM_CMATCH:
+                    for i from pos <= i < pos + l:
+                        result.append( i )
+
+                if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
+                    pos += l
+
+            return result
+
+    property aligned_pairs:
+       """a list of aligned read and reference positions.
+
+       Unaligned position are marked by None.
+       """
+       def __get__(self):
+           cdef uint32_t k, i, pos, qpos
+           cdef int op
+           cdef uint32_t * cigar_p
+           cdef bam1_t * src 
+
+           src = self._delegate
+           if src.core.n_cigar == 0: return []
+
+           result = []
+           pos = src.core.pos
+           qpos = 0
+           cigar_p = bam1_cigar(src)
+
+           for k from 0 <= k < src.core.n_cigar:
+               op = cigar_p[k] & BAM_CIGAR_MASK
+               l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+               if op == BAM_CMATCH:
+                   for i from pos <= i < pos + l:
+                       result.append( (qpos, i) )
+                       qpos += 1
+                   pos += l
+
+               elif op == BAM_CINS:
+                   for i from pos <= i < pos + l:
+                       result.append( (qpos, None) )
+                       qpos += 1
+
+               elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+                   for i from pos <= i < pos + l:
+                       result.append( (None, i) )
+                   pos += l
+                       
+           return result
+
+
+    def overlap( self, uint32_t start, uint32_t end ):
+        """return number of aligned bases of read overlapping the interval *start* and *end*
+        on the reference sequence.
+        """
+        cdef uint32_t k, i, pos, overlap
+        cdef int op, o
+        cdef uint32_t * cigar_p
+        cdef bam1_t * src
+
+        overlap = 0
+
+        src = self._delegate
+        if src.core.n_cigar == 0: return 0
+        pos = src.core.pos
+        o = 0
+
+        cigar_p = bam1_cigar(src)
+        for k from 0 <= k < src.core.n_cigar:
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+            if op == BAM_CMATCH:
+                o = min( pos + l, end) - max( pos, start )
+                if o > 0: overlap += o
+
+            if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
+                pos += l
+
+        return overlap
+
      def opt(self, tag):
          """retrieves optional data given a two-letter *tag*"""
-        #see bam_aux.c: bam_aux_get() and bam_aux2i() etc 
+        #see bam_aux.c: bam_aux_get() and bam_aux2i() etc
          cdef uint8_t * v
-        v = bam_aux_get(self._delegate, tag)
+        cdef int nvalues
+        btag = _force_bytes(tag)
+        v = bam_aux_get(self._delegate, btag)
          if v == NULL: raise KeyError( "tag '%s' not present" % tag )
-        type = chr(v[0])
-        if type == 'c' or type == 'C' or type == 's' or type == 'S' or type == 'i':
-            return <int>bam_aux2i(v)            
-        elif type == 'f':
+        auxtype = chr(v[0])
+        if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
+            return <int>bam_aux2i(v)
+        elif auxtype == 'i' or auxtype == 'I':
+            return <int32_t>bam_aux2i(v)
+        elif auxtype == 'f' or auxtype == 'F':
              return <float>bam_aux2f(v)
-        elif type == 'd':
+        elif auxtype == 'd' or auxtype == 'D':
              return <double>bam_aux2d(v)
-        elif type == 'A':
+        elif auxtype == 'A':
              # there might a more efficient way
              # to convert a char into a string
              return '%c' % <char>bam_aux2A(v)
-        elif type == 'Z':
-            return <char*>bam_aux2Z(v)
-    
+        elif auxtype == 'Z':
+            return _charptr_to_str(<char*>bam_aux2Z(v))
+        elif auxtype == 'B':
+            bytesize, nvalues, values = convertBinaryTagToList( v + 1 )
+            return values
+        else:
+            raise ValueError("unknown auxilliary type '%s'" % auxtype)
+
+
      def fancy_str (self):
          """returns list of fieldnames/values in pretty format for debugging
          """
@@ -1554,10 +2985,10 @@ cdef class AlignedRead:
             "m_data":        "Maximum data length",
             "data_len":      "Current data length",
             }
-        fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag", 
-                                 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname", 
+        fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag",
+                                 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname",
                                   "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
-        
+
          for f in fields_names_in_order:
              if not f in self.__dict__:
                  continue
@@ -1577,7 +3008,7 @@ cdef class PileupProxy:
      pos
          the target base coordinate (0-based)
      n
-        number of reads mapping to this column    
+        number of reads mapping to this column
      pileups
          list of reads (:class:`pysam.PileupRead`) aligned to this column
  
@@ -1585,11 +3016,8 @@ cdef class PileupProxy:
      If the underlying engine iterator advances, the results of this column
      will change.
      '''
-    cdef bam_plbuf_t * buf
-    cdef int n_pu
-
-    def __cinit__(self ):
-        pass
+    def __init__(self):
+        raise TypeError("This class cannot be instantiated from Python")
  
      def __str__(self):
          return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
@@ -1598,7 +3026,7 @@ cdef class PileupProxy:
  
      property tid:
          '''the chromosome ID as is defined in the header'''
-        def __get__(self): return pysam_get_tid( self.buf )
+        def __get__(self): return self.tid
  
      property n:
          '''number of reads mapping to this column.'''
@@ -1606,39 +3034,33 @@ cdef class PileupProxy:
          def __set__(self, n): self.n_pu = n
  
      property pos:
-        def __get__(self): return pysam_get_pos( self.buf )
+        def __get__(self): return self.pos
  
      property pileups:
          '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
          def __get__(self):
-            cdef bam_pileup1_t * pl
-            pl = pysam_get_pileup( self.buf )
+            cdef int x
              pileups = []
+
+            if self.plp[0] == NULL:
+                raise ValueError("PileupProxy accessed after iterator finished")
+
              # warning: there could be problems if self.n and self.buf are
              # out of sync.
              for x from 0 <= x < self.n_pu:
-                pileups.append( makePileupRead( &pl[x]) )
+                pileups.append( makePileupRead( &(self.plp[0][x])) )
              return pileups
  
  cdef class PileupRead:
      '''A read aligned to a column.
      '''
  
-    cdef:
-         AlignedRead _alignment
-         int32_t  _qpos
-         int _indel
-         int _level
-         uint32_t _is_del
-         uint32_t _is_head
-         uint32_t _is_tail
-
-    def __cinit__( self ):
-        pass
+    def __init__(self):
+        raise TypeError("This class cannot be instantiated from Python")
  
      def __str__(self):
          return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
-       
+
      property alignment:
          """a :class:`pysam.AlignedRead` object of the aligned read"""
          def __get__(self):
@@ -1685,9 +3107,10 @@ class Outs:
          ofd = os.dup(self.id)      #  Save old stream on new unit.
          self.streams.append(ofd)
          sys.stdout.flush()          #  Buffered data goes to old stream.
+        sys.stderr.flush()          #  Buffered data goes to old stream.
          os.dup2(fd, self.id)        #  Open unit 1 on new stream.
          os.close(fd)                #  Close other unit (look out, caller.)
-            
+
      def restore(self):
          '''restore previous output stream'''
          if self.streams:
@@ -1699,84 +3122,730 @@ class Outs:
              os.close(self.streams[-1])
              del self.streams[-1]
  
-def _samtools_dispatch( method, args = () ):
+def _samtools_dispatch( method,
+                        args = (),
+                        catch_stdout = True ):
      '''call ``method`` in samtools providing arguments in args.
      
      .. note:: 
-       This method redirects stdout and stderr to capture it 
-       from samtools. If for some reason stdout/stderr disappears
+       This method redirects stdout to capture it 
+       from samtools. If for some reason stdout disappears
         the reason might be in this method.
  
      .. note::
         The current implementation might only work on linux.
-       
-    .. note:: 
-       This method captures stdout and stderr using temporary files, 
+
+    .. note::
+       This method captures stdout and stderr using temporary files,
         which are then read into memory in their entirety. This method
-       is slow and might cause large memory overhead. 
+       is slow and might cause large memory overhead.
  
      See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
      on the topic of redirecting stderr/stdout.
      '''
  
      # note that debugging this module can be a problem
-    # as stdout/stderr will not appear
+    # as stdout/stderr will not appear on the terminal
+    
+    # some special cases
+    if method == "index":
+        if not os.path.exists( args[0] ):
+            raise IOError( "No such file or directory: '%s'" % args[0] )
  
      # redirect stderr and stdout to file
-
-    # open files and redirect into it
      stderr_h, stderr_f = tempfile.mkstemp()
-    stdout_h, stdout_f = tempfile.mkstemp()
-
-    # patch for `samtools view`
-    # samtools `view` closes stdout, from which I can not
-    # recover. Thus redirect output to file with -o option.
-    if method == "view":
-        if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
-        args = ( "-o", stdout_f ) + args
-
-    stdout_save = Outs( sys.stdout.fileno() )
-    stdout_save.setfd( stdout_h )
-    stderr_save = Outs( sys.stderr.fileno() )
-    stderr_save.setfd( stderr_h )
+    pysam_set_stderr( stderr_h )
+        
+    if catch_stdout:
+        stdout_h, stdout_f = tempfile.mkstemp()
+        try:
+            stdout_save = Outs( sys.stdout.fileno() )
+            stdout_save.setfd( stdout_h )
+        except AttributeError:
+            # stdout has already been redirected
+            catch_stdout = False
+
+        # patch for `samtools view`
+        # samtools `view` closes stdout, from which I can not
+        # recover. Thus redirect output to file with -o option.
+        if method == "view":
+            if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
+            args = ( "-o", stdout_f ) + args
  
      # do the function call to samtools
      cdef char ** cargs
      cdef int i, n, retval
  
      n = len(args)
+    method = _force_cmdline_bytes(method)
+    args = [ _force_cmdline_bytes(a) for a in args ]
+
      # allocate two more for first (dummy) argument (contains command)
      cargs = <char**>calloc( n+2, sizeof( char *) )
      cargs[0] = "samtools"
      cargs[1] = method
      for i from 0 <= i < n: cargs[i+2] = args[i]
+    
      retval = pysam_dispatch(n+2, cargs)
      free( cargs )
-
+    
      # restore stdout/stderr. This will also flush, so
      # needs to be before reading back the file contents
-    stdout_save.restore()
-    stderr_save.restore()
+    if catch_stdout:
+        stdout_save.restore()
+        try:
+            with open( stdout_f, "r") as inf:
+                out_stdout = inf.readlines()
+        except UnicodeDecodeError:
+            with open( stdout_f, "rb") as inf:
+                # read binary output
+                out_stdout = inf.read()
+        os.remove( stdout_f )
+    else:
+        out_stdout = []
+
+    # get error messages
+    pysam_unset_stderr()
+    try:
+        with open( stderr_f, "r") as inf:
+            out_stderr = inf.readlines()
+    except UnicodeDecodeError:
+        with open( stderr_f, "rb") as inf:
+            # read binary output
+            out_stderr = inf.read()
+        os.remove( stderr_f )
+    else:
+        out_stderr = []
  
-    # capture stderr/stdout.
-    out_stderr = open( stderr_f, "r").readlines()
-    out_stdout = open( stdout_f, "r").readlines()
+    return retval, out_stderr, out_stdout
  
-    # clean up files
-    os.remove( stderr_f )
-    os.remove( stdout_f )
+cdef class SNPCall:
+    '''the results of a SNP call.'''
+    cdef int _tid
+    cdef int _pos
+    cdef char _reference_base
+    cdef char _genotype
+    cdef int _consensus_quality
+    cdef int _snp_quality
+    cdef int _rms_mapping_quality
+    cdef int _coverage
  
-    return retval, out_stderr, out_stdout
+    property tid:
+        '''the chromosome ID as is defined in the header'''
+        def __get__(self):
+            return self._tid
+
+    property pos:
+       '''nucleotide position of SNP.'''
+       def __get__(self): return self._pos
+
+    property reference_base:
+       '''reference base at pos. ``N`` if no reference sequence supplied.'''
+       def __get__(self): return from_string_and_size( &self._reference_base, 1 )
+
+    property genotype:
+       '''the genotype called.'''
+       def __get__(self): return from_string_and_size( &self._genotype, 1 )
+
+    property consensus_quality:
+       '''the genotype quality (Phred-scaled).'''
+       def __get__(self): return self._consensus_quality
+
+    property snp_quality:
+       '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+       def __get__(self): return self._snp_quality
+
+    property mapping_quality:
+       '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+       def __get__(self): return self._rms_mapping_quality
+
+    property coverage:
+       '''coverage or read depth - the number of reads involved in the call.'''
+       def __get__(self): return self._coverage
+
+    def __str__(self):
+
+        return "\t".join( map(str, (
+                    self.tid,
+                    self.pos,
+                    self.reference_base,
+                    self.genotype,
+                    self.consensus_quality,
+                    self.snp_quality,
+                    self.mapping_quality,
+                    self.coverage ) ) )
+
+
+# cdef class SNPCallerBase:
+#     '''Base class for SNP callers.
+
+#     *min_baseQ*
+#        minimum base quality (possibly capped by BAQ)
+#     *capQ_threshold*
+#        coefficient for adjusting mapQ of poor mappings
+#     *theta*
+#        theta in maq consensus calling model
+#     *n_haplotypes*
+#        number of haplotypes in the sample
+#     *het_rate*
+#        prior of a difference between two haplotypes
+#     '''
+
+#     cdef bam_maqcns_t * c
+#     cdef IteratorColumn iter
+
+#     def __cinit__(self,
+#                   IteratorColumn iterator_column,
+#                   **kwargs ):
+
+#         self.iter = iterator_column
+#         self.c =  bam_maqcns_init()
+
+#         # set the default parameterization according to
+#         # samtools
+
+#         # new default mode for samtools >0.1.10
+#         self.c.errmod = kwargs.get( "errmod", BAM_ERRMOD_MAQ2 )
+
+#         self.c.min_baseQ = kwargs.get( "min_baseQ", 13 )
+#         # self.c.capQ_thres = kwargs.get( "capQ_threshold", 60 )
+#         self.c.n_hap = kwargs.get( "n_haplotypes", 2 )
+#         self.c.het_rate = kwargs.get( "het_rate", 0.001 )
+#         self.c.theta = kwargs.get( "theta", 0.83 )
+
+#         if self.c.errmod != BAM_ERRMOD_MAQ2:
+#             self.c.theta += 0.02
+
+#         # call prepare AFTER setting parameters
+#         bam_maqcns_prepare( self.c )
+
+#     def __dealloc__(self):
+#         bam_maqcns_destroy( self.c )
+
+    # cdef __dump( self, glf1_t * g, uint32_t cns, int rb ):
+    #     '''debugging output.'''
+
+    #     pysam_dump_glf( g, self.c );
+    #     print ""
+    #     for x in range(self.iter.n_plp):
+    #         print "--> read %i %s %i" % (x,
+    #                                      bam1_qname(self.iter.plp[x].b),
+    #                                      self.iter.plp[x].qpos,
+    #                                      )
+
+    #     print "pos=%i, cns=%i, q_r = %f, depth=%i, n=%i, rb=%i, cns-cq=%i %i %i %i" \
+    #         % (self.iter.pos,
+    #            cns,
+    #            self.c.q_r,
+    #            self.iter.n_plp,
+    #            self.iter.n_plp,
+    #            rb,
+    #            cns >> 8 & 0xff,
+    #            cns >> 16 & 0xff,
+    #            cns & 0xff,
+    #            cns >> 28,
+    #            )
+
+    #     printf("-------------------------------------\n");
+    #     sys.stdout.flush()
+
+# cdef class IteratorSNPCalls( SNPCallerBase ):
+#     """*(IteratorColumn iterator)*
+
+#     call SNPs within a region.
+
+#     *iterator* is a pileup iterator. SNPs will be called
+#     on all positions returned by this iterator.
+
+#     This caller is fast if SNPs are called over large continuous
+#     regions. It is slow, if instantiated frequently and in random
+#     order as the sequence will have to be reloaded.
+
+#     """
+
+#     def __cinit__(self,
+#                   IteratorColumn iterator_column,
+#                   **kwargs ):
+
+#         assert self.iter.hasReference(), "IteratorSNPCalls requires an pileup iterator with reference sequence"
+
+#     def __iter__(self):
+#         return self
+
+#     def __next__(self):
+#         """python version of next().
+#         """
+
+#         # the following code was adapted from bam_plcmd.c:pileup_func()
+#         self.iter.cnext()
+
+#         if self.iter.n_plp < 0:
+#             raise ValueError("error during iteration" )
+
+#         if self.iter.plp == NULL:
+#            raise StopIteration
+
+#         cdef char * seq = self.iter.getSequence()
+#         cdef int seq_len = self.iter.seq_len
+
+#         assert seq != NULL
+
+#         # reference base
+#         if self.iter.pos >= seq_len:
+#             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+#         cdef int rb = seq[self.iter.pos]
+#         cdef uint32_t cns
+#        cdef glf1_t * g
+
+#        g = bam_maqcns_glfgen( self.iter.n_plp,
+#                               self.iter.plp,
+#                               bam_nt16_table[rb],
+#                               self.c )
+
+#        if pysam_glf_depth( g ) == 0:
+#            cns = 0xfu << 28 | 0xf << 24
+#        else:
+#            cns = glf2cns(g, <int>(self.c.q_r + .499))
+
+#        free(g)
+
+#         cdef SNPCall call
+
+#         call = SNPCall()
+#         call._tid = self.iter.tid
+#         call._pos = self.iter.pos
+#         call._reference_base = rb
+#         call._genotype = bam_nt16_rev_table[cns>>28]
+#         call._consensus_quality = cns >> 8 & 0xff
+#         call._snp_quality = cns & 0xff
+#         call._rms_mapping_quality = cns >> 16&0xff
+#         call._coverage = self.iter.n_plp
+
+#         return call
+
+# cdef class SNPCaller( SNPCallerBase ):
+#     '''*(IteratorColumn iterator_column )*
+
+#     The samtools SNP caller.
+
+#     This object will call SNPs in *samfile* against the reference
+#     sequence in *fasta*.
+
+#     This caller is fast for calling few SNPs in selected regions.
+
+#     It is slow, if called over large genomic regions.
+#     '''
+
+
+#     def __cinit__(self,
+#                   IteratorColumn iterator_column,
+#                   **kwargs ):
+
+#         pass
+
+#     def call(self, reference, int pos ):
+#         """call a snp on chromosome *reference*
+#         and position *pos*.
+
+#         returns a :class:`SNPCall` object.
+#         """
+
+#         cdef int tid = self.iter.samfile.gettid( reference )
+
+#         self.iter.reset( tid, pos, pos + 1 )
+
+#         while 1:
+#             self.iter.cnext()
+
+#             if self.iter.n_plp < 0:
+#                 raise ValueError("error during iteration" )
  
-__all__ = ["Samfile", 
+#             if self.iter.plp == NULL:
+#                 raise ValueError( "no reads in region - no call" )
+
+#             if self.iter.pos == pos: break
+
+#         cdef char * seq = self.iter.getSequence()
+#         cdef int seq_len = self.iter.seq_len
+
+#         assert seq != NULL
+
+#         # reference base
+#         if self.iter.pos >= seq_len:
+#             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+#         cdef int rb = seq[self.iter.pos]
+#         cdef uint32_t cns
+# #        cdef glf1_t * g
+# #
+# #        g = bam_maqcns_glfgen( self.iter.n_plp,
+# #                               self.iter.plp,
+# #                               bam_nt16_table[rb],
+# #                               self.c )
+# ##
+# #
+# #        if pysam_glf_depth( g ) == 0:
+# #            cns = 0xfu << 28 | 0xf << 24
+# #        else:
+# #            cns = glf2cns(g, <int>(self.c.q_r + .499))
+# #
+# #        free(g)
+
+#         cdef SNPCall call
+
+#         call = SNPCall()
+#         call._tid = self.iter.tid
+#         call._pos = self.iter.pos
+#         call._reference_base = rb
+#         call._genotype = bam_nt16_rev_table[cns>>28]
+#         call._consensus_quality = cns >> 8 & 0xff
+#         call._snp_quality = cns & 0xff
+#         call._rms_mapping_quality = cns >> 16&0xff
+#         call._coverage = self.iter.n_plp
+
+#         return call
+
+# cdef class IndelCall:
+#     '''the results of an indel call.'''
+#     cdef int _tid
+#     cdef int _pos
+#     cdef int _coverage
+#     cdef int _rms_mapping_quality
+#     cdef bam_maqindel_ret_t * _r
+
+#     def __cinit__(self):
+#         #assert r != NULL
+#         #self._r = r
+#         pass
+
+#     property tid:
+#         '''the chromosome ID as is defined in the header'''
+#         def __get__(self):
+#             return self._tid
+
+#     property pos:
+#        '''nucleotide position of SNP.'''
+#        def __get__(self): return self._pos
+
+#     property genotype:
+#        '''the genotype called.'''
+#        def __get__(self):
+#            if self._r.gt == 0:
+#                s = PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+#                return "%s/%s" % (s,s)
+#            elif self._r.gt == 1:
+#                s = PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+#                return "%s/%s" % (s,s)
+#            else:
+#                return "%s/%s" % (self.first_allele, self.second_allele )
+
+#     property consensus_quality:
+#        '''the genotype quality (Phred-scaled).'''
+#        def __get__(self): return self._r.q_cns
+
+#     property snp_quality:
+#        '''the snp quality (Phred scaled) - probability of consensus being identical to reference sequence.'''
+#        def __get__(self): return self._r.q_ref
+
+#     property mapping_quality:
+#        '''the root mean square (rms) of the mapping quality of all reads involved in the call.'''
+#        def __get__(self): return self._rms_mapping_quality
+
+#     property coverage:
+#        '''coverage or read depth - the number of reads involved in the call.'''
+#        def __get__(self): return self._coverage
+
+#     property first_allele:
+#        '''sequence of first allele.'''
+#        def __get__(self): return PyString_FromStringAndSize( self._r.s[0], self._r.indel1 + 1)
+
+#     property second_allele:
+#        '''sequence of second allele.'''
+#        def __get__(self): return PyString_FromStringAndSize( self._r.s[1], self._r.indel2 + 1)
+
+#     property reads_first:
+#        '''reads supporting first allele.'''
+#        def __get__(self): return self._r.cnt1
+
+#     property reads_second:
+#        '''reads supporting first allele.'''
+#        def __get__(self): return self._r.cnt2
+
+#     property reads_diff:
+#        '''reads supporting first allele.'''
+#        def __get__(self): return self._r.cnt_anti
+
+#     def __str__(self):
+
+#         return "\t".join( map(str, (
+#                     self.tid,
+#                     self.pos,
+#                     self.genotype,
+#                     self.consensus_quality,
+#                     self.snp_quality,
+#                     self.mapping_quality,
+#                     self.coverage,
+#                     self.first_allele,
+#                     self.second_allele,
+#                     self.reads_first,
+#                     self.reads_second,
+#                     self.reads_diff ) ) )
+
+#     def __dealloc__(self ):
+#         bam_maqindel_ret_destroy(self._r)
+
+# cdef class IndelCallerBase:
+#     '''Base class for SNP callers.
+
+#     *min_baseQ*
+#        minimum base quality (possibly capped by BAQ)
+#     *capQ_threshold*
+#        coefficient for adjusting mapQ of poor mappings
+#     *theta*
+#        theta in maq consensus calling model
+#     *n_haplotypes*
+#        number of haplotypes in the sample
+#     *het_rate*
+#        prior of a difference between two haplotypes
+#     '''
+
+#     cdef bam_maqindel_opt_t * options
+#     cdef IteratorColumn iter
+#     cdef int cap_mapQ
+#     cdef int max_depth
+
+#     def __cinit__(self,
+#                   IteratorColumn iterator_column,
+#                   **kwargs ):
+
+
+#         self.iter = iterator_column
+
+#         assert iterator_column.hasReference(), "IndelCallerBase requires an pileup iterator with reference sequence"
+
+#         self.options = bam_maqindel_opt_init()
+
+#         # set the default parameterization according to
+#         # samtools
+
+#         self.options.r_indel = kwargs.get( "r_indel", 0.00015 )
+#         self.options.q_indel = kwargs.get( "q_indel", 40 )
+#         self.cap_mapQ = kwargs.get( "cap_mapQ", 60 )
+#         self.max_depth = kwargs.get( "max_depth", 1024 )
+
+#     def __dealloc__(self):
+#         free( self.options )
+
+#     def _call( self ):
+
+#         cdef char * seq = self.iter.getSequence()
+#         cdef int seq_len = self.iter.seq_len
+
+#         assert seq != NULL
+
+#         # reference base
+#         if self.iter.pos >= seq_len:
+#             raise ValueError( "position %i out of bounds on reference sequence (len=%i)" % (self.iter.pos, seq_len) )
+
+#         cdef bam_maqindel_ret_t * r
+
+#         cdef int m = min( self.max_depth, self.iter.n_plp )
+
+#         # printf("pysam: m=%i, q_indel=%i, r_indel=%f, r_snp=%i, mm_penalty=%i, indel_err=%i, ambi_thres=%i\n",
+#         #        m, self.options.q_indel, self.options.r_indel, self.options.r_snp, self.options.mm_penalty,
+#         #        self.options.indel_err, self.options.ambi_thres );
+
+#         r = bam_maqindel(m,
+#                          self.iter.pos,
+#                          self.options,
+#                          self.iter.plp,
+#                          seq,
+#                          0,
+#                          NULL)
+
+#         if r == NULL: return None
+
+#         cdef IndelCall call
+#         call = IndelCall()
+#         call._r = r
+#         call._tid = self.iter.tid
+#         call._pos = self.iter.pos
+#         call._coverage = self.iter.n_plp
+
+#         cdef uint64_t rms_aux = 0
+#         cdef int i = 0
+#         cdef bam_pileup1_t * p
+#         cdef int tmp
+
+#         for i from 0 <= i < self.iter.n_plp:
+#             p = self.iter.plp + i
+#             if p.b.core.qual < self.cap_mapQ:
+#                 tmp = p.b.core.qual
+#             else:
+#                 tmp = self.cap_mapQ
+#             rms_aux += tmp * tmp
+
+#         call._rms_mapping_quality = <uint64_t>(sqrt(<double>rms_aux / self.iter.n_plp) + .499)
+
+#         return call
+
+# cdef class IndelCaller( IndelCallerBase ):
+#     '''*(IteratorColumn iterator_column )*
+
+#     The samtools SNP caller.
+
+#     This object will call SNPs in *samfile* against the reference
+#     sequence in *fasta*.
+
+#     This caller is fast for calling few SNPs in selected regions.
+
+#     It is slow, if called over large genomic regions.
+#     '''
+
+#     def __cinit__(self,
+#                   IteratorColumn iterator_column,
+#                   **kwargs ):
+
+#         pass
+
+#     def call(self, reference, int pos ):
+#         """call a snp on chromosome *reference*
+#         and position *pos*.
+
+#         returns a :class:`SNPCall` object or None, if no indel call could be made.
+#         """
+
+#         cdef int tid = self.iter.samfile.gettid( reference )
+
+#         self.iter.reset( tid, pos, pos + 1 )
+
+#         while 1:
+#             self.iter.cnext()
+
+#             if self.iter.n_plp < 0:
+#                 raise ValueError("error during iteration" )
+
+#             if self.iter.plp == NULL:
+#                 raise ValueError( "no reads in region - no call" )
+
+#             if self.iter.pos == pos: break
+
+#         return self._call()
+
+# cdef class IteratorIndelCalls( IndelCallerBase ):
+#     """*(IteratorColumn iterator)*
+
+#     call indels within a region.
+
+#     *iterator* is a pileup iterator. SNPs will be called
+#     on all positions returned by this iterator.
+
+#     This caller is fast if SNPs are called over large continuous
+#     regions. It is slow, if instantiated frequently and in random
+#     order as the sequence will have to be reloaded.
+
+#     """
+
+#     def __cinit__(self,
+#                   IteratorColumn iterator_column,
+#                   **kwargs ):
+#         pass
+
+
+#     def __iter__(self):
+#         return self
+
+#     def __next__(self):
+#         """python version of next().
+#         """
+
+#         # the following code was adapted from bam_plcmd.c:pileup_func()
+#         self.iter.cnext()
+
+#         if self.iter.n_plp < 0:
+#             raise ValueError("error during iteration" )
+
+#         if self.iter.plp == NULL:
+#            raise StopIteration
+
+#         return self._call()
+
+
+
+cdef class IndexedReads:
+    """index a bamfile by read.
+
+    The index is kept in memory.
+
+    By default, the file is re-openend to avoid conflicts if
+    multiple operators work on the same file. Set *reopen* = False
+    to not re-open *samfile*.
+    """
+
+    def __init__(self, Samfile samfile, int reopen = True ):
+        self.samfile = samfile
+
+        if samfile.isbam: mode = b"rb"
+        else: mode = b"r"
+
+        # reopen the file - note that this makes the iterator
+        # slow and causes pileup to slow down significantly.
+        if reopen:
+            store = StderrStore()
+            self.fp = samopen( samfile._filename, mode, NULL )
+            store.release()
+            assert self.fp != NULL
+            self.owns_samfile = True
+        else:
+            self.fp = samfile.samfile
+            self.owns_samfile = False
+
+        assert samfile.isbam, "can only IndexReads on bam files"
+
+    def build( self ):
+        '''build index.'''
+
+        self.index = collections.defaultdict( list )
+
+        # this method will start indexing from the current file position
+        # if you decide
+        cdef int ret = 1
+        cdef bam1_t * b = <bam1_t*> calloc(1, sizeof( bam1_t) )
+
+        cdef uint64_t pos
+
+        while ret > 0:
+            pos = bam_tell( self.fp.x.bam )
+            ret = samread( self.fp, b)
+            if ret > 0:
+                qname = _charptr_to_str(bam1_qname( b ))
+                self.index[qname].append( pos )
+
+        bam_destroy1( b )
+
+    def find( self, qname ):
+        if qname in self.index:
+            return IteratorRowSelection( self.samfile, self.index[qname], reopen = False )
+        else:
+            raise KeyError( "read %s not found" % qname )
+
+    def __dealloc__(self):
+        if self.owns_samfile: samclose( self.fp )
+
+__all__ = ["Samfile",
             "Fastafile",
-           "IteratorRow", 
-           "IteratorRowAll", 
-           "IteratorColumn", 
-           "AlignedRead", 
-           "PileupColumn", 
-           "PileupProxy", 
-           "PileupRead" ]
-
-               
+           "IteratorRow",
+           "IteratorColumn",
+           "AlignedRead",
+           "PileupColumn",
+           "PileupProxy",
+           "PileupRead",
+           # "IteratorSNPCalls",
+           # "SNPCaller",
+           # "IndelCaller",
+           # "IteratorIndelCalls",
+           "IndexedReads" ]
+
+