Imported Upstream version 0.2
authorDiane Trout <diane@caltech.edu>
Fri, 19 Nov 2010 19:43:01 +0000 (11:43 -0800)
committerDiane Trout <diane@caltech.edu>
Fri, 19 Nov 2010 19:43:01 +0000 (11:43 -0800)
69 files changed:
COPYING [new file with mode: 0644]
INSTALL [new file with mode: 0644]
KNOWN_BUGS [new file with mode: 0644]
MANIFEST.in [new file with mode: 0644]
PKG-INFO [new file with mode: 0644]
THANKS [new file with mode: 0644]
pysam/Pileup.py [new file with mode: 0644]
pysam/__init__.py [new file with mode: 0644]
pysam/csamtools.pxd [new file with mode: 0644]
pysam/csamtools.pyx [new file with mode: 0644]
pysam/namedtuple.py [new file with mode: 0644]
pysam/pysam_util.c [new file with mode: 0644]
pysam/pysam_util.h [new file with mode: 0644]
samtools/bam.c [new file with mode: 0644]
samtools/bam.h [new file with mode: 0644]
samtools/bam_aux.c [new file with mode: 0644]
samtools/bam_color.c [new file with mode: 0644]
samtools/bam_endian.h [new file with mode: 0644]
samtools/bam_import.c [new file with mode: 0644]
samtools/bam_index.c [new file with mode: 0644]
samtools/bam_lpileup.c [new file with mode: 0644]
samtools/bam_maqcns.c [new file with mode: 0644]
samtools/bam_maqcns.h [new file with mode: 0644]
samtools/bam_mate.c [new file with mode: 0644]
samtools/bam_md.c [new file with mode: 0644]
samtools/bam_pileup.c [new file with mode: 0644]
samtools/bam_plcmd.c [new file with mode: 0644]
samtools/bam_rmdup.c [new file with mode: 0644]
samtools/bam_rmdupse.c [new file with mode: 0644]
samtools/bam_sort.c [new file with mode: 0644]
samtools/bam_stat.c [new file with mode: 0644]
samtools/bam_tview.c [new file with mode: 0644]
samtools/bgzf.c [new file with mode: 0644]
samtools/bgzf.h [new file with mode: 0644]
samtools/faidx.c [new file with mode: 0644]
samtools/faidx.h [new file with mode: 0644]
samtools/glf.c [new file with mode: 0644]
samtools/glf.h [new file with mode: 0644]
samtools/kaln.c [new file with mode: 0644]
samtools/kaln.h [new file with mode: 0644]
samtools/khash.h [new file with mode: 0644]
samtools/klist.h [new file with mode: 0644]
samtools/knetfile.c [new file with mode: 0644]
samtools/knetfile.h [new file with mode: 0644]
samtools/kseq.h [new file with mode: 0644]
samtools/ksort.h [new file with mode: 0644]
samtools/kstring.c [new file with mode: 0644]
samtools/kstring.h [new file with mode: 0644]
samtools/razf.c [new file with mode: 0644]
samtools/razf.h [new file with mode: 0644]
samtools/sam.c [new file with mode: 0644]
samtools/sam.h [new file with mode: 0644]
samtools/sam_header.c [new file with mode: 0644]
samtools/sam_header.h [new file with mode: 0644]
samtools/sam_view.c [new file with mode: 0644]
setup.cfg [new file with mode: 0644]
setup.py [new file with mode: 0644]
tests/00README.txt [new file with mode: 0644]
tests/Makefile [new file with mode: 0644]
tests/ex1.fa [new file with mode: 0644]
tests/ex1.sam.gz [new file with mode: 0644]
tests/ex3.sam [new file with mode: 0644]
tests/ex4.sam [new file with mode: 0644]
tests/ex5.sam [new file with mode: 0644]
tests/ex6.sam [new file with mode: 0644]
tests/ex7.sam [new file with mode: 0644]
tests/example.py [new file with mode: 0644]
tests/pysam_test.py [new file with mode: 0755]
tests/segfault_tests.py [new file with mode: 0755]

diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..82fa2f4
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2008-2009 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/INSTALL b/INSTALL
new file mode 100644 (file)
index 0000000..65330bc
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,50 @@
+System Requirements
+===================
+
+SAMtools depends on the zlib library <http://www.zlib.net>. The latest
+version 1.2.3 is preferred and with the latest version you can compile
+razip and use it to compress a FASTA file. SAMtools' faidx is able to
+index a razip-compressed FASTA file to save diskspace. Older zlib also
+works with SAMtools, but razip cannot be compiled.
+
+The text-based viewer (tview) requires the GNU ncurses library
+<http://www.gnu.org/software/ncurses/>, which comes with Mac OS X and
+most of the modern Linux/Unix distributions. If you do not have this
+library installed, you can still compile the rest of SAMtools by
+manually modifying one line in Makefile.
+
+Pysam requires pyrex (0.9.8 or greater) and python (2.6 or greater).
+It has not been tested on many other platforms.
+
+Compilation
+===========
+
+Unpack the distribution and enter the pysam directory. Type 
+
+python setup.py build
+
+to compile.
+
+Installation
+============
+
+Type 
+
+   python setup.py install
+
+to install it within the site-packages directory of your python
+distribution. Type
+
+   python setup.py install --help 
+
+for more options.
+
+Architecture specific options
+=============================
+
+Pysam has been compiled on various linux systems and works
+with python 2.6 and python 2.5.
+
+Python 2.7 and Python 3 have not been tested.
+
+Windows support does not work yet
diff --git a/KNOWN_BUGS b/KNOWN_BUGS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644 (file)
index 0000000..11fb9d1
--- /dev/null
@@ -0,0 +1,26 @@
+#
+# Use .add_data_files and .add_data_dir methods in a appropriate
+# setup.py files to include non-python files such as documentation,
+# data, etc files to distribution. Avoid using MANIFEST.in for that.
+#
+include MANIFEST.in
+include COPYING
+include INSTALL
+include KNOWN_BUGS
+include THANKS
+include pysam/csamtools.pxd
+include pysam/pysam_util.h
+include samtools/*.h
+include tests/00README.txt
+include tests/Makefile
+include tests/ex1.fa
+include tests/ex1.sam.gz
+include tests/ex3.sam
+include tests/ex4.sam
+include tests/ex5.sam
+include tests/ex6.sam
+include tests/ex7.sam
+include tests/example.py
+include tests/pysam_test.py
+include tests/segfault_tests.py
+
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644 (file)
index 0000000..3e3b745
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,15 @@
+Metadata-Version: 1.0
+Name: pysam
+Version: 0.2
+Summary: pysam
+Home-page: http://code.google.com/p/pysam/
+Author: Andreas Heger
+Author-email: andreas.heger@gmail.com
+License: MIT
+Description: 
+        
+        pysam
+        *****
+        
+        
+Platform: ALL
diff --git a/THANKS b/THANKS
new file mode 100644 (file)
index 0000000..02fea67
--- /dev/null
+++ b/THANKS
@@ -0,0 +1,3 @@
+We would like to thank Heng Li and the other samtools contributors for their support
+and their hard work. As a wrapper, pysam merely tries to make their code accessible 
+to the python community - the heavy lifting has been done by the samtools developers.
diff --git a/pysam/Pileup.py b/pysam/Pileup.py
new file mode 100644 (file)
index 0000000..e182d12
--- /dev/null
@@ -0,0 +1,60 @@
+'''Tools for working with files in the samtools pileup -c format.'''
+import collections
+import pysam
+
+PileupSubstitution = collections.namedtuple( "PileupSubstitution",
+                                    " ".join( (\
+            "chromosome", 
+            "position", 
+            "reference_base", 
+            "consensus_base",
+            "consensus_quality",
+            "snp_quality",
+            "rms_mapping_quality",
+            "coverage",
+            "read_bases",
+            "base_qualities" ) ) )
+
+PileupIndel = collections.namedtuple( "PileupIndel",
+                                      " ".join( (\
+            "chromosome", 
+            "position", 
+            "reference_base", 
+            "genotype",
+            "consensus_quality",
+            "snp_quality",
+            "rms_mapping_quality",
+            "coverage",
+            "first_allelle",
+            "second_allele",
+            "reads_first",
+            "reads_second",
+            "reads_diff" ) ) )
+
+def iterate( infile ):
+    '''iterate over ``samtools pileup -c`` formatted file.
+
+    *infile* can be any iterator over a lines.
+
+    The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution`
+    or :class:`pysam.Pileup.PileupIndel`.
+
+    .. note:: 
+       The parser converts to 0-based coordinates
+    '''
+    
+    conv_subst = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str)
+    conv_indel = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str,int,int,int)
+
+    for line in infile:
+        d = line[:-1].split()
+        if d[2] == "*":
+            try:
+                yield PileupIndel( *[x(y) for x,y in zip(conv_indel,d) ] )
+            except TypeError:
+                raise pysam.SamtoolsError( "parsing error in line: `%s`" % line)
+        else:
+            try:
+                yield PileupSubstitution( *[x(y) for x,y in zip(conv_subst,d) ] )
+            except TypeError:
+                raise pysam.SamtoolsError( "parsing error in line: `%s`" % line)
diff --git a/pysam/__init__.py b/pysam/__init__.py
new file mode 100644 (file)
index 0000000..3062753
--- /dev/null
@@ -0,0 +1,101 @@
+from csamtools import *
+import Pileup
+import sys
+import os
+
+class SamtoolsError( Exception ):
+    '''exception raised in case of an error incurred in the samtools library.'''
+
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return repr(self.value)
+
+class SamtoolsDispatcher(object):
+    '''samtools dispatcher. 
+
+    Emulates the samtools command line as module calls.
+    
+    Captures stdout and stderr. 
+
+    Raises a :class:`pysam.SamtoolsError` exception in case
+    samtools exits with an error code other than 0.
+
+    Some command line options are associated with parsers.
+    For example, the samtools command "pileup -c" creates
+    a tab-separated table on standard output. In order to 
+    associate parsers with options, an optional list of 
+    parsers can be supplied. The list will be processed
+    in order checking for the presence of each option.
+
+    If no parser is given or no appropriate parser is found, 
+    the stdout output of samtools commands will be returned.
+    '''
+    dispatch=None
+    parsers=None
+
+    def __init__(self,dispatch, parsers): 
+        self.dispatch = dispatch
+        self.parsers = parsers
+        self.stderr = []
+
+    def __call__(self,*args, **kwargs):
+        '''execute the samtools command
+        '''
+        retval, stderr, stdout = csamtools._samtools_dispatch( self.dispatch, args )
+        if retval: raise SamtoolsError( "\n".join( stderr ) )
+        self.stderr = stderr
+        # samtools commands do not propagate the return code correctly.
+        # I have thus added this patch to throw if there is output on stderr.
+        # Note that there is sometimes output on stderr that is not an error,
+        # for example: [sam_header_read2] 2 sequences loaded.
+        # Ignore messages like these
+        stderr = [ x for x in stderr if not x.startswith( "[sam_header_read2]" ) ]
+        if stderr: raise SamtoolsError( "\n".join( stderr ) )
+
+        # call parser for stdout:
+        if not kwargs.get("raw") and stdout and self.parsers:
+            for options, parser in self.parsers:
+                for option in options: 
+                    if option not in args: break
+                else:
+                    return parser(stdout)
+
+        return stdout
+
+    def getMessages( self ):
+        return self.stderr
+
+    def usage(self):
+        '''return the samtools usage information for this command'''
+        retval, stderr, stdout = csamtools._samtools_dispatch( self.dispatch )
+        return "".join(stderr)
+
+#
+# samtools command line options to export in python
+#
+# import is a python reserved word.
+SAMTOOLS_DISPATCH = { 
+    "view" : ( "view", None ),
+    "sort" : ( "sort", None),
+    "samimport": ( "import", None),
+    "pileup" : ( "pileup", ( (("-c",), Pileup.iterate ), ), ),
+    "faidx" : ("faidx", None),
+    "tview" : ("tview", None),
+    "index" : ("index", None),
+    "fixmate" : ("fixmate", None),
+    "glfview" : ("glfview", None),
+    "flagstat" : ("flagstat", None),
+    "calmd" : ("calmd", None),
+    "merge" : ("merge", None),  
+    "rmdup" : ("rmdup", None) }
+
+# instantiate samtools commands as python functions
+for key, options in SAMTOOLS_DISPATCH.iteritems():
+    cmd, parser = options
+    globals()[key] = SamtoolsDispatcher(cmd, parser)
+
+# hack to export all the symbols from csamtools
+__all__ = csamtools.__all__ + [ "SamtoolsError", "SamtoolsDispatcher" ] + list(SAMTOOLS_DISPATCH) +\
+    ["Pileup",] 
+
diff --git a/pysam/csamtools.pxd b/pysam/csamtools.pxd
new file mode 100644 (file)
index 0000000..7dac38d
--- /dev/null
@@ -0,0 +1,251 @@
+
+cdef extern from "string.h":
+  ctypedef int size_t
+  void *memcpy(void *dst,void *src,size_t len)
+  void *memmove(void *dst,void *src,size_t len)
+  void *memset(void *b,int c,size_t len)
+
+cdef extern from "stdlib.h":
+  void free(void *)
+  void *malloc(size_t)
+  void *calloc(size_t,size_t)
+  void *realloc(void *,size_t)
+  int c_abs "abs" (int)
+  void qsort(void *base, size_t nmemb, size_t size,
+             int (*compar)(void *,void *))
+
+cdef extern from "stdio.h":
+  ctypedef struct FILE:
+    pass
+  FILE *fopen(char *,char *)
+  FILE *freopen(char *path, char *mode, FILE *stream)
+  int fileno(FILE *stream)
+  int dup2(int oldfd, int newfd)
+  int fflush(FILE *stream)
+
+  FILE * stderr
+  FILE * stdout
+  int fclose(FILE *)
+  int sscanf(char *str,char *fmt,...)
+  int printf(char *str,char *fmt,...)
+  int sprintf(char *str,char *fmt,...)
+  int fprintf(FILE *ifile,char *fmt,...)
+  char *fgets(char *str,int size,FILE *ifile)
+
+cdef extern from "ctype.h":
+  int toupper(int c)
+  int tolower(int c)
+  
+cdef extern from "unistd.h":
+  char *ttyname(int fd)
+  int isatty(int fd)  
+
+cdef extern from "string.h":
+  int strcmp(char *s1, char *s2)
+  int strncmp(char *s1,char *s2,size_t len)
+  char *strcpy(char *dest,char *src)
+  char *strncpy(char *dest,char *src, size_t len)
+  char *strdup(char *)
+  char *strcat(char *,char *)
+  size_t strlen(char *s)
+  int memcmp( void * s1, void *s2, size_t len )
+
+cdef extern from "razf.h":
+  pass
+
+cdef extern from "stdint.h":
+  ctypedef int int64_t
+  ctypedef int int32_t
+  ctypedef int uint32_t
+  ctypedef int uint8_t
+  ctypedef int uint64_t
+
+
+cdef extern from "bam.h":
+
+  # IF _IOLIB=2, bamFile = BGZF, see bgzf.h
+  # samtools uses KNETFILE, check how this works
+
+  ctypedef struct tamFile:
+      pass
+
+  ctypedef struct bamFile:
+      pass
+
+  ctypedef struct bam1_core_t:
+      int32_t tid 
+      int32_t pos
+      uint32_t bin
+      uint32_t qual
+      uint32_t l_qname
+      uint32_t flag
+      uint32_t n_cigar
+      int32_t l_qseq
+      int32_t mtid 
+      int32_t mpos 
+      int32_t isize
+
+  ctypedef struct bam1_t:
+    bam1_core_t core
+    int l_aux
+    int data_len
+    int m_data
+    uint8_t *data
+
+  ctypedef struct bam_pileup1_t:
+      bam1_t *b 
+      int32_t qpos 
+      int indel
+      int level
+      uint32_t is_del
+      uint32_t is_head
+      uint32_t is_tail
+
+  ctypedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *data)
+
+  ctypedef int (*bam_fetch_f)(bam1_t *b, void *data)
+
+  ctypedef struct bam_header_t:
+     int32_t n_targets
+     char **target_name
+     uint32_t *target_len
+     void *hash
+     void *rg2lib
+     int l_text
+     char *text
+
+  ctypedef struct bam_index_t:
+      pass
+
+  ctypedef struct bam_plbuf_t:
+      pass
+
+  bamFile razf_dopen(int data_fd, char *mode)
+
+  # removed - macros not found
+
+  # int64_t bam_seek( bamFile fp, uint64_t voffset, int where)
+  # int64_t bam_tell( bamFile fp )
+  # void bam_destroy1( bam1_t * b) 
+  # void bam_init_header_hash(bam_header_t *header)
+
+  bam1_t * bam_dup1( bam1_t *src ) 
+  
+  bam1_t * bam_copy1(bam1_t *bdst, bam1_t *bsrc)
+  bam_index_t *bam_index_load(char *f )
+
+  void bam_index_destroy(bam_index_t *idx)
+
+  int bam_parse_region(bam_header_t *header, char *str, int *ref_id, int *begin, int *end)
+
+  bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+
+  int bam_fetch(bamFile fp, bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+
+  int bam_plbuf_push(bam1_t *b, bam_plbuf_t *buf)
+
+  void bam_plbuf_destroy(bam_plbuf_t *buf)
+
+  int bam_read1(bamFile fp, bam1_t *b)
+
+  int bam_write1( bamFile fp, bam1_t *b)
+
+  bam_header_t *bam_header_init()
+
+  int bam_header_write( bamFile fp, bam_header_t *header)
+
+  bam_header_t *bam_header_read( bamFile fp )
+
+  void bam_header_destroy(bam_header_t *header)
+
+  bam1_t * bam_dup1( bam1_t *src ) 
+  
+  bam1_t * bam_copy1(bam1_t *bdst, bam1_t *bsrc)
+
+  uint8_t *bam_aux_get(bam1_t *b,  char tag[2])
+
+  int bam_aux2i(uint8_t *s)
+  float bam_aux2f(uint8_t *s)
+  double bam_aux2d(uint8_t *s)
+  char bam_aux2A( uint8_t *s)
+  char *bam_aux2Z( uint8_t *s)
+  
+  int bam_reg2bin(uint32_t beg, uint32_t end)
+
+  uint32_t bam_calend(bam1_core_t *c, uint32_t *cigar)
+
+cdef extern from "sam.h":
+
+  ctypedef struct samfile_t_un:
+    tamFile tamr
+    bamFile bam
+    FILE *tamw
+    
+  ctypedef struct samfile_t:
+     int type
+     samfile_t_un x
+     bam_header_t *header
+
+  samfile_t *samopen( char *fn, char * mode, void *aux)
+
+  int sampileup( samfile_t *fp, int mask, bam_pileup_f func, void *data)
+
+  void samclose(samfile_t *fp)
+
+  int samread(samfile_t *fp, bam1_t *b)
+
+  int samwrite(samfile_t *fp, bam1_t *b)
+
+cdef extern from "faidx.h":
+
+   ctypedef struct faidx_t:
+      pass
+
+   int fai_build(char *fn)
+
+   void fai_destroy(faidx_t *fai)
+
+   faidx_t *fai_load(char *fn)
+
+   char *fai_fetch(faidx_t *fai, char *reg, int *len)
+
+cdef extern from "pysam_util.h":
+
+    int pysam_bam_plbuf_push(bam1_t *b, bam_plbuf_t *buf, int cont)
+
+    int pysam_get_pos( bam_plbuf_t *buf)
+
+    int pysam_get_tid( bam_plbuf_t *buf)
+
+    bam_pileup1_t * pysam_get_pileup( bam_plbuf_t *buf)
+
+    int pysam_dispatch(int argc, char *argv[] )
+
+    # stand-in functions for samtools macros
+    void pysam_bam_destroy1( bam1_t * b) 
+
+    # add *nbytes* into the variable length data of *src* at *pos*
+    bam1_t * pysam_bam_update( bam1_t * b, 
+                               size_t nbytes_old,
+                               size_t nbytes_new,
+                               uint8_t * pos )
+
+    # translate char to unsigned char
+    unsigned char pysam_translate_sequence( char s )
+
+    # stand-ins for samtools macros
+    uint32_t * pysam_bam1_cigar( bam1_t * b)
+    char * pysam_bam1_qname( bam1_t * b)
+    uint8_t * pysam_bam1_seq( bam1_t * b)
+    uint8_t * pysam_bam1_qual( bam1_t * b)
+    uint8_t * pysam_bam1_aux( bam1_t * b)
+
+    # iterator implemenation
+    ctypedef struct bam_fetch_iterator_t:
+        pass
+  
+    bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, bam_index_t *idx, int tid, int beg, int end)
+  
+    bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter)
+  
+    void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter)
diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx
new file mode 100644 (file)
index 0000000..0da8d9e
--- /dev/null
@@ -0,0 +1,1782 @@
+# cython: embedsignature=True
+# adds doc-strings for sphinx
+
+import tempfile, os, sys, types, itertools, struct, ctypes
+
+# defines imported from samtools
+DEF SEEK_SET = 0
+DEF SEEK_CUR = 1
+DEF SEEK_END = 2
+
+## These are bits set in the flag.
+## have to put these definitions here, in csamtools.pxd they got ignored
+## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
+DEF BAM_FPAIRED       =1
+## @abstract the read is mapped in a proper pair */
+DEF BAM_FPROPER_PAIR  =2
+## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
+DEF BAM_FUNMAP        =4
+## @abstract the mate is unmapped */
+DEF BAM_FMUNMAP       =8
+## @abstract the read is mapped to the reverse strand */
+DEF BAM_FREVERSE      =16
+## @abstract the mate is mapped to the reverse strand */
+DEF BAM_FMREVERSE     =32
+## @abstract this is read1 */
+DEF BAM_FREAD1        =64
+## @abstract this is read2 */
+DEF BAM_FREAD2       =128
+## @abstract not primary alignment */
+DEF BAM_FSECONDARY   =256
+## @abstract QC failure */
+DEF BAM_FQCFAIL      =512
+## @abstract optical or PCR duplicate */
+DEF BAM_FDUP        =1024
+
+DEF BAM_CIGAR_SHIFT=4
+DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)
+
+#####################################################################
+#####################################################################
+#####################################################################
+## private factory methods
+#####################################################################
+cdef class AlignedRead
+cdef makeAlignedRead( bam1_t * src):
+    '''enter src into AlignedRead.'''
+    cdef AlignedRead dest
+    dest = AlignedRead()
+    # destroy dummy delegate created in constructor
+    # to prevent memory leak.
+    pysam_bam_destroy1(dest._delegate)
+    dest._delegate = bam_dup1(src)
+    return dest
+
+cdef class PileupProxy
+cdef makePileupProxy( bam_plbuf_t * buf, int n ):
+     cdef PileupProxy dest
+     dest = PileupProxy()
+     dest.buf = buf
+     dest.n = n
+     return dest
+
+cdef class PileupRead
+cdef makePileupRead( bam_pileup1_t * src ):
+    '''fill a  PileupRead object from a bam_pileup1_t * object.'''
+    cdef PileupRead dest
+    dest = PileupRead()
+    dest._alignment = makeAlignedRead( src.b )
+    dest._qpos = src.qpos
+    dest._indel = src.indel
+    dest._level = src.level
+    dest._is_del = src.is_del
+    dest._is_head = src.is_head
+    dest._is_tail = src.is_tail
+    return dest
+
+#####################################################################
+#####################################################################
+#####################################################################
+## Generic callbacks for inserting python callbacks.
+#####################################################################
+cdef int fetch_callback( bam1_t *alignment, void *f):
+    '''callback for bam_fetch. 
+    
+    calls function in *f* with a new :class:`AlignedRead` object as parameter.
+    '''
+    a = makeAlignedRead( alignment )
+    (<object>f)(a)
+
+class PileupColumn(object):                       
+    '''A pileup column. A pileup column contains  
+    all the reads that map to a certain target base.
+
+    tid      
+        chromosome ID as is defined in the header      
+    pos      
+        the target base coordinate (0-based)    
+    n 
+        number of reads mapping to this column  
+    pileups  
+        list of reads (:class:`pysam.PileupRead`) aligned to this column    
+    '''      
+    def __str__(self):     
+        return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
+            "\n" + "\n".join( map(str, self.pileups) )
+
+cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f):
+    '''callback for pileup.
+
+    calls function in *f* with a new :class:`Pileup` object as parameter.
+
+    tid
+        chromosome ID as is defined in the header
+    pos
+        start coordinate of the alignment, 0-based
+    n
+        number of elements in pl array
+    pl
+        array of alignments
+    data
+        user provided data
+    '''
+
+    p = PileupColumn()
+    p.tid = tid
+    p.pos = pos
+    p.n = n
+    pileups = []
+
+    for x from 0 <= x < n:
+        pileups.append( makePileupRead( &(pl[x]) ) )
+    p.pileups = pileups
+        
+    (<object>f)(p)
+
+cdef int pileup_fetch_callback( bam1_t *b, void *data):
+    '''callback for bam_fetch. 
+
+    Fetches reads and submits them to pileup.
+    '''
+    cdef bam_plbuf_t * buf
+    buf = <bam_plbuf_t*>data
+    bam_plbuf_push(b, buf)
+    return 0
+
+class StderrStore():
+    '''
+    stderr is captured. 
+    '''
+    def __init__(self):
+        self.stderr_h, self.stderr_f = tempfile.mkstemp()
+        self.stderr_save = Outs( sys.stderr.fileno() )
+        self.stderr_save.setfd( self.stderr_h )
+        
+    def release(self):
+        self.stderr_save.restore()
+        if os.path.exists(self.stderr_f):
+            os.remove( self.stderr_f )
+
+    def __del__(self):
+        self.release()
+
+######################################################################
+######################################################################
+######################################################################
+# valid types for sam headers
+VALID_HEADER_TYPES = { "HD" : dict, 
+                       "SQ" : list, 
+                       "RG" : list, 
+                       "PG" : list, 
+                       "CO" : list }
+
+# order of records within sam headers
+VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" )
+
+# type conversions within sam header records
+VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str },
+                        "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str },
+                        "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, },
+                        "PG" : { "ID" : str, "VN" : str, "CL" : str }, }
+
+# output order of fields within records
+VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ),
+                       "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ),
+                       "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ),
+                       "PG" : ( "ID", "VN", "CL" ), }
+
+######################################################################
+######################################################################
+######################################################################
+## Public methods
+######################################################################
+cdef class Samfile:
+    '''*(filename, mode='r', template = None, referencenames = None, referencelengths = None, text = NULL, header = None)*
+              
+    A *SAM* file. The file is automatically opened.
+    
+    *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode so for binary 
+    (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output. 
+    Use ``h`` to output header information  in text (:term:`TAM`)  mode.
+
+    If ``b`` is present, it must immediately follow ``r`` or ``w``. 
+    Currently valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``.
+    
+    so to open a :term:`BAM` file for reading::
+
+        f=Samfile('ex1.bam','rb')
+
+
+    For writing, the header of a :term:`TAM` file/:term:`BAM` file can be constituted from several
+    sources:
+
+        1. If *template* is given, the header is copied from a another *Samfile* (*template* must be of type *Samfile*).
+
+        2. If *header* is given, the header is build from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level is then a list of lines, with each line being a list of tag-value pairs.
+
+        3. If *text* is given, new header text is copied from raw text.
+
+        4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists. 
+
+    If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random
+    access to reads via :meth:`fetch` and :meth:`pileup` is disabled.
+    '''
+
+    cdef char * filename
+    # pointer to samfile
+    cdef samfile_t * samfile
+    # pointer to index
+    cdef bam_index_t *index
+    # true if file is a bam file
+    cdef int isbam
+
+    # current read within iteration
+    cdef bam1_t * b
+
+    def __cinit__(self, *args, **kwargs ):
+        self.samfile = NULL
+        self.isbam = False
+        self._open( *args, **kwargs )
+
+        # allocate memory for iterator
+        self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+
+    def _isOpen( self ):
+        '''return true if samfile has been opened.'''
+        return self.samfile != NULL
+
+    def _hasIndex( self ):
+        '''return true if samfile has an existing (and opened) index.'''
+        return self.index != NULL
+
+    def _open( self, 
+               char * filename, 
+               mode ='r',
+               Samfile template = None,
+               referencenames = None,
+               referencelengths = None,
+               char * text = NULL,
+               header = None,
+              ):
+        '''open a sam/bam file.
+
+        If _open is called on an existing bamfile, the current file will be
+        closed and a new file will be opened.
+        '''
+
+        assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode
+
+        # close a previously opened file
+        if self.samfile != NULL: self.close()
+        self.samfile = NULL
+
+        cdef bam_header_t * header_to_write
+        header_to_write = NULL
+
+        self.filename = filename
+
+        self.isbam = len(mode) > 1 and mode[1] == 'b'
+
+        if mode[0] == 'w':
+            # open file for writing
+            
+            # header structure (used for writing)
+            if template:
+                # copy header from another file
+                header_to_write = template.samfile.header
+
+            elif header:
+                header_to_write = self._buildHeader( header )
+
+            else:
+                # build header from a target names and lengths
+                assert referencenames and referencelengths, "either supply options `template`, `header` or  both `refernencenames` and `referencelengths` for writing"
+                assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
+
+                # allocate and fill header
+                header_to_write = bam_header_init()
+                header_to_write.n_targets = len(referencenames)
+                n = 0
+                for x in referencenames: n += len(x) + 1
+                header_to_write.target_name = <char**>calloc(n, sizeof(char*))
+                header_to_write.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+                for x from 0 <= x < header_to_write.n_targets:
+                    header_to_write.target_len[x] = referencelengths[x]
+                    name = referencenames[x]
+                    header_to_write.target_name[x] = <char*>calloc(len(name)+1, sizeof(char))
+                    strncpy( header_to_write.target_name[x], name, len(name) )
+
+                if text != NULL:
+                    # copy without \0
+                    header_to_write.l_text = strlen(text)
+                    header_to_write.text = <char*>calloc( strlen(text), sizeof(char) )
+                    memcpy( header_to_write.text, text, strlen(text) )
+
+                header_to_write.hash = NULL
+                header_to_write.rg2lib = NULL
+                    
+            # open file. Header gets written to file at the same time for bam files
+            # and sam files (in the latter case, the mode needs to be wh)
+            store = StderrStore()
+            self.samfile = samopen( filename, mode, header_to_write )
+            store.release()
+
+            # bam_header_destroy takes care of cleaning up of all the members
+            if not template and header_to_write != NULL:
+                bam_header_destroy( header_to_write )
+
+        elif mode[0] == "r":
+            # open file for reading
+            if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ):
+                raise IOError( "file `%s` not found" % filename)
+
+            store = StderrStore()
+            self.samfile = samopen( filename, mode, NULL )
+            store.release()
+
+        if self.samfile == NULL:
+            raise IOError("could not open file `%s`" % filename )
+
+        if mode[0] == "r" and self.isbam:
+            if not os.path.exists(filename + ".bai"):
+                self.index = NULL
+            else:
+                # returns NULL if there is no index or index could not be opened
+                self.index = bam_index_load(filename)
+                if self.index == NULL:
+                    raise IOError("error while opening index `%s` " % filename )
+
+    def getrname( self, tid ):
+        '''(tid )
+        convert numerical :term:`tid` into :ref:`reference` name.'''
+        if not 0 <= tid < self.samfile.header.n_targets:
+            raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets )
+        return self.samfile.header.target_name[tid]
+
+    def _parseRegion( self, 
+                      reference = None, 
+                      start = None, 
+                      end = None, 
+                      region = None ):
+        '''parse region information.
+
+        raise Value for for invalid regions.
+
+        returns a tuple of region, tid, start and end. Region
+        is a valid samtools :term:`region` or None if the region
+        extends over the whole file.
+
+        Note that regions are 1-based, while start,end are python coordinates.
+        '''
+        
+        cdef int rtid
+        cdef int rstart
+        cdef int rend
+        cdef int max_pos
+        max_pos = 2 << 29
+
+        rtid = rstart = rend = 0
+
+        # translate to a region
+        if reference:
+            if start != None and end != None:
+                region = "%s:%i-%i" % (reference, start+1, end)
+            else:
+                region = reference
+
+        if region:
+            store = StderrStore()
+            bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)        
+            store.release()
+            if rtid < 0: raise ValueError( "invalid region `%s`" % region )
+            if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
+            if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
+            if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
+
+        return region, rtid, rstart, rend
+
+    def fetch( self, 
+               reference = None, 
+               start = None, 
+               end = None, 
+               region = None, 
+               callback = None,
+               until_eof = False ):
+        '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)*
+               
+        fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
+        :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
+
+        Without *reference* or *region* all reads will be fetched. The reads will be returned
+        ordered by reference sequence, which will not necessarily be the order within the file.
+        If *until_eof* is given, all reads from the current file position will be returned
+        *as they are sorted within the file*.  
+        
+        If only *reference* is set, all reads matching on *reference* will be fetched.
+
+        The method returns an iterator of type :class:`pysam.IteratorRow` unless
+        a *callback is provided. If *callback* is given, the callback will be executed 
+        for each position within the :term:`region`. Note that callbacks currently work
+        only, if *region* or *reference* is given.
+
+        Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given,
+        an exception is raised.
+        '''
+        cdef int rtid
+        cdef int rstart
+        cdef int rend
+
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+
+        if self.isbam:
+            if callback:
+                if not region:
+                    raise ValueError( "callback functionality requires a region/reference" )
+                if not self._hasIndex(): raise ValueError( "no index available for fetch" )
+                return bam_fetch(self.samfile.x.bam, 
+                                 self.index, rtid, rstart, rend, <void*>callback, fetch_callback )
+            else:
+                if region:
+                    return IteratorRow( self, rtid, rstart, rend )
+                else:
+                    if until_eof:
+                        return IteratorRowAll( self )
+                    else:
+                        # return all targets by chaining the individual targets together.
+                        if not self._hasIndex(): raise ValueError( "no index available for fetch" )
+                        i = []
+                        rstart = 0
+                        rend = 1<<29
+                        for rtid from 0 <= rtid < self.nreferences: 
+                            i.append( IteratorRow( self, rtid, rstart, rend))
+                        return itertools.chain( *i )
+        else:                    
+            if region != None:
+                raise ValueError ("fetch for a region is not available for sam files" )
+            if callback:
+                raise NotImplementedError( "callback not implemented yet" )
+            else:
+                return IteratorRowAll( self )
+
+    def pileup( self, reference = None, start = None, end = None, region = None, callback = None ):
+        '''run a pileup within a :term:`region` using 0-based indexing. The region is specified by
+        :term:`reference`, *start* and *end*. Alternatively, a samtools *region* string can be supplied.
+
+        Without *reference* or *region* all reads will be fetched. The reads will be returned
+        ordered by :term:`reference` sequence, which will not necessarily be the order within the file.
+
+        The method returns an iterator of type :class:`pysam.IteratorColumn` unless
+        a *callback is provided. If *callback* is given, the callback will be executed 
+        for each position within the :term:`region`. 
+
+        Note that samfiles do not allow random access. If *region* or *reference* are given,
+        an exception is raised.
+        
+        .. Note::
+
+            *all* reads which overlap the region are returned. The first base returned will be the 
+            first base of the first read *not* necessarily the first base of the region used in the query.
+        '''
+        cdef int rtid
+        cdef int rstart
+        cdef int rend
+        cdef bam_plbuf_t *buf
+
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+        
+        if self.isbam:
+            if not self._hasIndex(): raise ValueError( "no index available for pileup" )
+
+            if callback:
+                if not region:
+                    raise ValueError( "callback functionality requires a region/reference" )
+
+                buf = bam_plbuf_init( <bam_pileup_f>pileup_callback, <void*>callback )
+                bam_fetch(self.samfile.x.bam, 
+                          self.index, rtid, rstart, rend, 
+                          buf, pileup_fetch_callback )
+                
+                # finalize pileup
+                bam_plbuf_push( NULL, buf)
+                bam_plbuf_destroy(buf)
+            else:
+                if region:
+                    return IteratorColumn( self, rtid, rstart, rend )
+                else:
+                    # return all targets by chaining the individual targets together.
+                    i = []
+                    rstart = 0
+                    rend = 1<<29
+                    for rtid from 0 <= rtid < self.nreferences: 
+                        i.append( IteratorColumn( self, rtid, rstart, rend))
+                    return itertools.chain( *i )
+
+        else:
+            raise NotImplementedError( "pileup of samfiles not implemented yet" )
+
+    def close( self ):
+        '''closes file.'''
+        if self.samfile != NULL:
+            samclose( self.samfile )
+            bam_index_destroy(self.index);
+            self.samfile = NULL
+
+    def __dealloc__( self ):
+        '''clean up.'''
+        # remember: dealloc cannot call other methods
+        # Note that __del__ is not called.
+        self.close()
+        pysam_bam_destroy1(self.b)
+
+    def write( self, AlignedRead read ):
+        '''(AlignedRead read )
+        write a single :class:`pysam.AlignedRead`..
+
+        return the number of bytes written.
+        '''
+        return samwrite( self.samfile, read._delegate )
+
+    property nreferences:
+        '''number of :term:`reference` sequences in the file.'''
+        def __get__(self):
+            return self.samfile.header.n_targets
+
+    property references:
+        """tuple with the names of :term:`reference` sequences."""
+        def __get__(self): 
+            t = []
+            for x from 0 <= x < self.samfile.header.n_targets:
+                t.append( self.samfile.header.target_name[x] )
+            return tuple(t)
+
+    property lengths:
+        """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference`
+        """
+        def __get__(self): 
+            t = []
+            for x from 0 <= x < self.samfile.header.n_targets:
+                t.append( self.samfile.header.target_len[x] )
+            return tuple(t)
+
+    property text:
+        '''full contents of the :term:`sam file` header as a string.'''
+        def __get__(self):
+            # create a temporary 0-terminated copy
+            cdef char * t
+            t = <char*>calloc( self.samfile.header.l_text + 1, sizeof(char) )
+            memcpy( t, self.samfile.header.text, self.samfile.header.l_text )
+            result = t
+            free(t)
+            return result
+
+    property header:
+        '''header information within the :term:`sam file`. The records and fields are returned as 
+        a two-level dictionary.
+        '''
+        def __get__(self):
+            result = {}
+
+            if self.samfile.header.text != NULL:
+                # convert to python string (note: call self.text to create 0-terminated string)
+                t = self.text
+                for line in t.split("\n"):
+                    if not line.strip(): continue
+                    assert line.startswith("@"), "header line without '@': '%s'" % line
+                    fields = line[1:].split("\t")
+                    record = fields[0]
+                    assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line)
+
+                    # treat comments
+                    if record == "CO":
+                        if record not in result: result[record] = []
+                        result[record].append( "\t".join( fields[1:] ) )
+                        continue
+
+                    # the following is clumsy as generators do not work?
+                    x = {}
+                    for field in fields[1:]:
+                        key, value = field.split(":",1)
+                        if key not in VALID_HEADER_FIELDS[record]:
+                            raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) )
+                        x[key] = VALID_HEADER_FIELDS[record][key](value)
+
+                    if VALID_HEADER_TYPES[record] == dict:
+                        if record in result:
+                            raise ValueError( "multiple '%s' lines are not permitted" % record )
+                        result[record] = x
+                    elif VALID_HEADER_TYPES[record] == list:
+                        if record not in result: result[record] = []
+                        result[record].append( x )
+
+            return result
+
+    def _buildLine( self, fields, record ):
+        '''build a header line from *fields* dictionary for *record*'''
+
+        # TODO: add checking for field and sort order
+        line = ["@%s" % record ]
+        if record == "CO":
+            line.append( fields )
+        else:
+            for key in VALID_HEADER_ORDER[record]:
+                if key in fields:
+                    line.append( "%s:%s" % (key, str(fields[key])))
+        return "\t".join( line ) 
+
+    cdef bam_header_t * _buildHeader( self, new_header ):
+        '''return a new header built from a dictionary in *new_header*.
+
+        This method inserts the text field, target_name and target_len.
+        '''
+
+        lines = []
+
+        # check if hash exists
+
+        # create new header and copy old data
+        cdef bam_header_t * dest
+
+        dest = bam_header_init()
+                
+        for record in VALID_HEADERS:
+            if record in new_header:
+                ttype = VALID_HEADER_TYPES[record]
+                data = new_header[record]
+                if type( data ) != type( ttype() ):
+                    raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) )
+                if type( data ) == types.DictType:
+                    lines.append( self._buildLine( data, record ) )
+                else:
+                    for fields in new_header[record]:
+                        lines.append( self._buildLine( fields, record ) )
+
+        text = "\n".join(lines) + "\n"
+        if dest.text != NULL: free( dest.text )
+        dest.text = <char*>calloc( len(text), sizeof(char))
+        dest.l_text = len(text)
+        strncpy( dest.text, text, dest.l_text )
+
+        # collect targets
+        if "SQ" in new_header:
+            seqs = []
+            for fields in new_header["SQ"]:
+                try:
+                    seqs.append( (fields["SN"], fields["LN"] ) )
+                except KeyError:
+                    raise KeyError( "incomplete sequence information in '%s'" % str(fields))
+                
+            dest.n_targets = len(seqs)
+            dest.target_name = <char**>calloc( dest.n_targets, sizeof(char*) )
+            dest.target_len = <uint32_t*>calloc( dest.n_targets, sizeof(uint32_t) )
+            
+            for x from 0 <= x < dest.n_targets:
+                seqname, seqlen = seqs[x]
+                dest.target_name[x] = <char*>calloc( len( seqname ) + 1, sizeof(char) )
+                strncpy( dest.target_name[x], seqname, len(seqname) + 1 )
+                dest.target_len[x] = seqlen
+
+        return dest
+
+    def __iter__(self):
+        return self 
+
+    cdef bam1_t * getCurrent( self ):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        cdef int ret
+        return samread(self.samfile, self.b)
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        cdef int ret
+        ret = samread(self.samfile, self.b)
+        if (ret > 0):
+            return makeAlignedRead( self.b )
+        else:
+            raise StopIteration
+
+cdef class Fastafile:
+    '''*(filename)*
+              
+    A *FASTA* file. The file is automatically opened.
+
+    The file expects an indexed fasta file.
+
+    TODO: 
+        add automatic indexing.
+        add function to get sequence names.
+    '''
+
+    cdef char * filename
+    # pointer to fastafile
+    cdef faidx_t * fastafile
+
+    def __cinit__(self, *args, **kwargs ):
+        self.fastafile = NULL
+        self._open( *args, **kwargs )
+
+    def _isOpen( self ):
+        '''return true if samfile has been opened.'''
+        return self.fastafile != NULL
+
+    def _open( self, 
+               char * filename ):
+        '''open an indexed fasta file.
+
+        This method expects an indexed fasta file.
+        '''
+
+        # close a previously opened file
+        if self.fastafile != NULL: self.close()
+        self.filename = filename
+        self.fastafile = fai_load( filename )
+
+        if self.fastafile == NULL:
+            raise IOError("could not open file `%s`" % filename )
+
+    def close( self ):
+        if self.fastafile != NULL:
+            fai_destroy( self.fastafile )
+            self.fastafile = NULL
+
+    def fetch( self, 
+               reference = None, 
+               start = None, 
+               end = None,
+               region = None):
+               
+        '''*(reference = None, start = None, end = None, region = None)*
+               
+        fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by
+        :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
+        '''
+        
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
+        cdef int len, max_pos
+        cdef char * seq
+        max_pos = 2 << 29
+
+        if not region:
+            if reference == None: raise ValueError( 'no sequence/region supplied.' )
+            if start == None and end == None:
+                region = "%s" % str(reference)
+            elif start == None or end == None:
+                raise ValueError( 'only start or only end of region supplied' )
+            else:
+                if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
+               # valid ranges are from 0 to 2^29-1
+                if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
+                if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
+                region = "%s:%i-%i" % (reference, start+1, end )
+
+        # samtools adds a '\0' at the end
+        seq = fai_fetch( self.fastafile, region, &len )
+        # copy to python
+        result = seq
+        # clean up
+        free(seq)
+        
+        return result
+
+## turning callbacks elegantly into iterators is an unsolved problem, see the following threads:
+## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1
+## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html
+## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code
+## changes, the changes have to be manually entered.
+
+cdef class IteratorRow:
+    """iterates over mapped reads in a region.
+    """
+    
+    cdef bam_fetch_iterator_t*  bam_iter # iterator state object
+    cdef bam1_t *               b
+    cdef                        error_msg
+    cdef int                    error_state
+    cdef Samfile                samfile
+    def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
+        self.bam_iter = NULL
+
+        assert samfile._isOpen()
+        assert samfile._hasIndex()
+        
+        # makes sure that samfile stays alive as long as the
+        # iterator is alive.
+        self.samfile = samfile
+
+        # parse the region
+        self.error_state = 0
+        self.error_msg = None
+
+        cdef bamFile  fp
+        fp = samfile.samfile.x.bam
+        self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end)
+
+    def __iter__(self):
+        return self 
+
+    cdef bam1_t * getCurrent( self ):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        self.b = bam_fetch_iterate(self.bam_iter)
+        if self.b == NULL: return 0
+        return 1
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        if self.error_state:
+            raise ValueError( self.error_msg)
+        
+        self.b = bam_fetch_iterate(self.bam_iter)
+        if self.b != NULL:
+            return makeAlignedRead( self.b )
+        else:
+            raise StopIteration
+
+    def __dealloc__(self):
+        '''remember: dealloc cannot call other methods!'''
+        if self.bam_iter:
+            bam_cleanup_fetch_iterator(self.bam_iter)
+        
+cdef class IteratorRowAll:
+    """iterates over all mapped reads
+    """
+
+    cdef bam1_t * b
+    cdef samfile_t * fp
+
+    def __cinit__(self, Samfile samfile):
+
+        assert samfile._isOpen()
+
+        self.fp = samfile.samfile
+
+        # allocate memory for alignment
+        self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+
+    def __iter__(self):
+        return self 
+
+    cdef bam1_t * getCurrent( self ):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        cdef int ret
+        return samread(self.fp, self.b)
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        cdef int ret
+        ret = samread(self.fp, self.b)
+        if (ret > 0):
+            return makeAlignedRead( self.b )
+        else:
+            raise StopIteration
+
+    def __dealloc__(self):
+        '''remember: dealloc cannot call other methods!'''
+        pysam_bam_destroy1(self.b)
+        
+cdef class IteratorColumn:
+    '''iterates over columns.
+
+    This iterator wraps the pileup functionality of samtools.
+    
+    For reasons of efficiency, the iterator returns the current 
+    pileup buffer. As this buffer is updated at every iteration, 
+    the contents of this iterator will change accordingly. Hence the conversion to
+    a list will not produce the expected result::
+    
+       f = Samfile("file.bam", "rb")
+       result = list( f.pileup() )
+
+    Here, result will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns, 
+    but each object will contain the same information.
+    
+    If the results of several columns are required at the same time, the results
+    need to be stored explicitely::
+
+       result = [ x.pileups() for x in f.pileup() ]
+
+    Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`.
+
+    '''
+    cdef bam_plbuf_t *buf
+
+    # check if first iteration
+    cdef int notfirst
+    # result of the last plbuf_push
+    cdef int n_pu
+    cdef int eof 
+    cdef IteratorRow iter
+
+    def __cinit__(self, Samfile samfile, int tid, int start, int end ):
+
+        self.iter = IteratorRow( samfile, tid, start, end )
+        self.buf = bam_plbuf_init(NULL, NULL )
+        self.n_pu = 0
+        self.eof = 0
+
+    def __iter__(self):
+        return self 
+
+    cdef int cnext(self):
+        '''perform next iteration.
+        
+        return 1 if there is a buffer to emit. Return 0 for end of iteration.
+        '''
+
+        cdef int retval1, retval2
+
+        # pysam bam_plbuf_push returns:
+        # 1: if buf is full and can be emitted
+        # 0: if b has been added
+        # -1: if there was an error
+
+        # check if previous plbuf was incomplete. If so, continue within
+        # the loop and yield if necessary
+        if self.n_pu > 0:
+            self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1)
+            if self.n_pu > 0: return 1
+
+        if self.eof: return 0
+
+        # get next alignments and submit until plbuf indicates that
+        # an new column has finished
+        while self.n_pu == 0:
+            retval1 = self.iter.cnext()
+            # wrap up if no more input
+            if retval1 == 0: 
+                self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0)            
+                self.eof = 1
+                return self.n_pu
+
+            # submit to plbuf
+            self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0)            
+            if self.n_pu < 0: raise ValueError( "error while iterating" )
+
+        # plbuf has yielded
+        return 1
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        cdef int ret
+        ret = self.cnext()
+        cdef bam_pileup1_t * pl
+
+        if ret > 0 :
+            return makePileupProxy( self.buf, self.n_pu )
+        else:
+            raise StopIteration
+
+    def __dealloc__(self):
+        bam_plbuf_destroy(self.buf);
+
+cdef class AlignedRead:
+    '''
+    Class representing an aligned read. see SAM format specification for meaning of fields (http://samtools.sourceforge.net/).
+
+    This class stores a handle to the samtools C-structure representing
+    an aligned read. Member read access is forwarded to the C-structure
+    and converted into python objects. This implementation should be fast,
+    as only the data needed is converted.
+
+    For write access, the C-structure is updated in-place. This is
+    not the most efficient way to build BAM entries, as the variable
+    length data is concatenated and thus needs to resized if
+    a field is updated. Furthermore, the BAM entry might be
+    in an inconsistent state. The :meth:`~validate` method can
+    be used to check if an entry is consistent.
+
+    One issue to look out for is that the sequence should always
+    be set *before* the quality scores. Setting the sequence will
+    also erase any quality scores that were set previously.
+    '''
+    cdef:
+         bam1_t * _delegate 
+
+    def __cinit__( self ):
+        # see bam_init1
+        self._delegate = <bam1_t*>calloc( 1, sizeof( bam1_t) )
+        # allocate some memory 
+        # If size is 0, calloc does not return a pointer that can be passed to free()
+        # so allocate 40 bytes for a new read
+        self._delegate.m_data = 40
+        self._delegate.data = <uint8_t *>calloc( self._delegate.m_data, 1 )
+        self._delegate.data_len = 0
+
+    def __dealloc__(self):
+        '''clear up memory.'''
+        pysam_bam_destroy1(self._delegate)
+    
+    def __str__(self):
+        """todo"""
+        return "\t".join(map(str, (self.qname,
+                                   self.rname,
+                                   self.pos,
+                                   self.cigar,
+                                   self.qual,
+                                   self.flag,
+                                   self.seq,
+                                   self.mapq,
+                                   self.tags)))
+    
+       
+    def __cmp__(self, AlignedRead other):
+        '''return true, if contents in this are binary equal to ``other``.'''
+        cdef int retval, x
+        cdef bam1_t *t, *o
+        t = self._delegate
+        o = other._delegate
+
+        # uncomment for debugging purposes
+        # cdef unsigned char * oo, * tt
+        # tt = <unsigned char*>(&t.core)
+        # oo = <unsigned char*>(&o.core)
+        # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
+        # tt = <unsigned char*>(t.data)
+        # oo = <unsigned char*>(o.data)
+        # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
+
+        retval = memcmp( &t.core, 
+                          &o.core, 
+                          sizeof( bam1_core_t ))
+
+        if retval: return retval
+        retval = cmp( t.data_len, o.data_len)
+        if retval: return retval
+        return memcmp( t.data, 
+                       o.data, 
+                       sizeof( t.data_len ))
+
+    property qname:
+        """the query name (None if not present)"""
+        def __get__(self):
+            cdef bam1_t * src 
+            src = self._delegate
+            if src.core.l_qname == 0: return None
+            return <char *>pysam_bam1_qname( src )
+
+        def __set__(self, qname ):
+            if qname == None or len(qname) == 0: return
+            cdef bam1_t * src 
+            cdef int l 
+            cdef char * p
+
+            src = self._delegate            
+            p = pysam_bam1_qname( src )
+
+            # the qname is \0 terminated
+            l = len(qname) + 1
+            pysam_bam_update( src, 
+                              src.core.l_qname, 
+                              l, 
+                              <uint8_t*>p )
+
+            src.core.l_qname = l
+
+            # re-acquire pointer to location in memory
+            # as it might have moved
+            p = pysam_bam1_qname(src)
+
+            strncpy( p, qname, l )
+            
+    property cigar:
+        """the :term:`cigar` alignment (None if not present).
+        """
+        def __get__(self):
+            cdef uint32_t * cigar_p
+            cdef bam1_t * src 
+            cdef op, l, cigar
+            src = self._delegate
+            if src.core.n_cigar == 0: return None
+            
+            cigar = []
+            cigar_p = pysam_bam1_cigar(src);
+            for k from 0 <= k < src.core.n_cigar:
+                op = cigar_p[k] & BAM_CIGAR_MASK
+                l = cigar_p[k] >> BAM_CIGAR_SHIFT
+                cigar.append((op, l))
+            return cigar
+
+        def __set__(self, values ):
+            if values == None or len(values) == 0: return
+            cdef uint32_t * p
+            cdef bam1_t * src 
+            cdef op, l
+            cdef int k
+
+            k = 0
+
+            src = self._delegate
+
+            # get location of cigar string
+            p = pysam_bam1_cigar(src)
+
+            # create space for cigar data within src.data
+            pysam_bam_update( src, 
+                              src.core.n_cigar * 4,
+                              len(values) * 4, 
+                              p )
+            
+            # length is number of cigar operations, not bytes
+            src.core.n_cigar = len(values)
+
+            # re-acquire pointer to location in memory
+            # as it might have moved
+            p = pysam_bam1_cigar(src)
+
+            # insert cigar operations
+            for op, l in values:
+                p[k] = l << BAM_CIGAR_SHIFT | op
+                k += 1
+
+            ## setting the cigar string also updates the "bin" attribute
+            src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p))
+
+    property seq:
+        """the query sequence (None if not present)"""
+        def __get__(self):
+            cdef bam1_t * src
+            cdef uint8_t * p 
+            cdef char * s
+            src = self._delegate
+            bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"
+            ## parse qseq (bam1_seq)
+            if src.core.l_qseq == 0: return None
+
+            s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char))
+            p = pysam_bam1_seq( src )
+            for k from 0 <= k < src.core.l_qseq:
+            ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c)
+                s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)]
+            retval=s
+            free(s)
+            return retval
+
+        def __set__(self,seq):
+            # samtools manages sequence and quality length memory together
+            # if no quality information is present, the first byte says 0xff.
+            
+            if seq == None or len(seq) == 0: return
+            cdef bam1_t * src
+            cdef uint8_t * p 
+            cdef char * s
+            src = self._delegate
+            cdef int l, k, nbytes_new, nbytes_old
+
+            l = len(seq)
+            
+            # as the sequence is stored in half-bytes, the total length (sequence
+            # plus quality scores) is (l+1)/2 + l
+            nbytes_new = (l+1)/2 + l
+            nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq
+            # acquire pointer to location in memory
+            p = pysam_bam1_seq( src )
+            src.core.l_qseq = l
+
+            pysam_bam_update( src, 
+                              nbytes_old,
+                              nbytes_new,
+                              p)
+            # re-acquire pointer to location in memory
+            # as it might have moved
+            p = pysam_bam1_seq( src )
+            for k from 0 <= k < nbytes_new: p[k] = 0
+            # convert to C string
+            s = seq
+            for k from 0 <= k < l:
+                p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2)
+
+            # erase qualities
+            p = pysam_bam1_qual( src )
+            p[0] = 0xff
+
+    property qual:
+        """the base quality (None if not present)"""
+        def __get__(self):
+            cdef bam1_t * src 
+            cdef uint8_t * p
+            cdef char * q
+            src = self._delegate
+            if src.core.l_qseq == 0: return None
+
+            p = pysam_bam1_qual( src )
+            if p[0] == 0xff: return None
+
+            q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char))
+            for k from 0 <= k < src.core.l_qseq:
+            ## equivalent to t[i] + 33 (see bam.c)
+                q[k] = p[k] + 33
+            # convert to python string
+            retval=q
+            # clean up
+            free(q)
+            return retval
+
+        def __set__(self,qual):
+            # note that space is already allocated via the sequences
+            cdef bam1_t * src
+            cdef uint8_t * p
+            cdef char * q 
+            src = self._delegate
+            p = pysam_bam1_qual( src )
+            if qual == None or len(qual) == 0:
+                # if absent - set to 0xff
+                p[0] = 0xff
+                return
+            cdef int l
+            # convert to C string
+            q = qual
+            l = len(qual)
+            if src.core.l_qseq != l:
+                raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq))
+            assert src.core.l_qseq == l
+            for k from 0 <= k < l:
+                p[k] = <uint8_t>q[k] - 33
+
+    property tags:
+        """the tags in the AUX field."""
+        def __get__(self):
+            cdef char * ctag
+            cdef bam1_t * src
+            cdef uint8_t * s
+            cdef char tpe
+            
+            src = self._delegate
+            if src.l_aux == 0: return None
+            
+            s = pysam_bam1_aux( src )
+            result = []
+            ctag = <char*>calloc( 3, sizeof(char) )
+            cdef int x
+            while s < (src.data + src.data_len):
+                # get tag
+                ctag[0] = s[0]
+                ctag[1] = s[1]
+                pytag = ctag
+
+                s += 2
+
+                # convert type - is there a better way?
+                ctag[0] = s[0]
+                ctag[1] = 0
+                pytype = ctag
+                # get type and value 
+                # how do I do char literal comparison in cython?
+                # the code below works (i.e, is C comparison)
+                tpe = toupper(s[0])
+                if tpe == 'S'[0]:
+                    value = <int>bam_aux2i(s)            
+                    s += 2
+                elif tpe == 'I'[0]:
+                    value = <int>bam_aux2i(s)            
+                    s += 4
+                elif tpe == 'F'[0]:
+                    value = <float>bam_aux2f(s)
+                    s += 4
+                elif tpe == 'D'[0]:
+                    value = <double>bam_aux2d(s)
+                    s += 8
+                elif tpe == 'C'[0]:
+                    value = <int>bam_aux2i(s)
+                    s += 1
+                elif tpe == 'A'[0]:
+                    # there might a more efficient way
+                    # to convert a char into a string
+                    value = "%c" % <char>bam_aux2A(s)
+                    s += 1
+                elif tpe == 'Z'[0]:
+                    value = <char*>bam_aux2Z(s)
+                    # +1 for NULL terminated string
+                    s += len(value) + 1
+
+                # skip over type
+                s += 1
+
+                # ignore pytype
+                result.append( (pytag, value) )
+
+            free( ctag )
+            return result
+
+        def __set__(self, tags):
+            cdef char * ctag
+            cdef bam1_t * src
+            cdef uint8_t * s
+            cdef uint8_t * new_data
+            cdef int guessed_size, control_size
+            src = self._delegate
+            cdef int max_size, size
+            max_size = 4000
+
+            # map samtools code to python.struct code and byte size
+            buffer = ctypes.create_string_buffer(max_size)
+
+            offset = 0
+            for pytag, value in tags:
+                t = type(value)
+                if t == types.FloatType:
+                    fmt = "<cccf"
+                elif t == types.IntType:
+                    if value < 0:
+                        if value >= -127: fmt, pytype = "<cccb", 'c'
+                        elif value >= -32767: fmt, pytype = "<ccch", 's'
+                        elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                        else: fmt, ctype = "<ccci", 'i'[0]
+                    else:
+                        if value <= 255: fmt, pytype = "<cccB", 'C'
+                        elif value <= 65535: fmt, pytype = "<cccH", 'S'
+                        elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                        else: fmt, pytype = "<cccI", 'I'
+                else:
+                    # Note: hex strings (H) are not supported yet
+                    if len(value) == 1:
+                        fmt, pytype = "<cccc", 'A'
+                    else:
+                        fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
+
+                size = struct.calcsize(fmt)
+                if offset + size > max_size:
+                    raise NotImplementedError("tags field too large")
+
+                struct.pack_into( fmt,
+                                  buffer,
+                                  offset,
+                                  pytag[0],
+                                  pytag[1],
+                                  pytype,
+                                  value )
+                offset += size
+            
+            # delete the old data and allocate new
+            pysam_bam_update( src, 
+                              src.l_aux,
+                              offset,
+                              pysam_bam1_aux( src ) )
+            
+            src.l_aux = offset
+
+            if offset == 0: return
+
+            # get location of new data
+            s = pysam_bam1_aux( src )            
+            
+            # check if there is direct path from buffer.raw to tmp
+            cdef char * temp 
+            temp = buffer.raw
+            memcpy( s, temp, offset )            
+
+    property flag: 
+        """properties flag"""
+        def __get__(self): return self._delegate.core.flag
+        def __set__(self, flag): self._delegate.core.flag = flag
+    property rname: 
+        """
+        :term:`target` ID
+
+        .. note::
+
+            This field contains the index of the reference sequence 
+            in the sequence dictionary. To obtain the name
+            of the reference sequence, use :meth:`pysam.Samfile.getrname()`
+
+        """
+        def __get__(self): return self._delegate.core.tid
+        def __set__(self, tid): self._delegate.core.tid = tid
+    property pos: 
+        """0-based leftmost coordinate"""
+        def __get__(self): return self._delegate.core.pos
+        def __set__(self, pos): 
+            ## setting the cigar string also updates the "bin" attribute
+            cdef bam1_t * src
+            src = self._delegate
+            if src.core.n_cigar:
+                src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) )
+            else:
+                src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1)
+            self._delegate.core.pos = pos
+    property bin: 
+        """properties bin"""
+        def __get__(self): return self._delegate.core.bin
+        def __set__(self, bin): self._delegate.core.bin = bin
+    property rlen:
+        '''length of the read (read only). Returns 0 if not given.'''
+        def __get__(self): return self._delegate.core.l_qseq
+    property mapq: 
+        """mapping quality"""
+        def __get__(self): return self._delegate.core.qual
+        def __set__(self, qual): self._delegate.core.qual = qual
+    property mrnm:
+        """the :term:`reference` id of the mate """     
+        def __get__(self): return self._delegate.core.mtid
+        def __set__(self, mtid): self._delegate.core.mtid = mtid
+    property mpos: 
+        """the position of the mate"""
+        def __get__(self): return self._delegate.core.mpos
+        def __set__(self, mpos): self._delegate.core.mpos = mpos
+    property isize: 
+        """the insert size"""
+        def __get__(self): return self._delegate.core.isize
+        def __set__(self, isize): self._delegate.core.isize = isize
+    property is_paired: 
+        """true if read is paired in sequencing"""
+        def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FPAIRED
+            else: self._delegate.core.flag &= ~BAM_FPAIRED
+    property is_proper_pair:
+        """true if read is mapped in a proper pair"""
+        def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FPROPER_PAIR
+            else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR
+    property is_unmapped:
+        """true if read itself is unmapped"""
+        def __get__(self): return (self.flag & BAM_FUNMAP) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FUNMAP
+            else: self._delegate.core.flag &= ~BAM_FUNMAP
+    property mate_is_unmapped: 
+        """true if the mate is unmapped""" 
+        def __get__(self): return (self.flag & BAM_FMUNMAP) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FMUNMAP
+            else: self._delegate.core.flag &= ~BAM_FMUNMAP
+    property is_reverse:
+        """true if read is mapped to reverse strand"""
+        def __get__(self):return (self.flag & BAM_FREVERSE) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FREVERSE
+            else: self._delegate.core.flag &= ~BAM_FREVERSE
+    property mate_is_reverse:
+        """true is read is mapped to reverse strand"""
+        def __get__(self): return (self.flag & BAM_FMREVERSE) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FMREVERSE
+            else: self._delegate.core.flag &= ~BAM_FMREVERSE
+    property is_read1: 
+        """true if this is read1"""
+        def __get__(self): return (self.flag & BAM_FREAD1) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FREAD1
+            else: self._delegate.core.flag &= ~BAM_FREAD1
+    property is_read2:
+        """true if this is read2"""
+        def __get__(self): return (self.flag & BAM_FREAD2) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FREAD2
+            else: self._delegate.core.flag &= ~BAM_FREAD2
+    property is_secondary:
+        """true if not primary alignment"""
+        def __get__(self): return (self.flag & BAM_FSECONDARY) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FSECONDARY
+            else: self._delegate.core.flag &= ~BAM_FSECONDARY
+    property is_qcfail:
+        """true if QC failure"""
+        def __get__(self): return (self.flag & BAM_FQCFAIL) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FQCFAIL
+            else: self._delegate.core.flag &= ~BAM_FQCFAIL
+    property is_duplicate:
+        """ true if optical or PCR duplicate"""
+        def __get__(self): return (self.flag & BAM_FDUP) != 0
+        def __set__(self,val): 
+            if val: self._delegate.core.flag |= BAM_FDUP
+            else: self._delegate.core.flag &= ~BAM_FDUP
+    
+    def opt(self, tag):
+        """retrieves optional data given a two-letter *tag*"""
+        #see bam_aux.c: bam_aux_get() and bam_aux2i() etc 
+        cdef uint8_t * v
+        v = bam_aux_get(self._delegate, tag)
+        if v == NULL: raise KeyError( "tag '%s' not present" % tag )
+        type = chr(v[0])
+        if type == 'c' or type == 'C' or type == 's' or type == 'S' or type == 'i':
+            return <int>bam_aux2i(v)            
+        elif type == 'f':
+            return <float>bam_aux2f(v)
+        elif type == 'd':
+            return <double>bam_aux2d(v)
+        elif type == 'A':
+            # there might a more efficient way
+            # to convert a char into a string
+            return '%c' % <char>bam_aux2A(v)
+        elif type == 'Z':
+            return <char*>bam_aux2Z(v)
+    
+    def fancy_str (self):
+        """returns list of fieldnames/values in pretty format for debugging
+        """
+        ret_string = []
+        field_names = {
+           "tid":           "Contig index",
+           "pos":           "Mapped position on contig",
+           "mtid":          "Contig index for mate pair",
+           "mpos":          "Position of mate pair",
+           "isize":         "Insert size",
+           "flag":          "Binary flag",
+           "n_cigar":       "Count of cigar entries",
+           "cigar":         "Cigar entries",
+           "qual":          "Mapping quality",
+           "bin":           "Bam index bin number",
+           "l_qname":       "Length of query name",
+           "qname":         "Query name",
+           "l_qseq":        "Length of query sequence",
+           "qseq":          "Query sequence",
+           "bqual":         "Quality scores",
+           "l_aux":         "Length of auxilary data",
+           "m_data":        "Maximum data length",
+           "data_len":      "Current data length",
+           }
+        fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag", 
+                                 "n_cigar", "cigar", "qual", "bin", "l_qname", "qname", 
+                                 "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"]
+        
+        for f in fields_names_in_order:
+            if not f in self.__dict__:
+                continue
+            ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f)))
+
+        for f in self.__dict__:
+            if not f in field_names:
+                ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f)))
+        return ret_string
+
+cdef class PileupProxy:
+    '''A pileup column. A pileup column contains
+    all the reads that map to a certain target base.
+
+    tid
+        chromosome ID as is defined in the header
+    pos
+        the target base coordinate (0-based)
+    n
+        number of reads mapping to this column    
+    pileups
+        list of reads (:class:`pysam.PileupRead`) aligned to this column
+
+    This class is a proxy for results returned by the samtools pileup engine.
+    If the underlying engine iterator advances, the results of this column
+    will change.
+    '''
+    cdef bam_plbuf_t * buf
+    cdef int n_pu
+
+    def __cinit__(self ):
+        pass
+
+    def __str__(self):
+        return "\t".join( map(str, (self.tid, self.pos, self.n))) +\
+            "\n" +\
+            "\n".join( map(str, self.pileups) )
+
+    property tid:
+        '''the chromosome ID as is defined in the header'''
+        def __get__(self): return pysam_get_tid( self.buf )
+
+    property n:
+        '''number of reads mapping to this column.'''
+        def __get__(self): return self.n_pu
+        def __set__(self, n): self.n_pu = n
+
+    property pos:
+        def __get__(self): return pysam_get_pos( self.buf )
+
+    property pileups:
+        '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
+        def __get__(self):
+            cdef bam_pileup1_t * pl
+            pl = pysam_get_pileup( self.buf )
+            pileups = []
+            # warning: there could be problems if self.n and self.buf are
+            # out of sync.
+            for x from 0 <= x < self.n_pu:
+                pileups.append( makePileupRead( &pl[x]) )
+            return pileups
+
+cdef class PileupRead:
+    '''A read aligned to a column.
+    '''
+
+    cdef:
+         AlignedRead _alignment
+         int32_t  _qpos
+         int _indel
+         int _level
+         uint32_t _is_del
+         uint32_t _is_head
+         uint32_t _is_tail
+
+    def __cinit__( self ):
+        pass
+
+    def __str__(self):
+        return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) )
+       
+    property alignment:
+        """a :class:`pysam.AlignedRead` object of the aligned read"""
+        def __get__(self):
+            return self._alignment
+    property qpos:
+        """position of the read base at the pileup site, 0-based"""
+        def __get__(self):
+            return self._qpos
+    property indel:
+        """indel length; 0 for no indel, positive for ins and negative for del"""
+        def __get__(self):
+            return self._indel
+    property is_del:
+        """1 iff the base on the padded read is a deletion"""
+        def __get__(self):
+            return self._is_del
+    property is_head:
+        def __get__(self):
+            return self._is_head
+    property is_tail:
+        def __get__(self):
+            return self._is_tail
+    property level:
+        def __get__(self):
+            return self._level
+
+class Outs:
+    '''http://mail.python.org/pipermail/python-list/2000-June/038406.html'''
+    def __init__(self, id = 1):
+        self.streams = []
+        self.id = id
+
+    def setdevice(self, filename):
+        '''open an existing file, like "/dev/null"'''
+        fd = os.open(filename, os.O_WRONLY)
+        self.setfd(fd)
+
+    def setfile(self, filename):
+        '''open a new file.'''
+        fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);
+        self.setfd(fd)
+
+    def setfd(self, fd):
+        ofd = os.dup(self.id)      #  Save old stream on new unit.
+        self.streams.append(ofd)
+        sys.stdout.flush()          #  Buffered data goes to old stream.
+        os.dup2(fd, self.id)        #  Open unit 1 on new stream.
+        os.close(fd)                #  Close other unit (look out, caller.)
+            
+    def restore(self):
+        '''restore previous output stream'''
+        if self.streams:
+            # the following was not sufficient, hence flush both stderr and stdout
+            # os.fsync( self.id )
+            sys.stdout.flush()
+            sys.stderr.flush()
+            os.dup2(self.streams[-1], self.id)
+            os.close(self.streams[-1])
+            del self.streams[-1]
+
+def _samtools_dispatch( method, args = () ):
+    '''call ``method`` in samtools providing arguments in args.
+    
+    .. note:: 
+       This method redirects stdout and stderr to capture it 
+       from samtools. If for some reason stdout/stderr disappears
+       the reason might be in this method.
+
+    .. note::
+       The current implementation might only work on linux.
+       
+    .. note:: 
+       This method captures stdout and stderr using temporary files, 
+       which are then read into memory in their entirety. This method
+       is slow and might cause large memory overhead. 
+
+    See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily
+    on the topic of redirecting stderr/stdout.
+    '''
+
+    # note that debugging this module can be a problem
+    # as stdout/stderr will not appear
+
+    # redirect stderr and stdout to file
+
+    # open files and redirect into it
+    stderr_h, stderr_f = tempfile.mkstemp()
+    stdout_h, stdout_f = tempfile.mkstemp()
+
+    # patch for `samtools view`
+    # samtools `view` closes stdout, from which I can not
+    # recover. Thus redirect output to file with -o option.
+    if method == "view":
+        if "-o" in args: raise ValueError("option -o is forbidden in samtools view")
+        args = ( "-o", stdout_f ) + args
+
+    stdout_save = Outs( sys.stdout.fileno() )
+    stdout_save.setfd( stdout_h )
+    stderr_save = Outs( sys.stderr.fileno() )
+    stderr_save.setfd( stderr_h )
+
+    # do the function call to samtools
+    cdef char ** cargs
+    cdef int i, n, retval
+
+    n = len(args)
+    # allocate two more for first (dummy) argument (contains command)
+    cargs = <char**>calloc( n+2, sizeof( char *) )
+    cargs[0] = "samtools"
+    cargs[1] = method
+    for i from 0 <= i < n: cargs[i+2] = args[i]
+    retval = pysam_dispatch(n+2, cargs)
+    free( cargs )
+
+    # restore stdout/stderr. This will also flush, so
+    # needs to be before reading back the file contents
+    stdout_save.restore()
+    stderr_save.restore()
+
+    # capture stderr/stdout.
+    out_stderr = open( stderr_f, "r").readlines()
+    out_stdout = open( stdout_f, "r").readlines()
+
+    # clean up files
+    os.remove( stderr_f )
+    os.remove( stdout_f )
+
+    return retval, out_stderr, out_stdout
+
+__all__ = ["Samfile", 
+           "Fastafile",
+           "IteratorRow", 
+           "IteratorRowAll", 
+           "IteratorColumn", 
+           "AlignedRead", 
+           "PileupColumn", 
+           "PileupProxy", 
+           "PileupRead" ]
+
+               
+
diff --git a/pysam/namedtuple.py b/pysam/namedtuple.py
new file mode 100644 (file)
index 0000000..a60fb1a
--- /dev/null
@@ -0,0 +1,117 @@
+from operator import itemgetter as _itemgetter
+from keyword import iskeyword as _iskeyword
+import sys as _sys
+
+def namedtuple(typename, field_names, verbose=False, rename=False):
+    """Returns a new subclass of tuple with named fields.
+
+    >>> Point = namedtuple('Point', 'x y')
+    >>> Point.__doc__                   # docstring for the new class
+    'Point(x, y)'
+    >>> p = Point(11, y=22)             # instantiate with positional args or keywords
+    >>> p[0] + p[1]                     # indexable like a plain tuple
+    33
+    >>> x, y = p                        # unpack like a regular tuple
+    >>> x, y
+    (11, 22)
+    >>> p.x + p.y                       # fields also accessable by name
+    33
+    >>> d = p._asdict()                 # convert to a dictionary
+    >>> d['x']
+    11
+    >>> Point(**d)                      # convert from a dictionary
+    Point(x=11, y=22)
+    >>> p._replace(x=100)               # _replace() is like str.replace() but targets named fields
+    Point(x=100, y=22)
+
+    """
+
+    # Parse and validate the field names.  Validation serves two purposes,
+    # generating informative error messages and preventing template injection attacks.
+    if isinstance(field_names, basestring):
+        field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas
+    field_names = tuple(map(str, field_names))
+    if rename:
+        names = list(field_names)
+        seen = set()
+        for i, name in enumerate(names):
+            if (not min(c.isalnum() or c=='_' for c in name) or _iskeyword(name)
+                or not name or name[0].isdigit() or name.startswith('_')
+                or name in seen):
+                    names[i] = '_%d' % i
+            seen.add(name)
+        field_names = tuple(names)
+    for name in (typename,) + field_names:
+        if not min(c.isalnum() or c=='_' for c in name):
+            raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name)
+        if _iskeyword(name):
+            raise ValueError('Type names and field names cannot be a keyword: %r' % name)
+        if name[0].isdigit():
+            raise ValueError('Type names and field names cannot start with a number: %r' % name)
+    seen_names = set()
+    for name in field_names:
+        if name.startswith('_') and not rename:
+            raise ValueError('Field names cannot start with an underscore: %r' % name)
+        if name in seen_names:
+            raise ValueError('Encountered duplicate field name: %r' % name)
+        seen_names.add(name)
+
+    # Create and fill-in the class template
+    numfields = len(field_names)
+    argtxt = repr(field_names).replace("'", "")[1:-1]   # tuple repr without parens or quotes
+    reprtxt = ', '.join('%s=%%r' % name for name in field_names)
+    template = '''class %(typename)s(tuple):
+        '%(typename)s(%(argtxt)s)' \n
+        __slots__ = () \n
+        _fields = %(field_names)r \n
+        def __new__(_cls, %(argtxt)s):
+            return _tuple.__new__(_cls, (%(argtxt)s)) \n
+        @classmethod
+        def _make(cls, iterable, new=tuple.__new__, len=len):
+            'Make a new %(typename)s object from a sequence or iterable'
+            result = new(cls, iterable)
+            if len(result) != %(numfields)d:
+                raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result))
+            return result \n
+        def __repr__(self):
+            return '%(typename)s(%(reprtxt)s)' %% self \n
+        def _asdict(self):
+            'Return a new dict which maps field names to their values'
+            return dict(zip(self._fields, self)) \n
+        def _replace(_self, **kwds):
+            'Return a new %(typename)s object replacing specified fields with new values'
+            result = _self._make(map(kwds.pop, %(field_names)r, _self))
+            if kwds:
+                raise ValueError('Got unexpected field names: %%r' %% kwds.keys())
+            return result \n
+        def __getnewargs__(self):
+            return tuple(self) \n\n''' % locals()
+    for i, name in enumerate(field_names):
+        template += '        %s = _property(_itemgetter(%d))\n' % (name, i)
+    if verbose:
+        print template
+
+    # Execute the template string in a temporary namespace
+    namespace = dict(_itemgetter=_itemgetter, __name__='namedtuple_%s' % typename,
+                     _property=property, _tuple=tuple)
+    try:
+        exec template in namespace
+    except SyntaxError, e:
+        raise SyntaxError(e.message + ':\n' + template)
+    result = namespace[typename]
+
+    # For pickling to work, the __module__ variable needs to be set to the frame
+    # where the named tuple is created.  Bypass this step in enviroments where
+    # sys._getframe is not defined (Jython for example) or sys._getframe is not
+    # defined for arguments greater than 0 (IronPython).
+    try:
+        result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__')
+    except (AttributeError, ValueError):
+        pass
+
+    return result
+
+
+
+
+
diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c
new file mode 100644 (file)
index 0000000..5360626
--- /dev/null
@@ -0,0 +1,499 @@
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+#include "knetfile.h"
+#include "pysam_util.h"
+
+// #######################################################
+// utility routines to avoid using callbacks in bam_fetch
+// taken from bam_index.c
+// The order of the following declarations is important.
+// #######################################################
+
+typedef struct
+{
+  uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+
+typedef struct {
+       uint32_t m, n;
+       pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+       int32_t n, m;
+       uint64_t *offset;
+} bam_lidx_t;
+
+KSORT_INIT(my_off, pair64_t, pair64_lt);
+KHASH_MAP_INIT_INT(my_i, bam_binlist_t);
+
+struct __bam_index_t
+{
+  int32_t n;
+  khash_t(my_i) **index;
+  bam_lidx_t *index2;
+};
+
+typedef struct __linkbuf_t {
+       bam1_t b;
+       uint32_t beg, end;
+       struct __linkbuf_t *next;
+} lbnode_t;
+
+typedef struct {
+       int cnt, n, max;
+       lbnode_t **buf;
+} mempool_t;
+
+struct __bam_plbuf_t {
+       mempool_t *mp;
+       lbnode_t *head, *tail, *dummy;
+       bam_pileup_f func;
+       void *func_data;
+       int32_t tid, pos, max_tid, max_pos;
+       int max_pu, is_eof;
+       bam_pileup1_t *pu;
+       int flag_mask;
+};
+
+static mempool_t *mp_init()
+{
+       mempool_t *mp;
+       mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+       return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+       int k;
+       for (k = 0; k < mp->n; ++k) {
+               free(mp->buf[k]->b.data);
+               free(mp->buf[k]);
+       }
+       free(mp->buf);
+       free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+       ++mp->cnt;
+       if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+       else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+       --mp->cnt; p->next = 0; // clear lbnode_t::next here
+       if (mp->n == mp->max) {
+               mp->max = mp->max? mp->max<<1 : 256;
+               mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+       }
+       mp->buf[mp->n++] = p;
+}
+
+static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
+{
+       unsigned k;
+       bam1_t *b = p->b;
+       bam1_core_t *c = &b->core;
+       uint32_t x = c->pos, y = 0;
+       int ret = 1, is_restart = 1;
+
+       if (c->flag&BAM_FUNMAP) return 0; // unmapped read
+       assert(x <= pos); // otherwise a bug
+       p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
+               int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
+               if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip
+                       if (x + l > pos) { // overlap with pos
+                               p->indel = p->is_del = 0;
+                               p->qpos = y + (pos - x);
+                               if (x == pos && is_restart) p->is_head = 1;
+                               if (x + l - 1 == pos) { // come to the end of a match
+                                       if (k < c->n_cigar - 1) { // there are additional operation(s)
+                                               uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
+                                               int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
+                                               if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
+                                               else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
+                                               if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
+                                                       p->is_tail = 1; // tail
+                                       } else p->is_tail = 1; // this is the last operation; set tail
+                               }
+                       }
+                       x += l; y += l;
+               } else if (op == BAM_CDEL) { // then set ->is_del
+                       if (x + l > pos) {
+                               p->indel = 0; p->is_del = 1;
+                               p->qpos = y + (pos - x);
+                       }
+                       x += l;
+               } else if (op == BAM_CREF_SKIP) x += l;
+               else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
+               if (x > pos) {
+                       if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
+                       break;
+               }
+       }
+       assert(x > pos); // otherwise a bug
+       return ret;
+}
+
+
+
+
+// the following code has been taken from bam_plbuf_push
+// and modified such that instead of a function call
+// the function returns and will continue (if cont is true).
+// from where it left off.
+
+// returns
+// 1: if buf is full and can be emitted
+// 0: if b has been added
+// -1: if there was an error
+int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont)
+{
+  if (!cont)
+    {
+      if (b) { // fill buffer
+       if (b->core.tid < 0) return 0;
+       if (b->core.flag & buf->flag_mask) return 0;
+       bam_copy1(&buf->tail->b, b);
+       buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
+       if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
+         fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n");
+         abort();
+       }
+       buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
+       if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
+         buf->tail->next = mp_alloc(buf->mp);
+         buf->tail = buf->tail->next;
+       }
+      } else buf->is_eof = 1;
+    }
+  else
+    // continue end of loop
+    {
+      // update tid and pos
+      if (buf->head->next) {
+       if (buf->tid > buf->head->b.core.tid) {
+         fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
+         return -1;
+       }
+      }
+      if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
+       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
+      } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
+       buf->pos = buf->head->beg; // jump to the next position
+      } else ++buf->pos; // scan contiguously
+      if (buf->is_eof && buf->head->next == 0) return 0;
+    }
+
+  // enter yield loop
+  while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos))
+    {
+      int n_pu = 0;
+      lbnode_t *p, *q;
+      buf->dummy->next = buf->head;
+      for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
+       if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
+         q->next = p->next; mp_free(buf->mp, p); p = q;
+       } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
+         if (n_pu == buf->max_pu) { // then double the capacity
+           buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
+           buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
+         }
+         buf->pu[n_pu].b = &p->b;
+         if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
+       }
+      }
+      buf->head = buf->dummy->next; // dummy->next may be changed
+
+      // exit if alignments need to be emitted
+      if (n_pu) { return n_pu; }
+      
+      // update tid and pos
+      if (buf->head->next) {
+       if (buf->tid > buf->head->b.core.tid) {
+         fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
+         return -2;
+       }
+      }
+      if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
+       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
+      } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
+       buf->pos = buf->head->beg; // jump to the next position
+      } else ++buf->pos; // scan contiguously
+      if (buf->is_eof && buf->head->next == 0) break;
+    }
+  return 0;
+}
+
+int pysam_get_pos( const bam_plbuf_t *buf) 
+{
+  return buf->pos;
+}
+
+  
+int pysam_get_tid( const bam_plbuf_t *buf)
+{
+  return buf->tid;
+}
+
+bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf)
+{
+  return buf->pu;
+}
+
+// pysam dispatch function to emulate the samtools
+// command line within python.
+// taken from the main function in bamtk.c
+// added code to reset getopt
+extern int main_samview(int argc, char *argv[]);
+extern int main_import(int argc, char *argv[]);
+extern int bam_pileup(int argc, char *argv[]);
+extern int bam_merge(int argc, char *argv[]);
+extern int bam_sort(int argc, char *argv[]);
+extern int bam_index(int argc, char *argv[]);
+extern int faidx_main(int argc, char *argv[]);
+extern int bam_mating(int argc, char *argv[]);
+extern int bam_rmdup(int argc, char *argv[]);
+extern int glf3_view_main(int argc, char *argv[]);
+extern int bam_flagstat(int argc, char *argv[]);
+extern int bam_fillmd(int argc, char *argv[]);
+
+int pysam_dispatch(int argc, char *argv[] )
+{
+
+#ifdef _WIN32
+  setmode(fileno(stdout), O_BINARY);
+  setmode(fileno(stdin),  O_BINARY);
+#ifdef _USE_KNETFILE
+  knet_win32_init();
+#endif
+#endif
+
+  extern int optind;
+  
+  // reset getop
+  optind = 1;
+
+  if (argc < 2) return 1;
+
+  if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
+  else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
+  else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);
+  else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
+  else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
+  else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
+  else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+  else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
+  else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
+  else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
+  else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
+  else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
+  else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
+
+#if _CURSES_LIB != 0
+  else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
+#endif
+  else 
+    {
+      fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
+      return 1;
+    }
+  return 0;
+}
+
+// standin for bam_destroy1 in bam.h
+// deletes all variable length data
+void pysam_bam_destroy1( bam1_t * b )
+{
+  if (b == NULL) return;
+  if (b->data != NULL) free(b->data);
+  free(b);
+}
+
+// taken from samtools/bam_import.c
+static inline uint8_t *alloc_data(bam1_t *b, size_t size)
+{
+  if (b->m_data < size)
+    {
+      b->m_data = size;
+      kroundup32(b->m_data);
+      b->data = (uint8_t*)realloc(b->data, b->m_data);
+    }
+  return b->data;
+}
+
+// update the variable length data within a bam1_t entry.
+// Adds *nbytes_new* - *nbytes_old* into the variable length data of *src* at *pos*.
+// Data within the bam1_t entry is moved so that it is
+// consistent with the data field lengths.
+bam1_t * pysam_bam_update( bam1_t * b,
+                          const size_t nbytes_old,
+                          const size_t nbytes_new, 
+                          uint8_t * pos )
+{
+  int d = nbytes_new-nbytes_old;
+
+  // no change
+  if (d == 0) return b;
+
+  int new_size = d + b->data_len;
+  size_t offset = pos - b->data;
+
+  //printf("d=%i, old=%i, new=%i, old_size=%i, new_size=%i\n",
+  // d, nbytes_old, nbytes_new, b->data_len, new_size);
+  
+  // increase memory if required
+  if (d > 0)
+    {
+      alloc_data( b, new_size );
+      pos = b->data + offset;
+    }
+  
+  if (b->data_len != 0)
+    {
+      if (offset < 0 || offset > b->data_len)
+       fprintf(stderr, "[pysam_bam_insert] illegal offset: '%i'\n", (int)offset);
+    }
+  
+  // printf("dest=%p, src=%p, n=%i\n", pos+nbytes_new, pos + nbytes_old, b->data_len - (offset+nbytes_old));
+  memmove( pos + nbytes_new,
+          pos + nbytes_old,
+          b->data_len - (offset + nbytes_old));
+    
+  b->data_len = new_size;
+      
+  return b;
+}
+
+// translate a nucleotide character to binary code
+unsigned char pysam_translate_sequence( const unsigned char s )
+{
+  return bam_nt16_table[s];
+}
+
+// stand-ins for samtools macros in bam.h
+char * pysam_bam1_qname( const bam1_t * b)
+{
+  return (char*)b->data;
+}
+
+uint32_t * pysam_bam1_cigar( const bam1_t * b) 
+{
+  return (uint32_t*)(b->data + b->core.l_qname);
+}
+
+uint8_t * pysam_bam1_seq( const bam1_t * b) 
+{
+  return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname);
+}
+
+uint8_t * pysam_bam1_qual( const bam1_t * b)
+{
+  return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + (b->core.l_qseq + 1)/2);
+}
+
+uint8_t * pysam_bam1_aux( const bam1_t * b)
+{
+  return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + b->core.l_qseq + (b->core.l_qseq + 1)/2);
+}
+
+// #######################################################
+// Iterator implementation
+// #######################################################
+
+// functions defined in bam_index.c
+extern pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off);
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+       uint32_t rbeg = b->core.pos;
+       uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
+       return (rend > beg && rbeg < end);
+}
+
+struct __bam_fetch_iterator_t
+{
+  bam1_t *        b;
+  pair64_t *      off;
+  int             n_off;
+  uint64_t        curr_off;
+  int             curr_chunk;
+  bamFile              fp;
+  int                          tid;
+  int                          beg;
+  int                          end;
+  int             n_seeks;
+};
+bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end)
+{
+       // iterator contains current alignment position
+       //      and will contain actual alignment during iterations
+       bam_fetch_iterator_t* iter  = (bam_fetch_iterator_t*)calloc(1, sizeof(bam_fetch_iterator_t));
+       iter->b                     = (bam1_t*)calloc(1, sizeof(bam1_t));
+               
+       // list of chunks containing our alignments
+       iter->off = get_chunk_coordinates(idx, tid, beg, end, &iter->n_off);
+       
+       // initialise other state variables in iterator
+       iter->fp                = fp;
+       iter->curr_chunk        = -1;   
+       iter->curr_off          =  0;
+       iter->n_seeks           =  0;    
+       iter->tid                               = tid;
+       iter->beg                               = beg;
+       iter->end                               = end;
+       return iter;
+}
+
+bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter)
+{
+       if (!iter->off) {
+               return 0;
+       }
+
+       int ret;
+       // iterate through all alignments in chunks
+       for (;;) {
+               if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->curr_chunk].v) { // then jump to the next chunk
+                       if (iter->curr_chunk == iter->n_off - 1) break; // no more chunks
+                       if (iter->curr_chunk >= 0) assert(iter->curr_off == iter->off[iter->curr_chunk].v); // otherwise bug
+                       if (iter->curr_chunk < 0 || iter->off[iter->curr_chunk].v != iter->off[iter->curr_chunk+1].u) { // not adjacent chunks; then seek
+                               bam_seek(iter->fp, iter->off[iter->curr_chunk+1].u, SEEK_SET);
+                               iter->curr_off = bam_tell(iter->fp);
+                               ++iter->n_seeks;
+                       }
+                       ++iter->curr_chunk;
+               }
+               if ((ret = bam_read1(iter->fp, iter->b)) > 0) {
+                       iter->curr_off = bam_tell(iter->fp);
+                       if (iter->b->core.tid != iter->tid || iter->b->core.pos >= iter->end) break; // no need to proceed
+                       else if (is_overlap(iter->beg, iter->end, iter->b)) 
+                               //
+                               //func(iter->b, data);
+                               //
+                               return iter->b;
+               } else 
+                       return 0; // end of file
+       }
+       return 0;
+}
+
+void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter)
+{
+  //  fprintf(stderr, "[bam_fetch] # seek calls: %d\n", iter->n_seeks);
+  bam_destroy1(iter->b);
+  free(iter->off);
+}
+
+       
+
+
diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h
new file mode 100644 (file)
index 0000000..ff5d569
--- /dev/null
@@ -0,0 +1,95 @@
+#ifndef PYSAM_UTIL_H
+#define PYSAM_UTIL_H
+
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
+// code for iterator
+
+/*! @typedef
+  @Structure for holding current state (current alignment etc.) for iterating through
+  alignments overlapping a specified region.
+  @field  b           pointer to the current alignment
+  @field  off         pointer to an array of chunk loci (each with beg/end positions)
+  @field  n_off       The number of chunks
+  @field  curr_off    The current file positon
+  @field  curr_chunk  The item in a list of chunk
+  @discussion See also bam_fetch_iterate
+*/
+struct __bam_fetch_iterator_t;
+typedef struct __bam_fetch_iterator_t bam_fetch_iterator_t;
+       
+/*!
+  @abstract Retrieve the alignments that are overlapped with the
+  specified region.
+  
+  @discussion Returns iterator object to retrieve successive alignments ordered by
+  start position. 
+  @param  fp    BAM file handler
+  @param  idx   pointer to the alignment index
+  @param  tid   chromosome ID as is defined in the header
+  @param  beg   start coordinate, 0-based
+  @param  end   end coordinate, 0-based
+*/
+bam_fetch_iterator_t * bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end);
+
+
+/*!
+  @abstract Iterates through alignments overlapped the specified region.
+  @discussion Returns pointer to successive alignments ordered by start position.
+  Returns null pointer to signal the end of the iteration.
+  The alignment data is nested within the iterator to avoid unnecessary allocations.
+*/
+bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter);
+
+bam_fetch_iterator_t* bam_init_fetchall_iterator(bamFile fp, const bam_index_t *idx);
+bam1_t * bam_fetchall_iterate(bam_fetch_iterator_t *iter);
+
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
+// various helper functions
+
+int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont);
+
+// accessor functions - necessary as bam_plbuf_t is hidden
+// among the implementation
+int pysam_get_pos( const bam_plbuf_t *buf);
+int pysam_get_tid( const bam_plbuf_t *buf);
+bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf);
+
+int pysam_dispatch(int argc, char *argv[] );
+
+// stand-in for macro - not wrappable in pyrex
+void pysam_bam_destroy1( bam1_t * b );
+
+// stand-in for other samtools macros
+uint32_t * pysam_bam1_cigar( const bam1_t * b);
+char * pysam_bam1_qname( const bam1_t * b);
+uint8_t * pysam_bam1_seq( const bam1_t * b);
+uint8_t * pysam_bam1_qual( const bam1_t * b);
+uint8_t * pysam_bam1_aux( const bam1_t * b);
+
+/*!
+  @abstract Update the variable length data within a bam1_t entry
+
+  Old data is deleted and the data within b are re-arranged to 
+  make place for new data.
+  
+  @discussion Returns b
+
+  @param  b           bam1_t data
+  @param  nbytes_old  size of old data
+  @param  nbytes_new  size of new data
+  @param  pos         position of data
+*/
+bam1_t * pysam_bam_update( bam1_t * b,
+                          const size_t nbytes_old,
+                          const size_t nbytes_new,
+                          uint8_t * pos );
+
+// translate a nucleotide character to binary code
+unsigned char pysam_translate_sequence( const unsigned char s );
+
+
+#endif
diff --git a/samtools/bam.c b/samtools/bam.c
new file mode 100644 (file)
index 0000000..ee7642b
--- /dev/null
@@ -0,0 +1,303 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+#include "bam.h"
+#include "bam_endian.h"
+#include "kstring.h"
+#include "sam_header.h"
+
+int bam_is_be = 0;
+char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
+
+/**************************
+ * CIGAR related routines *
+ **************************/
+
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
+{
+       uint32_t k, end;
+       end = c->pos;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
+                       end += cigar[k] >> BAM_CIGAR_SHIFT;
+       }
+       return end;
+}
+
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
+{
+       uint32_t k;
+       int32_t l = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)
+                       l += cigar[k] >> BAM_CIGAR_SHIFT;
+       }
+       return l;
+}
+
+/********************
+ * BAM I/O routines *
+ ********************/
+
+bam_header_t *bam_header_init()
+{
+       bam_is_be = bam_is_big_endian();
+       return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+       int32_t i;
+       extern void bam_destroy_header_hash(bam_header_t *header);
+       if (header == 0) return;
+       if (header->target_name) {
+               for (i = 0; i < header->n_targets; ++i)
+                       free(header->target_name[i]);
+               free(header->target_name);
+               free(header->target_len);
+       }
+       free(header->text);
+       if (header->dict) sam_header_free(header->dict);
+       if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
+       bam_destroy_header_hash(header);
+       free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+       bam_header_t *header;
+       char buf[4];
+       int32_t i = 1, name_len;
+       // check EOF
+       i = bgzf_check_EOF(fp);
+       if (i < 0) {
+               // If the file is a pipe, checking the EOF marker will *always* fail
+               // with ESPIPE.  Suppress the error message in this case.
+               if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
+       }
+       else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n");
+       // read "BAM1"
+       if (bam_read(fp, buf, 4) != 4) return 0;
+       if (strncmp(buf, "BAM\001", 4)) {
+               fprintf(stderr, "[bam_header_read] wrong header\n");
+               return 0;
+       }
+       header = bam_header_init();
+       // read plain text and the number of reference sequences
+       bam_read(fp, &header->l_text, 4);
+       if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+       header->text = (char*)calloc(header->l_text + 1, 1);
+       bam_read(fp, header->text, header->l_text);
+       bam_read(fp, &header->n_targets, 4);
+       if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+       // read reference sequence names and lengths
+       header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+       header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+       for (i = 0; i != header->n_targets; ++i) {
+               bam_read(fp, &name_len, 4);
+               if (bam_is_be) bam_swap_endian_4p(&name_len);
+               header->target_name[i] = (char*)calloc(name_len, 1);
+               bam_read(fp, header->target_name[i], name_len);
+               bam_read(fp, &header->target_len[i], 4);
+               if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+       }
+       return header;
+}
+
+int bam_header_write(bamFile fp, const bam_header_t *header)
+{
+       char buf[4];
+       int32_t i, name_len, x;
+       // write "BAM1"
+       strncpy(buf, "BAM\001", 4);
+       bam_write(fp, buf, 4);
+       // write plain text and the number of reference sequences
+       if (bam_is_be) {
+               x = bam_swap_endian_4(header->l_text);
+               bam_write(fp, &x, 4);
+               if (header->l_text) bam_write(fp, header->text, header->l_text);
+               x = bam_swap_endian_4(header->n_targets);
+               bam_write(fp, &x, 4);
+       } else {
+               bam_write(fp, &header->l_text, 4);
+               if (header->l_text) bam_write(fp, header->text, header->l_text);
+               bam_write(fp, &header->n_targets, 4);
+       }
+       // write sequence names and lengths
+       for (i = 0; i != header->n_targets; ++i) {
+               char *p = header->target_name[i];
+               name_len = strlen(p) + 1;
+               if (bam_is_be) {
+                       x = bam_swap_endian_4(name_len);
+                       bam_write(fp, &x, 4);
+               } else bam_write(fp, &name_len, 4);
+               bam_write(fp, p, name_len);
+               if (bam_is_be) {
+                       x = bam_swap_endian_4(header->target_len[i]);
+                       bam_write(fp, &x, 4);
+               } else bam_write(fp, &header->target_len[i], 4);
+       }
+       return 0;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+       uint8_t *s;
+       uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+       s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+       for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+       while (s < data + data_len) {
+               uint8_t type;
+               s += 2; // skip key
+               type = toupper(*s); ++s; // skip type
+               if (type == 'C' || type == 'A') ++s;
+               else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+               else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+               else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
+               else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+       }
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+       bam1_core_t *c = &b->core;
+       int32_t block_len, ret, i;
+       uint32_t x[8];
+
+       assert(BAM_CORE_SIZE == 32);
+       if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+               if (ret == 0) return -1; // normal end-of-file
+               else return -2; // truncated
+       }
+       if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
+       if (bam_is_be) {
+               bam_swap_endian_4p(&block_len);
+               for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+       }
+       c->tid = x[0]; c->pos = x[1];
+       c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+       c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+       c->l_qseq = x[4];
+       c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+       b->data_len = block_len - BAM_CORE_SIZE;
+       if (b->m_data < b->data_len) {
+               b->m_data = b->data_len;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+       b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+       if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+       return 4 + block_len;
+}
+
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
+{
+       uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
+       int i;
+       assert(BAM_CORE_SIZE == 32);
+       x[0] = c->tid;
+       x[1] = c->pos;
+       x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+       x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+       x[4] = c->l_qseq;
+       x[5] = c->mtid;
+       x[6] = c->mpos;
+       x[7] = c->isize;
+       if (bam_is_be) {
+               for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+               y = block_len;
+               bam_write(fp, bam_swap_endian_4p(&y), 4);
+               swap_endian_data(c, data_len, data);
+       } else bam_write(fp, &block_len, 4);
+       bam_write(fp, x, BAM_CORE_SIZE);
+       bam_write(fp, data, data_len);
+       if (bam_is_be) swap_endian_data(c, data_len, data);
+       return 4 + block_len;
+}
+
+int bam_write1(bamFile fp, const bam1_t *b)
+{
+       return bam_write1_core(fp, &b->core, b->data_len, b->data);
+}
+
+char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
+{
+       uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
+       int i;
+       const bam1_core_t *c = &b->core;
+       kstring_t str;
+       str.l = str.m = 0; str.s = 0;
+
+       ksprintf(&str, "%s\t", bam1_qname(b));
+       if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag);
+       else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
+       else { // BAM_OFSTR
+               for (i = 0; i < 16; ++i)
+                       if ((c->flag & 1<<i) && bam_flag2char_table[i])
+                               kputc(bam_flag2char_table[i], &str);
+               kputc('\t', &str);
+       }
+       if (c->tid < 0) kputs("*\t", &str);
+       else ksprintf(&str, "%s\t", header->target_name[c->tid]);
+       ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual);
+       if (c->n_cigar == 0) kputc('*', &str);
+       else {
+               for (i = 0; i < c->n_cigar; ++i)
+                       ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]);
+       }
+       kputc('\t', &str);
+       if (c->mtid < 0) kputs("*\t", &str);
+       else if (c->mtid == c->tid) kputs("=\t", &str);
+       else ksprintf(&str, "%s\t", header->target_name[c->mtid]);
+       ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize);
+       if (c->l_qseq) {
+               for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
+               kputc('\t', &str);
+               if (t[0] == 0xff) kputc('*', &str);
+               else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+       } else ksprintf(&str, "*\t*");
+       s = bam1_aux(b);
+       while (s < b->data + b->data_len) {
+               uint8_t type, key[2];
+               key[0] = s[0]; key[1] = s[1];
+               s += 2; type = *s; ++s;
+               ksprintf(&str, "\t%c%c:", key[0], key[1]);
+               if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; }
+               else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; }
+               else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; }
+               else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; }
+               else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; }
+               else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; }
+               else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; }
+               else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
+               else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
+               else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; }
+       }
+       return str.s;
+}
+
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
+{
+       return bam_format1_core(header, b, BAM_OFDEC);
+}
+
+void bam_view1(const bam_header_t *header, const bam1_t *b)
+{
+       char *s = bam_format1(header, b);
+       printf("%s\n", s);
+       free(s);
+}
+
+// FIXME: we should also check the LB tag associated with each alignment
+const char *bam_get_library(bam_header_t *h, const bam1_t *b)
+{
+       const uint8_t *rg;
+       if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+       if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
+       rg = bam_aux_get(b, "RG");
+       return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
+}
diff --git a/samtools/bam.h b/samtools/bam.h
new file mode 100644 (file)
index 0000000..291b303
--- /dev/null
@@ -0,0 +1,697 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef BAM_BAM_H
+#define BAM_BAM_H
+
+/*!
+  @header
+
+  BAM library provides I/O and various operations on manipulating files
+  in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
+  format. It now supports importing from or exporting to TAM, sorting,
+  merging, generating pileup, and quickly retrieval of reads overlapped
+  with a specified region.
+
+  @copyright Genome Research Ltd.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifndef BAM_LITE
+#define BAM_VIRTUAL_OFFSET16
+#include "bgzf.h"
+/*! @abstract BAM file handler */
+typedef BGZF *bamFile;
+#define bam_open(fn, mode) bgzf_open(fn, mode)
+#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
+#define bam_close(fp) bgzf_close(fp)
+#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
+#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
+#define bam_tell(fp) bgzf_tell(fp)
+#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
+#else
+#define BAM_TRUE_OFFSET
+#include <zlib.h>
+typedef gzFile bamFile;
+#define bam_open(fn, mode) gzopen(fn, mode)
+#define bam_dopen(fd, mode) gzdopen(fd, mode)
+#define bam_close(fp) gzclose(fp)
+#define bam_read(fp, buf, size) gzread(fp, buf, size)
+/* no bam_write/bam_tell/bam_seek() here */
+#endif
+
+/*! @typedef
+  @abstract Structure for the alignment header.
+  @field n_targets   number of reference sequences
+  @field target_name names of the reference sequences
+  @field target_len  lengths of the referene sequences
+  @field dict        header dictionary
+  @field hash        hash table for fast name lookup
+  @field rg2lib      hash table for @RG-ID -> LB lookup
+  @field l_text      length of the plain text in the header
+  @field text        plain text
+
+  @discussion Field hash points to null by default. It is a private
+  member.
+ */
+typedef struct {
+       int32_t n_targets;
+       char **target_name;
+       uint32_t *target_len;
+       void *dict, *hash, *rg2lib;
+       int l_text;
+       char *text;
+} bam_header_t;
+
+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
+#define BAM_FPAIRED        1
+/*! @abstract the read is mapped in a proper pair */
+#define BAM_FPROPER_PAIR   2
+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
+#define BAM_FUNMAP         4
+/*! @abstract the mate is unmapped */
+#define BAM_FMUNMAP        8
+/*! @abstract the read is mapped to the reverse strand */
+#define BAM_FREVERSE      16
+/*! @abstract the mate is mapped to the reverse strand */
+#define BAM_FMREVERSE     32
+/*! @abstract this is read1 */
+#define BAM_FREAD1        64
+/*! @abstract this is read2 */
+#define BAM_FREAD2       128
+/*! @abstract not primary alignment */
+#define BAM_FSECONDARY   256
+/*! @abstract QC failure */
+#define BAM_FQCFAIL      512
+/*! @abstract optical or PCR duplicate */
+#define BAM_FDUP        1024
+
+#define BAM_OFDEC          0
+#define BAM_OFHEX          1
+#define BAM_OFSTR          2
+
+/*! @abstract defautl mask for pileup */
+#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
+
+#define BAM_CORE_SIZE   sizeof(bam1_core_t)
+
+/**
+ * Describing how CIGAR operation/length is packed in a 32-bit integer.
+ */
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+
+/*
+  CIGAR operations.
+ */
+/*! @abstract CIGAR: match */
+#define BAM_CMATCH      0
+/*! @abstract CIGAR: insertion to the reference */
+#define BAM_CINS        1
+/*! @abstract CIGAR: deletion from the reference */
+#define BAM_CDEL        2
+/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
+#define BAM_CREF_SKIP   3
+/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
+#define BAM_CSOFT_CLIP  4
+/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
+#define BAM_CHARD_CLIP  5
+/*! @abstract CIGAR: padding */
+#define BAM_CPAD        6
+
+/*! @typedef
+  @abstract Structure for core alignment information.
+  @field  tid     chromosome ID, defined by bam_header_t
+  @field  pos     0-based leftmost coordinate
+  @field  strand  strand; 0 for forward and 1 otherwise
+  @field  bin     bin calculated by bam_reg2bin()
+  @field  qual    mapping quality
+  @field  l_qname length of the query name
+  @field  flag    bitwise flag
+  @field  n_cigar number of CIGAR operations
+  @field  l_qseq  length of the query sequence (read)
+ */
+typedef struct {
+       int32_t tid;
+       int32_t pos;
+       uint32_t bin:16, qual:8, l_qname:8;
+       uint32_t flag:16, n_cigar:16;
+       int32_t l_qseq;
+       int32_t mtid;
+       int32_t mpos;
+       int32_t isize;
+} bam1_core_t;
+
+/*! @typedef
+  @abstract Structure for one alignment.
+  @field  core       core information about the alignment
+  @field  l_aux      length of auxiliary data
+  @field  data_len   current length of bam1_t::data
+  @field  m_data     maximum length of bam1_t::data
+  @field  data       all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
+
+  @discussion Notes:
+   1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+   2. l_qseq is calculated from the total length of an alignment block
+      on reading or from CIGAR.
+ */
+typedef struct {
+       bam1_core_t core;
+       int l_aux, data_len, m_data;
+       uint8_t *data;
+} bam1_t;
+
+#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
+#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
+
+/*! @function
+  @abstract  Get the CIGAR array
+  @param  b  pointer to an alignment
+  @return    pointer to the CIGAR array
+
+  @discussion In the CIGAR array, each element is a 32-bit integer. The
+  lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+  length of a CIGAR.
+ */
+#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
+
+/*! @function
+  @abstract  Get the name of the query
+  @param  b  pointer to an alignment
+  @return    pointer to the name string, null terminated
+ */
+#define bam1_qname(b) ((char*)((b)->data))
+
+/*! @function
+  @abstract  Get query sequence
+  @param  b  pointer to an alignment
+  @return    pointer to sequence
+
+  @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+  8 for T and 15 for N. Two bases are packed in one byte with the base
+  at the higher 4 bits having smaller coordinate on the read. It is
+  recommended to use bam1_seqi() macro to get the base.
+ */
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
+
+/*! @function
+  @abstract  Get query quality
+  @param  b  pointer to an alignment
+  @return    pointer to quality string
+ */
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
+
+/*! @function
+  @abstract  Get a base on read
+  @param  s  Query sequence returned by bam1_seq()
+  @param  i  The i-th position, 0-based
+  @return    4-bit integer representing the base.
+ */
+#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+
+/*! @function
+  @abstract  Get query sequence and quality
+  @param  b  pointer to an alignment
+  @return    pointer to the concatenated auxiliary data
+ */
+#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
+
+#ifndef kroundup32
+/*! @function
+  @abstract  Round an integer to the next closest power-2 integer.
+  @param  x  integer to be rounded (in place)
+  @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/*!
+  @abstract Whether the machine is big-endian; modified only in
+  bam_header_init().
+ */
+extern int bam_is_be;
+
+/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
+extern unsigned char bam_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
+extern char *bam_nt16_rev_table;
+
+extern char bam_nt16_nt4_table[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       /*! @abstract TAM file handler */
+       typedef struct __tamFile_t *tamFile;
+
+       /*!
+         @abstract   Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
+         @param  fn  SAM file name
+         @return     SAM file handler
+        */
+       tamFile sam_open(const char *fn);
+
+       /*!
+         @abstract   Close a SAM file handler
+         @param  fp  SAM file handler
+        */
+       void sam_close(tamFile fp);
+
+       /*!
+         @abstract      Read one alignment from a SAM file handler
+         @param  fp     SAM file handler
+         @param  header header information (ordered names of chromosomes)
+         @param  b      read alignment; all members in b will be updated
+         @return        0 if successful; otherwise negative
+        */
+       int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
+
+       /*!
+         @abstract       Read header information from a TAB-delimited list file.
+         @param  fn_list file name for the list
+         @return         a pointer to the header structure
+
+         @discussion Each line in this file consists of chromosome name and
+         the length of chromosome.
+        */
+       bam_header_t *sam_header_read2(const char *fn_list);
+
+       /*!
+         @abstract       Read header from a SAM file (if present)
+         @param  fp      SAM file handler
+         @return         pointer to header struct; 0 if no @SQ lines available
+        */
+       bam_header_t *sam_header_read(tamFile fp);
+
+       /*!
+         @abstract       Parse @SQ lines a update a header struct
+         @param  h       pointer to the header struct to be updated
+         @return         number of target sequences
+
+         @discussion bam_header_t::{n_targets,target_len,target_name} will
+         be destroyed in the first place.
+        */
+       int sam_header_parse(bam_header_t *h);
+
+       /*!
+         @abstract       Parse @RG lines a update a header struct
+         @param  h       pointer to the header struct to be updated
+         @return         number of @RG lines
+
+         @discussion bam_header_t::rg2lib will be destroyed in the first
+         place.
+        */
+       int sam_header_parse_rg(bam_header_t *h);
+
+#define sam_write1(header, b) bam_view1(header, b)
+
+       int bam_strmap_put(void *strmap, const char *rg, const char *lib);
+       const char *bam_strmap_get(const void *strmap, const char *rg);
+       void *bam_strmap_dup(const void*);
+       void *bam_strmap_init();
+       void bam_strmap_destroy(void *strmap);
+
+       /*!
+         @abstract Initialize a header structure.
+         @return   the pointer to the header structure
+
+         @discussion This function also modifies the global variable
+         bam_is_be.
+        */
+       bam_header_t *bam_header_init();
+
+       /*!
+         @abstract        Destroy a header structure.
+         @param  header  pointer to the header
+        */
+       void bam_header_destroy(bam_header_t *header);
+
+       /*!
+         @abstract   Read a header structure from BAM.
+         @param  fp  BAM file handler, opened by bam_open()
+         @return     pointer to the header structure
+
+         @discussion The file position indicator must be placed at the
+         beginning of the file. Upon success, the position indicator will
+         be set at the start of the first alignment.
+        */
+       bam_header_t *bam_header_read(bamFile fp);
+
+       /*!
+         @abstract      Write a header structure to BAM.
+         @param  fp     BAM file handler
+         @param  header pointer to the header structure
+         @return        always 0 currently
+        */
+       int bam_header_write(bamFile fp, const bam_header_t *header);
+
+       /*!
+         @abstract   Read an alignment from BAM.
+         @param  fp  BAM file handler
+         @param  b   read alignment; all members are updated.
+         @return     number of bytes read from the file
+
+         @discussion The file position indicator must be
+         placed right before an alignment. Upon success, this function
+         will set the position indicator to the start of the next
+         alignment. This function is not affected by the machine
+         endianness.
+        */
+       int bam_read1(bamFile fp, bam1_t *b);
+
+       /*!
+         @abstract Write an alignment to BAM.
+         @param  fp       BAM file handler
+         @param  c        pointer to the bam1_core_t structure
+         @param  data_len total length of variable size data related to
+                          the alignment
+         @param  data     pointer to the concatenated data
+         @return          number of bytes written to the file
+
+         @discussion This function is not affected by the machine
+         endianness.
+        */
+       int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
+
+       /*!
+         @abstract   Write an alignment to BAM.
+         @param  fp  BAM file handler
+         @param  b   alignment to write
+         @return     number of bytes written to the file
+
+         @abstract It is equivalent to:
+           bam_write1_core(fp, &b->core, b->data_len, b->data)
+        */
+       int bam_write1(bamFile fp, const bam1_t *b);
+
+       /*! @function
+         @abstract  Initiate a pointer to bam1_t struct
+        */
+#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
+
+       /*! @function
+         @abstract  Free the memory allocated for an alignment.
+         @param  b  pointer to an alignment
+        */
+#define bam_destroy1(b) do {                                   \
+               if (b) { free((b)->data); free(b); }    \
+       } while (0)
+
+       /*!
+         @abstract       Format a BAM record in the SAM format
+         @param  header  pointer to the header structure
+         @param  b       alignment to print
+         @return         a pointer to the SAM string
+        */
+       char *bam_format1(const bam_header_t *header, const bam1_t *b);
+
+       char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
+
+       const char *bam_get_library(bam_header_t *header, const bam1_t *b);
+
+       /*! @typedef
+         @abstract Structure for one alignment covering the pileup position.
+         @field  b      pointer to the alignment
+         @field  qpos   position of the read base at the pileup site, 0-based
+         @field  indel  indel length; 0 for no indel, positive for ins and negative for del
+         @field  is_del 1 iff the base on the padded read is a deletion
+         @field  level  the level of the read in the "viewer" mode
+
+         @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+         difference between the two functions is that the former does not
+         set bam_pileup1_t::level, while the later does. Level helps the
+         implementation of alignment viewers, but calculating this has some
+         overhead.
+        */
+       typedef struct {
+               bam1_t *b;
+               int32_t qpos;
+               int indel, level;
+               uint32_t is_del:1, is_head:1, is_tail:1;
+       } bam_pileup1_t;
+
+       struct __bam_plbuf_t;
+       /*! @abstract pileup buffer */
+       typedef struct __bam_plbuf_t bam_plbuf_t;
+
+       void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+
+       /*! @typedef
+         @abstract    Type of function to be called by bam_plbuf_push().
+         @param  tid  chromosome ID as is defined in the header
+         @param  pos  start coordinate of the alignment, 0-based
+         @param  n    number of elements in pl array
+         @param  pl   array of alignments
+         @param  data user provided data
+         @discussion  See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
+        */
+       typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
+
+       /*!
+         @abstract     Reset a pileup buffer for another pileup process
+         @param  buf   the pileup buffer to be reset
+        */
+       void bam_plbuf_reset(bam_plbuf_t *buf);
+
+       /*!
+         @abstract     Initialize a buffer for pileup.
+         @param  func  fucntion to be called by bam_pileup_core()
+         @param  data  user provided data
+         @return       pointer to the pileup buffer
+        */
+       bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
+
+       /*!
+         @abstract    Destroy a pileup buffer.
+         @param  buf  pointer to the pileup buffer
+        */
+       void bam_plbuf_destroy(bam_plbuf_t *buf);
+
+       /*!
+         @abstract    Push an alignment to the pileup buffer.
+         @param  b    alignment to be pushed
+         @param  buf  pileup buffer
+         @see         bam_plbuf_init()
+         @return      always 0 currently
+
+         @discussion If all the alignments covering a particular site have
+         been collected, this function will call the user defined function
+         as is provided to bam_plbuf_init(). The coordinate of the site and
+         all the alignments will be transferred to the user defined
+         function as function parameters.
+        
+         When all the alignments are pushed to the buffer, this function
+         needs to be called with b equal to NULL. This will flush the
+         buffer. A pileup buffer can only be reused when bam_plbuf_reset()
+         is called.
+        */
+       int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
+
+       int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
+
+       struct __bam_lplbuf_t;
+       typedef struct __bam_lplbuf_t bam_lplbuf_t;
+
+       void bam_lplbuf_reset(bam_lplbuf_t *buf);
+
+       /*! @abstract  bam_plbuf_init() equivalent with level calculated. */
+       bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
+
+       /*! @abstract  bam_plbuf_destroy() equivalent with level calculated. */
+       void bam_lplbuf_destroy(bam_lplbuf_t *tv);
+
+       /*! @abstract  bam_plbuf_push() equivalent with level calculated. */
+       int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
+
+       struct __bam_index_t;
+       typedef struct __bam_index_t bam_index_t;
+
+       /*!
+         @abstract   Build index for a BAM file.
+         @discussion Index file "fn.bai" will be created.
+         @param  fn  name of the BAM file
+         @return     always 0 currently
+        */
+       int bam_index_build(const char *fn);
+
+       /*!
+         @abstract   Load index from file "fn.bai".
+         @param  fn  name of the BAM file (NOT the index file)
+         @return     pointer to the index structure
+        */
+       bam_index_t *bam_index_load(const char *fn);
+
+       /*!
+         @abstract    Destroy an index structure.
+         @param  idx  pointer to the index structure
+        */
+       void bam_index_destroy(bam_index_t *idx);
+
+       /*! @typedef
+         @abstract      Type of function to be called by bam_fetch().
+         @param  b     the alignment
+         @param  data  user provided data
+        */
+       typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
+
+       /*!
+         @abstract Retrieve the alignments that are overlapped with the
+         specified region.
+
+         @discussion A user defined function will be called for each
+         retrieved alignment ordered by its start position.
+
+         @param  fp    BAM file handler
+         @param  idx   pointer to the alignment index
+         @param  tid   chromosome ID as is defined in the header
+         @param  beg   start coordinate, 0-based
+         @param  end   end coordinate, 0-based
+         @param  data  user provided data (will be transferred to func)
+         @param  func  user defined function
+        */
+       int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
+
+       /*!
+         @abstract       Parse a region in the format: "chr2:100,000-200,000".
+         @discussion     bam_header_t::hash will be initialized if empty.
+         @param  header  pointer to the header structure
+         @param  str     string to be parsed
+         @param  ref_id  the returned chromosome ID
+         @param  begin   the returned start coordinate
+         @param  end     the returned end coordinate
+         @return         0 on success; -1 on failure
+        */
+       int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+
+       /*!
+         @abstract       Retrieve data of a tag
+         @param  b       pointer to an alignment struct
+         @param  tag     two-character tag to be retrieved
+
+         @return  pointer to the type and data. The first character is the
+         type that can be 'iIsScCdfAZH'.
+
+         @discussion  Use bam_aux2?() series to convert the returned data to
+         the corresponding type.
+       */
+       uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
+
+       int32_t bam_aux2i(const uint8_t *s);
+       float bam_aux2f(const uint8_t *s);
+       double bam_aux2d(const uint8_t *s);
+       char bam_aux2A(const uint8_t *s);
+       char *bam_aux2Z(const uint8_t *s);
+
+       int bam_aux_del(bam1_t *b, uint8_t *s);
+       void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
+       uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
+
+       /*!  
+         @abstract Calculate the rightmost coordinate of an alignment on the
+         reference genome.
+
+         @param  c      pointer to the bam1_core_t structure
+         @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
+         @return        the rightmost coordinate, 0-based
+       */
+       uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
+
+       /*!
+         @abstract      Calculate the length of the query sequence from CIGAR.
+         @param  c      pointer to the bam1_core_t structure
+         @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
+         @return        length of the query sequence
+       */
+       int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*!
+  @abstract    Calculate the minimum bin that contains a region [beg,end).
+  @param  beg  start of the region, 0-based
+  @param  end  end of the region, 0-based
+  @return      bin
+ */
+static inline int bam_reg2bin(uint32_t beg, uint32_t end)
+{
+       --end;
+       if (beg>>14 == end>>14) return 4681 + (beg>>14);
+       if (beg>>17 == end>>17) return  585 + (beg>>17);
+       if (beg>>20 == end>>20) return   73 + (beg>>20);
+       if (beg>>23 == end>>23) return    9 + (beg>>23);
+       if (beg>>26 == end>>26) return    1 + (beg>>26);
+       return 0;
+}
+
+/*!
+  @abstract     Copy an alignment
+  @param  bdst  destination alignment struct
+  @param  bsrc  source alignment struct
+  @return       pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+{
+       uint8_t *data = bdst->data;
+       int m_data = bdst->m_data;   // backup data and m_data
+       if (m_data < bsrc->m_data) { // double the capacity
+               m_data = bsrc->m_data; kroundup32(m_data);
+               data = (uint8_t*)realloc(data, m_data);
+       }
+       memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
+       *bdst = *bsrc; // copy the rest
+       // restore the backup
+       bdst->m_data = m_data;
+       bdst->data = data;
+       return bdst;
+}
+
+/*!
+  @abstract     Duplicate an alignment
+  @param  src   source alignment struct
+  @return       pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_dup1(const bam1_t *src)
+{
+       bam1_t *b;
+       b = bam_init1();
+       *b = *src;
+       b->m_data = b->data_len;
+       b->data = (uint8_t*)calloc(b->data_len, 1);
+       memcpy(b->data, src->data, b->data_len);
+       return b;
+}
+
+#endif
diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c
new file mode 100644 (file)
index 0000000..89e99f2
--- /dev/null
@@ -0,0 +1,182 @@
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+typedef char *str_p;
+KHASH_MAP_INIT_STR(s, int)
+KHASH_MAP_INIT_STR(r2l, str_p)
+
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+       int ori_len = b->data_len;
+       b->data_len += 3 + len;
+       b->l_aux += 3 + len;
+       if (b->m_data < b->data_len) {
+               b->m_data = b->data_len;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+       b->data[ori_len + 2] = type;
+       memcpy(b->data + ori_len + 3, data, len);
+}
+
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
+{
+       return bam_aux_get(b, tag);
+}
+
+#define __skip_tag(s) do { \
+               int type = toupper(*(s));                                                                               \
+               ++(s);                                                                                                                  \
+               if (type == 'C' || type == 'A') ++(s);                                                  \
+               else if (type == 'S') (s) += 2;                                                                 \
+               else if (type == 'I' || type == 'F') (s) += 4;                                  \
+               else if (type == 'D') (s) += 8;                                                                 \
+               else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
+       } while (0)
+
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
+{
+       uint8_t *s;
+       int y = tag[0]<<8 | tag[1];
+       s = bam1_aux(b);
+       while (s < b->data + b->data_len) {
+               int x = (int)s[0]<<8 | s[1];
+               s += 2;
+               if (x == y) return s;
+               __skip_tag(s);
+       }
+       return 0;
+}
+// s MUST BE returned by bam_aux_get()
+int bam_aux_del(bam1_t *b, uint8_t *s)
+{
+       uint8_t *p, *aux;
+       aux = bam1_aux(b);
+       p = s - 2;
+       __skip_tag(s);
+       memmove(p, s, b->l_aux - (s - aux));
+       b->data_len -= s - p;
+       b->l_aux -= s - p;
+       return 0;
+}
+
+void bam_init_header_hash(bam_header_t *header)
+{
+       if (header->hash == 0) {
+               int ret, i;
+               khiter_t iter;
+               khash_t(s) *h;
+               header->hash = h = kh_init(s);
+               for (i = 0; i < header->n_targets; ++i) {
+                       iter = kh_put(s, h, header->target_name[i], &ret);
+                       kh_value(h, iter) = i;
+               }
+       }
+}
+
+void bam_destroy_header_hash(bam_header_t *header)
+{
+       if (header->hash)
+               kh_destroy(s, (khash_t(s)*)header->hash);
+}
+
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
+{
+       khint_t k;
+       khash_t(s) *h = (khash_t(s)*)header->hash;
+       k = kh_get(s, h, seq_name);
+       return k == kh_end(h)? -1 : kh_value(h, k);
+}
+
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end)
+{
+       char *s, *p;
+       int i, l, k;
+       khiter_t iter;
+       khash_t(s) *h;
+
+       bam_init_header_hash(header);
+       h = (khash_t(s)*)header->hash;
+
+       l = strlen(str);
+       p = s = (char*)malloc(l+1);
+       /* squeeze out "," */
+       for (i = k = 0; i != l; ++i)
+               if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+       s[k] = 0;
+       for (i = 0; i != k; ++i) if (s[i] == ':') break;
+       s[i] = 0;
+       iter = kh_get(s, h, s); /* get the ref_id */
+       if (iter == kh_end(h)) { // name not found
+               *ref_id = -1; free(s);
+               return -1;
+       }
+       *ref_id = kh_value(h, iter);
+       if (i == k) { /* dump the whole sequence */
+               *begin = 0; *end = 1<<29; free(s);
+               return -1;
+       }
+       for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+       *begin = atoi(p);
+       if (i < k) {
+               p = s + i + 1;
+               *end = atoi(p);
+       } else *end = 1<<29;
+       if (*begin > 0) --*begin;
+       free(s);
+       if (*begin > *end) {
+               fprintf(stderr, "[bam_parse_region] invalid region.\n");
+               return -1;
+       }
+       return 0;
+}
+
+int32_t bam_aux2i(const uint8_t *s)
+{
+       int type;
+       if (s == 0) return 0;
+       type = *s++;
+       if (type == 'c') return (int32_t)*(int8_t*)s;
+       else if (type == 'C') return (int32_t)*(uint8_t*)s;
+       else if (type == 's') return (int32_t)*(int16_t*)s;
+       else if (type == 'S') return (int32_t)*(uint16_t*)s;
+       else if (type == 'i' || type == 'I') return *(int32_t*)s;
+       else return 0;
+}
+
+float bam_aux2f(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0.0;
+       if (type == 'f') return *(float*)s;
+       else return 0.0;
+}
+
+double bam_aux2d(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0.0;
+       if (type == 'd') return *(double*)s;
+       else return 0.0;
+}
+
+char bam_aux2A(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0;
+       if (type == 'A') return *(char*)s;
+       else return 0;
+}
+
+char *bam_aux2Z(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0;
+       if (type == 'Z' || type == 'H') return (char*)s;
+       else return 0;
+}
diff --git a/samtools/bam_color.c b/samtools/bam_color.c
new file mode 100644 (file)
index 0000000..ce637f7
--- /dev/null
@@ -0,0 +1,127 @@
+#include <ctype.h>
+#include "bam.h"
+
+/*!
+ @abstract     Get the color encoding the previous and current base
+ @param b      pointer to an alignment
+ @param i      The i-th position, 0-based
+ @return       color
+
+ @discussion   Returns 0 no color information is found.
+ */
+char bam_aux_getCSi(bam1_t *b, int i)
+{
+       uint8_t *c = bam_aux_get(b, "CS");
+       char *cs = NULL;
+
+       // return the base if the tag was not found
+       if(0 == c) return 0;
+
+       cs = bam_aux2Z(c);
+       // adjust for strandedness and leading adaptor
+       if(bam1_strand(b)) i = strlen(cs) - 1 - i;
+       else i++;
+       return cs[i];
+}
+
+/*!
+ @abstract     Get the color quality of the color encoding the previous and current base
+ @param b      pointer to an alignment
+ @param i      The i-th position, 0-based
+ @return       color quality
+
+ @discussion   Returns 0 no color information is found.
+ */
+char bam_aux_getCQi(bam1_t *b, int i)
+{
+       uint8_t *c = bam_aux_get(b, "CQ");
+       char *cq = NULL;
+       
+       // return the base if the tag was not found
+       if(0 == c) return 0;
+
+       cq = bam_aux2Z(c);
+       // adjust for strandedness
+       if(bam1_strand(b)) i = strlen(cq) - 1 - i;
+       return cq[i];
+}
+
+char bam_aux_nt2int(char a)
+{
+       switch(toupper(a)) {
+               case 'A':
+                       return 0;
+                       break;
+               case 'C':
+                       return 1;
+                       break;
+               case 'G':
+                       return 2;
+                       break;
+               case 'T':
+                       return 3;
+                       break;
+               default:
+                       return 4;
+                       break;
+       }
+}
+
+char bam_aux_ntnt2cs(char a, char b)
+{
+       a = bam_aux_nt2int(a);
+       b = bam_aux_nt2int(b);
+       if(4 == a || 4 == b) return '4';
+       return "0123"[(int)(a ^ b)];
+}
+
+/*!
+ @abstract     Get the color error profile at the give position    
+ @param b      pointer to an alignment
+ @return       the original color if the color was an error, '-' (dash) otherwise
+
+ @discussion   Returns 0 no color information is found.
+ */
+char bam_aux_getCEi(bam1_t *b, int i)
+{
+       int cs_i;
+       uint8_t *c = bam_aux_get(b, "CS");
+       char *cs = NULL;
+       char prev_b, cur_b;
+       char cur_color, cor_color;
+
+       // return the base if the tag was not found
+       if(0 == c) return 0;
+       
+       cs = bam_aux2Z(c);
+
+       // adjust for strandedness and leading adaptor
+       if(bam1_strand(b)) { //reverse strand
+               cs_i = strlen(cs) - 1 - i;
+               // get current color
+               cur_color = cs[cs_i];
+               // get previous base.  Note: must rc adaptor
+               prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+               // get current base
+               cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; 
+       }
+       else {
+               cs_i=i+1;
+               // get current color
+               cur_color = cs[cs_i];
+               // get previous base
+               prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+               // get current base
+               cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+       }
+
+       // corrected color
+       cor_color = bam_aux_ntnt2cs(prev_b, cur_b);
+
+       if(cur_color == cor_color) { 
+               return '-';
+       }
+       else {
+               return cur_color;
+       }
+}
diff --git a/samtools/bam_endian.h b/samtools/bam_endian.h
new file mode 100644 (file)
index 0000000..0fc74a8
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef BAM_ENDIAN_H
+#define BAM_ENDIAN_H
+
+#include <stdint.h>
+
+static inline int bam_is_big_endian()
+{
+       long one= 1;
+       return !(*((char *)(&one)));
+}
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+       return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *bam_swap_endian_2p(void *x)
+{
+       *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
+       return x;
+}
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+       v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+       return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *bam_swap_endian_4p(void *x)
+{
+       *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
+       return x;
+}
+static inline uint64_t bam_swap_endian_8(uint64_t v)
+{
+       v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+       v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+       return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *bam_swap_endian_8p(void *x)
+{
+       *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
+       return x;
+}
+
+#endif
diff --git a/samtools/bam_import.c b/samtools/bam_import.c
new file mode 100644 (file)
index 0000000..9d463d1
--- /dev/null
@@ -0,0 +1,439 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#ifdef _WIN32
+#include <fcntl.h>
+#endif
+#include "kstring.h"
+#include "bam.h"
+#include "sam_header.h"
+#include "kseq.h"
+#include "khash.h"
+
+KSTREAM_INIT(gzFile, gzread, 8192)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+
+void bam_init_header_hash(bam_header_t *header);
+void bam_destroy_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+unsigned char bam_nt16_table[256] = {
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+        1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+unsigned short bam_char2flag_table[256] = {
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
+       BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+       0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
+};
+
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
+
+struct __tamFile_t {
+       gzFile fp;
+       kstream_t *ks;
+       kstring_t *str;
+       uint64_t n_lines;
+       int is_first;
+};
+
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
+{
+       char **list = 0, *s;
+       int n = 0, dret, m = 0;
+       gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       kstream_t *ks;
+       kstring_t *str;
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, '\n', str, &dret) > 0) {
+               if (n == m) {
+                       m = m? m << 1 : 16;
+                       list = (char**)realloc(list, m * sizeof(char*));
+               }
+               if (str->s[str->l-1] == '\r')
+                       str->s[--str->l] = '\0';
+               s = list[n++] = (char*)calloc(str->l + 1, 1);
+               strcpy(s, str->s);
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       *_n = n;
+       return list;
+}
+
+static bam_header_t *hash2header(const kh_ref_t *hash)
+{
+       bam_header_t *header;
+       khiter_t k;
+       header = bam_header_init();
+       header->n_targets = kh_size(hash);
+       header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
+       header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
+       for (k = kh_begin(hash); k != kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       int i = (int)kh_value(hash, k);
+                       header->target_name[i] = (char*)kh_key(hash, k);
+                       header->target_len[i] = kh_value(hash, k)>>32;
+               }
+       }
+       bam_init_header_hash(header);
+       return header;
+}
+bam_header_t *sam_header_read2(const char *fn)
+{
+       bam_header_t *header;
+       int c, dret, ret;
+       gzFile fp;
+       kstream_t *ks;
+       kstring_t *str;
+       kh_ref_t *hash;
+       khiter_t k;
+       if (fn == 0) return 0;
+       fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       if (fp == 0) return 0;
+       hash = kh_init(ref);
+       ks = ks_init(fp);
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       while (ks_getuntil(ks, 0, str, &dret) > 0) {
+               char *s = strdup(str->s);
+               int len, i;
+               i = kh_size(hash);
+               ks_getuntil(ks, 0, str, &dret);
+               len = atoi(str->s);
+               k = kh_put(ref, hash, s, &ret);
+               kh_value(hash, k) = (uint64_t)len<<32 | i;
+               if (dret != '\n')
+                       while ((c = ks_getc(ks)) != '\n' && c != -1);
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+       header = hash2header(hash);
+       kh_destroy(ref, hash);
+       return header;
+}
+static inline uint8_t *alloc_data(bam1_t *b, int size)
+{
+       if (b->m_data < size) {
+               b->m_data = size;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       return b->data;
+}
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
+{
+       fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
+       abort();
+}
+static inline void append_text(bam_header_t *header, kstring_t *str)
+{
+       int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+       kroundup32(x); kroundup32(y);
+       if (x < y) header->text = (char*)realloc(header->text, y);
+       strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
+       header->l_text += str->l + 1;
+       header->text[header->l_text] = 0;
+}
+
+int sam_header_parse(bam_header_t *h)
+{
+       char **tmp;
+       int i;
+       free(h->target_len); free(h->target_name);
+       h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+       if (h->l_text < 3) return 0;
+       if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+       tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets);
+       if (h->n_targets == 0) return 0;
+       h->target_name = calloc(h->n_targets, sizeof(void*));
+       for (i = 0; i < h->n_targets; ++i)
+               h->target_name[i] = strdup(tmp[i]);
+       free(tmp);
+       tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets);
+       h->target_len = calloc(h->n_targets, 4);
+       for (i = 0; i < h->n_targets; ++i)
+               h->target_len[i] = atoi(tmp[i]);
+       free(tmp);
+       return h->n_targets;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+       int ret, dret;
+       bam_header_t *header = bam_header_init();
+       kstring_t *str = fp->str;
+       while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
+               str->s[str->l] = dret; // note that str->s is NOT null terminated!!
+               append_text(header, str);
+               if (dret != '\n') {
+                       ret = ks_getuntil(fp->ks, '\n', str, &dret);
+                       str->s[str->l] = '\n'; // NOT null terminated!!
+                       append_text(header, str);
+               }
+               ++fp->n_lines;
+       }
+       sam_header_parse(header);
+       bam_init_header_hash(header);
+       fp->is_first = 1;
+       return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+       int ret, doff, doff0, dret, z = 0;
+       bam1_core_t *c = &b->core;
+       kstring_t *str = fp->str;
+       kstream_t *ks = fp->ks;
+
+       if (fp->is_first) {
+               fp->is_first = 0;
+               ret = str->l;
+       } else {
+               do { // special consideration for empty lines
+                       ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+                       if (ret >= 0) z += str->l + 1;
+               } while (ret == 0);
+       }
+       if (ret < 0) return -1;
+       ++fp->n_lines;
+       doff = 0;
+
+       { // name
+               c->l_qname = strlen(str->s) + 1;
+               memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
+               doff += c->l_qname;
+       }
+       { // flag
+               long flag;
+               char *s;
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+               flag = strtol((char*)str->s, &s, 0);
+               if (*s) { // not the end of the string
+                       flag = 0;
+                       for (s = str->s; *s; ++s)
+                               flag |= bam_char2flag_table[(int)*s];
+               }
+               c->flag = flag;
+       }
+       { // tid, pos, qual
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+               if (c->tid < 0 && strcmp(str->s, "*")) {
+                       if (header->n_targets == 0) {
+                               fprintf(stderr, "[sam_read1] missing header? Abort!\n");
+                               exit(1);
+                       } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
+               }
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
+               if (ret < 0) return -2;
+       }
+       { // cigar
+               char *s, *t;
+               int i, op;
+               long x;
+               c->n_cigar = 0;
+               if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
+               z += str->l + 1;
+               if (str->s[0] != '*') {
+                       for (s = str->s; *s; ++s) {
+                               if (isalpha(*s)) ++c->n_cigar;
+                               else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
+                       }
+                       b->data = alloc_data(b, doff + c->n_cigar * 4);
+                       for (i = 0, s = str->s; i != c->n_cigar; ++i) {
+                               x = strtol(s, &t, 10);
+                               op = toupper(*t);
+                               if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
+                               else if (op == 'I') op = BAM_CINS;
+                               else if (op == 'D') op = BAM_CDEL;
+                               else if (op == 'N') op = BAM_CREF_SKIP;
+                               else if (op == 'S') op = BAM_CSOFT_CLIP;
+                               else if (op == 'H') op = BAM_CHARD_CLIP;
+                               else if (op == 'P') op = BAM_CPAD;
+                               else parse_error(fp->n_lines, "invalid CIGAR operation");
+                               s = t + 1;
+                               bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
+                       }
+                       if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
+                       c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b)));
+                       doff += c->n_cigar * 4;
+               } else {
+                       if (!(c->flag&BAM_FUNMAP)) {
+                               fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
+                               c->flag |= BAM_FUNMAP;
+                       }
+                       c->bin = bam_reg2bin(c->pos, c->pos + 1);
+               }
+       }
+       { // mtid, mpos, isize
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+               c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+               c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+               c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
+               if (ret < 0) return -4;
+       }
+       { // seq and qual
+               int i;
+               uint8_t *p = 0;
+               if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
+               z += str->l + 1;
+               if (strcmp(str->s, "*")) {
+                       c->l_qseq = strlen(str->s);
+                       if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
+                               parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+                       p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+                       memset(p, 0, (c->l_qseq+1)/2);
+                       for (i = 0; i < c->l_qseq; ++i)
+                               p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+               } else c->l_qseq = 0;
+               if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
+               z += str->l + 1;
+               if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
+                       parse_error(fp->n_lines, "sequence and quality are inconsistent");
+               p += (c->l_qseq+1)/2;
+               if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
+               else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
+               doff += c->l_qseq + (c->l_qseq+1)/2;
+       }
+       doff0 = doff;
+       if (dret != '\n' && dret != '\r') { // aux
+               while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
+                       uint8_t *s, type, key[2];
+                       z += str->l + 1;
+                       if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
+                               parse_error(fp->n_lines, "missing colon in auxiliary data");
+                       key[0] = str->s[0]; key[1] = str->s[1];
+                       type = str->s[3];
+                       s = alloc_data(b, doff + 3) + doff;
+                       s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
+                       if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
+                               s = alloc_data(b, doff + 2) + doff;
+                               *s++ = 'A'; *s = str->s[5];
+                               doff += 2;
+                       } else if (type == 'I' || type == 'i') {
+                               long long x;
+                               s = alloc_data(b, doff + 5) + doff;
+                               x = (long long)atoll(str->s + 5);
+                               if (x < 0) {
+                                       if (x >= -127) {
+                                               *s++ = 'c'; *(int8_t*)s = (int8_t)x;
+                                               s += 1; doff += 2;
+                                       } else if (x >= -32767) {
+                                               *s++ = 's'; *(int16_t*)s = (int16_t)x;
+                                               s += 2; doff += 3;
+                                       } else {
+                                               *s++ = 'i'; *(int32_t*)s = (int32_t)x;
+                                               s += 4; doff += 5;
+                                               if (x < -2147483648ll)
+                                                       fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+                                                                       (long long)fp->n_lines, x);
+                                       }
+                               } else {
+                                       if (x <= 255) {
+                                               *s++ = 'C'; *s++ = (uint8_t)x;
+                                               doff += 2;
+                                       } else if (x <= 65535) {
+                                               *s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
+                                               s += 2; doff += 3;
+                                       } else {
+                                               *s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
+                                               s += 4; doff += 5;
+                                               if (x > 4294967295ll)
+                                                       fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+                                                                       (long long)fp->n_lines, x);
+                                       }
+                               }
+                       } else if (type == 'f') {
+                               s = alloc_data(b, doff + 5) + doff;
+                               *s++ = 'f';
+                               *(float*)s = (float)atof(str->s + 5);
+                               s += 4; doff += 5;
+                       } else if (type == 'd') {
+                               s = alloc_data(b, doff + 9) + doff;
+                               *s++ = 'd';
+                               *(float*)s = (float)atof(str->s + 9);
+                               s += 8; doff += 9;
+                       } else if (type == 'Z' || type == 'H') {
+                               int size = 1 + (str->l - 5) + 1;
+                               if (type == 'H') { // check whether the hex string is valid
+                                       int i;
+                                       if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
+                                       for (i = 0; i < str->l - 5; ++i) {
+                                               int c = toupper(str->s[5 + i]);
+                                               if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
+                                                       parse_error(fp->n_lines, "invalid hex character");
+                                       }
+                               }
+                               s = alloc_data(b, doff + size) + doff;
+                               *s++ = type;
+                               memcpy(s, str->s + 5, str->l - 5);
+                               s[str->l - 5] = 0;
+                               doff += size;
+                       } else parse_error(fp->n_lines, "unrecognized type");
+                       if (dret == '\n' || dret == '\r') break;
+               }
+       }
+       b->l_aux = doff - doff0;
+       b->data_len = doff;
+       return z;
+}
+
+tamFile sam_open(const char *fn)
+{
+       tamFile fp;
+       gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");
+       if (gzfp == 0) return 0;
+       fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
+       fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       fp->fp = gzfp;
+       fp->ks = ks_init(fp->fp);
+       return fp;
+}
+
+void sam_close(tamFile fp)
+{
+       if (fp) {
+               ks_destroy(fp->ks);
+               gzclose(fp->fp);
+               free(fp->str->s); free(fp->str);
+               free(fp);
+       }
+}
diff --git a/samtools/bam_index.c b/samtools/bam_index.c
new file mode 100644 (file)
index 0000000..a627884
--- /dev/null
@@ -0,0 +1,574 @@
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+/*!
+  @header
+
+  Alignment indexing. Before indexing, BAM must be sorted based on the
+  leftmost coordinate of alignments. In indexing, BAM uses two indices:
+  a UCSC binning index and a simple linear index. The binning index is
+  efficient for alignments spanning long distance, while the auxiliary
+  linear index helps to reduce unnecessary seek calls especially for
+  short alignments.
+
+  The UCSC binning scheme was suggested by Richard Durbin and Lincoln
+  Stein and is explained by Kent et al. (2002). In this scheme, each bin
+  represents a contiguous genomic region which can be fully contained in
+  another bin; each alignment is associated with a bin which represents
+  the smallest region containing the entire alignment. The binning
+  scheme is essentially another representation of R-tree. A distinct bin
+  uniquely corresponds to a distinct internal node in a R-tree. Bin A is
+  a child of Bin B if region A is contained in B.
+
+  In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
+  0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
+  585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
+  find the alignments overlapped with a region [rbeg,rend), we need to
+  calculate the list of bins that may be overlapped the region and test
+  the alignments in the bins to confirm the overlaps. If the specified
+  region is short, typically only a few alignments in six bins need to
+  be retrieved. The overlapping alignments can be quickly fetched.
+
+ */
+
+#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
+#define BAM_LIDX_SHIFT    14
+
+typedef struct {
+       uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(off, pair64_t, pair64_lt)
+
+typedef struct {
+       uint32_t m, n;
+       pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+       int32_t n, m;
+       uint64_t *offset;
+} bam_lidx_t;
+
+KHASH_MAP_INIT_INT(i, bam_binlist_t)
+
+struct __bam_index_t {
+       int32_t n;
+       khash_t(i) **index;
+       bam_lidx_t *index2;
+};
+
+// requirement: len <= LEN_MASK
+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
+{
+       khint_t k;
+       bam_binlist_t *l;
+       int ret;
+       k = kh_put(i, h, bin, &ret);
+       l = &kh_value(h, k);
+       if (ret) { // not present
+               l->m = 1; l->n = 0;
+               l->list = (pair64_t*)calloc(l->m, 16);
+       }
+       if (l->n == l->m) {
+               l->m <<= 1;
+               l->list = (pair64_t*)realloc(l->list, l->m * 16);
+       }
+       l->list[l->n].u = beg; l->list[l->n++].v = end;
+}
+
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
+{
+       int i, beg, end;
+       beg = b->core.pos >> BAM_LIDX_SHIFT;
+       end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+       if (index2->m < end + 1) {
+               int old_m = index2->m;
+               index2->m = end + 1;
+               kroundup32(index2->m);
+               index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+               memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+       }
+       for (i = beg + 1; i <= end; ++i)
+               if (index2->offset[i] == 0) index2->offset[i] = offset;
+       index2->n = end + 1;
+}
+
+static void merge_chunks(bam_index_t *idx)
+{
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+       khash_t(i) *index;
+       int i, l, m;
+       khint_t k;
+       for (i = 0; i < idx->n; ++i) {
+               index = idx->index[i];
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       bam_binlist_t *p;
+                       if (!kh_exist(index, k)) continue;
+                       p = &kh_value(index, k);
+                       m = 0;
+                       for (l = 1; l < p->n; ++l) {
+#ifdef BAM_TRUE_OFFSET
+                               if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
+#else
+                               if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
+#endif
+                               else p->list[++m] = p->list[l];
+                       } // ~for(l)
+                       p->n = m + 1;
+               } // ~for(k)
+       } // ~for(i)
+#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
+}
+
+bam_index_t *bam_index_core(bamFile fp)
+{
+       bam1_t *b;
+       bam_header_t *h;
+       int i, ret;
+       bam_index_t *idx;
+       uint32_t last_bin, save_bin;
+       int32_t last_coor, last_tid, save_tid;
+       bam1_core_t *c;
+       uint64_t save_off, last_off;
+
+       idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       h = bam_header_read(fp);
+       c = &b->core;
+
+       idx->n = h->n_targets;
+       bam_header_destroy(h);
+       idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+       for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
+       idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+
+       save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
+       save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+       while ((ret = bam_read1(fp, b)) >= 0) {
+               if (last_tid != c->tid) { // change of chromosomes
+                       last_tid = c->tid;
+                       last_bin = 0xffffffffu;
+               } else if (last_coor > c->pos) {
+                       fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
+                                       bam1_qname(b), last_coor, c->pos, c->tid+1);
+                       exit(1);
+               }
+               if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+               if (c->bin != last_bin) { // then possibly write the binning index
+                       if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+                               insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+                       save_off = last_off;
+                       save_bin = last_bin = c->bin;
+                       save_tid = c->tid;
+                       if (save_tid < 0) break;
+               }
+               if (bam_tell(fp) <= last_off) {
+                       fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
+                                       (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
+                       exit(1);
+               }
+               last_off = bam_tell(fp);
+               last_coor = b->core.pos;
+       }
+       if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+       merge_chunks(idx);
+       if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
+       free(b->data); free(b);
+       return idx;
+}
+
+void bam_index_destroy(bam_index_t *idx)
+{
+       khint_t k;
+       int i;
+       if (idx == 0) return;
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index = idx->index[i];
+               bam_lidx_t *index2 = idx->index2 + i;
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       if (kh_exist(index, k))
+                               free(kh_value(index, k).list);
+               }
+               kh_destroy(i, index);
+               free(index2->offset);
+       }
+       free(idx->index); free(idx->index2);
+       free(idx);
+}
+
+void bam_index_save(const bam_index_t *idx, FILE *fp)
+{
+       int32_t i, size;
+       khint_t k;
+       fwrite("BAI\1", 1, 4, fp);
+       if (bam_is_be) {
+               uint32_t x = idx->n;
+               fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+       } else fwrite(&idx->n, 4, 1, fp);
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index = idx->index[i];
+               bam_lidx_t *index2 = idx->index2 + i;
+               // write binning index
+               size = kh_size(index);
+               if (bam_is_be) { // big endian
+                       uint32_t x = size;
+                       fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+               } else fwrite(&size, 4, 1, fp);
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       if (kh_exist(index, k)) {
+                               bam_binlist_t *p = &kh_value(index, k);
+                               if (bam_is_be) { // big endian
+                                       uint32_t x;
+                                       x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+                                       x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+                                       for (x = 0; (int)x < p->n; ++x) {
+                                               bam_swap_endian_8p(&p->list[x].u);
+                                               bam_swap_endian_8p(&p->list[x].v);
+                                       }
+                                       fwrite(p->list, 16, p->n, fp);
+                                       for (x = 0; (int)x < p->n; ++x) {
+                                               bam_swap_endian_8p(&p->list[x].u);
+                                               bam_swap_endian_8p(&p->list[x].v);
+                                       }
+                               } else {
+                                       fwrite(&kh_key(index, k), 4, 1, fp);
+                                       fwrite(&p->n, 4, 1, fp);
+                                       fwrite(p->list, 16, p->n, fp);
+                               }
+                       }
+               }
+               // write linear index (index2)
+               if (bam_is_be) {
+                       int x = index2->n;
+                       fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+               } else fwrite(&index2->n, 4, 1, fp);
+               if (bam_is_be) { // big endian
+                       int x;
+                       for (x = 0; (int)x < index2->n; ++x)
+                               bam_swap_endian_8p(&index2->offset[x]);
+                       fwrite(index2->offset, 8, index2->n, fp);
+                       for (x = 0; (int)x < index2->n; ++x)
+                               bam_swap_endian_8p(&index2->offset[x]);
+               } else fwrite(index2->offset, 8, index2->n, fp);
+       }
+       fflush(fp);
+}
+
+static bam_index_t *bam_index_load_core(FILE *fp)
+{
+       int i;
+       char magic[4];
+       bam_index_t *idx;
+       if (fp == 0) {
+               fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
+               return 0;
+       }
+       fread(magic, 1, 4, fp);
+       if (strncmp(magic, "BAI\1", 4)) {
+               fprintf(stderr, "[bam_index_load] wrong magic number.\n");
+               fclose(fp);
+               return 0;
+       }
+       idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));     
+       fread(&idx->n, 4, 1, fp);
+       if (bam_is_be) bam_swap_endian_4p(&idx->n);
+       idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+       idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index;
+               bam_lidx_t *index2 = idx->index2 + i;
+               uint32_t key, size;
+               khint_t k;
+               int j, ret;
+               bam_binlist_t *p;
+               index = idx->index[i] = kh_init(i);
+               // load binning index
+               fread(&size, 4, 1, fp);
+               if (bam_is_be) bam_swap_endian_4p(&size);
+               for (j = 0; j < (int)size; ++j) {
+                       fread(&key, 4, 1, fp);
+                       if (bam_is_be) bam_swap_endian_4p(&key);
+                       k = kh_put(i, index, key, &ret);
+                       p = &kh_value(index, k);
+                       fread(&p->n, 4, 1, fp);
+                       if (bam_is_be) bam_swap_endian_4p(&p->n);
+                       p->m = p->n;
+                       p->list = (pair64_t*)malloc(p->m * 16);
+                       fread(p->list, 16, p->n, fp);
+                       if (bam_is_be) {
+                               int x;
+                               for (x = 0; x < p->n; ++x) {
+                                       bam_swap_endian_8p(&p->list[x].u);
+                                       bam_swap_endian_8p(&p->list[x].v);
+                               }
+                       }
+               }
+               // load linear index
+               fread(&index2->n, 4, 1, fp);
+               if (bam_is_be) bam_swap_endian_4p(&index2->n);
+               index2->m = index2->n;
+               index2->offset = (uint64_t*)calloc(index2->m, 8);
+               fread(index2->offset, index2->n, 8, fp);
+               if (bam_is_be)
+                       for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+       }
+       return idx;
+}
+
+bam_index_t *bam_index_load_local(const char *_fn)
+{
+       FILE *fp;
+       char *fnidx, *fn;
+
+       if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
+               const char *p;
+               int l = strlen(_fn);
+               for (p = _fn + l - 1; p >= _fn; --p)
+                       if (*p == '/') break;
+               fn = strdup(p + 1);
+       } else fn = strdup(_fn);
+       fnidx = (char*)calloc(strlen(fn) + 5, 1);
+       strcpy(fnidx, fn); strcat(fnidx, ".bai");
+       fp = fopen(fnidx, "r");
+       if (fp == 0) { // try "{base}.bai"
+               char *s = strstr(fn, "bam");
+               if (s == fn + strlen(fn) - 3) {
+                       strcpy(fnidx, fn);
+                       fnidx[strlen(fn)-1] = 'i';
+                       fp = fopen(fnidx, "r");
+               }
+       }
+       free(fnidx); free(fn);
+       if (fp) {
+               bam_index_t *idx = bam_index_load_core(fp);
+               fclose(fp);
+               return idx;
+       } else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+       const int buf_size = 1 * 1024 * 1024;
+       char *fn;
+       FILE *fp;
+       uint8_t *buf;
+       knetFile *fp_remote;
+       int l;
+       if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+       l = strlen(url);
+       for (fn = (char*)url + l - 1; fn >= url; --fn)
+               if (*fn == '/') break;
+       ++fn; // fn now points to the file name
+       fp_remote = knet_open(url, "r");
+       if (fp_remote == 0) {
+               fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+               return;
+       }
+       if ((fp = fopen(fn, "w")) == 0) {
+               fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+               knet_close(fp_remote);
+               return;
+       }
+       buf = (uint8_t*)calloc(buf_size, 1);
+       while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+               fwrite(buf, 1, l, fp);
+       free(buf);
+       fclose(fp);
+       knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+       return;
+}
+#endif
+
+bam_index_t *bam_index_load(const char *fn)
+{
+       bam_index_t *idx;
+       idx = bam_index_load_local(fn);
+       if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
+               char *fnidx = calloc(strlen(fn) + 5, 1);
+               strcat(strcpy(fnidx, fn), ".bai");
+               fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
+               download_from_remote(fnidx);
+               idx = bam_index_load_local(fn);
+       }
+       if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
+       return idx;
+}
+
+int bam_index_build2(const char *fn, const char *_fnidx)
+{
+       char *fnidx;
+       FILE *fpidx;
+       bamFile fp;
+       bam_index_t *idx;
+       if ((fp = bam_open(fn, "r")) == 0) {
+               fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
+               return -1;
+       }
+       idx = bam_index_core(fp);
+       bam_close(fp);
+       if (_fnidx == 0) {
+               fnidx = (char*)calloc(strlen(fn) + 5, 1);
+               strcpy(fnidx, fn); strcat(fnidx, ".bai");
+       } else fnidx = strdup(_fnidx);
+       fpidx = fopen(fnidx, "w");
+       if (fpidx == 0) {
+               fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
+               free(fnidx);
+               return -1;
+       }
+       bam_index_save(idx, fpidx);
+       bam_index_destroy(idx);
+       fclose(fpidx);
+       free(fnidx);
+       return 0;
+}
+
+int bam_index_build(const char *fn)
+{
+       return bam_index_build2(fn, 0);
+}
+
+int bam_index(int argc, char *argv[])
+{
+       if (argc < 2) {
+               fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
+               return 1;
+       }
+       if (argc >= 3) bam_index_build2(argv[1], argv[2]);
+       else bam_index_build(argv[1]);
+       return 0;
+}
+
+#define MAX_BIN 37450 // =(8^6-1)/7+1
+
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
+{
+       int i = 0, k;
+       --end;
+       list[i++] = 0;
+       for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
+       for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
+       for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
+       for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
+       for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+       return i;
+}
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+       uint32_t rbeg = b->core.pos;
+       uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
+       return (rend > beg && rbeg < end);
+}
+
+// bam_fetch helper function retrieves 
+pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off)
+{
+       uint16_t *bins;
+       int i, n_bins, n_off;
+       pair64_t *off;
+       khint_t k;
+       khash_t(i) *index;
+       uint64_t min_off;
+
+       bins = (uint16_t*)calloc(MAX_BIN, 2);
+       n_bins = reg2bins(beg, end, bins);
+       index = idx->index[tid];
+       min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+       for (i = n_off = 0; i < n_bins; ++i) {
+               if ((k = kh_get(i, index, bins[i])) != kh_end(index))
+                       n_off += kh_value(index, k).n;
+       }
+       if (n_off == 0) {
+               free(bins); return 0;
+       }
+       off = (pair64_t*)calloc(n_off, 16);
+       for (i = n_off = 0; i < n_bins; ++i) {
+               if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
+                       int j;
+                       bam_binlist_t *p = &kh_value(index, k);
+                       for (j = 0; j < p->n; ++j)
+                               if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+               }
+       }
+       free(bins);
+       {
+               bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               int l;
+               ks_introsort(off, n_off, off);
+               // resolve completely contained adjacent blocks
+               for (i = 1, l = 0; i < n_off; ++i)
+                       if (off[l].v < off[i].v)
+                               off[++l] = off[i];
+               n_off = l + 1;
+               // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+               for (i = 1; i < n_off; ++i)
+                       if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+               { // merge adjacent blocks
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+                       for (i = 1, l = 0; i < n_off; ++i) {
+#ifdef BAM_TRUE_OFFSET
+                               if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
+#else
+                               if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+#endif
+                               else off[++l] = off[i];
+                       }
+                       n_off = l + 1;
+#endif
+               }
+               bam_destroy1(b);
+       }
+       *cnt_off = n_off;
+       return off;
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+       int n_off;
+       pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off);
+       if (off == 0) return 0;
+       {
+               // retrive alignments
+               uint64_t curr_off;
+               int i, ret, n_seeks;
+               n_seeks = 0; i = -1; curr_off = 0;
+               bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               for (;;) {
+                       if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
+                               if (i == n_off - 1) break; // no more chunks
+                               if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+                               if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+                                       bam_seek(fp, off[i+1].u, SEEK_SET);
+                                       curr_off = bam_tell(fp);
+                                       ++n_seeks;
+                               }
+                               ++i;
+                       }
+                       if ((ret = bam_read1(fp, b)) > 0) {
+                               curr_off = bam_tell(fp);
+                               if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
+                               else if (is_overlap(beg, end, b)) func(b, data);
+                       } else break; // end of file
+               }
+//             fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
+               bam_destroy1(b);
+       }
+       free(off);
+       return 0;
+}
diff --git a/samtools/bam_lpileup.c b/samtools/bam_lpileup.c
new file mode 100644 (file)
index 0000000..d4dd63b
--- /dev/null
@@ -0,0 +1,198 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "bam.h"
+#include "ksort.h"
+
+#define TV_GAP 2
+
+typedef struct __freenode_t {
+       uint32_t level:28, cnt:4;
+       struct __freenode_t *next;
+} freenode_t, *freenode_p;
+
+#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
+KSORT_INIT(node, freenode_p, freenode_lt)
+
+/* Memory pool, similar to the one in bam_pileup.c */
+typedef struct {
+       int cnt, n, max;
+       freenode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+       return (mempool_t*)calloc(1, sizeof(mempool_t));
+}
+static void mp_destroy(mempool_t *mp)
+{
+       int k;
+       for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
+       free(mp->buf); free(mp);
+}
+static inline freenode_t *mp_alloc(mempool_t *mp)
+{
+       ++mp->cnt;
+       if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
+       else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, freenode_t *p)
+{
+       --mp->cnt; p->next = 0; p->cnt = TV_GAP;
+       if (mp->n == mp->max) {
+               mp->max = mp->max? mp->max<<1 : 256;
+               mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
+       }
+       mp->buf[mp->n++] = p;
+}
+
+/* core part */
+struct __bam_lplbuf_t {
+       int max, n_cur, n_pre;
+       int max_level, *cur_level, *pre_level;
+       mempool_t *mp;
+       freenode_t **aux, *head, *tail;
+       int n_nodes, m_aux;
+       bam_pileup_f func;
+       void *user_data;
+       bam_plbuf_t *plbuf;
+};
+
+void bam_lplbuf_reset(bam_lplbuf_t *buf)
+{
+       freenode_t *p, *q;
+       bam_plbuf_reset(buf->plbuf);
+       for (p = buf->head; p->next;) {
+               q = p->next;
+               mp_free(buf->mp, p);
+               p = q;
+       }
+       buf->head = buf->tail;
+       buf->max_level = 0;
+       buf->n_cur = buf->n_pre = 0;
+       buf->n_nodes = 0;
+}
+
+static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+       bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
+       freenode_t *p;
+       int i, l, max_level;
+       // allocate memory if necessary
+       if (tv->max < n) { // enlarge
+               tv->max = n;
+               kroundup32(tv->max);
+               tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
+               tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
+       }
+       tv->n_cur = n;
+       // update cnt
+       for (p = tv->head; p->next; p = p->next)
+               if (p->cnt > 0) --p->cnt;
+       // calculate cur_level[]
+       max_level = 0;
+       for (i = l = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (p->is_head) {
+                       if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
+                               freenode_t *p = tv->head->next;
+                               tv->cur_level[i] = tv->head->level;
+                               mp_free(tv->mp, tv->head);
+                               tv->head = p;
+                               --tv->n_nodes;
+                       } else tv->cur_level[i] = ++tv->max_level;
+               } else {
+                       tv->cur_level[i] = tv->pre_level[l++];
+                       if (p->is_tail) { // then return a free slot
+                               tv->tail->level = tv->cur_level[i];
+                               tv->tail->next = mp_alloc(tv->mp);
+                               tv->tail = tv->tail->next;
+                               ++tv->n_nodes;
+                       }
+               }
+               if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
+               ((bam_pileup1_t*)p)->level = tv->cur_level[i];
+       }
+       assert(l == tv->n_pre);
+       tv->func(tid, pos, n, pl, tv->user_data);
+       // sort the linked list
+       if (tv->n_nodes) {
+               freenode_t *q;
+               if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
+                       tv->m_aux = tv->n_nodes + 1;
+                       kroundup32(tv->m_aux);
+                       tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
+               }
+               for (p = tv->head, i = l = 0; p->next;) {
+                       if (p->level > max_level) { // then discard this entry
+                               q = p->next;
+                               mp_free(tv->mp, p);
+                               p = q;
+                       } else {
+                               tv->aux[i++] = p;
+                               p = p->next;
+                       }
+               }
+               tv->aux[i] = tv->tail; // add a proper tail for the loop below
+               tv->n_nodes = i;
+               if (tv->n_nodes) {
+                       ks_introsort(node, tv->n_nodes, tv->aux);
+                       for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
+                       tv->head = tv->aux[0];
+               } else tv->head = tv->tail;
+       }
+       // clean up
+       tv->max_level = max_level;
+       memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
+       // squeeze out terminated levels
+       for (i = l = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (!p->is_tail)
+                       tv->pre_level[l++] = tv->pre_level[i];
+       }
+       tv->n_pre = l;
+/*
+       fprintf(stderr, "%d\t", pos+1);
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (p->is_head) fprintf(stderr, "^");
+               if (p->is_tail) fprintf(stderr, "$");
+               fprintf(stderr, "%d,", p->level);
+       }
+       fprintf(stderr, "\n");
+*/
+       return 0;
+}
+
+bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
+{
+       bam_lplbuf_t *tv;
+       tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
+       tv->mp = mp_init();
+       tv->head = tv->tail = mp_alloc(tv->mp);
+       tv->func = func;
+       tv->user_data = data;
+       tv->plbuf = bam_plbuf_init(tview_func, tv);
+       return (bam_lplbuf_t*)tv;
+}
+
+void bam_lplbuf_destroy(bam_lplbuf_t *tv)
+{
+       freenode_t *p, *q;
+       free(tv->cur_level); free(tv->pre_level);
+       bam_plbuf_destroy(tv->plbuf);
+       free(tv->aux);
+       for (p = tv->head; p->next;) {
+               q = p->next;
+               mp_free(tv->mp, p); p = q;
+       }
+       mp_free(tv->mp, p);
+       assert(tv->mp->cnt == 0);
+       mp_destroy(tv->mp);
+       free(tv);
+}
+
+int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
+{
+       return bam_plbuf_push(b, tv->plbuf);
+}
diff --git a/samtools/bam_maqcns.c b/samtools/bam_maqcns.c
new file mode 100644 (file)
index 0000000..71c2185
--- /dev/null
@@ -0,0 +1,601 @@
+#include <math.h>
+#include <assert.h>
+#include "bam.h"
+#include "bam_maqcns.h"
+#include "ksort.h"
+#include "kaln.h"
+KSORT_INIT_GENERIC(uint32_t)
+
+#define INDEL_WINDOW_SIZE 50
+#define INDEL_EXT_DEP 0.9
+
+typedef struct __bmc_aux_t {
+       int max;
+       uint32_t *info;
+} bmc_aux_t;
+
+typedef struct {
+       float esum[4], fsum[4];
+       uint32_t c[4];
+       uint32_t rms_mapQ;
+} glf_call_aux_t;
+
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+/*
+  P(<b1,b2>) = \theta \sum_{i=1}^{N-1} 1/i
+  P(D|<b1,b2>) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2]
+  p_k = 1/k / \sum_{i=1}^{N-1} 1/i
+ */
+static void cal_het(bam_maqcns_t *aa)
+{
+       int k, n1, n2;
+       double sum_harmo; // harmonic sum
+       double poly_rate;
+
+       free(aa->lhet);
+       aa->lhet = (double*)calloc(256 * 256, sizeof(double));
+       sum_harmo = 0.0;
+       for (k = 1; k <= aa->n_hap - 1; ++k)
+               sum_harmo += 1.0 / k;
+       for (n1 = 0; n1 < 256; ++n1) {
+               for (n2 = 0; n2 < 256; ++n2) {
+                       long double sum = 0.0;
+                       double lC = aa->is_soap? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1}
+                       for (k = 1; k <= aa->n_hap - 1; ++k) {
+                               double pk = 1.0 / k / sum_harmo;
+                               double log1 = log((double)k/aa->n_hap);
+                               double log2 = log(1.0 - (double)k/aa->n_hap);
+                               sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2));
+                       }
+                       aa->lhet[n1<<8|n2] = lC + logl(sum);
+               }
+       }
+       poly_rate = aa->het_rate * sum_harmo;
+       aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate));
+}
+
+/** initialize the helper structure */
+static void cal_coef(bam_maqcns_t *aa)
+{
+       int k, n, q;
+       long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256];
+       double *lC;
+
+       // aa->lhet will be allocated and initialized 
+       free(aa->fk); free(aa->coef);
+       aa->coef = 0;
+       aa->fk = (double*)calloc(256, sizeof(double));
+       aa->fk[0] = fk2[0] = 1.0;
+       for (n = 1; n != 256; ++n) {
+               aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta;
+               fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands
+       }
+       if (aa->is_soap) return;
+       aa->coef = (double*)calloc(256*256*64, sizeof(double));
+       lC = (double*)calloc(256 * 256, sizeof(double));
+       for (n = 1; n != 256; ++n)
+               for (k = 1; k <= n; ++k)
+                       lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+       for (q = 1; q != 64; ++q) {
+               double e = pow(10.0, -q/10.0);
+               double le = log(e);
+               double le1 = log(1.0-e);
+               for (n = 1; n != 256; ++n) {
+                       double *coef = aa->coef + (q<<16|n<<8);
+                       sum_a[n+1] = 0.0;
+                       for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k}
+                               sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1);
+                               b[k] = sum_a[k+1] / sum_a[k];
+                               if (b[k] > 0.99) b[k] = 0.99;
+                       }
+                       for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k})
+                               q_c[k] = -4.343 * fk2[k] * logl(b[k] / e);
+                       for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i
+                       for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9
+                               tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k])));
+                               coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk}
+                       }
+               }
+       }
+       free(lC);
+}
+
+bam_maqcns_t *bam_maqcns_init()
+{
+       bam_maqcns_t *bm;
+       bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t));
+       bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t));
+       bm->het_rate = 0.001;
+       bm->theta = 0.85;
+       bm->n_hap = 2;
+       bm->eta = 0.03;
+       bm->cap_mapQ = 60;
+       return bm;
+}
+
+void bam_maqcns_prepare(bam_maqcns_t *bm)
+{
+       cal_coef(bm); cal_het(bm);
+}
+
+void bam_maqcns_destroy(bam_maqcns_t *bm)
+{
+       if (bm == 0) return;
+       free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info);
+       free(bm->aux); free(bm);
+}
+
+glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm)
+{
+       glf_call_aux_t *b;
+       int i, j, k, w[8], c, n;
+       glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t));
+       float p[16], min_p = 1e30;
+       uint64_t rms;
+
+       g->ref_base = ref_base;
+       if (_n == 0) return g;
+
+       // construct aux array
+       if (bm->aux->max < _n) {
+               bm->aux->max = _n;
+               kroundup32(bm->aux->max);
+               bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max);
+       }
+       for (i = n = 0; i < _n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               uint32_t q, x = 0, qq;
+               if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue;
+               q = (uint32_t)bam1_qual(p->b)[p->qpos];
+               x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
+               if (p->b->core.qual < q) q = p->b->core.qual;
+               x |= q << 24;
+               qq = bam1_seqi(bam1_seq(p->b), p->qpos);
+               q = bam_nt16_nt4_table[qq? qq : ref_base];
+               if (!p->is_del && q < 4) x |= 1 << 21 | q << 16;
+               bm->aux->info[n++] = x;
+       }
+       ks_introsort(uint32_t, n, bm->aux->info);
+       // generate esum and fsum
+       b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t));
+       for (k = 0; k != 8; ++k) w[k] = 0;
+       rms = 0;
+       for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
+               uint32_t info = bm->aux->info[j];
+               int tmp;
+               if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
+               k = info>>16&7;
+               if (info>>24 > 0) {
+                       b->esum[k&3] += bm->fk[w[k]] * (info>>24);
+                       b->fsum[k&3] += bm->fk[w[k]];
+                       if (w[k] < 0xff) ++w[k];
+                       ++b->c[k&3];
+               }
+               tmp = (int)(info&0xff) < bm->cap_mapQ? (int)(info&0xff) : bm->cap_mapQ;
+               rms += tmp * tmp;
+       }
+       b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499);
+       // rescale ->c[]
+       for (j = c = 0; j != 4; ++j) c += b->c[j];
+       if (c > 255) {
+               for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5);
+               for (j = c = 0; j != 4; ++j) c += b->c[j];
+       }
+       if (!bm->is_soap) {
+               // generate likelihood
+               for (j = 0; j != 4; ++j) {
+                       // homozygous
+                       float tmp1, tmp3;
+                       int tmp2, bar_e;
+                       for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) {
+                               if (j == k) continue;
+                               tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k];
+                       }
+                       if (tmp2) {
+                               bar_e = (int)(tmp1 / tmp3 + 0.5);
+                               if (bar_e < 4) bar_e = 4; // should not happen
+                               if (bar_e > 63) bar_e = 63;
+                               p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+                       } else p[j<<2|j] = 0.0; // all the bases are j
+                       // heterozygous
+                       for (k = j + 1; k < 4; ++k) {
+                               for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) {
+                                       if (i == j || i == k) continue;
+                                       tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i];
+                               }
+                               if (tmp2) {
+                                       bar_e = (int)(tmp1 / tmp3 + 0.5);
+                                       if (bar_e < 4) bar_e = 4;
+                                       if (bar_e > 63) bar_e = 63;
+                                       p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+                               } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k
+                       }
+                       //
+                       for (k = 0; k != 4; ++k)
+                               if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0;
+               }
+
+               { // fix p[k<<2|k]
+                       float max1, max2, min1, min2;
+                       int max_k, min_k;
+                       max_k = min_k = -1;
+                       max1 = max2 = -1.0; min1 = min2 = 1e30;
+                       for (k = 0; k < 4; ++k) {
+                               if (b->esum[k] > max1) {
+                                       max2 = max1; max1 = b->esum[k]; max_k = k;
+                               } else if (b->esum[k] > max2) max2 = b->esum[k];
+                       }
+                       for (k = 0; k < 4; ++k) {
+                               if (p[k<<2|k] < min1) {
+                                       min2 = min1; min1 = p[k<<2|k]; min_k = k;
+                               } else if (p[k<<2|k] < min2) min2 = p[k<<2|k];
+                       }
+                       if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2))
+                               p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0;
+               }
+       } else { // apply the SOAP model
+               // generate likelihood
+               for (j = 0; j != 4; ++j) {
+                       float tmp;
+                       // homozygous
+                       for (k = 0, tmp = 0.0; k != 4; ++k)
+                               if (j != k) tmp += b->esum[k];
+                       p[j<<2|j] = tmp;
+                       // heterozygous
+                       for (k = j + 1; k < 4; ++k) {
+                               for (i = 0, tmp = 0.0; i != 4; ++i)
+                                       if (i != j && i != k) tmp += b->esum[i];
+                               p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp;
+                       }
+               }
+       }
+
+       // convert necessary information to glf1_t
+       g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ;
+       g->depth = n > 16777215? 16777215 : n;
+       for (j = 0; j != 4; ++j)
+               for (k = j; k < 4; ++k)
+                       if (p[j<<2|k] < min_p) min_p = p[j<<2|k];
+       g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5);
+       for (j = c = 0; j != 4; ++j)
+               for (k = j; k < 4; ++k)
+                       g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5);
+
+       free(b);
+       return g;
+}
+
+uint32_t glf2cns(const glf1_t *g, int q_r)
+{
+       int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1;
+       uint32_t x = 0;
+       for (i = k = 0; i < 4; ++i)
+               for (j = i; j < 4; ++j) {
+                       tmp[j<<2|i] = -1;
+                       tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r);
+               }
+       for (i = 0; i < 16; ++i) {
+               if (tmp[i] < 0) continue;
+               if (tmp[i] < min) {
+                       min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i;
+               } else if (tmp[i] < min2) {
+                       min3 = min2; min2 = tmp[i]; min_g2 = i;
+               } else if (tmp[i] < min3) min3 = tmp[i];
+       }
+       x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28;
+       x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24;
+       x |= (uint32_t)g->max_mapQ << 16;
+       x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8;
+       x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff;
+       return x;
+}
+
+uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm)
+{
+       glf1_t *g;
+       uint32_t x;
+       if (n) {
+               g = bam_maqcns_glfgen(n, pl, 0xf, bm);
+               x = glf2cns(g, (int)(bm->q_r + 0.5));
+               free(g);
+       } else x = 0xfU<<28 | 0xfU<<24;
+       return x;
+}
+
+/************** *****************/
+
+bam_maqindel_opt_t *bam_maqindel_opt_init()
+{
+       bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
+       mi->q_indel = 40;
+       mi->r_indel = 0.00015;
+       //
+       mi->mm_penalty = 3;
+       mi->indel_err = 4;
+       mi->ambi_thres = 10;
+       return mi;
+}
+
+void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
+{
+       if (mir == 0) return;
+       free(mir->s[0]); free(mir->s[1]); free(mir);
+}
+
+int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
+{
+       int k, x = c->pos, y = 0, last_y = 0;
+       *_tpos = c->pos;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               int l = cigar[k] >> BAM_CIGAR_SHIFT;
+               if (op == BAM_CMATCH) {
+                       if (c->pos > tpos) return y;
+                       if (x + l > tpos) {
+                               *_tpos = tpos;
+                               return y + (tpos - x);
+                       }
+                       x += l; y += l;
+                       last_y = y;
+               } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+                       if (x + l > tpos) {
+                               *_tpos = is_left? x : x + l;
+                               return y;
+                       }
+                       x += l;
+               }
+       }
+       *_tpos = x;
+       return last_y;
+}
+
+#define MINUS_CONST 0x10000000
+
+bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+                                                                int _n_types, int *_types)
+{
+       int i, j, n_types, *types, left, right, max_rd_len = 0;
+       bam_maqindel_ret_t *ret = 0;
+       // if there is no proposed indel, check if there is an indel from the alignment
+       if (_n_types == 0) {
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+               }
+               if (i == n) return 0; // no indel
+       }
+       { // calculate how many types of indels are available (set n_types and types)
+               int m;
+               uint32_t *aux;
+               aux = (uint32_t*)calloc(n + _n_types + 1, 4);
+               m = 0;
+               aux[m++] = MINUS_CONST; // zero indel is always a type
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
+                               aux[m++] = MINUS_CONST + p->indel;
+                       j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
+                       if (j > max_rd_len) max_rd_len = j;
+               }
+               if (_n_types) // then also add this to aux[]
+                       for (i = 0; i < _n_types; ++i)
+                               if (_types[i]) aux[m++] = MINUS_CONST + _types[i];
+               ks_introsort(uint32_t, m, aux);
+               // squeeze out identical types
+               for (i = 1, n_types = 1; i < m; ++i)
+                       if (aux[i] != aux[i-1]) ++n_types;
+               types = (int*)calloc(n_types, sizeof(int));
+               j = 0;
+               types[j++] = aux[0] - MINUS_CONST; 
+               for (i = 1; i < m; ++i) {
+                       if (aux[i] != aux[i-1])
+                               types[j++] = aux[i] - MINUS_CONST;
+               }
+               free(aux);
+       }
+       { // calculate left and right boundary
+               left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+               right = pos + INDEL_WINDOW_SIZE;
+               if (types[0] < 0) right -= types[0];
+               // in case the alignments stand out the reference
+               for (i = pos; i < right; ++i)
+                       if (ref[i] == 0) break;
+               right = i;
+       }
+       { // the core part
+               char *ref2, *rs, *inscns = 0;
+               int k, l, *score, *pscore, max_ins = types[n_types-1];
+               if (max_ins > 0) { // get the consensus of inserted sequences
+                       int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
+                       // count occurrences
+                       for (i = 0; i < n_types; ++i) {
+                               if (types[i] <= 0) continue; // not insertion
+                               for (j = 0; j < n; ++j) {
+                                       const bam_pileup1_t *p = pl + j;
+                                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) {
+                                               for (k = 1; k <= p->indel; ++k) {
+                                                       int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)];
+                                                       if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c];
+                                               }
+                                       }
+                               }
+                       }
+                       // construct the consensus of inserted sequence
+                       inscns = (char*)calloc(n_types * max_ins, sizeof(char));
+                       for (i = 0; i < n_types; ++i) {
+                               for (j = 0; j < types[i]; ++j) {
+                                       int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4;
+                                       for (k = 0; k < 4; ++k) {
+                                               if (ia[k] > max) {
+                                                       max = ia[k];
+                                                       max_k = k;
+                                               }
+                                       }
+                                       inscns[i*max_ins + j] = max? 1<<max_k : 15;
+                               }
+                       }
+                       free(inscns_aux);
+               }
+               // calculate score
+               ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
+               rs   = (char*)calloc(right - left + max_rd_len + types[n_types-1] + 2, 1);
+               score = (int*)calloc(n_types * n, sizeof(int));
+               pscore = (int*)calloc(n_types * n, sizeof(int));
+               for (i = 0; i < n_types; ++i) {
+                       ka_param_t ap = ka_param_blast;
+                       ap.band_width = 2 * types[n_types - 1] + 2;
+                       // write ref2
+                       for (k = 0, j = left; j <= pos; ++j)
+                               ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
+                       if (types[i] <= 0) j += -types[i];
+                       else for (l = 0; l < types[i]; ++l)
+                                        ref2[k++] = bam_nt16_nt4_table[(int)inscns[i*max_ins + l]];
+                       for (; j < right && ref[j]; ++j)
+                               ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
+                       if (j < right) right = j;
+                       // calculate score for each read
+                       for (j = 0; j < n; ++j) {
+                               const bam_pileup1_t *p = pl + j;
+                               int qbeg, qend, tbeg, tend;
+                               if (p->b->core.flag & BAM_FUNMAP) continue;
+                               qbeg = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), left,  0, &tbeg);
+                               qend = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend);
+                               assert(tbeg >= left);
+                               for (l = qbeg; l < qend; ++l)
+                                       rs[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), l)];
+                               {
+                                       int x, y, n_acigar, ps;
+                                       uint32_t *acigar;
+                                       ps = 0;
+                                       if (tend - tbeg + types[i] <= 0) {
+                                               score[i*n+j] = -(1<<20);
+                                               pscore[i*n+j] = 1<<20;
+                                               continue;
+                                       }
+                                       acigar = ka_global_core((uint8_t*)ref2 + tbeg - left, tend - tbeg + types[i], (uint8_t*)rs, qend - qbeg, &ap, &score[i*n+j], &n_acigar);
+                                       x = tbeg - left; y = 0;
+                                       for (l = 0; l < n_acigar; ++l) {
+                                               int op = acigar[l]&0xf;
+                                               int len = acigar[l]>>4;
+                                               if (op == BAM_CMATCH) {
+                                                       int k;
+                                                       for (k = 0; k < len; ++k)
+                                                               if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k];
+                                                       x += len; y += len;
+                                               } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+                                                       if (op == BAM_CINS) ps += mi->q_indel * len;
+                                                       y += len;
+                                               } else if (op == BAM_CDEL) {
+                                                       ps += mi->q_indel * len;
+                                                       x += len;
+                                               }
+                                       }
+                                       pscore[i*n+j] = ps;
+                                       /*if (pos == 2618517) { // for debugging only
+                                               fprintf(stderr, "pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend);
+                                               for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); fprintf(stderr, "\n");
+                                               for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l]], stderr); fputc('\n', stderr);
+                                               for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); fputc('\n', stderr);
+                                               }*/
+                                       free(acigar);
+                               }
+                       }
+               }
+               { // get final result
+                       int *sum, max1, max2, max1_i, max2_i;
+                       // pick up the best two score
+                       sum = (int*)calloc(n_types, sizeof(int));
+                       for (i = 0; i < n_types; ++i)
+                               for (j = 0; j < n; ++j)
+                                       sum[i] += -pscore[i*n+j];
+                       max1 = max2 = -0x7fffffff; max1_i = max2_i = -1;
+                       for (i = 0; i < n_types; ++i) {
+                               if (sum[i] > max1) {
+                                       max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i;
+                               } else if (sum[i] > max2) {
+                                       max2 = sum[i]; max2_i = i;
+                               }
+                       }
+                       free(sum);
+                       // write ret
+                       ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));
+                       ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];
+                       ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1);
+                       ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1);
+                       // write indel sequence
+                       if (ret->indel1 > 0) {
+                               ret->s[0][0] = '+';
+                               for (k = 0; k < ret->indel1; ++k)
+                                       ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
+                       } else if (ret->indel1 < 0) {
+                               ret->s[0][0] = '-';
+                               for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)
+                                       ret->s[0][k+1] = ref[pos + k + 1];
+                       } else ret->s[0][0] = '*';
+                       if (ret->indel2 > 0) {
+                               ret->s[1][0] = '+';
+                               for (k = 0; k < ret->indel2; ++k)
+                                       ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
+                       } else if (ret->indel2 < 0) {
+                               ret->s[1][0] = '-';
+                               for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)
+                                       ret->s[1][k+1] = ref[pos + k + 1];
+                       } else ret->s[1][0] = '*';
+                       // write count
+                       for (i = 0; i < n; ++i) {
+                               const bam_pileup1_t *p = pl + i;
+                               if (p->indel == ret->indel1) ++ret->cnt1;
+                               else if (p->indel == ret->indel2) ++ret->cnt2;
+                               else ++ret->cnt_anti;
+                       }
+                       { // write gl[]
+                               int tmp, seq_err = 0;
+                               double x = 1.0;
+                               tmp = max1_i - max2_i;
+                               if (tmp < 0) tmp = -tmp;
+                               for (j = 0; j < tmp + 1; ++j) x *= INDEL_EXT_DEP;
+                               seq_err = mi->q_indel * (1.0 - x) / (1.0 - INDEL_EXT_DEP);
+                               ret->gl[0] = ret->gl[1] = 0;
+                               for (j = 0; j < n; ++j) {
+                                       int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
+                                       //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2);
+                                       if (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err;
+                                       else ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err;
+                               }
+                       }
+                       // write cnt_ref and cnt_ambi
+                       if (max1_i != 0 && max2_i != 0) {
+                               for (j = 0; j < n; ++j) {
+                                       int diff1 = score[j] - score[max1_i * n + j];
+                                       int diff2 = score[j] - score[max2_i * n + j];
+                                       if (diff1 > 0 && diff2 > 0) ++ret->cnt_ref;
+                                       else if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi;
+                               }
+                       }
+               }
+               free(score); free(pscore); free(ref2); free(rs); free(inscns);
+       }
+       { // call genotype
+               int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);
+               int min1, min2, min1_i;
+               q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel;
+               q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel;
+               q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel;
+               min1 = min2 = 0x7fffffff; min1_i = -1;
+               for (i = 0; i < 3; ++i) {
+                       if (q[i] < min1) {
+                               min2 = min1; min1 = q[i]; min1_i = i;
+                       } else if (q[i] < min2) min2 = q[i];
+               }
+               ret->gt = min1_i;
+               ret->q_cns = min2 - min1;
+               // set q_ref
+               if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3;
+               else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2];
+               if (ret->q_ref < 0) ret->q_ref = 0;
+       }
+       free(types);
+       return ret;
+}
diff --git a/samtools/bam_maqcns.h b/samtools/bam_maqcns.h
new file mode 100644 (file)
index 0000000..fa5489d
--- /dev/null
@@ -0,0 +1,56 @@
+#ifndef BAM_MAQCNS_H
+#define BAM_MAQCNS_H
+
+#include "glf.h"
+
+struct __bmc_aux_t;
+
+typedef struct {
+       float het_rate, theta;
+       int n_hap, cap_mapQ, is_soap;
+
+       float eta, q_r;
+       double *fk, *coef;
+       double *lhet;
+       struct __bmc_aux_t *aux;
+} bam_maqcns_t;
+
+typedef struct {
+       int q_indel;
+       float r_indel;
+       // hidden parameters, unchangeable from command line
+       int mm_penalty, indel_err, ambi_thres;
+} bam_maqindel_opt_t;
+
+typedef struct {
+       int indel1, indel2;
+       int cnt1, cnt2, cnt_anti;
+       int cnt_ref, cnt_ambi;
+       char *s[2];
+       //
+       int gt, gl[2];
+       int q_cns, q_ref;
+} bam_maqindel_ret_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       bam_maqcns_t *bam_maqcns_init();
+       void bam_maqcns_prepare(bam_maqcns_t *bm);
+       void bam_maqcns_destroy(bam_maqcns_t *bm);
+       glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm);
+       uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm);
+       // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2
+       uint32_t glf2cns(const glf1_t *g, int q_r);
+
+       bam_maqindel_opt_t *bam_maqindel_opt_init();
+       bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+                                                                        int _n_types, int *_types);
+       void bam_maqindel_ret_destroy(bam_maqindel_ret_t*);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c
new file mode 100644 (file)
index 0000000..61f808a
--- /dev/null
@@ -0,0 +1,70 @@
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+
+// currently, this function ONLY works if each read has one hit
+void bam_mating_core(bamFile in, bamFile out)
+{
+       bam_header_t *header;
+       bam1_t *b[2];
+       int curr, has_prev;
+
+       header = bam_header_read(in);
+       bam_header_write(out, header);
+
+       b[0] = bam_init1();
+       b[1] = bam_init1();
+       curr = 0; has_prev = 0;
+       while (bam_read1(in, b[curr]) >= 0) {
+               bam1_t *cur = b[curr], *pre = b[1-curr];
+               if (has_prev) {
+                       if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
+                               cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
+                               pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
+                               if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
+                                       && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)))
+                               {
+                                       uint32_t cur5, pre5;
+                                       cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos;
+                                       pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos;
+                                       cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
+                               } else cur->core.isize = pre->core.isize = 0;
+                               if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
+                               else cur->core.flag &= ~BAM_FMREVERSE;
+                               if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
+                               else pre->core.flag &= ~BAM_FMREVERSE;
+                               if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
+                               if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
+                               bam_write1(out, pre);
+                               bam_write1(out, cur);
+                               has_prev = 0;
+                       } else { // unpaired or singleton
+                               pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
+                               if (pre->core.flag & BAM_FPAIRED) {
+                                       pre->core.flag |= BAM_FMUNMAP;
+                                       pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
+                               }
+                               bam_write1(out, pre);
+                       }
+               } else has_prev = 1;
+               curr = 1 - curr;
+       }
+       if (has_prev) bam_write1(out, b[1-curr]);
+       bam_header_destroy(header);
+       bam_destroy1(b[0]);
+       bam_destroy1(b[1]);
+}
+
+int bam_mating(int argc, char *argv[])
+{
+       bamFile in, out;
+       if (argc < 3) {
+               fprintf(stderr, "samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n");
+               return 1;
+       }
+       in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+    out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+       bam_mating_core(in, out);
+       bam_close(in); bam_close(out);
+       return 0;
+}
diff --git a/samtools/bam_md.c b/samtools/bam_md.c
new file mode 100644 (file)
index 0000000..3ca7309
--- /dev/null
@@ -0,0 +1,149 @@
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <ctype.h>
+#include "faidx.h"
+#include "sam.h"
+#include "kstring.h"
+
+void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+{
+       uint8_t *seq = bam1_seq(b);
+       uint32_t *cigar = bam1_cigar(b);
+       bam1_core_t *c = &b->core;
+       int i, x, y, u = 0;
+       kstring_t *str;
+       uint8_t *old_md, *old_nm;
+       int32_t old_nm_i = -1, nm = 0;
+
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+               int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+               if (op == BAM_CMATCH) {
+                       for (j = 0; j < l; ++j) {
+                               int z = y + j;
+                               int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+                               if (ref[x+j] == 0) break; // out of boundary
+                               if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+                                       if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+                                       ++u;
+                               } else {
+                                       ksprintf(str, "%d", u);
+                                       kputc(ref[x+j], str);
+                                       u = 0; ++nm;
+                               }
+                       }
+                       if (j < l) break;
+                       x += l; y += l;
+               } else if (op == BAM_CDEL) {
+                       ksprintf(str, "%d", u);
+                       kputc('^', str);
+                       for (j = 0; j < l; ++j) {
+                               if (ref[x+j] == 0) break;
+                               kputc(ref[x+j], str);
+                       }
+                       u = 0;
+                       if (j < l) break;
+                       x += l; nm += l;
+               } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+                       y += l;
+                       if (op == BAM_CINS) nm += l;
+               } else if (op == BAM_CREF_SKIP) {
+                       x += l;
+               }
+       }
+       ksprintf(str, "%d", u);
+       // update NM
+       old_nm = bam_aux_get(b, "NM");
+       if (c->flag & BAM_FUNMAP) return;
+       if (old_nm) old_nm_i = bam_aux2i(old_nm);
+       if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+       else if (nm != old_nm_i) {
+               fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+               bam_aux_del(b, old_nm);
+               bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+       }
+       // update MD
+       old_md = bam_aux_get(b, "MD");
+       if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+       else {
+               int is_diff = 0;
+               if (strlen((char*)old_md+1) == str->l) {
+                       for (i = 0; i < str->l; ++i)
+                               if (toupper(old_md[i+1]) != toupper(str->s[i]))
+                                       break;
+                       if (i < str->l) is_diff = 1;
+               } else is_diff = 1;
+               if (is_diff) {
+                       fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+                       bam_aux_del(b, old_md);
+                       bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+               }
+       }
+       free(str->s); free(str);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+       int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed;
+       samfile_t *fp, *fpout = 0;
+       faidx_t *fai;
+       char *ref = 0, mode_w[8], mode_r[8];
+       bam1_t *b;
+
+       is_bam_out = is_sam_in = is_uncompressed = 0;
+       mode_w[0] = mode_r[0] = 0;
+       strcpy(mode_r, "r"); strcpy(mode_w, "w");
+       while ((c = getopt(argc, argv, "eubS")) >= 0) {
+               switch (c) {
+               case 'e': is_equal = 1; break;
+               case 'b': is_bam_out = 1; break;
+               case 'u': is_uncompressed = is_bam_out = 1; break;
+               case 'S': is_sam_in = 1; break;
+               default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+               }
+       }
+       if (!is_sam_in) strcat(mode_r, "b");
+       if (is_bam_out) strcat(mode_w, "b");
+       else strcat(mode_w, "h");
+       if (is_uncompressed) strcat(mode_w, "u");
+       if (optind + 1 >= argc) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   samtools fillmd [-eubS] <aln.bam> <ref.fasta>\n\n");
+               fprintf(stderr, "Options: -e       change identical bases to '='\n");
+               fprintf(stderr, "         -u       uncompressed BAM output (for piping)\n");
+               fprintf(stderr, "         -b       compressed BAM output\n");
+               fprintf(stderr, "         -S       the input is SAM with header\n\n");
+               return 1;
+       }
+       fp = samopen(argv[optind], mode_r, 0);
+       if (fp == 0) return 1;
+       if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
+               fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
+               return 1;
+       }
+       fpout = samopen("-", mode_w, fp->header);
+       fai = fai_load(argv[optind+1]);
+
+       b = bam_init1();
+       while ((ret = samread(fp, b)) >= 0) {
+               if (b->core.tid >= 0) {
+                       if (tid != b->core.tid) {
+                               free(ref);
+                               ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
+                               tid = b->core.tid;
+                               if (ref == 0)
+                                       fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
+                                                       fp->header->target_name[tid]);
+                       }
+                       if (ref) bam_fillmd1(b, ref, is_equal);
+               }
+               samwrite(fpout, b);
+       }
+       bam_destroy1(b);
+
+       free(ref);
+       fai_destroy(fai);
+       samclose(fp); samclose(fpout);
+       return 0;
+}
diff --git a/samtools/bam_pileup.c b/samtools/bam_pileup.c
new file mode 100644 (file)
index 0000000..f68f400
--- /dev/null
@@ -0,0 +1,238 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include "sam.h"
+
+typedef struct __linkbuf_t {
+       bam1_t b;
+       uint32_t beg, end;
+       struct __linkbuf_t *next;
+} lbnode_t;
+
+/* --- BEGIN: Memory pool */
+
+typedef struct {
+       int cnt, n, max;
+       lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+       mempool_t *mp;
+       mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+       return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+       int k;
+       for (k = 0; k < mp->n; ++k) {
+               free(mp->buf[k]->b.data);
+               free(mp->buf[k]);
+       }
+       free(mp->buf);
+       free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+       ++mp->cnt;
+       if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+       else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+       --mp->cnt; p->next = 0; // clear lbnode_t::next here
+       if (mp->n == mp->max) {
+               mp->max = mp->max? mp->max<<1 : 256;
+               mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+       }
+       mp->buf[mp->n++] = p;
+}
+
+/* --- END: Memory pool */
+
+/* --- BEGIN: Auxiliary functions */
+
+static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
+{
+       unsigned k;
+       bam1_t *b = p->b;
+       bam1_core_t *c = &b->core;
+       uint32_t x = c->pos, y = 0;
+       int ret = 1, is_restart = 1;
+
+       if (c->flag&BAM_FUNMAP) return 0; // unmapped read
+       assert(x <= pos); // otherwise a bug
+       p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
+               int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
+               if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip
+                       if (x + l > pos) { // overlap with pos
+                               p->indel = p->is_del = 0;
+                               p->qpos = y + (pos - x);
+                               if (x == pos && is_restart) p->is_head = 1;
+                               if (x + l - 1 == pos) { // come to the end of a match
+                                       if (k < c->n_cigar - 1) { // there are additional operation(s)
+                                               uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
+                                               int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
+                                               if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
+                                               else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
+                                               if (op_next == BAM_CDEL || op_next == BAM_CINS) {
+                                                       if (k + 2 < c->n_cigar) op_next = bam1_cigar(b)[k+2]&BAM_CIGAR_MASK;
+                                                       else p->is_tail = 1;
+                                               }
+                                               if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
+                                                       p->is_tail = 1; // tail
+                                       } else p->is_tail = 1; // this is the last operation; set tail
+                               }
+                       }
+                       x += l; y += l;
+               } else if (op == BAM_CDEL) { // then set ->is_del
+                       if (x + l > pos) {
+                               p->indel = 0; p->is_del = 1;
+                               p->qpos = y + (pos - x);
+                       }
+                       x += l;
+               } else if (op == BAM_CREF_SKIP) x += l;
+               else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
+               if (x > pos) {
+                       if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
+                       break;
+               }
+       }
+       assert(x > pos); // otherwise a bug
+       return ret;
+}
+
+/* --- END: Auxiliary functions */
+
+struct __bam_plbuf_t {
+       mempool_t *mp;
+       lbnode_t *head, *tail, *dummy;
+       bam_pileup_f func;
+       void *func_data;
+       int32_t tid, pos, max_tid, max_pos;
+       int max_pu, is_eof;
+       bam_pileup1_t *pu;
+       int flag_mask;
+};
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+       lbnode_t *p, *q;
+       buf->max_tid = buf->max_pos = -1;
+       buf->tid = buf->pos = 0;
+       buf->is_eof = 0;
+       for (p = buf->head; p->next;) {
+               q = p->next;
+               mp_free(buf->mp, p);
+               p = q;
+       }
+       buf->head = buf->tail;
+}
+
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+       if (mask < 0) buf->flag_mask = BAM_DEF_MASK;
+       else buf->flag_mask = BAM_FUNMAP | mask;
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+       bam_plbuf_t *buf;
+       buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t));
+       buf->func = func; buf->func_data = data;
+       buf->mp = mp_init();
+       buf->head = buf->tail = mp_alloc(buf->mp);
+       buf->dummy = mp_alloc(buf->mp);
+       buf->max_tid = buf->max_pos = -1;
+       buf->flag_mask = BAM_DEF_MASK;
+       return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+       mp_free(buf->mp, buf->dummy);
+       mp_free(buf->mp, buf->head);
+       if (buf->mp->cnt != 0)
+               fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt);
+       mp_destroy(buf->mp);
+       free(buf->pu);
+       free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+       if (b) { // fill buffer
+               if (b->core.tid < 0) return 0;
+               if (b->core.flag & buf->flag_mask) return 0;
+               bam_copy1(&buf->tail->b, b);
+               buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
+               if (b->core.tid < buf->max_tid) {
+                       fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+                       return -1;
+               }
+               if ((b->core.tid == buf->max_tid) && (buf->tail->beg < buf->max_pos)) {
+                       fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+                       return -1;
+               }
+               buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
+               if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
+                       buf->tail->next = mp_alloc(buf->mp);
+                       buf->tail = buf->tail->next;
+               }
+       } else buf->is_eof = 1;
+       while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) {
+               int n_pu = 0;
+               lbnode_t *p, *q;
+               buf->dummy->next = buf->head;
+               for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
+                       if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
+                               q->next = p->next; mp_free(buf->mp, p); p = q;
+                       } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
+                               if (n_pu == buf->max_pu) { // then double the capacity
+                                       buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
+                                       buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
+                               }
+                               buf->pu[n_pu].b = &p->b;
+                               if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
+                       }
+               }
+               buf->head = buf->dummy->next; // dummy->next may be changed
+               if (n_pu) { // then call user defined function
+                       buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data);
+               }
+               // update tid and pos
+               if (buf->head->next) {
+                       if (buf->tid > buf->head->b.core.tid) {
+                               fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
+                               return 1;
+                       }
+               }
+               if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
+                       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
+               } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
+                       buf->pos = buf->head->beg; // jump to the next position
+               } else ++buf->pos; // scan contiguously
+               if (buf->is_eof && buf->head->next == 0) break;
+       }
+       return 0;
+}
+
+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
+{
+       bam_plbuf_t *buf;
+       int ret;
+       bam1_t *b;
+       b = bam_init1();
+       buf = bam_plbuf_init(func, func_data);
+       bam_plbuf_set_mask(buf, mask);
+       while ((ret = bam_read1(fp, b)) >= 0)
+               bam_plbuf_push(b, buf);
+       bam_plbuf_push(0, buf);
+       bam_plbuf_destroy(buf);
+       bam_destroy1(b);
+       return 0;
+}
diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c
new file mode 100644 (file)
index 0000000..ba787a9
--- /dev/null
@@ -0,0 +1,392 @@
+#include <math.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "sam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+#include "khash.h"
+#include "glf.h"
+#include "kstring.h"
+
+typedef int *indel_list_t;
+KHASH_MAP_INIT_INT64(64, indel_list_t)
+
+#define BAM_PLF_SIMPLE     0x01
+#define BAM_PLF_CNS        0x02
+#define BAM_PLF_INDEL_ONLY 0x04
+#define BAM_PLF_GLF        0x08
+#define BAM_PLF_VAR_ONLY   0x10
+#define BAM_PLF_2ND        0x20
+
+typedef struct {
+       bam_header_t *h;
+       bam_maqcns_t *c;
+       bam_maqindel_opt_t *ido;
+       faidx_t *fai;
+       khash_t(64) *hash;
+       uint32_t format;
+       int tid, len, last_pos;
+       int mask;
+       char *ref;
+       glfFile fp_glf; // for glf output only
+} pu_data_t;
+
+char **__bam_get_lines(const char *fn, int *_n);
+void bam_init_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
+{
+       char **list;
+       int i, j, n, *fields, max_fields;
+       khash_t(64) *hash;
+       bam_init_header_hash(h);
+       list = __bam_get_lines(fn, &n);
+       hash = kh_init(64);
+       max_fields = 0; fields = 0;
+       for (i = 0; i < n; ++i) {
+               char *str = list[i];
+               int chr, n_fields, ret;
+               khint_t k;
+               uint64_t x;
+               n_fields = ksplit_core(str, 0, &max_fields, &fields);
+               if (n_fields < 2) continue;
+               chr = bam_get_tid(h, str + fields[0]);
+               if (chr < 0) {
+                       fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]);
+                       continue;
+               }
+               x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);
+               k = kh_put(64, hash, x, &ret);
+               if (ret == 0) {
+                       fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]);
+                       continue;
+               }
+               kh_val(hash, k) = 0;
+               if (n_fields > 2) {
+                       // count
+                       for (j = 2; j < n_fields; ++j) {
+                               char *s = str + fields[j];
+                               if ((*s != '+' && *s != '-') || !isdigit(s[1])) break;
+                       }
+                       if (j > 2) { // update kh_val()
+                               int *q, y, z;
+                               q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));
+                               q[0] = j - 2; z = j; y = 1;
+                               for (j = 2; j < z; ++j)
+                                       q[y++] = atoi(str + fields[j]);
+                       }
+               }
+               free(str);
+       }
+       free(list); free(fields);
+       return hash;
+}
+
+// an analogy to pileup_func() below
+static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+       pu_data_t *d = (pu_data_t*)data;
+       bam_maqindel_ret_t *r = 0;
+       int rb, *proposed_indels = 0;
+       glf1_t *g;
+       glf3_t *g3;
+
+       if (d->fai == 0) {
+               fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n");
+               exit(1);
+       }
+       if (d->hash) { // only output a list of sites
+               khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+               if (k == kh_end(d->hash)) return 0;
+               proposed_indels = kh_val(d->hash, k);
+       }
+       g3 = glf3_init1();
+       if (d->fai && (int)tid != d->tid) {
+               if (d->ref) { // then write the end mark
+                       g3->rtype = GLF3_RTYPE_END;
+                       glf3_write1(d->fp_glf, g3);
+               }
+               glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference
+               free(d->ref);
+               d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+               d->tid = tid;
+               d->last_pos = 0;
+       }
+       rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+       g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c);
+       memcpy(g3, g, sizeof(glf1_t));
+       g3->rtype = GLF3_RTYPE_SUB;
+       g3->offset = pos - d->last_pos;
+       d->last_pos = pos;
+       glf3_write1(d->fp_glf, g3);
+       if (pos < d->len) {
+               if (proposed_indels)
+                       r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+               else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+       }
+       if (r) { // then write indel line
+               int het = 3 * n, min;
+               min = het;
+               if (min > r->gl[0]) min = r->gl[0];
+               if (min > r->gl[1]) min = r->gl[1];
+               g3->ref_base = 0;
+               g3->rtype = GLF3_RTYPE_INDEL;
+               memset(g3->lk, 0, 10);
+               g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255;
+               g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255;
+               g3->lk[2] = het - min < 255? het - min : 255;
+               g3->offset = 0;
+               g3->indel_len[0] = r->indel1;
+               g3->indel_len[1] = r->indel2;
+               g3->min_lk = min < 255? min : 255;
+               g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1;
+               g3->indel_seq[0] = strdup(r->s[0]+1);
+               g3->indel_seq[1] = strdup(r->s[1]+1);
+               glf3_write1(d->fp_glf, g3);
+               bam_maqindel_ret_destroy(r);
+       }
+       free(g);
+       glf3_destroy1(g3);
+       return 0;
+}
+
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+       pu_data_t *d = (pu_data_t*)data;
+       bam_maqindel_ret_t *r = 0;
+       int i, j, rb, rms_mapq = -1, *proposed_indels = 0;
+       uint64_t rms_aux;
+       uint32_t cns = 0;
+
+       // if GLF is required, suppress -c completely
+       if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data);
+       // if d->hash is initialized, only output the sites in the hash table
+       if (d->hash) {
+               khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+               if (k == kh_end(d->hash)) return 0;
+               proposed_indels = kh_val(d->hash, k);
+       }
+       // update d->ref if necessary
+       if (d->fai && (int)tid != d->tid) {
+               free(d->ref);
+               d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+               d->tid = tid;
+       }
+       rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+       // when the indel-only mode is asked for, return if no reads mapped with indels
+       if (d->format & BAM_PLF_INDEL_ONLY) {
+               for (i = 0; i < n; ++i)
+                       if (pu[i].indel != 0) break;
+               if (i == n) return 0;
+       }
+       // call the consensus and indel
+       if (d->format & BAM_PLF_CNS) // call consensus
+               cns = bam_maqcns_call(n, pu, d->c);
+       if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels
+               if (proposed_indels) // the first element gives the size of the array
+                       r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+               else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+       }
+       // when only variant sites are asked for, test if the site is a variant
+       if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) {
+               if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP
+                       if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel
+                               if (r) bam_maqindel_ret_destroy(r);
+                               return 0;
+                       }
+               }
+       }
+       // print the first 3 columns
+       printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
+       // print consensus information if required
+       if (d->format & BAM_PLF_CNS) {
+               int ref_q, rb4 = bam_nt16_table[rb];
+               ref_q = 0;
+               if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP
+                       ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff);
+                       if (ref_q > 255) ref_q = 255;
+               }
+               rms_mapq = cns>>16&0xff;
+               printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq);
+       }
+       // print pileup sequences
+       printf("%d\t", n);
+       rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pu + i;
+               int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
+               rms_aux += tmp * tmp;
+               if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+               if (!p->is_del) {
+                       int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+                       if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                       else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+                       putchar(c);
+                       if (p->indel > 0) {
+                               printf("+%d", p->indel);
+                               for (j = 1; j <= p->indel; ++j) {
+                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                               }
+                       } else if (p->indel < 0) {
+                               printf("%d", p->indel);
+                               for (j = 1; j <= -p->indel; ++j) {
+                                       c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
+                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                               }
+                       }
+               } else putchar('*');
+               if (p->is_tail) putchar('$');
+       }
+       // finalize rms_mapq
+       rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499);
+       if (rms_mapq < 0) rms_mapq = rms_aux;
+       putchar('\t');
+       // print quality
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pu + i;
+               int c = bam1_qual(p->b)[p->qpos] + 33;
+               if (c > 126) c = 126;
+               putchar(c);
+       }
+       if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities
+               const unsigned char *q;
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pu + i;
+                       q = bam_aux_get(p->b, "E2");
+                       putchar(q? q[p->qpos + 1] : 'N');
+               }
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pu + i;
+                       q = bam_aux_get(p->b, "U2");
+                       putchar(q? q[p->qpos + 1] : '!');
+               }
+       }
+       // print mapping quality if -s is flagged on the command line
+       if (d->format & BAM_PLF_SIMPLE) {
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       int c = pu[i].b->core.qual + 33;
+                       if (c > 126) c = 126;
+                       putchar(c);
+               }
+       }
+       putchar('\n');
+       // print the indel line if r has been calculated. This only happens if:
+       // a) -c or -i are flagged, AND b) the reference sequence is available
+       if (r) {
+               printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1);
+               if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]);
+               else printf("%s/%s\t", r->s[0], r->s[1]);
+               printf("%d\t%d\t", r->q_cns, r->q_ref);
+               printf("%d\t%d\t", rms_mapq, n);
+               printf("%s\t%s\t", r->s[0], r->s[1]);
+               //printf("%d\t%d\t", r->gl[0], r->gl[1]);
+               printf("%d\t%d\t%d\t", r->cnt1, r->cnt2, r->cnt_anti);
+               printf("%d\t%d\n", r->cnt_ref, r->cnt_ambi);
+               bam_maqindel_ret_destroy(r);
+       }
+       return 0;
+}
+
+int bam_pileup(int argc, char *argv[])
+{
+       int c, is_SAM = 0;
+       char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
+       pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
+       d->tid = -1; d->mask = BAM_DEF_MASK;
+       d->c = bam_maqcns_init();
+       d->ido = bam_maqindel_opt_init();
+       while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2a")) >= 0) {
+               switch (c) {
+               case 'a': d->c->is_soap = 1; break;
+               case 's': d->format |= BAM_PLF_SIMPLE; break;
+               case 't': fn_list = strdup(optarg); break;
+               case 'l': fn_pos = strdup(optarg); break;
+               case 'f': fn_fa = strdup(optarg); break;
+               case 'T': d->c->theta = atof(optarg); break;
+               case 'N': d->c->n_hap = atoi(optarg); break;
+               case 'r': d->c->het_rate = atof(optarg); break;
+               case 'M': d->c->cap_mapQ = atoi(optarg); break;
+               case 'c': d->format |= BAM_PLF_CNS; break;
+               case 'i': d->format |= BAM_PLF_INDEL_ONLY; break;
+               case 'v': d->format |= BAM_PLF_VAR_ONLY; break;
+               case 'm': d->mask = strtol(optarg, 0, 0); break;
+               case 'g': d->format |= BAM_PLF_GLF; break;
+               case '2': d->format |= BAM_PLF_2ND; break;
+               case 'I': d->ido->q_indel = atoi(optarg); break;
+               case 'G': d->ido->r_indel = atof(optarg); break;
+               case 'S': is_SAM = 1; break;
+               default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
+               }
+       }
+       if (fn_list) is_SAM = 1;
+       if (optind == argc) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:  samtools pileup [options] <in.bam>|<in.sam>\n\n");
+               fprintf(stderr, "Option: -s        simple (yet incomplete) pileup format\n");
+               fprintf(stderr, "        -S        the input is in SAM\n");
+               fprintf(stderr, "        -a        use the SOAPsnp model for SNP calling\n");
+               fprintf(stderr, "        -2        output the 2nd best call and quality\n");
+               fprintf(stderr, "        -i        only show lines/consensus with indels\n");
+               fprintf(stderr, "        -m INT    filtering reads with bits in INT [%d]\n", d->mask);
+               fprintf(stderr, "        -M INT    cap mapping quality at INT [%d]\n", d->c->cap_mapQ);
+               fprintf(stderr, "        -t FILE   list of reference sequences (force -S)\n");
+               fprintf(stderr, "        -l FILE   list of sites at which pileup is output\n");
+               fprintf(stderr, "        -f FILE   reference sequence in the FASTA format\n\n");
+               fprintf(stderr, "        -c        output the maq consensus sequence\n");
+               fprintf(stderr, "        -v        print variants only (for -c)\n");
+               fprintf(stderr, "        -g        output in the GLFv3 format (suppressing -c/-i/-s)\n");
+               fprintf(stderr, "        -T FLOAT  theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta);
+               fprintf(stderr, "        -N INT    number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap);
+               fprintf(stderr, "        -r FLOAT  prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate);
+               fprintf(stderr, "        -G FLOAT  prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel);
+               fprintf(stderr, "        -I INT    phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel);
+               fprintf(stderr, "\n");
+               free(fn_list); free(fn_fa); free(d);
+               return 1;
+       }
+       if (fn_fa) d->fai = fai_load(fn_fa);
+       if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling
+       if (d->format & BAM_PLF_GLF) { // for glf output
+               glf3_header_t *h;
+               h = glf3_header_init();
+               d->fp_glf = bgzf_fdopen(fileno(stdout), "w");
+               glf3_header_write(d->fp_glf, h);
+               glf3_header_destroy(h);
+       }
+       if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)))
+               fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n");
+       if (fn_fa && is_SAM && fn_list == 0) fn_list = samfaipath(fn_fa);
+
+       {
+               samfile_t *fp;
+               fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0);
+               if (fp == 0 || fp->header == 0) {
+                       fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n");
+                       return 1;
+               }
+               d->h = fp->header;
+               if (fn_pos) d->hash = load_pos(fn_pos, d->h);
+               sampileup(fp, d->mask, pileup_func, d);
+               samclose(fp); // d->h will be destroyed here
+       }
+
+       // free
+       if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf);
+       if (fn_pos) { // free the hash table
+               khint_t k;
+               for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k)
+                       if (kh_exist(d->hash, k)) free(kh_val(d->hash, k));
+               kh_destroy(64, d->hash);
+       }
+       free(fn_pos); free(fn_list); free(fn_fa);
+       if (d->fai) fai_destroy(d->fai);
+       bam_maqcns_destroy(d->c);
+       free(d->ido); free(d->ref); free(d);
+       return 0;
+}
diff --git a/samtools/bam_rmdup.c b/samtools/bam_rmdup.c
new file mode 100644 (file)
index 0000000..f0d2b5d
--- /dev/null
@@ -0,0 +1,206 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <unistd.h>
+#include "sam.h"
+
+typedef bam1_t *bam1_p;
+
+#include "khash.h"
+KHASH_SET_INIT_STR(name)
+KHASH_MAP_INIT_INT64(pos, bam1_p)
+
+#define BUFFER_SIZE 0x40000
+
+typedef struct {
+       uint64_t n_checked, n_removed;
+       khash_t(pos) *best_hash;
+} lib_aux_t;
+KHASH_MAP_INIT_STR(lib, lib_aux_t)
+
+typedef struct {
+       int n, max;
+       bam1_t **a;
+} tmp_stack_t;
+
+static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
+{
+       if (stack->n == stack->max) {
+               stack->max = stack->max? stack->max<<1 : 0x10000;
+               stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
+       }
+       stack->a[stack->n++] = b;
+}
+
+static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
+{
+       int i;
+       for (i = 0; i != stack->n; ++i) {
+               samwrite(out, stack->a[i]);
+               bam_destroy1(stack->a[i]);
+       }
+       stack->n = 0;
+}
+
+static void clear_del_set(khash_t(name) *del_set)
+{
+       khint_t k;
+       for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
+               if (kh_exist(del_set, k))
+                       free((char*)kh_key(del_set, k));
+       kh_clear(name, del_set);
+}
+
+static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
+{
+       khint_t k = kh_get(lib, aux, lib);
+       if (k == kh_end(aux)) {
+               int ret;
+               char *p = strdup(lib);
+               lib_aux_t *q;
+               k = kh_put(lib, aux, p, &ret);
+               q = &kh_val(aux, k);
+               q->n_checked = q->n_removed = 0;
+               q->best_hash = kh_init(pos);
+               return q;
+       } else return &kh_val(aux, k);
+}
+
+static void clear_best(khash_t(lib) *aux, int max)
+{
+       khint_t k;
+       for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+               if (kh_exist(aux, k)) {
+                       lib_aux_t *q = &kh_val(aux, k);
+                       if (kh_size(q->best_hash) >= max)
+                               kh_clear(pos, q->best_hash);
+               }
+       }
+}
+
+static inline int sum_qual(const bam1_t *b)
+{
+       int i, q;
+       uint8_t *qual = bam1_qual(b);
+       for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
+       return q;
+}
+
+void bam_rmdup_core(samfile_t *in, samfile_t *out)
+{
+       bam1_t *b;
+       int last_tid = -1, last_pos = -1;
+       tmp_stack_t stack;
+       khint_t k;
+       khash_t(lib) *aux;
+       khash_t(name) *del_set;
+       
+       aux = kh_init(lib);
+       del_set = kh_init(name);
+       b = bam_init1();
+       memset(&stack, 0, sizeof(tmp_stack_t));
+
+       kh_resize(name, del_set, 4 * BUFFER_SIZE);
+       while (samread(in, b) >= 0) {
+               bam1_core_t *c = &b->core;
+               if (c->tid != last_tid || last_pos != c->pos) {
+                       dump_best(&stack, out); // write the result
+                       clear_best(aux, BUFFER_SIZE);
+                       if (c->tid != last_tid) {
+                               clear_best(aux, 0);
+                               if (kh_size(del_set)) { // check
+                                       fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
+                                       clear_del_set(del_set);
+                               }
+                               if ((int)c->tid == -1) { // append unmapped reads
+                                       samwrite(out, b);
+                                       while (samread(in, b) >= 0) samwrite(out, b);
+                                       break;
+                               }
+                               last_tid = c->tid;
+                               fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
+                       }
+               }
+               if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
+                       samwrite(out, b);
+               } else if (c->isize > 0) { // paired, head
+                       uint64_t key = (uint64_t)c->pos<<32 | c->isize;
+                       const char *lib;
+                       lib_aux_t *q;
+                       int ret;
+                       lib = bam_get_library(in->header, b);
+                       q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+                       ++q->n_checked;
+                       k = kh_put(pos, q->best_hash, key, &ret);
+                       if (ret == 0) { // found in best_hash
+                               bam1_t *p = kh_val(q->best_hash, k);
+                               ++q->n_removed;
+                               if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
+                                       kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+                                       bam_copy1(p, b); // replaced as b
+                               } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+                               if (ret == 0)
+                                       fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+                       } else { // not found in best_hash
+                               kh_val(q->best_hash, k) = bam_dup1(b);
+                               stack_insert(&stack, kh_val(q->best_hash, k));
+                       }
+               } else { // paired, tail
+                       k = kh_get(name, del_set, bam1_qname(b));
+                       if (k != kh_end(del_set)) {
+                               free((char*)kh_key(del_set, k));
+                               kh_del(name, del_set, k);
+                       } else samwrite(out, b);
+               }
+               last_pos = c->pos;
+       }
+
+       for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+               if (kh_exist(aux, k)) {
+                       lib_aux_t *q = &kh_val(aux, k);                 
+                       dump_best(&stack, out);
+                       fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+                                       (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+                       kh_destroy(pos, q->best_hash);
+                       free((char*)kh_key(aux, k));
+               }
+       }
+       kh_destroy(lib, aux);
+
+       clear_del_set(del_set);
+       kh_destroy(name, del_set);
+       free(stack.a);
+       bam_destroy1(b);
+}
+
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
+
+int bam_rmdup(int argc, char *argv[])
+{
+       int c, is_se = 0, force_se = 0;
+       samfile_t *in, *out;
+       while ((c = getopt(argc, argv, "sS")) >= 0) {
+               switch (c) {
+               case 's': is_se = 1; break;
+               case 'S': force_se = is_se = 1; break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:  samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
+               fprintf(stderr, "Option: -s    rmdup for SE reads\n");
+               fprintf(stderr, "        -S    treat PE reads as SE in rmdup (force -s)\n\n");
+               return 1;
+       }
+       in = samopen(argv[optind], "rb", 0);
+       out = samopen(argv[optind+1], "wb", in->header);
+       if (in == 0 || out == 0) {
+               fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
+               return 1;
+       }
+       if (is_se) bam_rmdupse_core(in, out, force_se);
+       else bam_rmdup_core(in, out);
+       samclose(in); samclose(out);
+       return 0;
+}
diff --git a/samtools/bam_rmdupse.c b/samtools/bam_rmdupse.c
new file mode 100644 (file)
index 0000000..e7dbdc7
--- /dev/null
@@ -0,0 +1,159 @@
+#include <math.h>
+#include "sam.h"
+#include "khash.h"
+#include "klist.h"
+
+#define QUEUE_CLEAR_SIZE 0x100000
+#define MAX_POS 0x7fffffff
+
+typedef struct {
+       int endpos;
+       uint32_t score:31, discarded:1;
+       bam1_t *b;
+} elem_t, *elem_p;
+#define __free_elem(p) bam_destroy1((p)->data.b)
+KLIST_INIT(q, elem_t, __free_elem)
+typedef klist_t(q) queue_t;
+
+KHASH_MAP_INIT_INT(best, elem_p)
+typedef khash_t(best) besthash_t;
+
+typedef struct {
+       uint64_t n_checked, n_removed;
+       besthash_t *left, *rght;
+} lib_aux_t;
+KHASH_MAP_INIT_STR(lib, lib_aux_t)
+
+static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
+{
+       khint_t k = kh_get(lib, aux, lib);
+       if (k == kh_end(aux)) {
+               int ret;
+               char *p = strdup(lib);
+               lib_aux_t *q;
+               k = kh_put(lib, aux, p, &ret);
+               q = &kh_val(aux, k);
+               q->left = kh_init(best);
+               q->rght = kh_init(best);
+               q->n_checked = q->n_removed = 0;
+               return q;
+       } else return &kh_val(aux, k);
+}
+
+static inline int sum_qual(const bam1_t *b)
+{
+       int i, q;
+       uint8_t *qual = bam1_qual(b);
+       for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
+       return q;
+}
+
+static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
+{
+       elem_t *p = kl_pushp(q, queue);
+       p->discarded = 0;
+       p->endpos = endpos; p->score = score;
+       if (p->b == 0) p->b = bam_init1();
+       bam_copy1(p->b, b);
+       return p;
+}
+
+static void clear_besthash(besthash_t *h, int32_t pos)
+{
+       khint_t k;
+       for (k = kh_begin(h); k != kh_end(h); ++k)
+               if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
+                       kh_del(best, h, k);
+}
+
+static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
+{
+       if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
+               khint_t k;
+               while (1) {
+                       elem_t *q;
+                       if (queue->head == queue->tail) break;
+                       q = &kl_val(queue->head);
+                       if (q->discarded) {
+                               q->b->data_len = 0;
+                               kl_shift(q, queue, 0);
+                               continue;
+                       }
+                       if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
+                       samwrite(out, q->b);
+                       q->b->data_len = 0;
+                       kl_shift(q, queue, 0);
+               }
+               for (k = kh_begin(h); k != kh_end(h); ++k) {
+                       if (kh_exist(h, k)) {
+                               clear_besthash(kh_val(h, k).left, pos);
+                               clear_besthash(kh_val(h, k).rght, pos);
+                       }
+               }
+       }
+}
+
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
+{
+       bam1_t *b;
+       queue_t *queue;
+       khint_t k;
+       int last_tid = -2;
+       khash_t(lib) *aux;
+
+       aux = kh_init(lib);
+       b = bam_init1();
+       queue = kl_init(q);
+       while (samread(in, b) >= 0) {
+               bam1_core_t *c = &b->core;
+               int endpos = bam_calend(c, bam1_cigar(b));
+               int score = sum_qual(b);
+               
+               if (last_tid != c->tid) {
+                       if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
+                       last_tid = c->tid;
+               } else dump_alignment(out, queue, c->pos, aux);
+               if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
+                       push_queue(queue, b, endpos, score);
+               } else {
+                       const char *lib;
+                       lib_aux_t *q;
+                       besthash_t *h;
+                       uint32_t key;
+                       int ret;
+                       lib = bam_get_library(in->header, b);
+                       q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+                       ++q->n_checked;
+                       h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
+                       key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
+                       k = kh_put(best, h, key, &ret);
+                       if (ret == 0) { // in the hash table
+                               elem_t *p = kh_val(h, k);
+                               ++q->n_removed;
+                               if (p->score < score) {
+                                       if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
+                                               p->discarded = 1;
+                                               kh_val(h, k) = push_queue(queue, b, endpos, score);
+                                       } else { // replace
+                                               p->score = score; p->endpos = endpos;
+                                               bam_copy1(p->b, b);
+                                       }
+                               } // otherwise, discard the alignment
+                       } else kh_val(h, k) = push_queue(queue, b, endpos, score);
+               }
+       }
+       dump_alignment(out, queue, MAX_POS, aux);
+
+       for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+               if (kh_exist(aux, k)) {
+                       lib_aux_t *q = &kh_val(aux, k);
+                       fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+                                       (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+                       kh_destroy(best, q->left); kh_destroy(best, q->rght);
+                       free((char*)kh_key(aux, k));
+               }
+       }
+       kh_destroy(lib, aux);
+       bam_destroy1(b);
+       kl_destroy(q, queue);
+}
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c
new file mode 100644 (file)
index 0000000..9884f3d
--- /dev/null
@@ -0,0 +1,357 @@
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "bam.h"
+#include "ksort.h"
+
+static int g_is_by_qname = 0;
+
+static inline int strnum_cmp(const char *a, const char *b)
+{
+       char *pa, *pb;
+       pa = (char*)a; pb = (char*)b;
+       while (*pa && *pb) {
+               if (isdigit(*pa) && isdigit(*pb)) {
+                       long ai, bi;
+                       ai = strtol(pa, &pa, 10);
+                       bi = strtol(pb, &pb, 10);
+                       if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0;
+               } else {
+                       if (*pa != *pb) break;
+                       ++pa; ++pb;
+               }
+       }
+       if (*pa == *pb)
+               return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0;
+       return *pa<*pb? -1 : *pa>*pb? 1 : 0;
+}
+
+#define HEAP_EMPTY 0xffffffffffffffffull
+
+typedef struct {
+       int i;
+       uint64_t pos, idx;
+       bam1_t *b;
+} heap1_t;
+
+#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
+
+static inline int heap_lt(const heap1_t a, const heap1_t b)
+{
+       if (g_is_by_qname) {
+               int t;
+               if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;
+               t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+               return (t > 0 || (t == 0 && __pos_cmp(a, b)));
+       } else return __pos_cmp(a, b);
+}
+
+KSORT_INIT(heap, heap1_t, heap_lt)
+
+static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
+{
+       int tempi;
+       char *temps;
+       tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;
+       temps = h1->text, h1->text = h2->text, h2->text = temps;
+}
+
+/*!
+  @abstract    Merge multiple sorted BAM.
+  @param  is_by_qname whether to sort by query name
+  @param  out  output BAM file name
+  @param  headers  name of SAM file from which to copy '@' header lines,
+                   or NULL to copy them from the first file to be merged
+  @param  n    number of files to be merged
+  @param  fn   names of files to be merged
+
+  @discussion Padding information may NOT correctly maintained. This
+  function is NOT thread safe.
+ */
+void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int add_RG)
+{
+       bamFile fpout, *fp;
+       heap1_t *heap;
+       bam_header_t *hout = 0;
+       bam_header_t *hheaders = NULL;
+       int i, j, *RG_len = 0;
+       uint64_t idx = 0;
+       char **RG = 0;
+
+       if (headers) {
+               tamFile fpheaders = sam_open(headers);
+               if (fpheaders == 0) {
+                       fprintf(stderr, "[bam_merge_core] Cannot open file `%s'. Continue anyway.\n", headers);
+               } else {
+                       hheaders = sam_header_read(fpheaders);
+                       sam_close(fpheaders);
+               }
+       }
+
+       g_is_by_qname = by_qname;
+       fp = (bamFile*)calloc(n, sizeof(bamFile));
+       heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+       // prepare RG tag
+       if (add_RG) {
+               RG = (char**)calloc(n, sizeof(void*));
+               RG_len = (int*)calloc(n, sizeof(int));
+               for (i = 0; i != n; ++i) {
+                       int l = strlen(fn[i]);
+                       const char *s = fn[i];
+                       if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
+                       for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
+                       ++j; l -= j;
+                       RG[i] = calloc(l + 1, 1);
+                       RG_len[i] = l;
+                       strncpy(RG[i], s + j, l);
+               }
+       }
+       // read the first
+       for (i = 0; i != n; ++i) {
+               heap1_t *h;
+               bam_header_t *hin;
+               fp[i] = bam_open(fn[i], "r");
+               if (fp[i] == 0) {
+                       int j;
+                       fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+                       for (j = 0; j < i; ++j) bam_close(fp[j]);
+                       free(fp); free(heap);
+                       // FIXME: possible memory leak
+                       return;
+               }
+               hin = bam_header_read(fp[i]);
+               if (i == 0) { // the first SAM
+                       hout = hin;
+                       if (hheaders) {
+                               // If the text headers to be swapped in include any @SQ headers,
+                               // check that they are consistent with the existing binary list
+                               // of reference information.
+                               if (hheaders->n_targets > 0) {
+                                       if (hout->n_targets != hheaders->n_targets)
+                                               fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers);
+                                       for (j = 0; j < hout->n_targets; ++j)
+                                               if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0)
+                                                       fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers);
+                               }
+                               swap_header_text(hout, hheaders);
+                               bam_header_destroy(hheaders);
+                               hheaders = NULL;
+                       }
+               } else { // validate multiple baf
+                       if (hout->n_targets != hin->n_targets) {
+                               fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]);
+                               exit(1);
+                       }
+                       for (j = 0; j < hout->n_targets; ++j) {
+                               if (strcmp(hout->target_name[j], hin->target_name[j])) {
+                                       fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n",
+                                                       hout->target_name[j], hin->target_name[j], fn[i]);
+                                       exit(1);
+                               }
+                       }
+                       bam_header_destroy(hin);
+               }
+               h = heap + i;
+               h->i = i;
+               h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               if (bam_read1(fp[i], h->b) >= 0) {
+                       h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b);
+                       h->idx = idx++;
+               }
+               else h->pos = HEAP_EMPTY;
+       }
+       fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
+       assert(fpout);
+       bam_header_write(fpout, hout);
+       bam_header_destroy(hout);
+
+       ks_heapmake(heap, n, heap);
+       while (heap->pos != HEAP_EMPTY) {
+               bam1_t *b = heap->b;
+               if (add_RG && bam_aux_get(b, "RG") == 0)
+                       bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
+               bam_write1_core(fpout, &b->core, b->data_len, b->data);
+               if ((j = bam_read1(fp[heap->i], b)) >= 0) {
+                       heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b);
+                       heap->idx = idx++;
+               } else if (j == -1) {
+                       heap->pos = HEAP_EMPTY;
+                       free(heap->b->data); free(heap->b);
+                       heap->b = 0;
+               } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+               ks_heapadjust(heap, 0, n, heap);
+       }
+
+       if (add_RG) {
+               for (i = 0; i != n; ++i) free(RG[i]);
+               free(RG); free(RG_len);
+       }
+       for (i = 0; i != n; ++i) bam_close(fp[i]);
+       bam_close(fpout);
+       free(fp); free(heap);
+}
+int bam_merge(int argc, char *argv[])
+{
+       int c, is_by_qname = 0, add_RG = 0;
+       char *fn_headers = NULL;
+
+       while ((c = getopt(argc, argv, "h:nr")) >= 0) {
+               switch (c) {
+               case 'r': add_RG = 1; break;
+               case 'h': fn_headers = strdup(optarg); break;
+               case 'n': is_by_qname = 1; break;
+               }
+       }
+       if (optind + 2 >= argc) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
+               fprintf(stderr, "Options: -n       sort by read names\n");
+               fprintf(stderr, "         -r       attach RG tag (inferred from file names)\n");
+               fprintf(stderr, "         -h FILE  copy the header in FILE to <out.bam> [in1.bam]\n\n");
+               fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
+               fprintf(stderr, "      must provide the correct header with -h, or uses Picard which properly maintains\n");
+               fprintf(stderr, "      the header dictionary in merging.\n\n");
+               return 1;
+       }
+       bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, add_RG);
+       free(fn_headers);
+       return 0;
+}
+
+typedef bam1_t *bam1_p;
+
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+       if (g_is_by_qname) {
+               int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
+               return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos))));
+       } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos));
+}
+KSORT_INIT(sort, bam1_p, bam1_lt)
+
+static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout)
+{
+       char *name;
+       int i;
+       bamFile fp;
+       ks_mergesort(sort, k, buf, 0);
+       name = (char*)calloc(strlen(prefix) + 20, 1);
+       if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n);
+       else sprintf(name, "%s.bam", prefix);
+       fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w");
+       if (fp == 0) {
+               fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name);
+               free(name);
+               // FIXME: possible memory leak
+               return;
+       }
+       free(name);
+       bam_header_write(fp, h);
+       for (i = 0; i < k; ++i)
+               bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
+       bam_close(fp);
+}
+
+/*!
+  @abstract Sort an unsorted BAM file based on the chromosome order
+  and the leftmost position of an alignment
+
+  @param  is_by_qname whether to sort by query name
+  @param  fn       name of the file to be sorted
+  @param  prefix   prefix of the output and the temporary files; upon
+                          sucessess, prefix.bam will be written.
+  @param  max_mem  approxiate maximum memory (very inaccurate)
+
+  @discussion It may create multiple temporary subalignment files
+  and then merge them by calling bam_merge_core(). This function is
+  NOT thread safe.
+ */
+void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout)
+{
+       int n, ret, k, i;
+       size_t mem;
+       bam_header_t *header;
+       bamFile fp;
+       bam1_t *b, **buf;
+
+       g_is_by_qname = is_by_qname;
+       n = k = 0; mem = 0;
+       fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+       if (fp == 0) {
+               fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
+               return;
+       }
+       header = bam_header_read(fp);
+       buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*));
+       // write sub files
+       for (;;) {
+               if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+               b = buf[k];
+               if ((ret = bam_read1(fp, b)) < 0) break;
+               mem += ret;
+               ++k;
+               if (mem >= max_mem) {
+                       sort_blocks(n++, k, buf, prefix, header, is_stdout);
+                       mem = 0; k = 0;
+               }
+       }
+       if (ret != -1)
+               fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+       if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout);
+       else { // then merge
+               char **fns, *fnout;
+               fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1);
+               sort_blocks(n++, k, buf, prefix, header, is_stdout);
+               fnout = (char*)calloc(strlen(prefix) + 20, 1);
+               if (is_stdout) sprintf(fnout, "-");
+               else sprintf(fnout, "%s.bam", prefix);
+               fns = (char**)calloc(n, sizeof(char*));
+               for (i = 0; i < n; ++i) {
+                       fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+                       sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+               }
+               bam_merge_core(is_by_qname, fnout, 0, n, fns, 0);
+               free(fnout);
+               for (i = 0; i < n; ++i) {
+                       unlink(fns[i]);
+                       free(fns[i]);
+               }
+               free(fns);
+       }
+       for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) {
+               if (buf[k]) {
+                       free(buf[k]->data);
+                       free(buf[k]);
+               }
+       }
+       free(buf);
+       bam_header_destroy(header);
+       bam_close(fp);
+}
+
+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
+{
+       bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0);
+}
+
+int bam_sort(int argc, char *argv[])
+{
+       size_t max_mem = 500000000;
+       int c, is_by_qname = 0, is_stdout = 0;
+       while ((c = getopt(argc, argv, "nom:")) >= 0) {
+               switch (c) {
+               case 'o': is_stdout = 1; break;
+               case 'n': is_by_qname = 1; break;
+               case 'm': max_mem = atol(optarg); break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "Usage: samtools sort [-on] [-m <maxMem>] <in.bam> <out.prefix>\n");
+               return 1;
+       }
+       bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout);
+       return 0;
+}
diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c
new file mode 100644 (file)
index 0000000..ea9deee
--- /dev/null
@@ -0,0 +1,78 @@
+#include <unistd.h>
+#include <assert.h>
+#include "bam.h"
+
+typedef struct {
+       long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
+       long long n_sgltn, n_read1, n_read2;
+       long long n_qcfail, n_dup;
+       long long n_diffchr, n_diffhigh;
+} bam_flagstat_t;
+
+#define flagstat_loop(s, c) do {                                                                               \
+               ++(s)->n_reads;                                                                                                 \
+               if ((c)->flag & BAM_FPAIRED) {                                                                  \
+                       ++(s)->n_pair_all;                                                                                      \
+                       if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good;           \
+                       if ((c)->flag & BAM_FREAD1) ++(s)->n_read1;                                     \
+                       if ((c)->flag & BAM_FREAD2) ++(s)->n_read2;                                     \
+                       if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn;     \
+                       if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
+                               ++(s)->n_pair_map;                                                                              \
+                               if ((c)->mtid != (c)->tid) {                                                    \
+                                       ++(s)->n_diffchr;                                                                       \
+                                       if ((c)->qual >= 5) ++(s)->n_diffhigh;                          \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped;                                 \
+               if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail;                                   \
+               if ((c)->flag & BAM_FDUP) ++(s)->n_dup;                                                 \
+       } while (0)
+
+bam_flagstat_t *bam_flagstat_core(bamFile fp)
+{
+       bam_flagstat_t *s;
+       bam1_t *b;
+       bam1_core_t *c;
+       int ret;
+       s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+       b = bam_init1();
+       c = &b->core;
+       while ((ret = bam_read1(fp, b)) >= 0)
+               flagstat_loop(s, c);
+       bam_destroy1(b);
+       if (ret != -1)
+               fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+       return s;
+}
+int bam_flagstat(int argc, char *argv[])
+{
+       bamFile fp;
+       bam_header_t *header;
+       bam_flagstat_t *s;
+       if (argc == optind) {
+               fprintf(stderr, "Usage: samtools flagstat <in.bam>\n");
+               return 1;
+       }
+       fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       assert(fp);
+       header = bam_header_read(fp);
+       s = bam_flagstat_core(fp);
+       printf("%lld in total\n", s->n_reads);
+       printf("%lld QC failure\n", s->n_qcfail);
+       printf("%lld duplicates\n", s->n_dup);
+       printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0);
+       printf("%lld paired in sequencing\n", s->n_pair_all);
+       printf("%lld read1\n", s->n_read1);
+       printf("%lld read2\n", s->n_read2);
+       printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0);
+       printf("%lld with itself and mate mapped\n", s->n_pair_map);
+       printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0);
+       printf("%lld with mate mapped to a different chr\n", s->n_diffchr);
+       printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh);
+       free(s);
+       bam_header_destroy(header);
+       bam_close(fp);
+       return 0;
+}
diff --git a/samtools/bam_tview.c b/samtools/bam_tview.c
new file mode 100644 (file)
index 0000000..4c121e7
--- /dev/null
@@ -0,0 +1,415 @@
+#undef _HAVE_CURSES
+
+#if _CURSES_LIB == 0
+#elif _CURSES_LIB == 1
+#include <curses.h>
+#ifndef NCURSES_VERSION
+#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled"
+#else
+#define _HAVE_CURSES
+#endif
+#elif _CURSES_LIB == 2
+#include <xcurses.h>
+#define _HAVE_CURSES
+#else
+#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled"
+#endif
+
+#ifdef _HAVE_CURSES
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include "bam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+
+char bam_aux_getCEi(bam1_t *b, int i);
+char bam_aux_getCSi(bam1_t *b, int i);
+char bam_aux_getCQi(bam1_t *b, int i);
+
+#define TV_MIN_ALNROW 2
+#define TV_MAX_GOTO  40
+#define TV_LOW_MAPQ  10
+
+#define TV_COLOR_MAPQ   0
+#define TV_COLOR_BASEQ  1
+#define TV_COLOR_NUCL   2
+#define TV_COLOR_COL    3
+#define TV_COLOR_COLQ   4
+
+#define TV_BASE_NUCL 0
+#define TV_BASE_COLOR_SPACE 1
+
+typedef struct {
+       int mrow, mcol;
+       WINDOW *wgoto, *whelp;
+
+       bam_index_t *idx;
+       bam_lplbuf_t *lplbuf;
+       bam_header_t *header;
+       bamFile fp;
+       int curr_tid, left_pos;
+       faidx_t *fai;
+       bam_maqcns_t *bmc;
+
+       int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name;
+       char *ref;
+} tview_t;
+
+int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+       tview_t *tv = (tview_t*)data;
+       int i, j, c, rb, attr, max_ins = 0;
+       uint32_t call = 0;
+       if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
+       // print referece
+       rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
+       for (i = tv->last_pos + 1; i < pos; ++i) {
+               if (i%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", i+1);
+               c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
+               mvaddch(1, tv->ccol++, c);
+       }
+       if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1);
+       // print consensus
+       call = bam_maqcns_call(n, pl, tv->bmc);
+       attr = A_UNDERLINE;
+       c = ",ACMGRSVTWYHKDBN"[call>>28&0xf];
+       i = (call>>8&0xff)/10+1;
+       if (i > 4) i = 4;
+       attr |= COLOR_PAIR(i);
+       if (c == toupper(rb)) c = '.';
+       attron(attr);
+       mvaddch(2, tv->ccol, c);
+       attroff(attr);
+       if(tv->ins) {
+               // calculate maximum insert
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
+               }
+       }
+       // core loop
+       for (j = 0; j <= max_ins; ++j) {
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       int row = TV_MIN_ALNROW + p->level - tv->row_shift;
+                       if (j == 0) {
+                               if (!p->is_del) {
+                                       if (tv->base_for == TV_BASE_COLOR_SPACE && 
+                                                       (c = bam_aux_getCSi(p->b, p->qpos))) {
+                                               c = bam_aux_getCSi(p->b, p->qpos);
+                                               // assume that if we found one color, we will be able to get the color error
+                                               if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.';
+                                       } else {
+                                               if (tv->show_name) {
+                                                       char *name = bam1_qname(p->b);
+                                                       c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
+                                               } else {
+                                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+                                                       if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                                               }
+                                       }
+                               } else c = '*';
+                       } else { // padding
+                               if (j > p->indel) c = '*';
+                               else { // insertion
+                                       if (tv->base_for ==  TV_BASE_NUCL) {
+                                               if (tv->show_name) {
+                                                       char *name = bam1_qname(p->b);
+                                                       c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
+                                               } else {
+                                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                                                       if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                                               }
+                                       } else {
+                                               c = bam_aux_getCSi(p->b, p->qpos + j);
+                                               if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.';
+                                       }
+                               }
+                       }
+                       if (row > TV_MIN_ALNROW && row < tv->mrow) {
+                               int x;
+                               attr = 0;
+                               if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
+                                               || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE;
+                               if (tv->color_for == TV_COLOR_BASEQ) {
+                                       x = bam1_qual(p->b)[p->qpos]/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               } else if (tv->color_for == TV_COLOR_MAPQ) {
+                                       x = p->b->core.qual/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               } else if (tv->color_for == TV_COLOR_NUCL) {
+                                       x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5;
+                                       attr |= COLOR_PAIR(x);
+                               } else if(tv->color_for == TV_COLOR_COL) {
+                                       x = 0;
+                                       switch(bam_aux_getCSi(p->b, p->qpos)) {
+                                               case '0': x = 0; break;
+                                               case '1': x = 1; break;
+                                               case '2': x = 2; break;
+                                               case '3': x = 3; break;
+                                               case '4': x = 4; break;
+                                               default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break;
+                                       }
+                                       x+=5;
+                                       attr |= COLOR_PAIR(x);
+                               } else if(tv->color_for == TV_COLOR_COLQ) {
+                                       x = bam_aux_getCQi(p->b, p->qpos);
+                                       if(0 == x) x = bam1_qual(p->b)[p->qpos];
+                                       x = x/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               }
+                               attron(attr);
+                               mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c));
+                               attroff(attr);
+                       }
+               }
+               c = j? '*' : rb;
+               if (c == '*') {
+                       attr = COLOR_PAIR(8);
+                       attron(attr);
+                       mvaddch(1, tv->ccol++, c);
+                       attroff(attr);
+               } else mvaddch(1, tv->ccol++, c);
+       }
+       tv->last_pos = pos;
+       return 0;
+}
+
+tview_t *tv_init(const char *fn, const char *fn_fa)
+{
+       tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t));
+       tv->is_dot = 1;
+       tv->idx = bam_index_load(fn);
+       if (tv->idx == 0) exit(1);
+       tv->fp = bam_open(fn, "r");
+       bgzf_set_cache_size(tv->fp, 8 * 1024 *1024);
+       assert(tv->fp);
+       tv->header = bam_header_read(tv->fp);
+       tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
+       if (fn_fa) tv->fai = fai_load(fn_fa);
+       tv->bmc = bam_maqcns_init();
+       tv->ins = 1;
+       bam_maqcns_prepare(tv->bmc);
+
+       initscr();
+       keypad(stdscr, TRUE);
+       clear();
+       noecho();
+       cbreak();
+       tv->mrow = 24; tv->mcol = 80;
+       getmaxyx(stdscr, tv->mrow, tv->mcol);
+       tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
+       tv->whelp = newwin(29, 40, 5, 5);
+       tv->color_for = TV_COLOR_MAPQ;
+       start_color();
+       init_pair(1, COLOR_BLUE, COLOR_BLACK);
+       init_pair(2, COLOR_GREEN, COLOR_BLACK);
+       init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+       init_pair(4, COLOR_WHITE, COLOR_BLACK);
+       init_pair(5, COLOR_GREEN, COLOR_BLACK);
+       init_pair(6, COLOR_CYAN, COLOR_BLACK);
+       init_pair(7, COLOR_YELLOW, COLOR_BLACK);
+       init_pair(8, COLOR_RED, COLOR_BLACK);
+       init_pair(9, COLOR_BLUE, COLOR_BLACK);
+       return tv;
+}
+
+void tv_destroy(tview_t *tv)
+{
+       delwin(tv->wgoto); delwin(tv->whelp);
+       endwin();
+
+       bam_lplbuf_destroy(tv->lplbuf);
+       bam_maqcns_destroy(tv->bmc);
+       bam_index_destroy(tv->idx);
+       if (tv->fai) fai_destroy(tv->fai);
+       free(tv->ref);
+       bam_header_destroy(tv->header);
+       bam_close(tv->fp);
+       free(tv);
+}
+
+int tv_fetch_func(const bam1_t *b, void *data)
+{
+       tview_t *tv = (tview_t*)data;
+       if (tv->no_skip) {
+               uint32_t *cigar = bam1_cigar(b); // this is cheating...
+               int i;
+               for (i = 0; i <b->core.n_cigar; ++i) {
+                       if ((cigar[i]&0xf) == BAM_CREF_SKIP)
+                               cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
+               }
+       }
+       bam_lplbuf_push(b, tv->lplbuf);
+       return 0;
+}
+
+int tv_draw_aln(tview_t *tv, int tid, int pos)
+{
+       // reset
+       clear();
+       tv->curr_tid = tid; tv->left_pos = pos;
+       tv->last_pos = tv->left_pos - 1;
+       tv->ccol = 0;
+       // print ref and consensus
+       if (tv->fai) {
+               char *str;
+               if (tv->ref) free(tv->ref);
+               str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
+               sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
+               tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
+               free(str);
+       }
+       // draw aln
+       bam_lplbuf_reset(tv->lplbuf);
+       bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
+       bam_lplbuf_push(0, tv->lplbuf);
+
+       while (tv->ccol < tv->mcol) {
+               int pos = tv->last_pos + 1;
+               if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1);
+               mvaddch(1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
+               ++tv->last_pos;
+       }
+       return 0;
+}
+
+static void tv_win_goto(tview_t *tv, int *tid, int *pos)
+{
+       char str[256];
+       int i, l = 0;
+       wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
+       mvwprintw(tv->wgoto, 1, 2, "Goto: ");
+       for (;;) {
+               int c = wgetch(tv->wgoto);
+               wrefresh(tv->wgoto);
+               if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
+                       --l;
+               } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
+                       int _tid = -1, _beg, _end;
+                       bam_parse_region(tv->header, str, &_tid, &_beg, &_end);
+                       if (_tid >= 0) {
+                               *tid = _tid; *pos = _beg;
+                               return;
+                       }
+               } else if (isgraph(c)) {
+                       if (l < TV_MAX_GOTO) str[l++] = c;
+               } else if (c == '\027') l = 0;
+               else if (c == '\033') return;
+               str[l] = '\0';
+               for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
+               mvwprintw(tv->wgoto, 1, 8, "%s", str);
+       }
+}
+
+static void tv_win_help(tview_t *tv) {
+       int r = 1;
+       WINDOW *win = tv->whelp;
+       wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
+       mvwprintw(win, r++, 2, "        -=-    Help    -=- ");
+       r++;
+       mvwprintw(win, r++, 2, "?          This window");
+       mvwprintw(win, r++, 2, "Arrows     Small scroll movement");
+       mvwprintw(win, r++, 2, "h,j,k,l    Small scroll movement");
+       mvwprintw(win, r++, 2, "H,J,K,L    Large scroll movement");
+       mvwprintw(win, r++, 2, "ctrl-H     Scroll 1k left");
+       mvwprintw(win, r++, 2, "ctrl-L     Scroll 1k right");
+       mvwprintw(win, r++, 2, "space      Scroll one screen");
+       mvwprintw(win, r++, 2, "backspace  Scroll back one screen");
+       mvwprintw(win, r++, 2, "g          Go to specific location");
+       mvwprintw(win, r++, 2, "m          Color for mapping qual");
+       mvwprintw(win, r++, 2, "n          Color for nucleotide");
+       mvwprintw(win, r++, 2, "b          Color for base quality");
+       mvwprintw(win, r++, 2, "c          Color for cs color");
+       mvwprintw(win, r++, 2, "z          Color for cs qual");
+       mvwprintw(win, r++, 2, ".          Toggle on/off dot view");
+       mvwprintw(win, r++, 2, "s          Toggle on/off ref skip");
+       mvwprintw(win, r++, 2, "r          Toggle on/off rd name");
+       mvwprintw(win, r++, 2, "N          Turn on nt view");
+       mvwprintw(win, r++, 2, "C          Turn on cs view");
+       mvwprintw(win, r++, 2, "i          Toggle on/off ins");
+       mvwprintw(win, r++, 2, "q          Exit");
+       r++;
+       mvwprintw(win, r++, 2, "Underline:      Secondary or orphan");
+       mvwprintw(win, r++, 2, "Blue:    0-9    Green: 10-19");
+       mvwprintw(win, r++, 2, "Yellow: 20-29   White: >=30");
+       wrefresh(win);
+       wgetch(win);
+}
+
+void tv_loop(tview_t *tv)
+{
+       int tid, pos;
+       tid = tv->curr_tid; pos = tv->left_pos;
+       while (1) {
+               int c = getch();
+               switch (c) {
+                       case '?': tv_win_help(tv); break;
+                       case '\033':
+                       case 'q': goto end_loop;
+                       case 'g': tv_win_goto(tv, &tid, &pos); break;
+                       case 'm': tv->color_for = TV_COLOR_MAPQ; break;
+                       case 'b': tv->color_for = TV_COLOR_BASEQ; break;
+                       case 'n': tv->color_for = TV_COLOR_NUCL; break;
+                       case 'c': tv->color_for = TV_COLOR_COL; break;
+                       case 'z': tv->color_for = TV_COLOR_COLQ; break;
+                       case 's': tv->no_skip = !tv->no_skip; break;
+                       case 'r': tv->show_name = !tv->show_name; break;
+                       case KEY_LEFT:
+                       case 'h': --pos; break;
+                       case KEY_RIGHT:
+                       case 'l': ++pos; break;
+                       case KEY_SLEFT:
+                       case 'H': pos -= 20; break;
+                       case KEY_SRIGHT:
+                       case 'L': pos += 20; break;
+                       case '.': tv->is_dot = !tv->is_dot; break;
+                       case 'N': tv->base_for = TV_BASE_NUCL; break;
+                       case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
+                       case 'i': tv->ins = !tv->ins; break;
+                       case '\010': pos -= 1000; break;
+                       case '\014': pos += 1000; break;
+                       case ' ': pos += tv->mcol; break;
+                       case KEY_UP:
+                       case 'j': --tv->row_shift; break;
+                       case KEY_DOWN:
+                       case 'k': ++tv->row_shift; break;
+                       case KEY_BACKSPACE:
+                       case '\177': pos -= tv->mcol; break;
+                       case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
+                       default: continue;
+               }
+               if (pos < 0) pos = 0;
+               if (tv->row_shift < 0) tv->row_shift = 0;
+               tv_draw_aln(tv, tid, pos);
+       }
+end_loop:
+       return;
+}
+
+int bam_tview_main(int argc, char *argv[])
+{
+       tview_t *tv;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: bamtk tview <aln.bam> [ref.fasta]\n");
+               return 1;
+       }
+       tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]);
+       tv_draw_aln(tv, 0, 0);
+       tv_loop(tv);
+       tv_destroy(tv);
+       return 0;
+}
+#else // #ifdef _HAVE_CURSES
+#include <stdio.h>
+#warning "No curses library is available; tview is disabled."
+int bam_tview_main(int argc, char *argv[])
+{
+       fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n");
+       return 1;
+}
+#endif // #ifdef _HAVE_CURSES
diff --git a/samtools/bgzf.c b/samtools/bgzf.c
new file mode 100644 (file)
index 0000000..59f902f
--- /dev/null
@@ -0,0 +1,683 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/*
+  2009-06-29 by lh3: cache recent uncompressed blocks.
+  2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
+  2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+#include "khash.h"
+typedef struct {
+       int size;
+       uint8_t *block;
+       int64_t end_offset;
+} cache_t;
+KHASH_MAP_INIT_INT64(cache, cache_t)
+
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
+
+typedef int8_t bgzf_byte_t;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+    return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+    buffer[2] = value >> 16;
+    buffer[3] = value >> 24;
+}
+
+static inline
+int
+bgzf_min(int x, int y)
+{
+    return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+    fp->error = message;
+}
+
+static BGZF *bgzf_read_init()
+{
+       BGZF *fp;
+       fp = calloc(1, sizeof(BGZF));
+    fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+    fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+       fp->cache_size = 0;
+       fp->cache = kh_init(cache);
+       return fp;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+#ifdef _USE_KNETFILE
+    knetFile *file = knet_dopen(fd, "r");
+#else
+    FILE* file = fdopen(fd, "r");
+#endif
+    BGZF* fp;
+       if (file == 0) return 0;
+       fp = bgzf_read_init();
+    fp->file_descriptor = fd;
+    fp->open_mode = 'r';
+#ifdef _USE_KNETFILE
+    fp->x.fpr = file;
+#else
+    fp->file = file;
+#endif
+    return fp;
+}
+
+static
+BGZF*
+open_write(int fd, bool is_uncompressed)
+{
+    FILE* file = fdopen(fd, "w");
+    BGZF* fp;
+       if (file == 0) return 0;
+       fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'w';
+    fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
+#ifdef _USE_KNETFILE
+    fp->x.fpw = file;
+#else
+    fp->file = file;
+#endif
+    fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+    fp->uncompressed_block = NULL;
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+    BGZF* fp = NULL;
+    if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
+#ifdef _USE_KNETFILE
+               knetFile *file = knet_open(path, mode);
+               if (file == 0) return 0;
+               fp = bgzf_read_init();
+               fp->file_descriptor = -1;
+               fp->open_mode = 'r';
+               fp->x.fpr = file;
+#else
+               int fd, oflag = O_RDONLY;
+#ifdef _WIN32
+               oflag |= O_BINARY;
+#endif
+               fd = open(path, oflag);
+               if (fd == -1) return 0;
+        fp = open_read(fd);
+#endif
+    } else if (mode[0] == 'w' || mode[0] == 'W') {
+               int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+               oflag |= O_BINARY;
+#endif
+               fd = open(path, oflag, 0666);
+               if (fd == -1) return 0;
+        fp = open_write(fd, strstr(mode, "u")? 1 : 0);
+    }
+    if (fp != NULL) {
+        fp->owned_file = 1;
+    }
+    return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+       if (fd == -1) return 0;
+    if (mode[0] == 'r' || mode[0] == 'R') {
+        return open_read(fd);
+    } else if (mode[0] == 'w' || mode[0] == 'W') {
+        return open_write(fd, strstr(mode, "u")? 1 : 0);
+    } else {
+        return NULL;
+    }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+    // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+    // Also adds an extra field that stores the compressed block length.
+
+    bgzf_byte_t* buffer = fp->compressed_block;
+    int buffer_size = fp->compressed_block_size;
+
+    // Init gzip header
+    buffer[0] = GZIP_ID1;
+    buffer[1] = GZIP_ID2;
+    buffer[2] = CM_DEFLATE;
+    buffer[3] = FLG_FEXTRA;
+    buffer[4] = 0; // mtime
+    buffer[5] = 0;
+    buffer[6] = 0;
+    buffer[7] = 0;
+    buffer[8] = 0;
+    buffer[9] = OS_UNKNOWN;
+    buffer[10] = BGZF_XLEN;
+    buffer[11] = 0;
+    buffer[12] = BGZF_ID1;
+    buffer[13] = BGZF_ID2;
+    buffer[14] = BGZF_LEN;
+    buffer[15] = 0;
+    buffer[16] = 0; // placeholder for block length
+    buffer[17] = 0;
+
+    // loop to retry for blocks that do not compress enough
+    int input_length = block_length;
+    int compressed_length = 0;
+    while (1) {
+               int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
+        z_stream zs;
+        zs.zalloc = NULL;
+        zs.zfree = NULL;
+        zs.next_in = fp->uncompressed_block;
+        zs.avail_in = input_length;
+        zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+        zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+        int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
+                                  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+        if (status != Z_OK) {
+            report_error(fp, "deflate init failed");
+            return -1;
+        }
+        status = deflate(&zs, Z_FINISH);
+        if (status != Z_STREAM_END) {
+            deflateEnd(&zs);
+            if (status == Z_OK) {
+                // Not enough space in buffer.
+                // Can happen in the rare case the input doesn't compress enough.
+                // Reduce the amount of input until it fits.
+                input_length -= 1024;
+                if (input_length <= 0) {
+                    // should never happen
+                    report_error(fp, "input reduction failed");
+                    return -1;
+                }
+                continue;
+            }
+            report_error(fp, "deflate failed");
+            return -1;
+        }
+        status = deflateEnd(&zs);
+        if (status != Z_OK) {
+            report_error(fp, "deflate end failed");
+            return -1;
+        }
+        compressed_length = zs.total_out;
+        compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+        if (compressed_length > MAX_BLOCK_SIZE) {
+            // should never happen
+            report_error(fp, "deflate overflow");
+            return -1;
+        }
+        break;
+    }
+
+    packInt16((uint8_t*)&buffer[16], compressed_length-1);
+    uint32_t crc = crc32(0L, NULL, 0L);
+    crc = crc32(crc, fp->uncompressed_block, input_length);
+    packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+    packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+    int remaining = block_length - input_length;
+    if (remaining > 0) {
+        if (remaining > input_length) {
+            // should never happen (check so we can use memcpy)
+            report_error(fp, "remainder too large");
+            return -1;
+        }
+        memcpy(fp->uncompressed_block,
+               fp->uncompressed_block + input_length,
+               remaining);
+    }
+    fp->block_offset = remaining;
+    return compressed_length;
+}
+
+static
+int
+inflate_block(BGZF* fp, int block_length)
+{
+    // Inflate the block in fp->compressed_block into fp->uncompressed_block
+
+    z_stream zs;
+    zs.zalloc = NULL;
+    zs.zfree = NULL;
+    zs.next_in = fp->compressed_block + 18;
+    zs.avail_in = block_length - 16;
+    zs.next_out = fp->uncompressed_block;
+    zs.avail_out = fp->uncompressed_block_size;
+
+    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+    if (status != Z_OK) {
+        report_error(fp, "inflate init failed");
+        return -1;
+    }
+    status = inflate(&zs, Z_FINISH);
+    if (status != Z_STREAM_END) {
+        inflateEnd(&zs);
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    status = inflateEnd(&zs);
+    if (status != Z_OK) {
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    return zs.total_out;
+}
+
+static
+int
+check_header(const bgzf_byte_t* header)
+{
+    return (header[0] == GZIP_ID1 &&
+            header[1] == (bgzf_byte_t) GZIP_ID2 &&
+            header[2] == Z_DEFLATED &&
+            (header[3] & FLG_FEXTRA) != 0 &&
+            unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
+            header[12] == BGZF_ID1 &&
+            header[13] == BGZF_ID2 &&
+            unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+}
+
+static void free_cache(BGZF *fp)
+{
+       khint_t k;
+       khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+       if (fp->open_mode != 'r') return;
+       for (k = kh_begin(h); k < kh_end(h); ++k)
+               if (kh_exist(h, k)) free(kh_val(h, k).block);
+       kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+       khint_t k;
+       cache_t *p;
+       khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+       k = kh_get(cache, h, block_address);
+       if (k == kh_end(h)) return 0;
+       p = &kh_val(h, k);
+       if (fp->block_length != 0) fp->block_offset = 0;
+       fp->block_address = block_address;
+       fp->block_length = p->size;
+       memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
+#ifdef _USE_KNETFILE
+       knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
+#else
+       fseeko(fp->file, p->end_offset, SEEK_SET);
+#endif
+       return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+       int ret;
+       khint_t k;
+       cache_t *p;
+       khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+       if (MAX_BLOCK_SIZE >= fp->cache_size) return;
+       if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
+               /* A better way would be to remove the oldest block in the
+                * cache, but here we remove a random one for simplicity. This
+                * should not have a big impact on performance. */
+               for (k = kh_begin(h); k < kh_end(h); ++k)
+                       if (kh_exist(h, k)) break;
+               if (k < kh_end(h)) {
+                       free(kh_val(h, k).block);
+                       kh_del(cache, h, k);
+               }
+       }
+       k = kh_put(cache, h, fp->block_address, &ret);
+       if (ret == 0) return; // if this happens, a bug!
+       p = &kh_val(h, k);
+       p->size = fp->block_length;
+       p->end_offset = fp->block_address + size;
+       p->block = malloc(MAX_BLOCK_SIZE);
+       memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
+}
+
+static
+int
+read_block(BGZF* fp)
+{
+    bgzf_byte_t header[BLOCK_HEADER_LENGTH];
+       int size = 0;
+#ifdef _USE_KNETFILE
+    int64_t block_address = knet_tell(fp->x.fpr);
+       if (load_block_from_cache(fp, block_address)) return 0;
+    int count = knet_read(fp->x.fpr, header, sizeof(header));
+#else
+    int64_t block_address = ftello(fp->file);
+       if (load_block_from_cache(fp, block_address)) return 0;
+    int count = fread(header, 1, sizeof(header), fp->file);
+#endif
+    if (count == 0) {
+        fp->block_length = 0;
+        return 0;
+    }
+       size = count;
+    if (count != sizeof(header)) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+    if (!check_header(header)) {
+        report_error(fp, "invalid block header");
+        return -1;
+    }
+    int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+    bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
+    memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+    int remaining = block_length - BLOCK_HEADER_LENGTH;
+#ifdef _USE_KNETFILE
+    count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+#else
+    count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
+#endif
+    if (count != remaining) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+       size += count;
+    count = inflate_block(fp, block_length);
+    if (count < 0) {
+        return -1;
+    }
+    if (fp->block_length != 0) {
+        // Do not reset offset if this read follows a seek.
+        fp->block_offset = 0;
+    }
+    fp->block_address = block_address;
+    fp->block_length = count;
+       cache_block(fp, size);
+    return 0;
+}
+
+int
+bgzf_read(BGZF* fp, void* data, int length)
+{
+    if (length <= 0) {
+        return 0;
+    }
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for reading");
+        return -1;
+    }
+
+    int bytes_read = 0;
+    bgzf_byte_t* output = data;
+    while (bytes_read < length) {
+        int available = fp->block_length - fp->block_offset;
+        if (available <= 0) {
+            if (read_block(fp) != 0) {
+                return -1;
+            }
+            available = fp->block_length - fp->block_offset;
+            if (available <= 0) {
+                break;
+            }
+        }
+        int copy_length = bgzf_min(length-bytes_read, available);
+        bgzf_byte_t* buffer = fp->uncompressed_block;
+        memcpy(output, buffer + fp->block_offset, copy_length);
+        fp->block_offset += copy_length;
+        output += copy_length;
+        bytes_read += copy_length;
+    }
+    if (fp->block_offset == fp->block_length) {
+#ifdef _USE_KNETFILE
+        fp->block_address = knet_tell(fp->x.fpr);
+#else
+        fp->block_address = ftello(fp->file);
+#endif
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+    return bytes_read;
+}
+
+static
+int
+flush_block(BGZF* fp)
+{
+    while (fp->block_offset > 0) {
+        int block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) {
+            return -1;
+        }
+#ifdef _USE_KNETFILE
+        int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+#else
+        int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+#endif
+        if (count != block_length) {
+            report_error(fp, "write failed");
+            return -1;
+        }
+        fp->block_address += block_length;
+    }
+    return 0;
+}
+
+int
+bgzf_write(BGZF* fp, const void* data, int length)
+{
+    if (fp->open_mode != 'w') {
+        report_error(fp, "file not open for writing");
+        return -1;
+    }
+
+    if (fp->uncompressed_block == NULL) {
+        fp->uncompressed_block = malloc(fp->uncompressed_block_size);
+    }
+
+    const bgzf_byte_t* input = data;
+    int block_length = fp->uncompressed_block_size;
+    int bytes_written = 0;
+    while (bytes_written < length) {
+        int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
+        bgzf_byte_t* buffer = fp->uncompressed_block;
+        memcpy(buffer + fp->block_offset, input, copy_length);
+        fp->block_offset += copy_length;
+        input += copy_length;
+        bytes_written += copy_length;
+        if (fp->block_offset == block_length) {
+            if (flush_block(fp) != 0) {
+                break;
+            }
+        }
+    }
+    return bytes_written;
+}
+
+int
+bgzf_close(BGZF* fp)
+{
+    if (fp->open_mode == 'w') {
+        if (flush_block(fp) != 0) {
+            return -1;
+        }
+               { // add an empty block
+                       int count, block_length = deflate_block(fp, 0);
+#ifdef _USE_KNETFILE
+                       count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+#else
+                       count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+#endif
+               }
+#ifdef _USE_KNETFILE
+        if (fflush(fp->x.fpw) != 0) {
+#else
+        if (fflush(fp->file) != 0) {
+#endif
+            report_error(fp, "flush failed");
+            return -1;
+        }
+    }
+    if (fp->owned_file) {
+#ifdef _USE_KNETFILE
+               int ret;
+               if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
+               else ret = knet_close(fp->x.fpr);
+        if (ret != 0) return -1;
+#else
+        if (fclose(fp->file) != 0) {
+            return -1;
+        }
+#endif
+    }
+    free(fp->uncompressed_block);
+    free(fp->compressed_block);
+       free_cache(fp);
+    free(fp);
+    return 0;
+}
+
+int64_t
+bgzf_tell(BGZF* fp)
+{
+    return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+       if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+       static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+       uint8_t buf[28];
+       off_t offset;
+#ifdef _USE_KNETFILE
+       offset = knet_tell(fp->x.fpr);
+       if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
+       knet_read(fp->x.fpr, buf, 28);
+       knet_seek(fp->x.fpr, offset, SEEK_SET);
+#else
+       offset = ftello(fp->file);
+       if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
+       fread(buf, 1, 28, fp->file);
+       fseeko(fp->file, offset, SEEK_SET);
+#endif
+       return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
+int64_t
+bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for read");
+        return -1;
+    }
+    if (where != SEEK_SET) {
+        report_error(fp, "unimplemented seek option");
+        return -1;
+    }
+    int block_offset = pos & 0xFFFF;
+    int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+#ifdef _USE_KNETFILE
+    if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
+#else
+    if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+#endif
+        report_error(fp, "seek failed");
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block is not loaded
+    fp->block_address = block_address;
+    fp->block_offset = block_offset;
+    return 0;
+}
diff --git a/samtools/bgzf.h b/samtools/bgzf.h
new file mode 100644 (file)
index 0000000..91b3317
--- /dev/null
@@ -0,0 +1,134 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <zlib.h>
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+//typedef int8_t bool;
+
+typedef struct {
+    int file_descriptor;
+    char open_mode;  // 'r' or 'w'
+    bool owned_file, is_uncompressed;
+#ifdef _USE_KNETFILE
+       union {
+               knetFile *fpr;
+               FILE *fpw;
+       } x;
+#else
+    FILE* file;
+#endif
+    int uncompressed_block_size;
+    int compressed_block_size;
+    void* uncompressed_block;
+    void* compressed_block;
+    int64_t block_address;
+    int block_length;
+    int block_offset;
+       int cache_size;
+    const char* error;
+       void *cache; // a pointer to a hash table
+} BGZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Open an existing file descriptor for reading or writing.
+ * Mode must be either "r" or "w".
+ * A subsequent bgzf_close will not close the file descriptor.
+ * Returns null on error.
+ */
+BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
+
+/*
+ * Open the specified file for reading or writing.
+ * Mode must be either "r" or "w".
+ * Returns null on error.
+ */
+BGZF* bgzf_open(const char* path, const char* __restrict mode);
+
+/*
+ * Close the BGZ file and free all associated resources.
+ * Does not close the underlying file descriptor if created with bgzf_fdopen.
+ * Returns zero on success, -1 on error.
+ */
+int bgzf_close(BGZF* fp);
+
+/*
+ * Read up to length bytes from the file storing into data.
+ * Returns the number of bytes actually read.
+ * Returns zero on end of file.
+ * Returns -1 on error.
+ */
+int bgzf_read(BGZF* fp, void* data, int length);
+
+/*
+ * Write length bytes from data to the file.
+ * Returns the number of bytes written.
+ * Returns -1 on error.
+ */
+int bgzf_write(BGZF* fp, const void* data, int length);
+
+/*
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ * Returns -1 on error.
+ */
+int64_t bgzf_tell(BGZF* fp);
+
+/*
+ * Set the file to read from the location specified by pos, which must
+ * be a value previously returned by bgzf_tell for this file (but not
+ * necessarily one returned by this file handle).
+ * The where argument must be SEEK_SET.
+ * Seeking on a file opened for write is not supported.
+ * Returns zero on success, -1 on error.
+ */
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
+
+/*
+ * Set the cache size. Zero to disable. By default, caching is
+ * disabled. The recommended cache size for frequent random access is
+ * about 8M bytes.
+ */
+void bgzf_set_cache_size(BGZF *fp, int cache_size);
+
+int bgzf_check_EOF(BGZF *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/faidx.c b/samtools/faidx.c
new file mode 100644 (file)
index 0000000..811bdf8
--- /dev/null
@@ -0,0 +1,422 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "faidx.h"
+#include "khash.h"
+
+typedef struct {
+       uint64_t len:32, line_len:16, line_blen:16;
+       uint64_t offset;
+} faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+
+#ifndef _NO_RAZF
+#include "razf.h"
+#else
+#ifdef _WIN32
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
+#define RAZF FILE
+#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
+#define razf_open(fn, mode) fopen(fn, mode)
+#define razf_close(fp) fclose(fp)
+#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define razf_tell(fp) ftello(fp)
+#endif
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+struct __faidx_t {
+       RAZF *rz;
+       int n, m;
+       char **name;
+       khash_t(s) *hash;
+};
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
+{
+       khint_t k;
+       int ret;
+       faidx1_t t;
+       if (idx->n == idx->m) {
+               idx->m = idx->m? idx->m<<1 : 16;
+               idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
+       }
+       idx->name[idx->n] = strdup(name);
+       k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
+       t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
+       kh_value(idx->hash, k) = t;
+       ++idx->n;
+}
+
+faidx_t *fai_build_core(RAZF *rz)
+{
+       char c, *name;
+       int l_name, m_name, ret;
+       int len, line_len, line_blen, state;
+       int l1, l2;
+       faidx_t *idx;
+       uint64_t offset;
+
+       idx = (faidx_t*)calloc(1, sizeof(faidx_t));
+       idx->hash = kh_init(s);
+       name = 0; l_name = m_name = 0;
+       len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
+       while (razf_read(rz, &c, 1)) {
+               if (c == '\n') { // an empty line
+                       if (state == 1) {
+                               offset = razf_tell(rz);
+                               continue;
+                       } else if ((state == 0 && len < 0) || state == 2) continue;
+               }
+               if (c == '>') { // fasta header
+                       if (len >= 0)
+                               fai_insert_index(idx, name, len, line_len, line_blen, offset);
+                       l_name = 0;
+                       while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
+                               if (m_name < l_name + 2) {
+                                       m_name = l_name + 2;
+                                       kroundup32(m_name);
+                                       name = (char*)realloc(name, m_name);
+                               }
+                               name[l_name++] = c;
+                       }
+                       name[l_name] = '\0';
+                       if (ret == 0) {
+                               fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
+                               free(name); fai_destroy(idx);
+                               return 0;
+                       }
+                       if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
+                       state = 1; len = 0;
+                       offset = razf_tell(rz);
+               } else {
+                       if (state == 3) {
+                               fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
+                               free(name); fai_destroy(idx);
+                               return 0;
+                       }
+                       if (state == 2) state = 3;
+                       l1 = l2 = 0;
+                       do {
+                               ++l1;
+                               if (isgraph(c)) ++l2;
+                       } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
+                       if (state == 3 && l2) {
+                               fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
+                               free(name); fai_destroy(idx);
+                               return 0;
+                       }
+                       ++l1; len += l2;
+                       if (l2 >= 0x10000) {
+                               fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name);
+                               free(name); fai_destroy(idx);
+                               return 0;
+                       }
+                       if (state == 1) line_len = l1, line_blen = l2, state = 0;
+                       else if (state == 0) {
+                               if (l1 != line_len || l2 != line_blen) state = 2;
+                       }
+               }
+       }
+       fai_insert_index(idx, name, len, line_len, line_blen, offset);
+       free(name);
+       return idx;
+}
+
+void fai_save(const faidx_t *fai, FILE *fp)
+{
+       khint_t k;
+       int i;
+       for (i = 0; i < fai->n; ++i) {
+               faidx1_t x;
+               k = kh_get(s, fai->hash, fai->name[i]);
+               x = kh_value(fai->hash, k);
+#ifdef _WIN32
+               fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
+#else
+               fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+#endif
+       }
+}
+
+faidx_t *fai_read(FILE *fp)
+{
+       faidx_t *fai;
+       char *buf, *p;
+       int len, line_len, line_blen;
+#ifdef _WIN32
+       long offset;
+#else
+       long long offset;
+#endif
+       fai = (faidx_t*)calloc(1, sizeof(faidx_t));
+       fai->hash = kh_init(s);
+       buf = (char*)calloc(0x10000, 1);
+       while (!feof(fp) && fgets(buf, 0x10000, fp)) {
+               for (p = buf; *p && isgraph(*p); ++p);
+               *p = 0; ++p;
+#ifdef _WIN32
+               sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
+#else
+               sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+#endif
+               fai_insert_index(fai, buf, len, line_len, line_blen, offset);
+       }
+       free(buf);
+       return fai;
+}
+
+void fai_destroy(faidx_t *fai)
+{
+       int i;
+       for (i = 0; i < fai->n; ++i) free(fai->name[i]);
+       free(fai->name);
+       kh_destroy(s, fai->hash);
+       if (fai->rz) razf_close(fai->rz);
+       free(fai);
+}
+
+int fai_build(const char *fn)
+{
+       char *str;
+       RAZF *rz;
+       FILE *fp;
+       faidx_t *fai;
+       str = (char*)calloc(strlen(fn) + 5, 1);
+       sprintf(str, "%s.fai", fn);
+       rz = razf_open(fn, "r");
+       if (rz == 0) {
+               fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",str);
+               free(str);
+               return -1;
+       }
+       fai = fai_build_core(rz);
+       razf_close(rz);
+       fp = fopen(str, "wb");
+       if (fp == 0) {
+               fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
+               fai_destroy(fai); free(str);
+               return -1;
+       }
+       fai_save(fai, fp);
+       fclose(fp);
+       free(str);
+       fai_destroy(fai);
+       return 0;
+}
+
+#ifdef _USE_KNETFILE
+FILE *download_and_open(const char *fn)
+{
+    const int buf_size = 1 * 1024 * 1024;
+    uint8_t *buf;
+    FILE *fp;
+    knetFile *fp_remote;
+    const char *url = fn;
+    const char *p;
+    int l = strlen(fn);
+    for (p = fn + l - 1; p >= fn; --p)
+        if (*p == '/') break;
+    fn = p + 1;
+
+    // First try to open a local copy
+    fp = fopen(fn, "r");
+    if (fp)
+        return fp;
+
+    // If failed, download from remote and open
+    fp_remote = knet_open(url, "rb");
+    if (fp_remote == 0) {
+        fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
+        return NULL;
+    }
+    if ((fp = fopen(fn, "wb")) == 0) {
+        fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
+        knet_close(fp_remote);
+        return NULL;
+    }
+    buf = (uint8_t*)calloc(buf_size, 1);
+    while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+        fwrite(buf, 1, l, fp);
+    free(buf);
+    fclose(fp);
+    knet_close(fp_remote);
+
+    return fopen(fn, "r");
+}
+#endif
+
+faidx_t *fai_load(const char *fn)
+{
+       char *str;
+       FILE *fp;
+       faidx_t *fai;
+       str = (char*)calloc(strlen(fn) + 5, 1);
+       sprintf(str, "%s.fai", fn);
+
+#ifdef _USE_KNETFILE
+    if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
+    {
+        fp = download_and_open(str);
+        if ( !fp )
+        {
+            fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
+            free(str);
+            return 0;
+        }
+    }
+    else
+#endif
+        fp = fopen(str, "rb");
+       if (fp == 0) {
+               fprintf(stderr, "[fai_load] build FASTA index.\n");
+               fai_build(fn);
+               fp = fopen(str, "rb");
+               if (fp == 0) {
+                       fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
+                       free(str);
+                       return 0;
+               }
+       }
+
+       fai = fai_read(fp);
+       fclose(fp);
+
+       fai->rz = razf_open(fn, "rb");
+       free(str);
+       if (fai->rz == 0) {
+               fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
+               return 0;
+       }
+       return fai;
+}
+
+char *fai_fetch(const faidx_t *fai, const char *str, int *len)
+{
+       char *s, *p, c;
+       int i, l, k;
+       khiter_t iter;
+       faidx1_t val;
+       khash_t(s) *h;
+       int beg, end;
+
+       beg = end = -1;
+       h = fai->hash;
+       l = strlen(str);
+       p = s = (char*)malloc(l+1);
+       /* squeeze out "," */
+       for (i = k = 0; i != l; ++i)
+               if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+       s[k] = 0;
+       for (i = 0; i != k; ++i) if (s[i] == ':') break;
+       s[i] = 0;
+       iter = kh_get(s, h, s); /* get the ref_id */
+       if (iter == kh_end(h)) {
+               *len = 0;
+               free(s); return 0;
+       }
+       val = kh_value(h, iter);
+       if (i == k) { /* dump the whole sequence */
+               beg = 0; end = val.len;
+       } else {
+               for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+               beg = atoi(p);
+               if (i < k) {
+                       p = s + i + 1;
+                       end = atoi(p);
+               } else end = val.len;
+       }
+       if (beg > 0) --beg;
+       if (beg >= val.len) beg = val.len;
+       if (end >= val.len) end = val.len;
+       if (beg > end) beg = end;
+       free(s);
+
+       // now retrieve the sequence
+       l = 0;
+       s = (char*)malloc(end - beg + 2);
+       razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
+       while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
+               if (isgraph(c)) s[l++] = c;
+       s[l] = '\0';
+       *len = l;
+       return s;
+}
+
+int faidx_main(int argc, char *argv[])
+{
+       if (argc == 1) {
+               fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
+               return 1;
+       } else {
+               if (argc == 2) fai_build(argv[1]);
+               else {
+                       int i, j, k, l;
+                       char *s;
+                       faidx_t *fai;
+                       fai = fai_load(argv[1]);
+                       if (fai == 0) return 1;
+                       for (i = 2; i != argc; ++i) {
+                               printf(">%s\n", argv[i]);
+                               s = fai_fetch(fai, argv[i], &l);
+                               for (j = 0; j < l; j += 60) {
+                                       for (k = 0; k < 60 && k < l - j; ++k)
+                                               putchar(s[j + k]);
+                                       putchar('\n');
+                               }
+                               free(s);
+                       }
+                       fai_destroy(fai);
+               }
+       }
+       return 0;
+}
+
+int faidx_fetch_nseq(const faidx_t *fai) 
+{
+       return fai->n;
+}
+
+char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
+{
+       int l;
+       char c;
+    khiter_t iter;
+    faidx1_t val;
+       char *seq=NULL;
+
+    // Adjust position
+    iter = kh_get(s, fai->hash, c_name);
+    if(iter == kh_end(fai->hash)) return 0;
+    val = kh_value(fai->hash, iter);
+       if(p_end_i < p_beg_i) p_beg_i = p_end_i;
+    if(p_beg_i < 0) p_beg_i = 0;
+    else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
+    if(p_end_i < 0) p_end_i = 0;
+    else if(val.len <= p_end_i) p_end_i = val.len - 1;
+
+    // Now retrieve the sequence 
+       l = 0;
+       seq = (char*)malloc(p_end_i - p_beg_i + 2);
+       razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
+       while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
+               if (isgraph(c)) seq[l++] = c;
+       seq[l] = '\0';
+       *len = l;
+       return seq;
+}
+
+#ifdef FAIDX_MAIN
+int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
+#endif
diff --git a/samtools/faidx.h b/samtools/faidx.h
new file mode 100644 (file)
index 0000000..1fb1b1f
--- /dev/null
@@ -0,0 +1,103 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef FAIDX_H
+#define FAIDX_H
+
+/*!
+  @header
+
+  Index FASTA files and extract subsequence.
+
+  @copyright The Wellcome Trust Sanger Institute.
+ */
+
+struct __faidx_t;
+typedef struct __faidx_t faidx_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       /*!
+         @abstract   Build index for a FASTA or razip compressed FASTA file.
+         @param  fn  FASTA file name
+         @return     0 on success; or -1 on failure
+         @discussion File "fn.fai" will be generated.
+        */
+       int fai_build(const char *fn);
+
+       /*!
+         @abstract    Distroy a faidx_t struct.
+         @param  fai  Pointer to the struct to be destroyed
+        */
+       void fai_destroy(faidx_t *fai);
+
+       /*!
+         @abstract   Load index from "fn.fai".
+         @param  fn  File name of the FASTA file
+        */
+       faidx_t *fai_load(const char *fn);
+
+       /*!
+         @abstract    Fetch the sequence in a region.
+         @param  fai  Pointer to the faidx_t struct
+         @param  reg  Region in the format "chr2:20,000-30,000"
+         @param  len  Length of the region
+         @return      Pointer to the sequence; null on failure
+
+         @discussion The returned sequence is allocated by malloc family
+         and should be destroyed by end users by calling free() on it.
+        */
+       char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
+
+       /*!
+         @abstract        Fetch the number of sequences. 
+         @param  fai  Pointer to the faidx_t struct
+         @return          The number of sequences
+        */
+       int faidx_fetch_nseq(const faidx_t *fai);
+
+       /*!
+         @abstract    Fetch the sequence in a region.
+         @param  fai  Pointer to the faidx_t struct
+         @param  c_name Region name
+         @param  p_beg_i  Beginning position number (zero-based)
+         @param  p_end_i  End position number (zero-based)
+         @param  len  Length of the region
+         @return      Pointer to the sequence; null on failure
+
+         @discussion The returned sequence is allocated by malloc family
+         and should be destroyed by end users by calling free() on it.
+        */
+       char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/glf.c b/samtools/glf.c
new file mode 100644 (file)
index 0000000..8d5346a
--- /dev/null
@@ -0,0 +1,236 @@
+#include <string.h>
+#include <stdlib.h>
+#include "glf.h"
+
+#ifdef _NO_BGZF
+// then alias bgzf_*() functions
+#endif
+
+static int glf3_is_BE = 0;
+
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+       v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+       return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+       return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+
+static inline int bam_is_big_endian()
+{
+       long one= 1;
+       return !(*((char *)(&one)));
+}
+
+glf3_header_t *glf3_header_init()
+{
+       glf3_is_BE = bam_is_big_endian();
+       return (glf3_header_t*)calloc(1, sizeof(glf3_header_t));
+}
+
+glf3_header_t *glf3_header_read(glfFile fp)
+{
+       glf3_header_t *h;
+       char magic[4];
+       h = glf3_header_init();
+       bgzf_read(fp, magic, 4);
+       if (strncmp(magic, "GLF\3", 4)) {
+               fprintf(stderr, "[glf3_header_read] invalid magic.\n");
+               glf3_header_destroy(h);
+               return 0;
+       }
+       bgzf_read(fp, &h->l_text, 4);
+       if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text);
+       if (h->l_text) {
+               h->text = (uint8_t*)calloc(h->l_text + 1, 1);
+               bgzf_read(fp, h->text, h->l_text);
+       }
+       return h;
+}
+
+void glf3_header_write(glfFile fp, const glf3_header_t *h)
+{
+       int32_t x;
+       bgzf_write(fp, "GLF\3", 4);
+       x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text;
+       bgzf_write(fp, &x, 4);
+       if (h->l_text) bgzf_write(fp, h->text, h->l_text);
+}
+
+void glf3_header_destroy(glf3_header_t *h)
+{
+       free(h->text);
+       free(h);
+}
+
+char *glf3_ref_read(glfFile fp, int *len)
+{
+       int32_t n, x;
+       char *str;
+       *len = 0;
+       if (bgzf_read(fp, &n, 4) != 4) return 0;
+       if (glf3_is_BE) n = bam_swap_endian_4(n);
+       if (n < 0) {
+               fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n);
+               return 0;
+       }
+       str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact
+       x = bgzf_read(fp, str, n);
+       x += bgzf_read(fp, len, 4);
+       if (x != n + 4) {
+               free(str); *len = -1; return 0; // truncated
+       }
+       if (glf3_is_BE) *len = bam_swap_endian_4(*len);
+       return str;
+}
+
+void glf3_ref_write(glfFile fp, const char *str, int len)
+{
+       int32_t m, n = strlen(str) + 1;
+       m = glf3_is_BE? bam_swap_endian_4(n) : n;
+       bgzf_write(fp, &m, 4);
+       bgzf_write(fp, str, n);
+       if (glf3_is_BE) len = bam_swap_endian_4(len);
+       bgzf_write(fp, &len, 4);
+}
+
+void glf3_view1(const char *ref_name, const glf3_t *g3, int pos)
+{
+       int j;
+       if (g3->rtype == GLF3_RTYPE_END) return;
+       printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1,
+                  g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base],
+                  g3->depth, g3->rms_mapQ, g3->min_lk);
+       if (g3->rtype == GLF3_RTYPE_SUB)
+               for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]);
+       else {
+               printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1],
+                          g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*");
+       }
+       printf("\n");
+}
+
+int glf3_write1(glfFile fp, const glf3_t *g3)
+{
+       int r;
+       uint8_t c;
+       uint32_t y[2];
+       c = g3->rtype<<4 | g3->ref_base;
+       r = bgzf_write(fp, &c, 1);
+       if (g3->rtype == GLF3_RTYPE_END) return r;
+       y[0] = g3->offset;
+       y[1] = g3->min_lk<<24 | g3->depth;
+       if (glf3_is_BE) {
+               y[0] = bam_swap_endian_4(y[0]);
+               y[1] = bam_swap_endian_4(y[1]);
+       }
+       r += bgzf_write(fp, y, 8);
+       r += bgzf_write(fp, &g3->rms_mapQ, 1);
+       if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10);
+       else {
+               int16_t x[2];
+               r += bgzf_write(fp, g3->lk, 3);
+               x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0];
+               x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1];
+               r += bgzf_write(fp, x, 4);
+               if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0]));
+               if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1]));
+       }
+       return r;
+}
+
+#ifndef kv_roundup32
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int glf3_read1(glfFile fp, glf3_t *g3)
+{
+       int r;
+       uint8_t c;
+       uint32_t y[2];
+       r = bgzf_read(fp, &c, 1);
+       if (r == 0) return 0;
+       g3->ref_base = c & 0xf;
+       g3->rtype = c>>4;
+       if (g3->rtype == GLF3_RTYPE_END) return r;
+       r += bgzf_read(fp, y, 8);
+       if (glf3_is_BE) {
+               y[0] = bam_swap_endian_4(y[0]);
+               y[1] = bam_swap_endian_4(y[1]);
+       }
+       g3->offset = y[0];
+       g3->min_lk = y[1]>>24;
+       g3->depth = y[1]<<8>>8;
+       r += bgzf_read(fp, &g3->rms_mapQ, 1);
+       if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10);
+       else {
+               int16_t x[2], max;
+               r += bgzf_read(fp, g3->lk, 3);
+               r += bgzf_read(fp, x, 4);
+               if (glf3_is_BE) {
+                       x[0] = bam_swap_endian_2(x[0]);
+                       x[1] = bam_swap_endian_2(x[1]);
+               }
+               g3->indel_len[0] = x[0];
+               g3->indel_len[1] = x[1];
+               x[0] = abs(x[0]); x[1] = abs(x[1]);
+               max = (x[0] > x[1]? x[0] : x[1]) + 1;
+               if (g3->max_len < max) {
+                       g3->max_len = max;
+                       kv_roundup32(g3->max_len);
+                       g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len);
+                       g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len);
+               }
+               r += bgzf_read(fp, g3->indel_seq[0], x[0]);
+               r += bgzf_read(fp, g3->indel_seq[1], x[1]);
+               g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0;
+       }
+       return r;
+}
+
+void glf3_view(glfFile fp)
+{
+       glf3_header_t *h;
+       char *name;
+       glf3_t *g3;
+       int len;
+       h = glf3_header_read(fp);
+       g3 = glf3_init1();
+       while ((name = glf3_ref_read(fp, &len)) != 0) {
+               int pos = 0;
+               while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) {
+                       pos += g3->offset;
+                       glf3_view1(name, g3, pos);
+               }
+               free(name);
+       }
+       glf3_header_destroy(h);
+       glf3_destroy1(g3);
+}
+
+int glf3_view_main(int argc, char *argv[])
+{
+       glfFile fp;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: glfview <in.glf>\n");
+               return 1;
+       }
+       fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r");
+       if (fp == 0) {
+               fprintf(stderr, "Fail to open file '%s'\n", argv[1]);
+               return 1;
+       }
+       glf3_view(fp);
+       bgzf_close(fp);
+       return 0;
+}
+
+#ifdef GLFVIEW_MAIN
+int main(int argc, char *argv[])
+{
+       return glf3_view_main(argc, argv);
+}
+#endif
diff --git a/samtools/glf.h b/samtools/glf.h
new file mode 100644 (file)
index 0000000..12e5400
--- /dev/null
@@ -0,0 +1,56 @@
+#ifndef GLF_H_
+#define GLF_H_
+
+typedef struct {
+       unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */
+       unsigned char max_mapQ; /** maximum mapping quality */
+       unsigned char lk[10];   /** log likelihood ratio, capped at 255 */
+       unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
+} glf1_t;
+
+#include <stdint.h>
+#include "bgzf.h"
+typedef BGZF *glfFile;
+
+#define GLF3_RTYPE_END   0
+#define GLF3_RTYPE_SUB   1
+#define GLF3_RTYPE_INDEL 2
+
+typedef struct {
+       uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */
+       uint8_t rms_mapQ; /** RMS mapping quality */
+       uint8_t lk[10];   /** log likelihood ratio, capped at 255 */
+       uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
+       int32_t offset; /** the first base in a chromosome has offset zero. */
+       // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10])
+       int16_t indel_len[2];
+       int32_t max_len; // maximum indel len; will be modified by glf3_read1()
+       char *indel_seq[2];
+} glf3_t;
+
+typedef struct {
+       int32_t l_text;
+       uint8_t *text;
+} glf3_header_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t)))
+#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0)
+
+       glf3_header_t *glf3_header_init();
+       glf3_header_t *glf3_header_read(glfFile fp);
+       void glf3_header_write(glfFile fp, const glf3_header_t *h);
+       void glf3_header_destroy(glf3_header_t *h);
+       char *glf3_ref_read(glfFile fp, int *len);
+       void glf3_ref_write(glfFile fp, const char *name, int len);
+       int glf3_write1(glfFile fp, const glf3_t *g3);
+       int glf3_read1(glfFile fp, glf3_t *g3);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/kaln.c b/samtools/kaln.c
new file mode 100644 (file)
index 0000000..9fa40d0
--- /dev/null
@@ -0,0 +1,370 @@
+/* The MIT License
+
+   Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include "kaln.h"
+
+#define FROM_M 0
+#define FROM_I 1
+#define FROM_D 2
+
+typedef struct {
+       int i, j;
+       unsigned char ctype;
+} path_t;
+
+int aln_sm_blosum62[] = {
+/*      A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
+        4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
+       -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
+       -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
+       -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
+        0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
+       -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
+       -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+        0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
+       -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
+       -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
+       -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
+       -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+       -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
+       -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
+       -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
+        1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
+        0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
+       -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
+       -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
+        0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
+       -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
+        0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
+};
+
+int aln_sm_blast[] = {
+       1, -3, -3, -3, -2,
+       -3, 1, -3, -3, -2,
+       -3, -3, 1, -3, -2,
+       -3, -3, -3, 1, -2,
+       -2, -2, -2, -2, -2
+};
+
+ka_param_t ka_param_blast = {  5,  2,  2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10,  2,  2, aln_sm_blosum62, 22, 50 };
+
+static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
+{
+       int i, n;
+       uint32_t *cigar;
+       unsigned char last_type;
+
+       if (path_len == 0 || path == 0) {
+               *n_cigar = 0;
+               return 0;
+       }
+
+       last_type = path->ctype;
+       for (i = n = 1; i < path_len; ++i) {
+               if (last_type != path[i].ctype) ++n;
+               last_type = path[i].ctype;
+       }
+       *n_cigar = n;
+       cigar = (uint32_t*)calloc(*n_cigar, 4);
+
+       cigar[0] = 1u << 4 | path[path_len-1].ctype;
+       last_type = path[path_len-1].ctype;
+       for (i = path_len - 2, n = 0; i >= 0; --i) {
+               if (path[i].ctype == last_type) cigar[n] += 1u << 4;
+               else {
+                       cigar[++n] = 1u << 4 | path[i].ctype;
+                       last_type = path[i].ctype;
+               }
+       }
+
+       return cigar;
+}
+
+/***************************/
+/* START OF common_align.c */
+/***************************/
+
+#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
+
+#define set_M(MM, cur, p, sc)                                                  \
+{                                                                                                              \
+       if ((p)->M >= (p)->I) {                                                         \
+               if ((p)->M >= (p)->D) {                                                 \
+                       (MM) = (p)->M + (sc); (cur)->Mt = FROM_M;       \
+               } else {                                                                                \
+                       (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
+               }                                                                                               \
+       } else {                                                                                        \
+               if ((p)->I > (p)->D) {                                                  \
+                       (MM) = (p)->I + (sc); (cur)->Mt = FROM_I;       \
+               } else {                                                                                \
+                       (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
+               }                                                                                               \
+       }                                                                                                       \
+}
+#define set_I(II, cur, p)                                                              \
+{                                                                                                              \
+       if ((p)->M - gap_open > (p)->I) {                                       \
+               (cur)->It = FROM_M;                                                             \
+               (II) = (p)->M - gap_open - gap_ext;                             \
+       } else {                                                                                        \
+               (cur)->It = FROM_I;                                                             \
+               (II) = (p)->I - gap_ext;                                                \
+       }                                                                                                       \
+}
+#define set_end_I(II, cur, p)                                                  \
+{                                                                                                              \
+       if (gap_end >= 0) {                                                                     \
+               if ((p)->M - gap_open > (p)->I) {                               \
+                       (cur)->It = FROM_M;                                                     \
+                       (II) = (p)->M - gap_open - gap_end;                     \
+               } else {                                                                                \
+                       (cur)->It = FROM_I;                                                     \
+                       (II) = (p)->I - gap_end;                                        \
+               }                                                                                               \
+       } else set_I(II, cur, p);                                                       \
+}
+#define set_D(DD, cur, p)                                                              \
+{                                                                                                              \
+       if ((p)->M - gap_open > (p)->D) {                                       \
+               (cur)->Dt = FROM_M;                                                             \
+               (DD) = (p)->M - gap_open - gap_ext;                             \
+       } else {                                                                                        \
+               (cur)->Dt = FROM_D;                                                             \
+               (DD) = (p)->D - gap_ext;                                                \
+       }                                                                                                       \
+}
+#define set_end_D(DD, cur, p)                                                  \
+{                                                                                                              \
+       if (gap_end >= 0) {                                                                     \
+               if ((p)->M - gap_open > (p)->D) {                               \
+                       (cur)->Dt = FROM_M;                                                     \
+                       (DD) = (p)->M - gap_open - gap_end;                     \
+               } else {                                                                                \
+                       (cur)->Dt = FROM_D;                                                     \
+                       (DD) = (p)->D - gap_end;                                        \
+               }                                                                                               \
+       } else set_D(DD, cur, p);                                                       \
+}
+
+typedef struct {
+       uint8_t Mt:3, It:2, Dt:2;
+} dpcell_t;
+
+typedef struct {
+       int M, I, D;
+} dpscore_t;
+
+/***************************
+ * banded global alignment *
+ ***************************/
+uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
+{
+       int i, j;
+       dpcell_t **dpcell, *q;
+       dpscore_t *curr, *last, *s;
+       int b1, b2, tmp_end;
+       int *mat, end, max = 0;
+       uint8_t type, ctype;
+       uint32_t *cigar = 0;
+
+       int gap_open, gap_ext, gap_end, b;
+       int *score_matrix, N_MATRIX_ROW;
+
+       /* initialize some align-related parameters. just for compatibility */
+       gap_open = ap->gap_open;
+       gap_ext = ap->gap_ext;
+       gap_end = ap->gap_end;
+       b = ap->band_width;
+       score_matrix = ap->matrix;
+       N_MATRIX_ROW = ap->row;
+
+       *n_cigar = 0;
+       if (len1 == 0 || len2 == 0) return 0;
+
+       /* calculate b1 and b2 */
+       if (len1 > len2) {
+               b1 = len1 - len2 + b;
+               b2 = b;
+       } else {
+               b1 = b;
+               b2 = len2 - len1 + b;
+       }
+       if (b1 > len1) b1 = len1;
+       if (b2 > len2) b2 = len2;
+       --seq1; --seq2;
+
+       /* allocate memory */
+       end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
+       dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
+       for (j = 0; j <= len2; ++j)
+               dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
+       for (j = b2 + 1; j <= len2; ++j)
+               dpcell[j] -= j - b2;
+       curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+       last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+       
+       /* set first row */
+       SET_INF(*curr); curr->M = 0;
+       for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
+               SET_INF(*s);
+               set_end_D(s->D, dpcell[0] + i, s - 1);
+       }
+       s = curr; curr = last; last = s;
+
+       /* core dynamic programming, part 1 */
+       tmp_end = (b2 < len2)? b2 : len2 - 1;
+       for (j = 1; j <= tmp_end; ++j) {
+               q = dpcell[j]; s = curr; SET_INF(*s);
+               set_end_I(s->I, q, last);
+               end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               ++s; ++q;
+               for (i = 1; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_D(s->D, q, s - 1);
+               if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+                       set_end_I(s->I, q, last + i);
+               } else s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+       }
+       /* last row for part 1, use set_end_D() instead of set_D() */
+       if (j == len2 && b2 != len2 - 1) {
+               q = dpcell[j]; s = curr; SET_INF(*s);
+               set_end_I(s->I, q, last);
+               end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               ++s; ++q;
+               for (i = 1; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+                       set_I(s->I, q, last + i);
+                       set_end_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_end_D(s->D, q, s - 1);
+               if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+                       set_end_I(s->I, q, last + i);
+               } else s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+               ++j;
+       }
+
+       /* core dynamic programming, part 2 */
+       for (; j <= len2 - b2 + 1; ++j) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               end = j + b1 - 1;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_D(s->D, q, s - 1);
+               s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+       }
+
+       /* core dynamic programming, part 3 */
+       for (; j < len2; ++j) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+               set_end_I(s->I, q, last + i);
+               set_D(s->D, q, s - 1);
+               s = curr; curr = last; last = s;
+       }
+       /* last row */
+       if (j == len2) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_end_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+               set_end_I(s->I, q, last + i);
+               set_end_D(s->D, q, s - 1);
+               s = curr; curr = last; last = s;
+       }
+
+       *_score = last[len1].M;
+       if (n_cigar) { /* backtrace */
+               path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
+               i = len1; j = len2;
+               q = dpcell[j] + i;
+               s = last + len1;
+               max = s->M; type = q->Mt; ctype = FROM_M;
+               if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
+               if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
+
+               p = path;
+               p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
+               ++p;
+               do {
+                       switch (ctype) {
+                       case FROM_M: --i; --j; break;
+                       case FROM_I: --j; break;
+                       case FROM_D: --i; break;
+                       }
+                       q = dpcell[j] + i;
+                       ctype = type;
+                       switch (type) {
+                       case FROM_M: type = q->Mt; break;
+                       case FROM_I: type = q->It; break;
+                       case FROM_D: type = q->Dt; break;
+                       }
+                       p->ctype = ctype; p->i = i; p->j = j;
+                       ++p;
+               } while (i || j);
+               cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
+               free(path);
+       }
+
+       /* free memory */
+       for (j = b2 + 1; j <= len2; ++j)
+               dpcell[j] += j - b2;
+       for (j = 0; j <= len2; ++j)
+               free(dpcell[j]);
+       free(dpcell);
+       free(curr); free(last);
+
+       return cigar;
+}
diff --git a/samtools/kaln.h b/samtools/kaln.h
new file mode 100644 (file)
index 0000000..b04d8cc
--- /dev/null
@@ -0,0 +1,55 @@
+/* The MIT License
+
+   Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef LH3_KALN_H_
+#define LH3_KALN_H_
+
+#include <stdint.h>
+
+#define MINOR_INF -1073741823
+
+typedef struct {
+       int gap_open;
+       int gap_ext;
+       int gap_end;
+
+       int *matrix;
+       int row;
+       int band_width;
+} ka_param_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar);
+
+#ifdef __cplusplus
+}
+#endif
+
+extern ka_param_t ka_param_blast; /* = {  5,  2,  2, aln_sm_blast, 5, 50 }; */
+
+#endif
diff --git a/samtools/khash.h b/samtools/khash.h
new file mode 100644 (file)
index 0000000..1d583ef
--- /dev/null
@@ -0,0 +1,486 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+       int ret, is_missing;
+       khiter_t k;
+       khash_t(32) *h = kh_init(32);
+       k = kh_put(32, h, 5, &ret);
+       if (!ret) kh_del(32, h, k);
+       kh_value(h, k) = 10;
+       k = kh_get(32, h, 10);
+       is_missing = (k == kh_end(h));
+       k = kh_get(32, h, 5);
+       kh_del(32, h, k);
+       for (k = kh_begin(h); k != kh_end(h); ++k)
+               if (kh_exist(h, k)) kh_value(h, k) = 1;
+       kh_destroy(32, h);
+       return 0;
+}
+*/
+
+/*
+  2008-09-19 (0.2.3):
+
+       * Corrected the example
+       * Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+       * Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+       * Added kh_clear()
+       * Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+       * Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+       * Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+       * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+
+  @copyright Heng Li
+ */
+
+#define AC_VERSION_KHASH_H "0.2.2"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef uint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_HASH_PRIME_SIZE 32
+static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
+{
+  0ul,          3ul,          11ul,         23ul,         53ul,
+  97ul,         193ul,        389ul,        769ul,        1543ul,
+  3079ul,       6151ul,       12289ul,      24593ul,      49157ul,
+  98317ul,      196613ul,     393241ul,     786433ul,     1572869ul,
+  3145739ul,    6291469ul,    12582917ul,   25165843ul,   50331653ul,
+  100663319ul,  201326611ul,  402653189ul,  805306457ul,  1610612741ul,
+  3221225473ul, 4294967291ul
+};
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+       typedef struct {                                                                                                        \
+               khint_t n_buckets, size, n_occupied, upper_bound;                               \
+               uint32_t *flags;                                                                                                \
+               khkey_t *keys;                                                                                                  \
+               khval_t *vals;                                                                                                  \
+       } kh_##name##_t;                                                                                                        \
+       static inline kh_##name##_t *kh_init_##name() {                                         \
+               return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));                \
+       }                                                                                                                                       \
+       static inline void kh_destroy_##name(kh_##name##_t *h)                          \
+       {                                                                                                                                       \
+               if (h) {                                                                                                                \
+                       free(h->keys); free(h->flags);                                                          \
+                       free(h->vals);                                                                                          \
+                       free(h);                                                                                                        \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kh_clear_##name(kh_##name##_t *h)                            \
+       {                                                                                                                                       \
+               if (h && h->flags) {                                                                                    \
+                       memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
+                       h->size = h->n_occupied = 0;                                                            \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+       {                                                                                                                                       \
+               if (h->n_buckets) {                                                                                             \
+                       khint_t inc, k, i, last;                                                                        \
+                       k = __hash_func(key); i = k % h->n_buckets;                                     \
+                       inc = 1 + k % (h->n_buckets - 1); last = i;                                     \
+                       while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+                               if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+                               else i += inc;                                                                                  \
+                               if (i == last) return h->n_buckets;                                             \
+                       }                                                                                                                       \
+                       return __ac_iseither(h->flags, i)? h->n_buckets : i;            \
+               } else return 0;                                                                                                \
+       }                                                                                                                                       \
+       static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+       {                                                                                                                                       \
+               uint32_t *new_flags = 0;                                                                                \
+               khint_t j = 1;                                                                                                  \
+               {                                                                                                                               \
+                       khint_t t = __ac_HASH_PRIME_SIZE - 1;                                           \
+                       while (__ac_prime_list[t] > new_n_buckets) --t;                         \
+                       new_n_buckets = __ac_prime_list[t+1];                                           \
+                       if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
+                       else {                                                                                                          \
+                               new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t));     \
+                               memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
+                               if (h->n_buckets < new_n_buckets) {                                             \
+                                       h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+                                       if (kh_is_map)                                                                          \
+                                               h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (j) {                                                                                                                \
+                       for (j = 0; j != h->n_buckets; ++j) {                                           \
+                               if (__ac_iseither(h->flags, j) == 0) {                                  \
+                                       khkey_t key = h->keys[j];                                                       \
+                                       khval_t val;                                                                            \
+                                       if (kh_is_map) val = h->vals[j];                                        \
+                                       __ac_set_isdel_true(h->flags, j);                                       \
+                                       while (1) {                                                                                     \
+                                               khint_t inc, k, i;                                                              \
+                                               k = __hash_func(key);                                                   \
+                                               i = k % new_n_buckets;                                                  \
+                                               inc = 1 + k % (new_n_buckets - 1);                              \
+                                               while (!__ac_isempty(new_flags, i)) {                   \
+                                                       if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
+                                                       else i += inc;                                                          \
+                                               }                                                                                               \
+                                               __ac_set_isempty_false(new_flags, i);                   \
+                                               if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
+                                                       { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+                                                       if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+                                                       __ac_set_isdel_true(h->flags, i);                       \
+                                               } else {                                                                                \
+                                                       h->keys[i] = key;                                                       \
+                                                       if (kh_is_map) h->vals[i] = val;                        \
+                                                       break;                                                                          \
+                                               }                                                                                               \
+                                       }                                                                                                       \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+                       if (h->n_buckets > new_n_buckets) {                                                     \
+                               h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+                               if (kh_is_map)                                                                                  \
+                                       h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+                       }                                                                                                                       \
+                       free(h->flags);                                                                                         \
+                       h->flags = new_flags;                                                                           \
+                       h->n_buckets = new_n_buckets;                                                           \
+                       h->n_occupied = h->size;                                                                        \
+                       h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+       {                                                                                                                                       \
+               khint_t x;                                                                                                              \
+               if (h->n_occupied >= h->upper_bound) {                                                  \
+                       if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
+                       else kh_resize_##name(h, h->n_buckets + 1);                                     \
+               }                                                                                                                               \
+               {                                                                                                                               \
+                       khint_t inc, k, i, site, last;                                                          \
+                       x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
+                       if (__ac_isempty(h->flags, i)) x = i;                                           \
+                       else {                                                                                                          \
+                               inc = 1 + k % (h->n_buckets - 1); last = i;                             \
+                               while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+                                       if (__ac_isdel(h->flags, i)) site = i;                          \
+                                       if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+                                       else i += inc;                                                                          \
+                                       if (i == last) { x = site; break; }                                     \
+                               }                                                                                                               \
+                               if (x == h->n_buckets) {                                                                \
+                                       if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+                                       else x = i;                                                                                     \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (__ac_isempty(h->flags, x)) {                                                                \
+                       h->keys[x] = key;                                                                                       \
+                       __ac_set_isboth_false(h->flags, x);                                                     \
+                       ++h->size; ++h->n_occupied;                                                                     \
+                       *ret = 1;                                                                                                       \
+               } else if (__ac_isdel(h->flags, x)) {                                                   \
+                       h->keys[x] = key;                                                                                       \
+                       __ac_set_isboth_false(h->flags, x);                                                     \
+                       ++h->size;                                                                                                      \
+                       *ret = 2;                                                                                                       \
+               } else *ret = 0;                                                                                                \
+               return x;                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kh_del_##name(kh_##name##_t *h, khint_t x)           \
+       {                                                                                                                                       \
+               if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {                 \
+                       __ac_set_isdel_true(h->flags, x);                                                       \
+                       --h->size;                                                                                                      \
+               }                                                                                                                               \
+       }
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [uint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (uint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [uint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static inline khint_t __ac_X31_hash_string(const char *s)
+{
+       khint_t h = *s;
+       if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+       return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other necessary macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: 0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+                               the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)                                                                               \
+       KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)                                                              \
+       KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)                                                                             \
+       KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)                                                            \
+       KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)                                                                               \
+       KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)                                                              \
+       KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/samtools/klist.h b/samtools/klist.h
new file mode 100644 (file)
index 0000000..2f17016
--- /dev/null
@@ -0,0 +1,96 @@
+#ifndef _LH3_KLIST_H
+#define _LH3_KLIST_H
+
+#include <stdlib.h>
+
+#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f)                                              \
+       typedef struct {                                                                                                        \
+               size_t cnt, n, max;                                                                                             \
+               kmptype_t **buf;                                                                                                \
+       } kmp_##name##_t;                                                                                                       \
+       static inline kmp_##name##_t *kmp_init_##name() {                                       \
+               return calloc(1, sizeof(kmp_##name##_t));                                               \
+       }                                                                                                                                       \
+       static inline void kmp_destroy_##name(kmp_##name##_t *mp) {                     \
+               size_t k;                                                                                                               \
+               for (k = 0; k < mp->n; ++k) {                                                                   \
+                       kmpfree_f(mp->buf[k]); free(mp->buf[k]);                                        \
+               }                                                                                                                               \
+               free(mp->buf); free(mp);                                                                                \
+       }                                                                                                                                       \
+       static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) {         \
+               ++mp->cnt;                                                                                                              \
+               if (mp->n == 0) return calloc(1, sizeof(kmptype_t));                    \
+               return mp->buf[--mp->n];                                                                                \
+       }                                                                                                                                       \
+       static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+               --mp->cnt;                                                                                                              \
+               if (mp->n == mp->max) {                                                                                 \
+                       mp->max = mp->max? mp->max<<1 : 16;                                                     \
+                       mp->buf = realloc(mp->buf, sizeof(void*) * mp->max);            \
+               }                                                                                                                               \
+               mp->buf[mp->n++] = p;                                                                                   \
+       }
+
+#define kmempool_t(name) kmp_##name##_t
+#define kmp_init(name) kmp_init_##name()
+#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
+#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
+#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
+
+#define KLIST_INIT(name, kltype_t, kmpfree_t)                                                  \
+       struct __kl1_##name {                                                                                           \
+               kltype_t data;                                                                                                  \
+               struct __kl1_##name *next;                                                                              \
+       };                                                                                                                                      \
+       typedef struct __kl1_##name kl1_##name;                                                         \
+       KMEMPOOL_INIT(name, kl1_##name, kmpfree_t)                                                      \
+       typedef struct {                                                                                                        \
+               kl1_##name *head, *tail;                                                                                \
+               kmp_##name##_t *mp;                                                                                             \
+               size_t size;                                                                                                    \
+       } kl_##name##_t;                                                                                                        \
+       static inline kl_##name##_t *kl_init_##name() {                                         \
+               kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t));                   \
+               kl->mp = kmp_init(name);                                                                                \
+               kl->head = kl->tail = kmp_alloc(name, kl->mp);                                  \
+               kl->head->next = 0;                                                                                             \
+               return kl;                                                                                                              \
+       }                                                                                                                                       \
+       static inline void kl_destroy_##name(kl_##name##_t *kl) {                       \
+               kl1_##name *p;                                                                                                  \
+               for (p = kl->head; p != kl->tail; p = p->next)                                  \
+                       kmp_free(name, kl->mp, p);                                                                      \
+               kmp_free(name, kl->mp, p);                                                                              \
+               kmp_destroy(name, kl->mp);                                                                              \
+               free(kl);                                                                                                               \
+       }                                                                                                                                       \
+       static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) {            \
+               kl1_##name *q, *p = kmp_alloc(name, kl->mp);                                    \
+               q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p;    \
+               ++kl->size;                                                                                                             \
+               return &q->data;                                                                                                \
+       }                                                                                                                                       \
+       static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \
+               kl1_##name *p;                                                                                                  \
+               if (kl->head->next == 0) return -1;                                                             \
+               --kl->size;                                                                                                             \
+               p = kl->head; kl->head = kl->head->next;                                                \
+               if (d) *d = p->data;                                                                                    \
+               kmp_free(name, kl->mp, p);                                                                              \
+               return 0;                                                                                                               \
+       }
+
+#define kliter_t(name) kl1_##name
+#define klist_t(name) kl_##name##_t
+#define kl_val(iter) ((iter)->data)
+#define kl_next(iter) ((iter)->next)
+#define kl_begin(kl) ((kl)->head)
+#define kl_end(kl) ((kl)->tail)
+
+#define kl_init(name) kl_init_##name()
+#define kl_destroy(name, kl) kl_destroy_##name(kl)
+#define kl_pushp(name, kl) kl_pushp_##name(kl)
+#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
+
+#endif
diff --git a/samtools/knetfile.c b/samtools/knetfile.c
new file mode 100644 (file)
index 0000000..994babb
--- /dev/null
@@ -0,0 +1,632 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/* Probably I will not do socket programming in the next few years and
+   therefore I decide to heavily annotate this file, for Linux and
+   Windows as well.  -lh3 */
+
+#include <time.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifdef _WIN32
+#include <winsock.h>
+#else
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#include "knetfile.h"
+
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
+static int socket_wait(int fd, int is_read)
+{
+       fd_set fds, *fdr = 0, *fdw = 0;
+       struct timeval tv;
+       int ret;
+       tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+       FD_ZERO(&fds);
+       FD_SET(fd, &fds);
+       if (is_read) fdr = &fds;
+       else fdw = &fds;
+       ret = select(fd+1, fdr, fdw, 0, &tv);
+#ifndef _WIN32
+       if (ret == -1) perror("select");
+#else
+       if (ret == 0)
+               fprintf(stderr, "select time-out\n");
+       else if (ret == SOCKET_ERROR)
+               fprintf(stderr, "select: %d\n", WSAGetLastError());
+#endif
+       return ret;
+}
+
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+       int on = 1, fd;
+       struct linger lng = { 0, 0 };
+       struct addrinfo hints, *res;
+       memset(&hints, 0, sizeof(struct addrinfo));
+       hints.ai_family = AF_UNSPEC;
+       hints.ai_socktype = SOCK_STREAM;
+       /* In Unix/Mac, getaddrinfo() is the most convenient way to get
+        * server information. */
+       if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+       if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+       /* The following two setsockopt() are used by ftplib
+        * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+        * necessary. */
+       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+       if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+       if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+       freeaddrinfo(res);
+       return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *int64tostr(char *buf, int64_t x)
+{
+       int cnt;
+       int i = 0;
+       do {
+               buf[i++] = '0' + x % 10;
+               x /= 10;
+       } while (x);
+       buf[i] = 0;
+       for (cnt = i, i = 0; i < cnt/2; ++i) {
+               int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+       }
+       return buf;
+}
+
+int64_t strtoint64(const char *buf)
+{
+       int64_t x;
+       for (x = 0; *buf != '\0'; ++buf)
+               x = x * 10 + ((int64_t) *buf - 48);
+       return x;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+       WSADATA wsaData;
+       return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+       WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func)                                                                            \
+       do {                                                                                                            \
+               fprintf(stderr, "%s: %d\n", func, WSAGetLastError());   \
+               return -1;                                                                                              \
+       } while (0)
+
+       int on = 1;
+       SOCKET fd;
+       struct linger lng = { 0, 0 };
+       struct sockaddr_in server;
+       struct hostent *hp = 0;
+       // open socket
+       if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+       if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+       // get host info
+       if (isalpha(host[0])) hp = gethostbyname(host);
+       else {
+               struct in_addr addr;
+               addr.s_addr = inet_addr(host);
+               hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+       }
+       if (hp == 0) __err_connect("gethost");
+       // connect
+       server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+       server.sin_family= AF_INET;
+       server.sin_port = htons(atoi(port));
+       if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+       // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+       return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+       off_t rest = len, curr, l = 0;
+       /* recv() and read() may not read the required length of data with
+        * one call. They have to be called repeatedly. */
+       while (rest) {
+               if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+               curr = netread(fd, buf + l, rest);
+               /* According to the glibc manual, section 13.2, a zero returned
+                * value indicates end-of-file (EOF), which should mean that
+                * read() will not return zero if EOF has not been met but data
+                * are not immediately available. */
+               if (curr == 0) break;
+               l += curr; rest -= curr;
+       }
+       return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
+static int kftp_get_response(knetFile *ftp)
+{
+#ifndef _WIN32
+       unsigned char c;
+#else
+       char c;
+#endif
+       int n = 0;
+       char *p;
+       if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+       while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+               //fputc(c, stderr);
+               if (n >= ftp->max_response) {
+                       ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+                       ftp->response = realloc(ftp->response, ftp->max_response);
+               }
+               ftp->response[n++] = c;
+               if (c == '\n') {
+                       if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+                               && ftp->response[3] != '-') break;
+                       n = 0;
+                       continue;
+               }
+       }
+       if (n < 2) return -1;
+       ftp->response[n-2] = 0;
+       return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+       if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+       netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
+       return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+       char *p;
+       int v[6];
+       kftp_send_cmd(ftp, "PASV\r\n", 1);
+       for (p = ftp->response; *p && *p != '('; ++p);
+       if (*p != '(') return -1;
+       ++p;
+       sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+       memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+       ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+       return 0;
+}
+
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+       char host[80], port[10];
+       if (ftp->pasv_port == 0) {
+               fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+               return -1;
+       }
+       sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+       sprintf(port, "%d", ftp->pasv_port);
+       ftp->fd = socket_connect(host, port);
+       if (ftp->fd == -1) return -1;
+       return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+       ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+       if (ftp->ctrl_fd == -1) return -1;
+       kftp_get_response(ftp);
+       kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+       kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+       kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+       return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+       if (ftp->ctrl_fd != -1) {
+               netclose(ftp->ctrl_fd);
+               ftp->ctrl_fd = -1;
+       }
+       netclose(ftp->fd);
+       ftp->fd = -1;
+       return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host, ->retr and ->size
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+       knetFile *fp;
+       char *p;
+       int l;
+       if (strstr(fn, "ftp://") != fn) return 0;
+       for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+       if (*p != '/') return 0;
+       l = p - fn - 6;
+       fp = calloc(1, sizeof(knetFile));
+       fp->type = KNF_TYPE_FTP;
+       fp->fd = -1;
+       /* the Linux/Mac version of socket_connect() also recognizes a port
+        * like "ftp", but the Windows version does not. */
+       fp->port = strdup("21");
+       fp->host = calloc(l + 1, 1);
+       if (strchr(mode, 'c')) fp->no_reconnect = 1;
+       strncpy(fp->host, fn + 6, l);
+       fp->retr = calloc(strlen(p) + 8, 1);
+       sprintf(fp->retr, "RETR %s\r\n", p);
+    fp->size_cmd = calloc(strlen(p) + 8, 1);
+    sprintf(fp->size_cmd, "SIZE %s\r\n", p);
+       fp->seek_offset = 0;
+       return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+       int ret;
+       long long file_size;
+       if (fp->fd != -1) {
+               netclose(fp->fd);
+               if (fp->no_reconnect) kftp_get_response(fp);
+       }
+       kftp_pasv_prep(fp);
+    kftp_send_cmd(fp, fp->size_cmd, 1);
+#ifndef _WIN32
+    if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
+    {
+        fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
+        return -1;
+    }
+#else
+       const char *p = fp->response;
+       while (*p != ' ') ++p;
+       while (*p < '0' || *p > '9') ++p;
+       file_size = strtoint64(p);
+#endif
+       fp->file_size = file_size;
+       if (fp->offset>=0) {
+               char tmp[32];
+#ifndef _WIN32
+               sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+               strcpy(tmp, "REST ");
+               int64tostr(tmp + 5, fp->offset);
+               strcat(tmp, "\r\n");
+#endif
+               kftp_send_cmd(fp, tmp, 1);
+       }
+       kftp_send_cmd(fp, fp->retr, 0);
+       kftp_pasv_connect(fp);
+       ret = kftp_get_response(fp);
+       if (ret != 150) {
+               fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+               netclose(fp->fd);
+               fp->fd = -1;
+               return -1;
+       }
+       fp->is_ready = 1;
+       return 0;
+}
+
+
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+       knetFile *fp;
+       char *p, *proxy, *q;
+       int l;
+       if (strstr(fn, "http://") != fn) return 0;
+       // set ->http_host
+       for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+       l = p - fn - 7;
+       fp = calloc(1, sizeof(knetFile));
+       fp->http_host = calloc(l + 1, 1);
+       strncpy(fp->http_host, fn + 7, l);
+       fp->http_host[l] = 0;
+       for (q = fp->http_host; *q && *q != ':'; ++q);
+       if (*q == ':') *q++ = 0;
+       // get http_proxy
+       proxy = getenv("http_proxy");
+       // set ->host, ->port and ->path
+       if (proxy == 0) {
+               fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+               fp->port = strdup(*q? q : "80");
+               fp->path = strdup(*p? p : "/");
+       } else {
+               fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+               for (q = fp->host; *q && *q != ':'; ++q);
+               if (*q == ':') *q++ = 0; 
+               fp->port = strdup(*q? q : "80");
+               fp->path = strdup(fn);
+       }
+       fp->type = KNF_TYPE_HTTP;
+       fp->ctrl_fd = fp->fd = -1;
+       fp->seek_offset = 0;
+       return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+       int ret, l = 0;
+       char *buf, *p;
+       if (fp->fd != -1) netclose(fp->fd);
+       fp->fd = socket_connect(fp->host, fp->port);
+       buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+       l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+    l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+       l += sprintf(buf + l, "\r\n");
+       netwrite(fp->fd, buf, l);
+       l = 0;
+       while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+               if (buf[l] == '\n' && l >= 3)
+                       if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+               ++l;
+       }
+       buf[l] = 0;
+       if (l < 14) { // prematured header
+               netclose(fp->fd);
+               fp->fd = -1;
+               return -1;
+       }
+       ret = strtol(buf + 8, &p, 0); // HTTP return code
+       if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
+               off_t rest = fp->offset;
+               while (rest) {
+                       off_t l = rest < 0x10000? rest : 0x10000;
+                       rest -= my_netread(fp->fd, buf, l);
+               }
+       } else if (ret != 206 && ret != 200) {
+               free(buf);
+               fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
+               netclose(fp->fd);
+               fp->fd = -1;
+               return -1;
+       }
+       free(buf);
+       fp->is_ready = 1;
+       return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+       knetFile *fp = 0;
+       if (mode[0] != 'r') {
+               fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+               return 0;
+       }
+       if (strstr(fn, "ftp://") == fn) {
+               fp = kftp_parse_url(fn, mode);
+               if (fp == 0) return 0;
+               if (kftp_connect(fp) == -1) {
+                       knet_close(fp);
+                       return 0;
+               }
+               kftp_connect_file(fp);
+       } else if (strstr(fn, "http://") == fn) {
+               fp = khttp_parse_url(fn, mode);
+               if (fp == 0) return 0;
+               khttp_connect_file(fp);
+       } else { // local file
+#ifdef _WIN32
+               /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+                * be undefined on some systems, although it is defined on my
+                * Mac and the Linux I have tested on. */
+               int fd = open(fn, O_RDONLY | O_BINARY);
+#else          
+               int fd = open(fn, O_RDONLY);
+#endif
+               if (fd == -1) {
+                       perror("open");
+                       return 0;
+               }
+               fp = (knetFile*)calloc(1, sizeof(knetFile));
+               fp->type = KNF_TYPE_LOCAL;
+               fp->fd = fd;
+               fp->ctrl_fd = -1;
+       }
+       if (fp && fp->fd == -1) {
+               knet_close(fp);
+               return 0;
+       }
+       return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+       knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+       fp->type = KNF_TYPE_LOCAL;
+       fp->fd = fd;
+       return fp;
+}
+
+off_t knet_read(knetFile *fp, void *buf, off_t len)
+{
+       off_t l = 0;
+       if (fp->fd == -1) return 0;
+       if (fp->type == KNF_TYPE_FTP) {
+               if (fp->is_ready == 0) {
+                       if (!fp->no_reconnect) kftp_reconnect(fp);
+                       kftp_connect_file(fp);
+               }
+       } else if (fp->type == KNF_TYPE_HTTP) {
+               if (fp->is_ready == 0)
+                       khttp_connect_file(fp);
+       }
+       if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+               off_t rest = len, curr;
+               while (rest) {
+                       curr = read(fp->fd, buf + l, rest);
+                       if (curr == 0) break;
+                       l += curr; rest -= curr;
+               }
+       } else l = my_netread(fp->fd, buf, len);
+       fp->offset += l;
+       return l;
+}
+
+off_t knet_seek(knetFile *fp, int64_t off, int whence)
+{
+       if (whence == SEEK_SET && off == fp->offset) return 0;
+       if (fp->type == KNF_TYPE_LOCAL) {
+               /* Be aware that lseek() returns the offset after seeking,
+                * while fseek() returns zero on success. */
+               off_t offset = lseek(fp->fd, off, whence);
+               if (offset == -1) {
+            // Be silent, it is OK for knet_seek to fail when the file is streamed
+            // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+                       return -1;
+               }
+               fp->offset = offset;
+               return 0;
+       }
+    else if (fp->type == KNF_TYPE_FTP) 
+    {
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+        else if ( whence==SEEK_END)
+            fp->offset = fp->file_size+off;
+               fp->is_ready = 0;
+               return 0;
+       } 
+    else if (fp->type == KNF_TYPE_HTTP) 
+    {
+               if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
+                       fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
+                       errno = ESPIPE;
+                       return -1;
+               }
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+               fp->is_ready = 0;
+               return fp->offset;
+       }
+       errno = EINVAL;
+    fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+       return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+       if (fp == 0) return 0;
+       if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+       if (fp->fd != -1) {
+               /* On Linux/Mac, netclose() is an alias of close(), but on
+                * Windows, it is an alias of closesocket(). */
+               if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+               else netclose(fp->fd);
+       }
+       free(fp->host); free(fp->port);
+       free(fp->response); free(fp->retr); // FTP specific
+       free(fp->path); free(fp->http_host); // HTTP specific
+       free(fp);
+       return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+       char *buf;
+       knetFile *fp;
+       int type = 4, l;
+#ifdef _WIN32
+       knet_win32_init();
+#endif
+       buf = calloc(0x100000, 1);
+       if (type == 0) {
+               fp = knet_open("knetfile.c", "r");
+               knet_seek(fp, 1000, SEEK_SET);
+       } else if (type == 1) { // NCBI FTP, large file
+               fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+               knet_seek(fp, 2500000000ll, SEEK_SET);
+               l = knet_read(fp, buf, 255);
+       } else if (type == 2) {
+               fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+               knet_seek(fp, 1000, SEEK_SET);
+       } else if (type == 3) {
+               fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+               knet_seek(fp, 1000, SEEK_SET);
+       } else if (type == 4) {
+               fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+               knet_read(fp, buf, 10000);
+               knet_seek(fp, 20000, SEEK_SET);
+               knet_seek(fp, 10000, SEEK_SET);
+               l = knet_read(fp, buf+10000, 10000000) + 10000;
+       }
+       if (type != 4 && type != 1) {
+               knet_read(fp, buf, 255);
+               buf[255] = 0;
+               printf("%s\n", buf);
+       } else write(fileno(stdout), buf, l);
+       knet_close(fp);
+       free(buf);
+       return 0;
+}
+#endif
diff --git a/samtools/knetfile.h b/samtools/knetfile.h
new file mode 100644 (file)
index 0000000..0a0e66f
--- /dev/null
@@ -0,0 +1,75 @@
+#ifndef KNETFILE_H
+#define KNETFILE_H
+
+#include <stdint.h>
+#include <fcntl.h>
+
+#ifndef _WIN32
+#define netread(fd, ptr, len) read(fd, ptr, len)
+#define netwrite(fd, ptr, len) write(fd, ptr, len)
+#define netclose(fd) close(fd)
+#else
+#include <winsock2.h>
+#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
+#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
+#define netclose(fd) closesocket(fd)
+#endif
+
+// FIXME: currently I/O is unbuffered
+
+#define KNF_TYPE_LOCAL 1
+#define KNF_TYPE_FTP   2
+#define KNF_TYPE_HTTP  3
+
+typedef struct knetFile_s {
+       int type, fd;
+       int64_t offset;
+       char *host, *port;
+
+       // the following are for FTP only
+       int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
+       char *response, *retr, *size_cmd;
+       int64_t seek_offset; // for lazy seek
+    int64_t file_size;
+
+       // the following are for HTTP only
+       char *path, *http_host;
+} knetFile;
+
+#define knet_tell(fp) ((fp)->offset)
+#define knet_fileno(fp) ((fp)->fd)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+       int knet_win32_init();
+       void knet_win32_destroy();
+#endif
+
+       knetFile *knet_open(const char *fn, const char *mode);
+
+       /* 
+          This only works with local files.
+        */
+       knetFile *knet_dopen(int fd, const char *mode);
+
+       /*
+         If ->is_ready==0, this routine updates ->fd; otherwise, it simply
+         reads from ->fd.
+        */
+       off_t knet_read(knetFile *fp, void *buf, off_t len);
+
+       /*
+         This routine only sets ->offset and ->is_ready=0. It does not
+         communicate with the FTP server.
+        */
+       off_t knet_seek(knetFile *fp, int64_t off, int whence);
+       int knet_close(knetFile *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/kseq.h b/samtools/kseq.h
new file mode 100644 (file)
index 0000000..82face0
--- /dev/null
@@ -0,0 +1,227 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+  2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
+ */
+
+/* Last Modified: 12APR2009 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_MAX   1
+
+#define __KS_TYPE(type_t)                                              \
+       typedef struct __kstream_t {                            \
+               unsigned char *buf;                                             \
+               int begin, end, is_eof;                                 \
+               type_t f;                                                               \
+       } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)                                                          \
+       static inline kstream_t *ks_init(type_t f)                                              \
+       {                                                                                                                               \
+               kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
+               ks->f = f;                                                                                                      \
+               ks->buf = malloc(__bufsize);                                                            \
+               return ks;                                                                                                      \
+       }                                                                                                                               \
+       static inline void ks_destroy(kstream_t *ks)                                    \
+       {                                                                                                                               \
+               if (ks) {                                                                                                       \
+                       free(ks->buf);                                                                                  \
+                       free(ks);                                                                                               \
+               }                                                                                                                       \
+       }
+
+#define __KS_GETC(__read, __bufsize)                                           \
+       static inline int ks_getc(kstream_t *ks)                                \
+       {                                                                                                               \
+               if (ks->is_eof && ks->begin >= ks->end) return -1;      \
+               if (ks->begin >= ks->end) {                                                     \
+                       ks->begin = 0;                                                                  \
+                       ks->end = __read(ks->f, ks->buf, __bufsize);    \
+                       if (ks->end < __bufsize) ks->is_eof = 1;                \
+                       if (ks->end == 0) return -1;                                    \
+               }                                                                                                       \
+               return (int)ks->buf[ks->begin++];                                       \
+       }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+       size_t l, m;
+       char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)                                                               \
+       static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       {                                                                                                                                       \
+               if (dret) *dret = 0;                                                                                    \
+               str->l = 0;                                                                                                             \
+               if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
+               for (;;) {                                                                                                              \
+                       int i;                                                                                                          \
+                       if (ks->begin >= ks->end) {                                                                     \
+                               if (!ks->is_eof) {                                                                              \
+                                       ks->begin = 0;                                                                          \
+                                       ks->end = __read(ks->f, ks->buf, __bufsize);            \
+                                       if (ks->end < __bufsize) ks->is_eof = 1;                        \
+                                       if (ks->end == 0) break;                                                        \
+                               } else break;                                                                                   \
+                       }                                                                                                                       \
+                       if (delimiter > KS_SEP_MAX) {                                                           \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (ks->buf[i] == delimiter) break;                                     \
+                       } else if (delimiter == KS_SEP_SPACE) {                                         \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (isspace(ks->buf[i])) break;                                         \
+                       } else if (delimiter == KS_SEP_TAB) {                                           \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+                       } else i = 0; /* never come to here! */                                         \
+                       if (str->m - str->l < i - ks->begin + 1) {                                      \
+                               str->m = str->l + (i - ks->begin) + 1;                                  \
+                               kroundup32(str->m);                                                                             \
+                               str->s = (char*)realloc(str->s, str->m);                                \
+                       }                                                                                                                       \
+                       memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+                       str->l = str->l + (i - ks->begin);                                                      \
+                       ks->begin = i + 1;                                                                                      \
+                       if (i < ks->end) {                                                                                      \
+                               if (dret) *dret = ks->buf[i];                                                   \
+                               break;                                                                                                  \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (str->l == 0) {                                                                                              \
+                       str->m = 1;                                                                                                     \
+                       str->s = (char*)calloc(1, 1);                                                           \
+               }                                                                                                                               \
+               str->s[str->l] = '\0';                                                                                  \
+               return str->l;                                                                                                  \
+       }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+       __KS_TYPE(type_t)                                                       \
+       __KS_BASIC(type_t, __bufsize)                           \
+       __KS_GETC(__read, __bufsize)                            \
+       __KS_GETUNTIL(__read, __bufsize)
+
+#define __KSEQ_BASIC(type_t)                                                                                   \
+       static inline kseq_t *kseq_init(type_t fd)                                                      \
+       {                                                                                                                                       \
+               kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
+               s->f = ks_init(fd);                                                                                             \
+               return s;                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kseq_rewind(kseq_t *ks)                                                      \
+       {                                                                                                                                       \
+               ks->last_char = 0;                                                                                              \
+               ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
+       }                                                                                                                                       \
+       static inline void kseq_destroy(kseq_t *ks)                                                     \
+       {                                                                                                                                       \
+               if (!ks) return;                                                                                                \
+               free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+               ks_destroy(ks->f);                                                                                              \
+               free(ks);                                                                                                               \
+       }
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ                                                                                                            \
+       static int kseq_read(kseq_t *seq)                                                                       \
+       {                                                                                                                                       \
+               int c;                                                                                                                  \
+               kstream_t *ks = seq->f;                                                                                 \
+               if (seq->last_char == 0) { /* then jump to the next header line */ \
+                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
+                       if (c == -1) return -1; /* end of file */                                       \
+                       seq->last_char = c;                                                                                     \
+               } /* the first header char has been read */                                             \
+               seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
+               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
+               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
+               while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+                       if (isgraph(c)) { /* printable non-space character */           \
+                               if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
+                                       seq->seq.m = seq->seq.l + 2;                                            \
+                                       kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
+                                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+                               }                                                                                                               \
+                               seq->seq.s[seq->seq.l++] = (char)c;                                             \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
+               if (c != '+') return seq->seq.l; /* FASTA */                                    \
+               if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
+                       seq->qual.m = seq->seq.m;                                                                       \
+                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
+               }                                                                                                                               \
+               while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+               if (c == -1) return -2; /* we should not stop here */                   \
+               while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
+                       if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
+               seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
+               seq->last_char = 0;     /* we have not come to the next header line */ \
+               if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
+               return seq->seq.l;                                                                                              \
+       }
+
+#define __KSEQ_TYPE(type_t)                                            \
+       typedef struct {                                                        \
+               kstring_t name, comment, seq, qual;             \
+               int last_char;                                                  \
+               kstream_t *f;                                                   \
+       } kseq_t;
+
+#define KSEQ_INIT(type_t, __read)                              \
+       KSTREAM_INIT(type_t, __read, 4096)                      \
+       __KSEQ_TYPE(type_t)                                                     \
+       __KSEQ_BASIC(type_t)                                            \
+       __KSEQ_READ
+
+#endif
diff --git a/samtools/ksort.h b/samtools/ksort.h
new file mode 100644 (file)
index 0000000..16a03fd
--- /dev/null
@@ -0,0 +1,271 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+  2008-11-16 (0.1.4):
+
+    * Fixed a bug in introsort() that happens in rare cases.
+
+  2008-11-05 (0.1.3):
+
+    * Fixed a bug in introsort() for complex comparisons.
+
+       * Fixed a bug in mergesort(). The previous version is not stable.
+
+  2008-09-15 (0.1.2):
+
+       * Accelerated introsort. On my Mac (not on another Linux machine),
+         my implementation is as fast as std::sort on random input.
+
+       * Added combsort and in introsort, switch to combsort if the
+         recursion is too deep.
+
+  2008-09-13 (0.1.1):
+
+       * Added k-small algorithm
+
+  2008-09-05 (0.1.0):
+
+       * Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+       void *left, *right;
+       int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt)                                                            \
+       void ks_mergesort_##name(size_t n, type_t array[], type_t temp[])       \
+       {                                                                                                                                       \
+               type_t *a2[2], *a, *b;                                                                                  \
+               int curr, shift;                                                                                                \
+                                                                                                                                               \
+               a2[0] = array;                                                                                                  \
+               a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);               \
+               for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {                  \
+                       a = a2[curr]; b = a2[1-curr];                                                           \
+                       if (shift == 0) {                                                                                       \
+                               type_t *p = b, *i, *eb = a + n;                                                 \
+                               for (i = a; i < eb; i += 2) {                                                   \
+                                       if (i == eb - 1) *p++ = *i;                                                     \
+                                       else {                                                                                          \
+                                               if (__sort_lt(*(i+1), *i)) {                                    \
+                                                       *p++ = *(i+1); *p++ = *i;                                       \
+                                               } else {                                                                                \
+                                                       *p++ = *i; *p++ = *(i+1);                                       \
+                                               }                                                                                               \
+                                       }                                                                                                       \
+                               }                                                                                                               \
+                       } else {                                                                                                        \
+                               size_t i, step = 1ul<<shift;                                                    \
+                               for (i = 0; i < n; i += step<<1) {                                              \
+                                       type_t *p, *j, *k, *ea, *eb;                                            \
+                                       if (n < i + step) {                                                                     \
+                                               ea = a + n; eb = a;                                                             \
+                                       } else {                                                                                        \
+                                               ea = a + i + step;                                                              \
+                                               eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+                                       }                                                                                                       \
+                                       j = a + i; k = a + i + step; p = b + i;                         \
+                                       while (j < ea && k < eb) {                                                      \
+                                               if (__sort_lt(*k, *j)) *p++ = *k++;                             \
+                                               else *p++ = *j++;                                                               \
+                                       }                                                                                                       \
+                                       while (j < ea) *p++ = *j++;                                                     \
+                                       while (k < eb) *p++ = *k++;                                                     \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+                       curr = 1 - curr;                                                                                        \
+               }                                                                                                                               \
+               if (curr == 1) {                                                                                                \
+                       type_t *p = a2[0], *i = a2[1], *eb = array + n;                         \
+                       for (; p < eb; ++i) *p++ = *i;                                                          \
+               }                                                                                                                               \
+               if (temp == 0) free(a2[1]);                                                                             \
+       }                                                                                                                                       \
+       void ks_heapadjust_##name(size_t i, size_t n, type_t l[])                       \
+       {                                                                                                                                       \
+               size_t k = i;                                                                                                   \
+               type_t tmp = l[i];                                                                                              \
+               while ((k = (k << 1) + 1) < n) {                                                                \
+                       if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k;                         \
+                       if (__sort_lt(l[k], tmp)) break;                                                        \
+                       l[i] = l[k]; i = k;                                                                                     \
+               }                                                                                                                               \
+               l[i] = tmp;                                                                                                             \
+       }                                                                                                                                       \
+       void ks_heapmake_##name(size_t lsize, type_t l[])                                       \
+       {                                                                                                                                       \
+               size_t i;                                                                                                               \
+               for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i)                              \
+                       ks_heapadjust_##name(i, lsize, l);                                                      \
+       }                                                                                                                                       \
+       void ks_heapsort_##name(size_t lsize, type_t l[])                                       \
+       {                                                                                                                                       \
+               size_t i;                                                                                                               \
+               for (i = lsize - 1; i > 0; --i) {                                                               \
+                       type_t tmp;                                                                                                     \
+                       tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       inline void __ks_insertsort_##name(type_t *s, type_t *t)                        \
+       {                                                                                                                                       \
+               type_t *i, *j, swap_tmp;                                                                                \
+               for (i = s + 1; i < t; ++i)                                                                             \
+                       for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) {                      \
+                               swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp;                  \
+                       }                                                                                                                       \
+       }                                                                                                                                       \
+       void ks_combsort_##name(size_t n, type_t a[])                                           \
+       {                                                                                                                                       \
+               const double shrink_factor = 1.2473309501039786540366528676643; \
+               int do_swap;                                                                                                    \
+               size_t gap = n;                                                                                                 \
+               type_t tmp, *i, *j;                                                                                             \
+               do {                                                                                                                    \
+                       if (gap > 2) {                                                                                          \
+                               gap = (size_t)(gap / shrink_factor);                                    \
+                               if (gap == 9 || gap == 10) gap = 11;                                    \
+                       }                                                                                                                       \
+                       do_swap = 0;                                                                                            \
+                       for (i = a; i < a + n - gap; ++i) {                                                     \
+                               j = i + gap;                                                                                    \
+                               if (__sort_lt(*j, *i)) {                                                                \
+                                       tmp = *i; *i = *j; *j = tmp;                                            \
+                                       do_swap = 1;                                                                            \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               } while (do_swap || gap > 2);                                                                   \
+               if (gap != 1) __ks_insertsort_##name(a, a + n);                                 \
+       }                                                                                                                                       \
+       void ks_introsort_##name(size_t n, type_t a[])                                          \
+       {                                                                                                                                       \
+               int d;                                                                                                                  \
+               ks_isort_stack_t *top, *stack;                                                                  \
+               type_t rp, swap_tmp;                                                                                    \
+               type_t *s, *t, *i, *j, *k;                                                                              \
+                                                                                                                                               \
+               if (n < 1) return;                                                                                              \
+               else if (n == 2) {                                                                                              \
+                       if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+                       return;                                                                                                         \
+               }                                                                                                                               \
+               for (d = 2; 1ul<<d < n; ++d);                                                                   \
+               stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+               top = stack; s = a; t = a + (n-1); d <<= 1;                                             \
+               while (1) {                                                                                                             \
+                       if (s < t) {                                                                                            \
+                               if (--d == 0) {                                                                                 \
+                                       ks_combsort_##name(t - s + 1, s);                                       \
+                                       t = s;                                                                                          \
+                                       continue;                                                                                       \
+                               }                                                                                                               \
+                               i = s; j = t; k = i + ((j-i)>>1) + 1;                                   \
+                               if (__sort_lt(*k, *i)) {                                                                \
+                                       if (__sort_lt(*k, *j)) k = j;                                           \
+                               } else k = __sort_lt(*j, *i)? i : j;                                    \
+                               rp = *k;                                                                                                \
+                               if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }  \
+                               for (;;) {                                                                                              \
+                                       do ++i; while (__sort_lt(*i, rp));                                      \
+                                       do --j; while (i <= j && __sort_lt(rp, *j));            \
+                                       if (j <= i) break;                                                                      \
+                                       swap_tmp = *i; *i = *j; *j = swap_tmp;                          \
+                               }                                                                                                               \
+                               swap_tmp = *i; *i = *t; *t = swap_tmp;                                  \
+                               if (i-s > t-i) {                                                                                \
+                                       if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+                                       s = t-i > 16? i+1 : t;                                                          \
+                               } else {                                                                                                \
+                                       if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+                                       t = i-s > 16? i-1 : s;                                                          \
+                               }                                                                                                               \
+                       } else {                                                                                                        \
+                               if (top == stack) {                                                                             \
+                                       free(stack);                                                                            \
+                                       __ks_insertsort_##name(a, a+n);                                         \
+                                       return;                                                                                         \
+                               } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+       /* 0 <= kk < n */                                                                                                       \
+       type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)                      \
+       {                                                                                                                                       \
+               type_t *low, *high, *k, *ll, *hh, *mid;                                                 \
+               low = arr; high = arr + n - 1; k = arr + kk;                                    \
+               for (;;) {                                                                                                              \
+                       if (high <= low) return *k;                                                                     \
+                       if (high == low + 1) {                                                                          \
+                               if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+                               return *k;                                                                                              \
+                       }                                                                                                                       \
+                       mid = low + (high - low) / 2;                                                           \
+                       if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+                       if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+                       if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);      \
+                       KSORT_SWAP(type_t, *mid, *(low+1));                                                     \
+                       ll = low + 1; hh = high;                                                                        \
+                       for (;;) {                                                                                                      \
+                               do ++ll; while (__sort_lt(*ll, *low));                                  \
+                               do --hh; while (__sort_lt(*low, *hh));                                  \
+                               if (hh < ll) break;                                                                             \
+                               KSORT_SWAP(type_t, *ll, *hh);                                                   \
+                       }                                                                                                                       \
+                       KSORT_SWAP(type_t, *low, *hh);                                                          \
+                       if (hh <= k) low = ll;                                                                          \
+                       if (hh >= k) high = hh - 1;                                                                     \
+               }                                                                                                                               \
+       }
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/samtools/kstring.c b/samtools/kstring.c
new file mode 100644 (file)
index 0000000..e0203fa
--- /dev/null
@@ -0,0 +1,165 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "kstring.h"
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+       va_list ap;
+       int l;
+       va_start(ap, fmt);
+       l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
+       va_end(ap);
+       if (l + 1 > s->m - s->l) {
+               s->m = s->l + l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+               va_start(ap, fmt);
+               l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+       }
+       va_end(ap);
+       s->l += l;
+       return l;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+       int i, n, max, last_char, last_start, *offsets, l;
+       n = 0; max = *_max; offsets = *_offsets;
+       l = strlen(s);
+       
+#define __ksplit_aux do {                                                                                              \
+               if (_offsets) {                                                                                                 \
+                       s[i] = 0;                                                                                                       \
+                       if (n == max) {                                                                                         \
+                               max = max? max<<1 : 2;                                                                  \
+                               offsets = (int*)realloc(offsets, sizeof(int) * max);    \
+                       }                                                                                                                       \
+                       offsets[n++] = last_start;                                                                      \
+               } else ++n;                                                                                                             \
+       } while (0)
+
+       for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+               if (delimiter == 0) {
+                       if (isspace(s[i]) || s[i] == 0) {
+                               if (isgraph(last_char)) __ksplit_aux; // the end of a field
+                       } else {
+                               if (isspace(last_char) || last_char == 0) last_start = i;
+                       }
+               } else {
+                       if (s[i] == delimiter || s[i] == 0) {
+                               if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+                       } else {
+                               if (last_char == delimiter || last_char == 0) last_start = i;
+                       }
+               }
+               last_char = s[i];
+       }
+       *_max = max; *_offsets = offsets;
+       return n;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+int *ksBM_prep(const uint8_t *pat, int m)
+{
+       int i, *suff, *prep, *bmGs, *bmBc;
+       prep = calloc(m + 256, 1);
+       bmGs = prep; bmBc = prep + m;
+       { // preBmBc()
+               for (i = 0; i < 256; ++i) bmBc[i] = m;
+               for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+       }
+       suff = calloc(m, sizeof(int));
+       { // suffixes()
+               int f = 0, g;
+               suff[m - 1] = m;
+               g = m - 1;
+               for (i = m - 2; i >= 0; --i) {
+                       if (i > g && suff[i + m - 1 - f] < i - g)
+                               suff[i] = suff[i + m - 1 - f];
+                       else {
+                               if (i < g) g = i;
+                               f = i;
+                               while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+                               suff[i] = f - g;
+                       }
+               }
+       }
+       { // preBmGs()
+               int j = 0;
+               for (i = 0; i < m; ++i) bmGs[i] = m;
+               for (i = m - 1; i >= 0; --i)
+                       if (suff[i] == i + 1)
+                               for (; j < m - 1 - i; ++j)
+                                       if (bmGs[j] == m)
+                                               bmGs[j] = m - 1 - i;
+               for (i = 0; i <= m - 2; ++i)
+                       bmGs[m - 1 - suff[i]] = m - 1 - i;
+       }
+       free(suff);
+       return prep;
+}
+
+int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches)
+{
+       int i, j, *prep, *bmGs, *bmBc;
+       int *matches = 0, mm = 0, nm = 0;
+       prep = _prep? _prep : ksBM_prep(pat, m);
+       bmGs = prep; bmBc = prep + m;
+       j = 0;
+       while (j <= n - m) {
+               for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+               if (i < 0) {
+                       if (nm == mm) {
+                               mm = mm? mm<<1 : 1;
+                               matches = realloc(matches, mm * sizeof(int));
+                       }
+                       matches[nm++] = j;
+                       j += bmGs[0];
+               } else {
+                       int max = bmBc[str[i+j]] - m + 1 + i;
+                       if (max < bmGs[i]) max = bmGs[i];
+                       j += max;
+               }
+       }
+       *n_matches = nm;
+       if (_prep == 0) free(prep);
+       return matches;
+}
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+       kstring_t *s;
+       int *fields, n, i;
+       s = (kstring_t*)calloc(1, sizeof(kstring_t));
+       // test ksprintf()
+       ksprintf(s, " abcdefg:    %d ", 100);
+       printf("'%s'\n", s->s);
+       // test ksplit()
+       fields = ksplit(s, 0, &n);
+       for (i = 0; i < n; ++i)
+               printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+       free(s);
+
+       {
+               static char *str = "abcdefgcdg";
+               static char *pat = "cd";
+               int n, *matches;
+               matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n);
+               printf("%d: \n", n);
+               for (i = 0; i < n; ++i)
+                       printf("- %d\n", matches[i]);
+               free(matches);
+       }
+       return 0;
+}
+#endif
diff --git a/samtools/kstring.h b/samtools/kstring.h
new file mode 100644 (file)
index 0000000..f4e5a99
--- /dev/null
@@ -0,0 +1,68 @@
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+       size_t l, m;
+       char *s;
+} kstring_t;
+#endif
+
+int ksprintf(kstring_t *s, const char *fmt, ...);
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+
+// calculate the auxiliary array, allocated by calloc()
+int *ksBM_prep(const uint8_t *pat, int m);
+
+/* Search pat in str and returned the list of matches. The size of the
+ * list is returned as n_matches. _prep is the array returned by
+ * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */
+int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches);
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+       if (s->l + l + 1 >= s->m) {
+               s->m = s->l + l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+       }
+       strncpy(s->s + s->l, p, l);
+       s->l += l;
+       s->s[s->l] = 0;
+       return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+       return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+       if (s->l + 1 >= s->m) {
+               s->m = s->l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+       }
+       s->s[s->l++] = c;
+       s->s[s->l] = 0;
+       return c;
+}
+
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+       int max = 0, *offsets = 0;
+       *n = ksplit_core(s->s, delimiter, &max, &offsets);
+       return offsets;
+}
+
+#endif
diff --git a/samtools/razf.c b/samtools/razf.c
new file mode 100644 (file)
index 0000000..e7499f9
--- /dev/null
@@ -0,0 +1,853 @@
+/*
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NO_RAZF
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "razf.h"
+
+
+#if ZLIB_VERNUM < 0x1221
+struct _gz_header_s {
+    int     text;
+    uLong   time;
+    int     xflags;
+    int     os;
+    Bytef   *extra;
+    uInt    extra_len;
+    uInt    extra_max;
+    Bytef   *name;
+    uInt    name_max;
+    Bytef   *comment;
+    uInt    comm_max;
+    int     hcrc;
+    int     done;
+};
+#warning "zlib < 1.2.2.1; RAZF writing is disabled."
+#endif
+
+#define DEF_MEM_LEVEL 8
+
+static inline uint32_t byte_swap_4(uint32_t v){
+       v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+       return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint64_t byte_swap_8(uint64_t v){
+       v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+       v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+       return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+
+static inline int is_big_endian(){
+       int x = 0x01;
+       char *c = (char*)&x;
+       return (c[0] != 0x01);
+}
+
+#ifndef _RZ_READONLY
+static void add_zindex(RAZF *rz, int64_t in, int64_t out){
+       if(rz->index->size == rz->index->cap){
+               rz->index->cap = rz->index->cap * 1.5 + 2;
+               rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
+               rz->index->bin_offsets  = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
+       }
+       if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
+       rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
+       rz->index->size ++;
+}
+
+static void save_zindex(RAZF *rz, int fd){
+       int32_t i, v32;
+       int is_be;
+       is_be = is_big_endian();
+       if(is_be) write(fd, &rz->index->size, sizeof(int));
+       else {
+               v32 = byte_swap_4((uint32_t)rz->index->size);
+               write(fd, &v32, sizeof(uint32_t));
+       }
+       v32 = rz->index->size / RZ_BIN_SIZE + 1;
+       if(!is_be){
+               for(i=0;i<v32;i++) rz->index->bin_offsets[i]  = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+               for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+       }
+       write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+       write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
+}
+#endif
+
+#ifdef _USE_KNETFILE
+static void load_zindex(RAZF *rz, knetFile *fp){
+#else
+static void load_zindex(RAZF *rz, int fd){
+#endif
+       int32_t i, v32;
+       int is_be;
+       if(!rz->load_index) return;
+       if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
+       is_be = is_big_endian();
+#ifdef _USE_KNETFILE
+       knet_read(fp, &rz->index->size, sizeof(int));
+#else
+       read(fd, &rz->index->size, sizeof(int));
+#endif
+       if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
+       rz->index->cap = rz->index->size;
+       v32 = rz->index->size / RZ_BIN_SIZE + 1;
+       rz->index->bin_offsets  = malloc(sizeof(int64_t) * v32);
+#ifdef _USE_KNETFILE
+       knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#else
+       read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#endif
+       rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
+#ifdef _USE_KNETFILE
+       knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#else
+       read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#endif
+       if(!is_be){
+               for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+               for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+       }
+}
+
+#ifdef _RZ_READONLY
+static RAZF* razf_open_w(int fd)
+{
+       fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n");
+       return 0;
+}
+#else
+static RAZF* razf_open_w(int fd){
+       RAZF *rz;
+#ifdef _WIN32
+       setmode(fd, O_BINARY);
+#endif
+       rz = calloc(1, sizeof(RAZF));
+       rz->mode = 'w';
+#ifdef _USE_KNETFILE
+    rz->x.fpw = fd;
+#else
+       rz->filedes = fd;
+#endif
+       rz->stream = calloc(sizeof(z_stream), 1);
+       rz->inbuf  = malloc(RZ_BUFFER_SIZE);
+       rz->outbuf = malloc(RZ_BUFFER_SIZE);
+       rz->index = calloc(sizeof(ZBlockIndex), 1);
+       deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+       rz->stream->avail_out = RZ_BUFFER_SIZE;
+       rz->stream->next_out  = rz->outbuf;
+       rz->header = calloc(sizeof(gz_header), 1);
+       rz->header->os    = 0x03; //Unix
+       rz->header->text  = 0;
+       rz->header->time  = 0;
+       rz->header->extra = malloc(7);
+       strncpy((char*)rz->header->extra, "RAZF", 4);
+       rz->header->extra[4] = 1; // obsolete field
+       // block size = RZ_BLOCK_SIZE, Big-Endian
+       rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
+       rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
+       rz->header->extra_len = 7;
+       rz->header->name = rz->header->comment  = 0;
+       rz->header->hcrc = 0;
+       deflateSetHeader(rz->stream, rz->header);
+       rz->block_pos = rz->block_off = 0;
+       return rz;
+}
+
+static void _razf_write(RAZF* rz, const void *data, int size){
+       int tout;
+       rz->stream->avail_in = size;
+       rz->stream->next_in  = (void*)data;
+       while(1){
+               tout = rz->stream->avail_out;
+               deflate(rz->stream, Z_NO_FLUSH);
+               rz->out += tout - rz->stream->avail_out;
+               if(rz->stream->avail_out) break;
+#ifdef _USE_KNETFILE
+               write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+               write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+               rz->stream->avail_out = RZ_BUFFER_SIZE;
+               rz->stream->next_out  = rz->outbuf;
+               if(rz->stream->avail_in == 0) break;
+       };
+       rz->in += size - rz->stream->avail_in;
+       rz->block_off += size - rz->stream->avail_in;
+}
+
+static void razf_flush(RAZF *rz){
+       uint32_t tout;
+       if(rz->buf_len){
+               _razf_write(rz, rz->inbuf, rz->buf_len);
+               rz->buf_off = rz->buf_len = 0;
+       }
+       if(rz->stream->avail_out){
+#ifdef _USE_KNETFILE    
+               write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else        
+               write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+               rz->stream->avail_out = RZ_BUFFER_SIZE;
+               rz->stream->next_out  = rz->outbuf;
+       }
+       while(1){
+               tout = rz->stream->avail_out;
+               deflate(rz->stream, Z_FULL_FLUSH);
+               rz->out += tout - rz->stream->avail_out;
+               if(rz->stream->avail_out == 0){
+#ifdef _USE_KNETFILE    
+                       write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else            
+                       write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+                       rz->stream->avail_out = RZ_BUFFER_SIZE;
+                       rz->stream->next_out  = rz->outbuf;
+               } else break;
+       }
+       rz->block_pos = rz->out;
+       rz->block_off = 0;
+}
+
+static void razf_end_flush(RAZF *rz){
+       uint32_t tout;
+       if(rz->buf_len){
+               _razf_write(rz, rz->inbuf, rz->buf_len);
+               rz->buf_off = rz->buf_len = 0;
+       }
+       while(1){
+               tout = rz->stream->avail_out;
+               deflate(rz->stream, Z_FINISH);
+               rz->out += tout - rz->stream->avail_out;
+               if(rz->stream->avail_out < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE        
+                       write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else            
+                       write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+                       rz->stream->avail_out = RZ_BUFFER_SIZE;
+                       rz->stream->next_out  = rz->outbuf;
+               } else break;
+       }
+}
+
+static void _razf_buffered_write(RAZF *rz, const void *data, int size){
+       int i, n;
+       while(1){
+               if(rz->buf_len == RZ_BUFFER_SIZE){
+                       _razf_write(rz, rz->inbuf, rz->buf_len);
+                       rz->buf_len = 0;
+               }
+               if(size + rz->buf_len < RZ_BUFFER_SIZE){
+                       for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+                       rz->buf_len += size;
+                       return;
+               } else {
+                       n = RZ_BUFFER_SIZE - rz->buf_len;
+                       for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+                       size -= n;
+                       data += n;
+                       rz->buf_len += n;
+               }
+       }
+}
+
+int razf_write(RAZF* rz, const void *data, int size){
+       int ori_size, n;
+       int64_t next_block;
+       ori_size = size;
+       next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+       while(rz->in + rz->buf_len + size >= next_block){
+               n = next_block - rz->in - rz->buf_len;
+               _razf_buffered_write(rz, data, n);
+               data += n;
+               size -= n;
+               razf_flush(rz);
+               add_zindex(rz, rz->in, rz->out);
+               next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+       }
+       _razf_buffered_write(rz, data, size);
+       return ori_size;
+}
+#endif
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC     0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define RESERVED     0xE0 /* bits 5..7: reserved */
+
+static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
+       int method, flags, n, len;
+       if(size < 2) return 0;
+       if(data[0] != 0x1f || data[1] != 0x8b) return 0;
+       if(size < 4) return 0;
+       method = data[2];
+       flags  = data[3];
+       if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
+       n = 4 + 6; // Skip 6 bytes
+       *extra_off = n + 2;
+       *extra_len = 0;
+       if(flags & EXTRA_FIELD){
+               if(size < n + 2) return 0;
+               len = ((int)data[n + 1] << 8) | data[n];
+               n += 2;
+               *extra_off = n;
+               while(len){
+                       if(n >= size) return 0;
+                       n ++;
+                       len --;
+               }
+               *extra_len = n - (*extra_off);
+       }
+       if(flags & ORIG_NAME) while(n < size && data[n++]);
+       if(flags & COMMENT) while(n < size && data[n++]);
+       if(flags & HEAD_CRC){
+               if(n + 2 > size) return 0;
+               n += 2;
+       }
+       return n;
+}
+
+#ifdef _USE_KNETFILE
+static RAZF* razf_open_r(knetFile *fp, int _load_index){
+#else
+static RAZF* razf_open_r(int fd, int _load_index){
+#endif
+       RAZF *rz;
+       int ext_off, ext_len;
+       int n, is_be, ret;
+       int64_t end;
+       unsigned char c[] = "RAZF";
+       rz = calloc(1, sizeof(RAZF));
+       rz->mode = 'r';
+#ifdef _USE_KNETFILE
+    rz->x.fpr = fp;
+#else
+#ifdef _WIN32
+       setmode(fd, O_BINARY);
+#endif
+       rz->filedes = fd;
+#endif
+       rz->stream = calloc(sizeof(z_stream), 1);
+       rz->inbuf  = malloc(RZ_BUFFER_SIZE);
+       rz->outbuf = malloc(RZ_BUFFER_SIZE);
+       rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
+#ifdef _USE_KNETFILE
+    n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+       n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif
+       ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
+       if(ret == 0){
+               PLAIN_FILE:
+               rz->in = n;
+               rz->file_type = FILE_TYPE_PLAIN;
+               memcpy(rz->outbuf, rz->inbuf, n);
+               rz->buf_len = n;
+               free(rz->stream);
+               rz->stream = NULL;
+               return rz;
+       }
+       rz->header_size = ret;
+       ret = inflateInit2(rz->stream, -WINDOW_BITS);
+       if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
+       rz->stream->avail_in = n - rz->header_size;
+       rz->stream->next_in  = rz->inbuf + rz->header_size;
+       rz->stream->avail_out = RZ_BUFFER_SIZE;
+       rz->stream->next_out  = rz->outbuf;
+       rz->file_type = FILE_TYPE_GZ;
+       rz->in = rz->header_size;
+       rz->block_pos = rz->header_size;
+       rz->next_block_pos = rz->header_size;
+       rz->block_off = 0;
+       if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
+       if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
+               fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file.  in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
+               return rz;
+       }
+       rz->load_index = _load_index;
+       rz->file_type = FILE_TYPE_RZ;
+#ifdef _USE_KNETFILE
+       if(knet_seek(fp, -16, SEEK_END) == -1){
+#else
+       if(lseek(fd, -16, SEEK_END) == -1){
+#endif
+               UNSEEKABLE:
+               rz->seekable = 0;
+               rz->index = NULL;
+               rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
+       } else {
+               is_be = is_big_endian();
+               rz->seekable = 1;
+#ifdef _USE_KNETFILE
+        knet_read(fp, &end, sizeof(int64_t));
+#else
+               read(fd, &end, sizeof(int64_t));
+#endif        
+               if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
+               else rz->src_end = end;
+
+#ifdef _USE_KNETFILE
+               knet_read(fp, &end, sizeof(int64_t));
+#else
+               read(fd, &end, sizeof(int64_t));
+#endif        
+               if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
+               else rz->end = end;
+               if(n > rz->end){
+                       rz->stream->avail_in -= n - rz->end;
+                       n = rz->end;
+               }
+               if(rz->end > rz->src_end){
+#ifdef _USE_KNETFILE
+            knet_seek(fp, rz->in, SEEK_SET);
+#else
+                       lseek(fd, rz->in, SEEK_SET);
+#endif
+                       goto UNSEEKABLE;
+               }
+#ifdef _USE_KNETFILE
+        knet_seek(fp, rz->end, SEEK_SET);
+               if(knet_tell(fp) != rz->end){
+                       knet_seek(fp, rz->in, SEEK_SET);
+#else
+               if(lseek(fd, rz->end, SEEK_SET) != rz->end){
+                       lseek(fd, rz->in, SEEK_SET);
+#endif
+                       goto UNSEEKABLE;
+               }
+#ifdef _USE_KNETFILE
+               load_zindex(rz, fp);
+               knet_seek(fp, n, SEEK_SET);
+#else
+               load_zindex(rz, fd);
+               lseek(fd, n, SEEK_SET);
+#endif
+       }
+       return rz;
+}
+
+#ifdef _USE_KNETFILE
+RAZF* razf_dopen(int fd, const char *mode){
+    if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n");
+    else if(strstr(mode, "w")) return razf_open_w(fd);
+       return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+    fprintf(stderr,"[razf_dopen2] implement me\n");
+    return NULL;
+}
+#else
+RAZF* razf_dopen(int fd, const char *mode){
+       if(strstr(mode, "r")) return razf_open_r(fd, 1);
+       else if(strstr(mode, "w")) return razf_open_w(fd);
+       else return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+       if(strstr(mode, "r")) return razf_open_r(fd, 0);
+       else if(strstr(mode, "w")) return razf_open_w(fd);
+       else return NULL;
+}
+#endif
+
+static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
+       int fd;
+       RAZF *rz;
+       if(strstr(mode, "r")){
+#ifdef _USE_KNETFILE
+        knetFile *fd = knet_open(filename, "r");
+        if (fd == 0) {
+            fprintf(stderr, "[_razf_open] fail to open %s\n", filename);
+            return NULL;
+        }
+#else
+#ifdef _WIN32
+               fd = open(filename, O_RDONLY | O_BINARY);
+#else
+               fd = open(filename, O_RDONLY);
+#endif
+#endif
+               if(fd < 0) return NULL;
+               rz = razf_open_r(fd, _load_index);
+       } else if(strstr(mode, "w")){
+#ifdef _WIN32
+               fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666);
+#else
+               fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
+#endif
+               if(fd < 0) return NULL;
+               rz = razf_open_w(fd);
+       } else return NULL;
+       return rz;
+}
+
+RAZF* razf_open(const char *filename, const char *mode){
+       return _razf_open(filename, mode, 1);
+}
+
+RAZF* razf_open2(const char *filename, const char *mode){
+       return _razf_open(filename, mode, 0);
+}
+
+int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
+       int64_t n;
+       if(rz->mode != 'r' && rz->mode != 'R') return 0;
+       switch(rz->file_type){
+               case FILE_TYPE_PLAIN:
+                       if(rz->end == 0x7fffffffffffffffLL){
+#ifdef _USE_KNETFILE
+                               if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0;
+                n = knet_tell(rz->x.fpr);
+                               knet_seek(rz->x.fpr, 0, SEEK_END);
+                rz->end = knet_tell(rz->x.fpr);
+                               knet_seek(rz->x.fpr, n, SEEK_SET);
+#else
+                               if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
+                               rz->end = lseek(rz->filedes, 0, SEEK_END);
+                               lseek(rz->filedes, n, SEEK_SET);
+#endif                
+                       }
+                       *u_size = *c_size = rz->end;
+                       return 1;
+               case FILE_TYPE_GZ:
+                       return 0;
+               case FILE_TYPE_RZ:
+                       if(rz->src_end == rz->end) return 0;
+                       *u_size = rz->src_end;
+                       *c_size = rz->end;
+                       return 1;
+               default:
+                       return 0;
+       }
+}
+
+static int _razf_read(RAZF* rz, void *data, int size){
+       int ret, tin;
+       if(rz->z_eof || rz->z_err) return 0;
+       if (rz->file_type == FILE_TYPE_PLAIN) {
+#ifdef _USE_KNETFILE
+               ret = knet_read(rz->x.fpr, data, size);
+#else
+               ret = read(rz->filedes, data, size);
+#endif        
+               if (ret == 0) rz->z_eof = 1;
+               return ret;
+       }
+       rz->stream->avail_out = size;
+       rz->stream->next_out  = data;
+       while(rz->stream->avail_out){
+               if(rz->stream->avail_in == 0){
+                       if(rz->in >= rz->end){ rz->z_eof = 1; break; }
+                       if(rz->end - rz->in < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE
+                               rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in);
+#else
+                               rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
+#endif        
+                       } else {
+#ifdef _USE_KNETFILE
+                               rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+                               rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif        
+                       }
+                       if(rz->stream->avail_in == 0){
+                               rz->z_eof = 1;
+                               break;
+                       }
+                       rz->stream->next_in = rz->inbuf;
+               }
+               tin = rz->stream->avail_in;
+               ret = inflate(rz->stream, Z_BLOCK);
+               rz->in += tin - rz->stream->avail_in;
+               if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
+                       fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__);
+                       rz->z_err = 1;
+                       break;
+               }
+               if(ret == Z_STREAM_END){
+                       rz->z_eof = 1;
+                       break;
+               }
+               if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
+                       rz->buf_flush = 1;
+                       rz->next_block_pos = rz->in;
+                       break;
+               }
+       }
+       return size - rz->stream->avail_out;
+}
+
+int razf_read(RAZF *rz, void *data, int size){
+       int ori_size, i;
+       ori_size = size;
+       while(size > 0){
+               if(rz->buf_len){
+                       if(size < rz->buf_len){
+                               for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+                               rz->buf_off += size;
+                               rz->buf_len -= size;
+                               data += size;
+                               rz->block_off += size;
+                               size = 0;
+                               break;
+                       } else {
+                               for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+                               data += rz->buf_len;
+                               size -= rz->buf_len;
+                               rz->block_off += rz->buf_len;
+                               rz->buf_off = 0;
+                               rz->buf_len = 0;
+                               if(rz->buf_flush){
+                                       rz->block_pos = rz->next_block_pos;
+                                       rz->block_off = 0;
+                                       rz->buf_flush = 0;
+                               }
+                       }
+               } else if(rz->buf_flush){
+                       rz->block_pos = rz->next_block_pos;
+                       rz->block_off = 0;
+                       rz->buf_flush = 0;
+               }
+               if(rz->buf_flush) continue;
+               rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+               if(rz->z_eof && rz->buf_len == 0) break;
+       }
+       rz->out += ori_size - size;
+       return ori_size - size;
+}
+
+int razf_skip(RAZF* rz, int size){
+       int ori_size;
+       ori_size = size;
+       while(size > 0){
+               if(rz->buf_len){
+                       if(size < rz->buf_len){
+                               rz->buf_off += size;
+                               rz->buf_len -= size;
+                               rz->block_off += size;
+                               size = 0;
+                               break;
+                       } else {
+                               size -= rz->buf_len;
+                               rz->buf_off = 0;
+                               rz->buf_len = 0;
+                               rz->block_off += rz->buf_len;
+                               if(rz->buf_flush){
+                                       rz->block_pos = rz->next_block_pos;
+                                       rz->block_off = 0;
+                                       rz->buf_flush = 0;
+                               }
+                       }
+               } else if(rz->buf_flush){
+                       rz->block_pos = rz->next_block_pos;
+                       rz->block_off = 0;
+                       rz->buf_flush = 0;
+               }
+               if(rz->buf_flush) continue;
+               rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+               if(rz->z_eof || rz->z_err) break;
+       }
+       rz->out += ori_size - size;
+       return ori_size - size;
+}
+
+static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
+#ifdef _USE_KNETFILE
+       knet_seek(rz->x.fpr, in, SEEK_SET);
+#else
+       lseek(rz->filedes, in, SEEK_SET);
+#endif
+       rz->in  = in;
+       rz->out = out;
+       rz->block_pos = in;
+       rz->next_block_pos = in;
+       rz->block_off = 0;
+       rz->buf_flush = 0;
+       rz->z_eof = rz->z_err = 0;
+       inflateReset(rz->stream);
+       rz->stream->avail_in = 0;
+       rz->buf_off = rz->buf_len = 0;
+}
+
+int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
+       int64_t pos;
+       rz->z_eof = 0;
+       if(rz->file_type == FILE_TYPE_PLAIN){
+               rz->buf_off = rz->buf_len = 0;
+               pos = block_start + block_offset;
+#ifdef _USE_KNETFILE
+               knet_seek(rz->x.fpr, pos, SEEK_SET);
+        pos = knet_tell(rz->x.fpr);
+#else
+               pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+               rz->out = rz->in = pos;
+               return pos;
+       }
+       if(block_start == rz->block_pos && block_offset >= rz->block_off) {
+               block_offset -= rz->block_off;
+               goto SKIP; // Needn't reset inflate
+       }
+       if(block_start  == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
+       _razf_reset_read(rz, block_start, 0);
+       SKIP:
+       if(block_offset) razf_skip(rz, block_offset);
+       return rz->block_off;
+}
+
+int64_t razf_seek(RAZF* rz, int64_t pos, int where){
+       int64_t idx;
+       int64_t seek_pos, new_out;
+       rz->z_eof = 0;
+       if (where == SEEK_CUR) pos += rz->out;
+       else if (where == SEEK_END) pos += rz->src_end;
+       if(rz->file_type == FILE_TYPE_PLAIN){
+#ifdef _USE_KNETFILE
+               knet_seek(rz->x.fpr, pos, SEEK_SET);
+        seek_pos = knet_tell(rz->x.fpr);
+#else
+               seek_pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+               rz->buf_off = rz->buf_len = 0;
+               rz->out = rz->in = seek_pos;
+               return seek_pos;
+       } else if(rz->file_type == FILE_TYPE_GZ){
+               if(pos >= rz->out) goto SKIP;
+               return rz->out;
+       }
+       if(pos == rz->out) return pos;
+       if(pos > rz->src_end) return rz->out;
+       if(!rz->seekable || !rz->load_index){
+               if(pos >= rz->out) goto SKIP;
+       }
+       idx = pos / RZ_BLOCK_SIZE - 1;
+       seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+       new_out  = (idx + 1) * RZ_BLOCK_SIZE;
+       if(pos > rz->out && new_out <= rz->out) goto SKIP;
+       _razf_reset_read(rz, seek_pos, new_out);
+       SKIP:
+       razf_skip(rz, (int)(pos - rz->out));
+       return rz->out;
+}
+
+uint64_t razf_tell2(RAZF *rz)
+{
+       /*
+       if (rz->load_index) {
+               int64_t idx, seek_pos;
+               idx = rz->out / RZ_BLOCK_SIZE - 1;
+               seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+               if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
+                       fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
+                                       (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
+       }
+       */
+       return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
+}
+
+int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
+{
+       if (where != SEEK_SET) return -1;
+       return razf_jump(rz, voffset>>16, voffset&0xffff);
+}
+
+void razf_close(RAZF *rz){
+       if(rz->mode == 'w'){
+#ifndef _RZ_READONLY
+               razf_end_flush(rz);
+               deflateEnd(rz->stream);
+#ifdef _USE_KNETFILE
+               save_zindex(rz, rz->x.fpw);
+               if(is_big_endian()){
+                       write(rz->x.fpw, &rz->in, sizeof(int64_t));
+                       write(rz->x.fpw, &rz->out, sizeof(int64_t));
+               } else {
+                       uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+                       write(rz->x.fpw, &v64, sizeof(int64_t));
+                       v64 = byte_swap_8((uint64_t)rz->out);
+                       write(rz->x.fpw, &v64, sizeof(int64_t));
+               }
+#else
+               save_zindex(rz, rz->filedes);
+               if(is_big_endian()){
+                       write(rz->filedes, &rz->in, sizeof(int64_t));
+                       write(rz->filedes, &rz->out, sizeof(int64_t));
+               } else {
+                       uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+                       write(rz->filedes, &v64, sizeof(int64_t));
+                       v64 = byte_swap_8((uint64_t)rz->out);
+                       write(rz->filedes, &v64, sizeof(int64_t));
+               }
+#endif
+#endif
+       } else if(rz->mode == 'r'){
+               if(rz->stream) inflateEnd(rz->stream);
+       }
+       if(rz->inbuf) free(rz->inbuf);
+       if(rz->outbuf) free(rz->outbuf);
+       if(rz->header){
+               free(rz->header->extra);
+               free(rz->header->name);
+               free(rz->header->comment);
+               free(rz->header);
+       }
+       if(rz->index){
+               free(rz->index->bin_offsets);
+               free(rz->index->cell_offsets);
+               free(rz->index);
+       }
+       free(rz->stream);
+#ifdef _USE_KNETFILE
+    if (rz->mode == 'r')
+        knet_close(rz->x.fpr);
+    if (rz->mode == 'w')
+        close(rz->x.fpw);
+#else
+       close(rz->filedes);
+#endif
+       free(rz);
+}
+
+#endif
diff --git a/samtools/razf.h b/samtools/razf.h
new file mode 100644 (file)
index 0000000..60a0c96
--- /dev/null
@@ -0,0 +1,134 @@
+ /*-
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef __RAZF_RJ_H
+#define __RAZF_RJ_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+#if ZLIB_VERNUM < 0x1221
+#define _RZ_READONLY
+struct _gz_header_s;
+typedef struct _gz_header_s _gz_header;
+#define gz_header _gz_header
+#endif
+
+#define WINDOW_BITS   15
+
+#ifndef RZ_BLOCK_SIZE
+#define RZ_BLOCK_SIZE (1<<WINDOW_BITS)
+#endif
+
+#ifndef RZ_BUFFER_SIZE
+#define RZ_BUFFER_SIZE 4096
+#endif
+
+#ifndef RZ_COMPRESS_LEVEL
+#define RZ_COMPRESS_LEVEL 6
+#endif
+
+#define RZ_BIN_SIZE ((1LLU << 32) / RZ_BLOCK_SIZE)
+
+typedef struct {
+       uint32_t *cell_offsets; // i
+       int64_t  *bin_offsets; // i / BIN_SIZE
+       int size;
+       int cap;
+} ZBlockIndex;
+/* When storing index, output bytes in Big-Endian everywhere */
+
+#define FILE_TYPE_RZ   1
+#define FILE_TYPE_PLAIN        2
+#define FILE_TYPE_GZ   3
+
+typedef struct RandomAccessZFile  {
+       char mode; /* 'w' : write mode; 'r' : read mode */
+       int file_type;
+       /* plain file or rz file, razf_read support plain file as input too, in this case, razf_read work as buffered fread */
+#ifdef _USE_KNETFILE
+    union {
+        knetFile *fpr;
+        int fpw;
+    } x;
+#else
+       int filedes; /* the file descriptor */
+#endif
+       z_stream *stream;
+       ZBlockIndex *index;
+       int64_t in, out, end, src_end;
+       /* in: n bytes total in; out: n bytes total out; */
+       /* end: the end of all data blocks, while the start of index; src_end: the true end position in uncompressed file */
+       int buf_flush; // buffer should be flush, suspend inflate util buffer is empty
+       int64_t block_pos, block_off, next_block_pos;
+       /* block_pos: the start postiion of current block  in compressed file */
+       /* block_off: tell how many bytes have been read from current block */
+       void *inbuf, *outbuf;
+       int header_size;
+       gz_header *header;
+       /* header is used to transfer inflate_state->mode from HEAD to TYPE after call inflateReset */
+       int buf_off, buf_len;
+       int z_err, z_eof;
+       int seekable;
+       /* Indice where the source is seekable */
+       int load_index;
+       /* set has_index to 0 in mode 'w', then index will be discarded */
+} RAZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       RAZF* razf_dopen(int data_fd, const char *mode);
+       RAZF *razf_open(const char *fn, const char *mode);
+       int razf_write(RAZF* rz, const void *data, int size);
+       int razf_read(RAZF* rz, void *data, int size);
+       int64_t razf_seek(RAZF* rz, int64_t pos, int where);
+       void razf_close(RAZF* rz);
+
+#define razf_tell(rz) ((rz)->out)
+
+       RAZF* razf_open2(const char *filename, const char *mode);
+       RAZF* razf_dopen2(int fd, const char *mode);
+       uint64_t razf_tell2(RAZF *rz);
+       int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/sam.c b/samtools/sam.c
new file mode 100644 (file)
index 0000000..ad4325b
--- /dev/null
@@ -0,0 +1,174 @@
+#include <string.h>
+#include <unistd.h>
+#include "faidx.h"
+#include "sam.h"
+
+#define TYPE_BAM  1
+#define TYPE_READ 2
+
+bam_header_t *bam_header_dup(const bam_header_t *h0)
+{
+       bam_header_t *h;
+       int i;
+       h = bam_header_init();
+       *h = *h0;
+       h->hash = h->dict = h->rg2lib = 0;
+       h->text = (char*)calloc(h->l_text + 1, 1);
+       memcpy(h->text, h0->text, h->l_text);
+       h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+       h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+       for (i = 0; i < h->n_targets; ++i) {
+               h->target_len[i] = h0->target_len[i];
+               h->target_name[i] = strdup(h0->target_name[i]);
+       }
+       return h;
+}
+static void append_header_text(bam_header_t *header, char* text, int len)
+{
+       int x = header->l_text + 1;
+       int y = header->l_text + len + 1; // 1 byte null
+       if (text == 0) return;
+       kroundup32(x); 
+       kroundup32(y);
+       if (x < y) header->text = (char*)realloc(header->text, y);
+       strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
+       header->l_text += len;
+       header->text[header->l_text] = 0;
+}
+
+samfile_t *samopen(const char *fn, const char *mode, const void *aux)
+{
+       samfile_t *fp;
+       fp = (samfile_t*)calloc(1, sizeof(samfile_t));
+       if (mode[0] == 'r') { // read
+               fp->type |= TYPE_READ;
+               if (mode[1] == 'b') { // binary
+                       fp->type |= TYPE_BAM;
+                       fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+                       if (fp->x.bam == 0) goto open_err_ret;
+                       fp->header = bam_header_read(fp->x.bam);
+               } else { // text
+                       fp->x.tamr = sam_open(fn);
+                       if (fp->x.tamr == 0) goto open_err_ret;
+                       fp->header = sam_header_read(fp->x.tamr);
+                       if (fp->header->n_targets == 0) { // no @SQ fields
+                               if (aux) { // check if aux is present
+                                       bam_header_t *textheader = fp->header;
+                                       fp->header = sam_header_read2((const char*)aux);
+                                       append_header_text(fp->header, textheader->text, textheader->l_text);
+                                       bam_header_destroy(textheader);
+                               }
+                               if (fp->header->n_targets == 0)
+                                       fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
+                       } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
+               }
+       } else if (mode[0] == 'w') { // write
+               fp->header = bam_header_dup((const bam_header_t*)aux);
+               if (mode[1] == 'b') { // binary
+                       char bmode[3];
+                       bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0;
+                       fp->type |= TYPE_BAM;
+                       fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
+                       if (fp->x.bam == 0) goto open_err_ret;
+                       bam_header_write(fp->x.bam, fp->header);
+               } else { // text
+                       // open file
+                       fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
+                       if (fp->x.tamr == 0) goto open_err_ret;
+                       if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2;
+                       else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2;
+                       else fp->type |= BAM_OFDEC<<2;
+                       // write header
+                       if (strstr(mode, "h")) {
+                               int i;
+                               bam_header_t *alt;
+                               // parse the header text 
+                               alt = bam_header_init();
+                               alt->l_text = fp->header->l_text; alt->text = fp->header->text;
+                               sam_header_parse(alt);
+                               alt->l_text = 0; alt->text = 0;
+                               // check if there are @SQ lines in the header
+                               fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
+                               if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
+                                       if (alt->n_targets != fp->header->n_targets)
+                                               fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+                               } else { // then dump ->target_{name,len}
+                                       for (i = 0; i < fp->header->n_targets; ++i)
+                                               fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
+                               }
+                               bam_header_destroy(alt);
+                       }
+               }
+       }
+       return fp;
+
+open_err_ret:
+       free(fp);
+       return 0;
+}
+
+void samclose(samfile_t *fp)
+{
+       if (fp == 0) return;
+       if (fp->header) bam_header_destroy(fp->header);
+       if (fp->type & TYPE_BAM) bam_close(fp->x.bam);
+       else if (fp->type & TYPE_READ) sam_close(fp->x.tamr);
+       else fclose(fp->x.tamw);
+       free(fp);
+}
+
+int samread(samfile_t *fp, bam1_t *b)
+{
+       if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading
+       if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b);
+       else return sam_read1(fp->x.tamr, fp->header, b);
+}
+
+int samwrite(samfile_t *fp, const bam1_t *b)
+{
+       if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing
+       if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b);
+       else {
+               char *s = bam_format1_core(fp->header, b, fp->type>>2&3);
+               int l = strlen(s);
+               fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw);
+               free(s);
+               return l + 1;
+       }
+}
+
+int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
+{
+       bam_plbuf_t *buf;
+       int ret;
+       bam1_t *b;
+       b = bam_init1();
+       buf = bam_plbuf_init(func, func_data);
+       bam_plbuf_set_mask(buf, mask);
+       while ((ret = samread(fp, b)) >= 0)
+               bam_plbuf_push(b, buf);
+       bam_plbuf_push(0, buf);
+       bam_plbuf_destroy(buf);
+       bam_destroy1(b);
+       return 0;
+}
+
+char *samfaipath(const char *fn_ref)
+{
+       char *fn_list = 0;
+       if (fn_ref == 0) return 0;
+       fn_list = calloc(strlen(fn_ref) + 5, 1);
+       strcat(strcpy(fn_list, fn_ref), ".fai");
+       if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
+               if (access(fn_ref, R_OK) == -1) {
+                       fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
+               } else {
+                       fprintf(stderr, "[samfaipath] build FASTA index...\n");
+                       if (fai_build(fn_ref) == -1) {
+                               fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
+                               free(fn_list); fn_list = 0;
+                       }
+               }
+       }
+       return fn_list;
+}
diff --git a/samtools/sam.h b/samtools/sam.h
new file mode 100644 (file)
index 0000000..0b87194
--- /dev/null
@@ -0,0 +1,98 @@
+#ifndef BAM_SAM_H
+#define BAM_SAM_H
+
+#include "bam.h"
+
+/*!
+  @header
+
+  This file provides higher level of I/O routines and unifies the APIs
+  for SAM and BAM formats. These APIs are more convenient and
+  recommended.
+
+  @copyright Genome Research Ltd.
+ */
+
+/*! @typedef
+  @abstract SAM/BAM file handler
+  @field  type    type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format
+  @field  bam   BAM file handler; valid if (type&1) == 1
+  @field  tamr  SAM file handler for reading; valid if type == 2
+  @field  tamw  SAM file handler for writing; valid if type == 0
+  @field  header  header struct
+ */
+typedef struct {
+       int type;
+       union {
+               tamFile tamr;
+               bamFile bam;
+               FILE *tamw;
+       } x;
+       bam_header_t *header;
+} samfile_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       /*!
+         @abstract     Open a SAM/BAM file
+
+         @param fn SAM/BAM file name; "-" is recognized as stdin (for
+         reading) or stdout (for writing).
+
+         @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading,
+         'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output,
+         'h' for outputing header in SAM, 'x' for HEX flag and 'X' for
+         string flag. If 'b' present, it must immediately follow 'r' or
+         'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX",
+         "rb", "wb" and "wbu" exclusively.
+
+         @param aux auxiliary data; if mode[0]=='w', aux points to
+         bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM
+         are absent, aux points the file name of the list of the reference;
+         aux is not used otherwise. If @SQ header lines are present in SAM,
+         aux is not used, either.
+
+         @return       SAM/BAM file handler
+        */
+       samfile_t *samopen(const char *fn, const char *mode, const void *aux);
+
+       /*!
+         @abstract     Close a SAM/BAM handler
+         @param  fp    file handler to be closed
+        */
+       void samclose(samfile_t *fp);
+
+       /*!
+         @abstract     Read one alignment
+         @param  fp    file handler
+         @param  b     alignment
+         @return       bytes read
+        */
+       int samread(samfile_t *fp, bam1_t *b);
+
+       /*!
+         @abstract     Write one alignment
+         @param  fp    file handler
+         @param  b     alignment
+         @return       bytes written
+        */
+       int samwrite(samfile_t *fp, const bam1_t *b);
+
+       /*!
+         @abstract     Get the pileup for a whole alignment file
+         @param  fp    file handler
+         @param  mask  mask transferred to bam_plbuf_set_mask()
+         @param  func  user defined function called in the pileup process
+         #param  data  user provided data for func()
+        */
+       int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data);
+
+       char *samfaipath(const char *fn_ref);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/sam_header.c b/samtools/sam_header.c
new file mode 100644 (file)
index 0000000..a119c02
--- /dev/null
@@ -0,0 +1,701 @@
+#include "sam_header.h"
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, const char *)
+
+struct _HeaderList
+{
+    struct _HeaderList *next;
+    void *data;
+};
+typedef struct _HeaderList list_t;
+typedef list_t HeaderDict;
+
+typedef struct
+{
+    char key[2];
+    char *value;
+}
+HeaderTag;
+
+typedef struct
+{
+    char type[2];
+    list_t *tags;
+}
+HeaderLine;
+
+const char *o_hd_tags[] = {"SO","GO",NULL};
+const char *r_hd_tags[] = {"VN",NULL};
+
+const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
+const char *r_sq_tags[] = {"SN","LN",NULL};
+const char *u_sq_tags[] = {"SN",NULL};
+
+const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
+const char *r_rg_tags[] = {"ID",NULL};
+const char *u_rg_tags[] = {"ID",NULL};
+
+const char *o_pg_tags[] = {"VN","CL",NULL};
+const char *r_pg_tags[] = {"ID",NULL};
+
+const char *types[]          = {"HD","SQ","RG","PG","CO",NULL};
+const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
+const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
+const char **unique_tags[]   = {NULL,     u_sq_tags,u_rg_tags,NULL,NULL,NULL};
+
+
+static void debug(const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    vfprintf(stderr, format, ap);
+    va_end(ap);
+}
+
+static list_t *list_append(list_t *root, void *data)
+{
+    list_t *l = root;
+    while (l && l->next)
+        l = l->next;
+    if ( l ) 
+    {
+        l->next = malloc(sizeof(list_t));
+        l = l->next;
+    }
+    else
+    {
+        l = malloc(sizeof(list_t));
+        root = l;
+    }
+    l->data = data;
+    l->next = NULL;
+    return root;
+}
+
+static void list_free(list_t *root)
+{
+    list_t *l = root;
+    while (root)
+    {
+        l = root;
+        root = root->next;
+        free(l);
+    }
+}
+
+
+
+// Look for a tag "XY" in a predefined const char *[] array.
+static int tag_exists(const char *tag, const char **tags)
+{
+    int itag=0;
+    if ( !tags ) return -1;
+    while ( tags[itag] )
+    {
+        if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; 
+        itag++;
+    }
+    return -1;
+}
+
+
+
+// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
+//  or NULL if everything has been read. The lineptr should be freed by the caller. The
+//  newline character is stripped.
+static const char *nextline(char **lineptr, size_t *n, const char *text)
+{
+    int len;
+    const char *to = text;
+
+    if ( !*to ) return NULL;
+
+    while ( *to && *to!='\n' && *to!='\r' ) to++;
+    len = to - text + 1;
+
+    if ( *to )
+    {
+        // Advance the pointer for the next call
+        if ( *to=='\n' ) to++;
+        else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
+    }
+    if ( !len )
+        return to;
+
+    if ( !*lineptr ) 
+    {
+        *lineptr = malloc(len);
+        *n = len;
+    }
+    else if ( *n<len ) 
+    {
+        *lineptr = realloc(*lineptr, len);
+        *n = len;
+    }
+    if ( !*lineptr ) {
+               debug("[nextline] Insufficient memory!\n");
+               return 0;
+       }
+
+    memcpy(*lineptr,text,len);
+    (*lineptr)[len-1] = 0;
+
+    return to;
+}
+
+// name points to "XY", value_from points to the first character of the value string and
+//  value_to points to the last character of the value string.
+static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
+{
+    HeaderTag *tag = malloc(sizeof(HeaderTag));
+    int len = value_to-value_from+1;
+
+    tag->key[0] = name[0];
+    tag->key[1] = name[1];
+    tag->value = malloc(len+1);
+    memcpy(tag->value,value_from,len+1);
+    tag->value[len] = 0;
+    return tag;
+}
+
+static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
+{
+    list_t *tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *tag = tags->data;
+        if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
+        tags = tags->next;
+    }
+    return NULL;
+}
+
+
+// Return codes:
+//   0 .. different types or unique tags differ or conflicting tags, cannot be merged
+//   1 .. all tags identical -> no need to merge, drop one
+//   2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
+//   3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
+static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
+{
+    HeaderTag *t1, *t2;
+
+    if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
+        return 0;
+
+    int itype = tag_exists(hline1->type,types);
+    if ( itype==-1 ) {
+               debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
+               return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
+       }
+
+    if ( unique_tags[itype] )
+    {
+        t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
+        t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
+        if ( !t1 || !t2 ) // this should never happen, the unique tags are required
+            return 2;
+
+        if ( strcmp(t1->value,t2->value) )
+            return 0;   // the unique tags differ, cannot be merged
+    }
+    if ( !required_tags[itype] && !optional_tags[itype] )
+    {
+        t1 = hline1->tags->data;
+        t2 = hline2->tags->data;
+        if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
+        return 0;
+    }
+
+    int missing=0, itag=0;
+    while ( required_tags[itype] && required_tags[itype][itag] )
+    {
+        t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
+        t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
+        if ( !t1 && !t2 )
+            return 2;       // this should never happen
+        else if ( !t1 || !t2 )
+            missing = 1;    // there is some tag missing in one of the hlines
+        else if ( strcmp(t1->value,t2->value) )
+        {
+            if ( unique_tags[itype] )
+                return 2;   // the lines have a matching unique tag but have a conflicting tag
+                    
+            return 0;    // the lines contain conflicting tags, cannot be merged
+        }
+        itag++;
+    }
+    itag = 0;
+    while ( optional_tags[itype] && optional_tags[itype][itag] )
+    {
+        t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
+        t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
+        if ( !t1 && !t2 )
+        {
+            itag++;
+            continue;
+        }
+        if ( !t1 || !t2 )
+            missing = 1;    // there is some tag missing in one of the hlines
+        else if ( strcmp(t1->value,t2->value) )
+        {
+            if ( unique_tags[itype] )
+                return 2;   // the lines have a matching unique tag but have a conflicting tag
+
+            return 0;   // the lines contain conflicting tags, cannot be merged
+        }
+        itag++;
+    }
+    if ( missing ) return 3;    // there are some missing complementary tags with no conflicts, can be merged
+    return 1;
+}
+
+
+static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
+{
+    list_t *tags;
+    HeaderLine *out = malloc(sizeof(HeaderLine));
+    out->type[0] = hline->type[0];
+    out->type[1] = hline->type[1];
+    out->tags = NULL;
+
+    tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *old = tags->data;
+
+        HeaderTag *new = malloc(sizeof(HeaderTag));
+        new->key[0] = old->key[0];
+        new->key[1] = old->key[1];
+        new->value  = strdup(old->value);
+        out->tags = list_append(out->tags, new);
+
+        tags = tags->next;
+    }
+    return out;
+}
+
+static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
+{
+    list_t *tmpl_tags;
+
+    if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
+        return 0;
+    
+    tmpl_tags = tmpl_hline->tags;
+    while (tmpl_tags)
+    {
+        HeaderTag *tmpl_tag = tmpl_tags->data;
+        HeaderTag *out_tag  = header_line_has_tag(out_hline, tmpl_tag->key);
+        if ( !out_tag )
+        {
+            HeaderTag *tag = malloc(sizeof(HeaderTag));
+            tag->key[0] = tmpl_tag->key[0];
+            tag->key[1] = tmpl_tag->key[1];
+            tag->value  = strdup(tmpl_tag->value);
+            out_hline->tags = list_append(out_hline->tags,tag);
+        }
+        tmpl_tags = tmpl_tags->next;
+    }
+    return 1;
+}
+
+
+static HeaderLine *sam_header_line_parse(const char *headerLine)
+{
+    HeaderLine *hline;
+    HeaderTag *tag;
+    const char *from, *to;
+    from = headerLine;
+
+    if ( *from != '@' ) {
+               debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+               return 0;
+       }
+    to = ++from;
+
+    while (*to && *to!='\t') to++;
+    if ( to-from != 2 ) {
+               debug("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine);
+               return 0;
+       }
+    
+    hline = malloc(sizeof(HeaderLine));
+    hline->type[0] = from[0];
+    hline->type[1] = from[1];
+    hline->tags = NULL;
+
+    int itype = tag_exists(hline->type, types);
+    
+    from = to;
+    while (*to && *to=='\t') to++;
+    if ( to-from != 1 ) {
+        debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+               return 0;
+       }
+    from = to;
+    while (*from)
+    {
+        while (*to && *to!='\t') to++;
+
+        if ( !required_tags[itype] && !optional_tags[itype] )
+            tag = new_tag("  ",from,to-1);
+        else
+            tag = new_tag(from,from+3,to-1);
+
+        if ( header_line_has_tag(hline,tag->key) ) 
+                debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
+        hline->tags = list_append(hline->tags, tag);
+
+        from = to;
+        while (*to && *to=='\t') to++;
+        if ( *to && to-from != 1 ) {
+                       debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+                       return 0;
+               }
+
+        from = to;
+    }
+    return hline;
+}
+
+
+// Must be of an existing type, all tags must be recognised and all required tags must be present
+static int sam_header_line_validate(HeaderLine *hline)
+{
+    list_t *tags;
+    HeaderTag *tag;
+    int itype, itag;
+    
+    // Is the type correct?
+    itype = tag_exists(hline->type, types);
+    if ( itype==-1 ) 
+    {
+        debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
+        return 0;
+    }
+
+    // Has all required tags?
+    itag = 0;
+    while ( required_tags[itype] && required_tags[itype][itag] )
+    {
+        if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
+        {
+            debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
+                hline->type[0],hline->type[1]);
+            return 0;
+        }
+        itag++;
+    }
+
+    // Are all tags recognised?
+    tags = hline->tags;
+    while ( tags )
+    {
+        tag = tags->data;
+        if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
+        {
+            debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
+            return 0;
+        }
+        tags = tags->next;
+    }
+
+    return 1;
+}
+
+
+static void print_header_line(FILE *fp, HeaderLine *hline)
+{
+    list_t *tags = hline->tags;
+    HeaderTag *tag;
+
+    fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
+    while (tags)
+    {
+        tag = tags->data;
+
+        fprintf(fp, "\t");
+        if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+            fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
+        fprintf(fp, "%s", tag->value);
+
+        tags = tags->next;
+    }
+    fprintf(fp,"\n");
+}
+
+
+static void sam_header_line_free(HeaderLine *hline)
+{
+    list_t *tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *tag = tags->data;
+        free(tag->value);
+        free(tag);
+        tags = tags->next;
+    }
+    list_free(hline->tags);
+    free(hline);
+}
+
+void sam_header_free(void *_header)
+{
+       HeaderDict *header = (HeaderDict*)_header;
+    list_t *hlines = header;
+    while (hlines)
+    {
+        sam_header_line_free(hlines->data);
+        hlines = hlines->next;
+    }
+    list_free(header);
+}
+
+HeaderDict *sam_header_clone(const HeaderDict *dict)
+{
+    HeaderDict *out = NULL;
+    while (dict)
+    {
+        HeaderLine *hline = dict->data;
+        out = list_append(out, sam_header_line_clone(hline));
+        dict = dict->next;
+    }
+    return out;
+}
+
+// Returns a newly allocated string
+char *sam_header_write(const void *_header)
+{
+       const HeaderDict *header = (const HeaderDict*)_header;
+    char *out = NULL;
+    int len=0, nout=0;
+    const list_t *hlines;
+
+    // Calculate the length of the string to allocate
+    hlines = header;
+    while (hlines)
+    {
+        len += 4;   // @XY and \n
+
+        HeaderLine *hline = hlines->data;
+        list_t *tags = hline->tags;
+        while (tags)
+        {
+            HeaderTag *tag = tags->data;
+            len += strlen(tag->value) + 1;                  // \t
+            if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+                len += strlen(tag->value) + 3;              // XY:
+            tags = tags->next;
+        }
+        hlines = hlines->next;
+    }
+
+    nout = 0;
+    out  = malloc(len+1);
+    hlines = header;
+    while (hlines)
+    {
+        HeaderLine *hline = hlines->data;
+
+        nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
+
+        list_t *tags = hline->tags;
+        while (tags)
+        {
+            HeaderTag *tag = tags->data;
+            nout += sprintf(out+nout,"\t");
+            if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+                nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
+            nout += sprintf(out+nout,"%s", tag->value);
+            tags = tags->next;
+        }
+        hlines = hlines->next;
+        nout += sprintf(out+nout,"\n");
+    }
+    out[len] = 0;
+    return out;
+}
+
+void *sam_header_parse2(const char *headerText)
+{
+    list_t *hlines = NULL;
+    HeaderLine *hline;
+    const char *text;
+    char *buf=NULL;
+    size_t nbuf = 0;
+
+    if ( !headerText )
+               return 0;
+
+    text = headerText;
+    while ( (text=nextline(&buf, &nbuf, text)) )
+    {
+        hline = sam_header_line_parse(buf);
+        if ( hline && sam_header_line_validate(hline) )
+            hlines = list_append(hlines, hline);
+        else
+        {
+                       if (hline) sam_header_line_free(hline);
+                       sam_header_free(hlines);
+            if ( buf ) free(buf);
+            return NULL;
+        }
+    }
+    if ( buf ) free(buf);
+
+    return hlines;
+}
+
+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
+{
+       const HeaderDict *dict = (const HeaderDict*)_dict;
+    const list_t *l   = dict;
+    khash_t(str) *tbl = kh_init(str);
+    khiter_t k;
+    int ret;
+
+       if (_dict == 0) return tbl; // return an empty (not null) hash table
+    while (l)
+    {
+        HeaderLine *hline = l->data;
+        if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) 
+        {
+            l = l->next;
+            continue;
+        }
+        
+        HeaderTag *key, *value;
+        key   = header_line_has_tag(hline,key_tag);
+        value = header_line_has_tag(hline,value_tag); 
+        if ( !key || !value )
+        {
+            l = l->next;
+            continue;
+        }
+        
+        k = kh_get(str, tbl, key->value);
+        if ( k != kh_end(tbl) )
+            debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
+        k = kh_put(str, tbl, key->value, &ret);
+        kh_value(tbl, k) = value->value;
+
+        l = l->next;
+    }
+    return tbl;
+}
+
+char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
+{
+       const HeaderDict *dict = (const HeaderDict*)_dict;
+    const list_t *l   = dict;
+    int max, n;
+       char **ret;
+
+       ret = 0; *_n = max = n = 0;
+    while (l)
+    {
+        HeaderLine *hline = l->data;
+        if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) 
+        {
+            l = l->next;
+            continue;
+        }
+        
+        HeaderTag *key;
+        key   = header_line_has_tag(hline,key_tag);
+        if ( !key )
+        {
+            l = l->next;
+            continue;
+        }
+
+               if (n == max) {
+                       max = max? max<<1 : 4;
+                       ret = realloc(ret, max * sizeof(void*));
+               }
+               ret[n++] = key->value;
+
+        l = l->next;
+    }
+       *_n = n;
+    return ret;
+}
+
+const char *sam_tbl_get(void *h, const char *key)
+{
+       khash_t(str) *tbl = (khash_t(str)*)h;
+       khint_t k;
+       k = kh_get(str, tbl, key);
+       return k == kh_end(tbl)? 0 : kh_val(tbl, k);
+}
+
+int sam_tbl_size(void *h)
+{
+       khash_t(str) *tbl = (khash_t(str)*)h;
+       return h? kh_size(tbl) : 0;
+}
+
+void sam_tbl_destroy(void *h)
+{
+       khash_t(str) *tbl = (khash_t(str)*)h;
+       kh_destroy(str, tbl);
+}
+
+void *sam_header_merge(int n, const void **_dicts)
+{
+       const HeaderDict **dicts = (const HeaderDict**)_dicts;
+    HeaderDict *out_dict;
+    int idict, status;
+
+    if ( n<2 ) return NULL;
+
+    out_dict = sam_header_clone(dicts[0]);
+
+    for (idict=1; idict<n; idict++)
+    {
+        const list_t *tmpl_hlines = dicts[idict];
+
+        while ( tmpl_hlines )
+        {
+            list_t *out_hlines = out_dict;
+            int inserted = 0;
+            while ( out_hlines )
+            {
+                status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
+                if ( status==0 )
+                {
+                    out_hlines = out_hlines->next;
+                    continue;
+                }
+                
+                if ( status==2 ) 
+                {
+                    print_header_line(stderr,tmpl_hlines->data);
+                    print_header_line(stderr,out_hlines->data);
+                    debug("Conflicting lines, cannot merge the headers.\n");
+                                       return 0;
+                }
+                if ( status==3 )
+                    sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
+
+                inserted = 1;
+                break;
+            }
+            if ( !inserted )
+                out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
+
+            tmpl_hlines = tmpl_hlines->next;
+        }
+    }
+
+    return out_dict;
+}
+
+
diff --git a/samtools/sam_header.h b/samtools/sam_header.h
new file mode 100644 (file)
index 0000000..e5c754f
--- /dev/null
@@ -0,0 +1,24 @@
+#ifndef __SAM_HEADER_H__
+#define __SAM_HEADER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       void *sam_header_parse2(const char *headerText);
+       void *sam_header_merge(int n, const void **dicts);
+       void sam_header_free(void *header);
+       char *sam_header_write(const void *headerDict);   // returns a newly allocated string
+
+       char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n);
+
+       void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]);
+       const char *sam_tbl_get(void *h, const char *key);
+       int sam_tbl_size(void *h);
+       void sam_tbl_destroy(void *h);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools/sam_view.c b/samtools/sam_view.c
new file mode 100644 (file)
index 0000000..06dd01a
--- /dev/null
@@ -0,0 +1,224 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <math.h>
+#include "sam_header.h"
+#include "sam.h"
+#include "faidx.h"
+
+static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;
+static char *g_library, *g_rg;
+static int g_sol2sanger_tbl[128];
+
+static void sol2sanger(bam1_t *b)
+{
+       int l;
+       uint8_t *qual = bam1_qual(b);
+       if (g_sol2sanger_tbl[30] == 0) {
+               for (l = 0; l != 128; ++l) {
+                       g_sol2sanger_tbl[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64 + 33) / 10.0)) / log(10.0) + .499);
+                       if (g_sol2sanger_tbl[l] >= 93) g_sol2sanger_tbl[l] = 93;
+               }
+       }
+       for (l = 0; l < b->core.l_qseq; ++l) {
+               int q = qual[l];
+               if (q > 127) q = 127;
+               qual[l] = g_sol2sanger_tbl[q];
+       }
+}
+
+static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b)
+{
+       if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))
+               return 1;
+       if (g_rg) {
+               uint8_t *s = bam_aux_get(b, "RG");
+               if (s && strcmp(g_rg, (char*)(s + 1)) == 0) return 0;
+       }
+       if (g_library) {
+               const char *p = bam_get_library((bam_header_t*)h, b);
+               return (p && strcmp(p, g_library) == 0)? 0 : 1;
+       }
+       return 0;
+}
+
+// callback function for bam_fetch()
+static int view_func(const bam1_t *b, void *data)
+{
+       if (!__g_skip_aln(((samfile_t*)data)->header, b))
+               samwrite((samfile_t*)data, b);
+       return 0;
+}
+
+static int usage(int is_long_help);
+
+int main_samview(int argc, char *argv[])
+{
+       int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0;
+       int of_type = BAM_OFDEC, is_long_help = 0;
+       samfile_t *in = 0, *out = 0;
+       char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
+
+       /* parse command-line options */
+       strcpy(in_mode, "r"); strcpy(out_mode, "w");
+       while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:C")) >= 0) {
+               switch (c) {
+               case 'C': slx2sngr = 1; break;
+               case 'S': is_bamin = 0; break;
+               case 'b': is_bamout = 1; break;
+               case 't': fn_list = strdup(optarg); is_bamin = 0; break;
+               case 'h': is_header = 1; break;
+               case 'H': is_header_only = 1; break;
+               case 'o': fn_out = strdup(optarg); break;
+               case 'f': g_flag_on = strtol(optarg, 0, 0); break;
+               case 'F': g_flag_off = strtol(optarg, 0, 0); break;
+               case 'q': g_min_mapQ = atoi(optarg); break;
+               case 'u': is_uncompressed = 1; break;
+               case 'l': g_library = strdup(optarg); break;
+               case 'r': g_rg = strdup(optarg); break;
+               case 'x': of_type = BAM_OFHEX; break;
+               case 'X': of_type = BAM_OFSTR; break;
+               case '?': is_long_help = 1; break;
+               case 'T': fn_ref = strdup(optarg); is_bamin = 0; break;
+               default: return usage(is_long_help);
+               }
+       }
+       if (is_uncompressed) is_bamout = 1;
+       if (is_header_only) is_header = 1;
+       if (is_bamout) strcat(out_mode, "b");
+       else {
+               if (of_type == BAM_OFHEX) strcat(out_mode, "x");
+               else if (of_type == BAM_OFSTR) strcat(out_mode, "X");
+       }
+       if (is_bamin) strcat(in_mode, "b");
+       if (is_header) strcat(out_mode, "h");
+       if (is_uncompressed) strcat(out_mode, "u");
+       if (argc == optind) return usage(is_long_help);
+
+       // generate the fn_list if necessary
+       if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
+       // open file handlers
+       if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+               fprintf(stderr, "[main_samview] fail to open file for reading.\n");
+               goto view_end;
+       }
+       if (in->header == 0) {
+               fprintf(stderr, "[main_samview] fail to read the header.\n");
+               goto view_end;
+       }
+       if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
+               fprintf(stderr, "[main_samview] fail to open file for writing.\n");
+               goto view_end;
+       }
+       if (is_header_only) goto view_end; // no need to print alignments
+
+       if (argc == optind + 1) { // convert/print the entire file
+               bam1_t *b = bam_init1();
+               int r;
+               while ((r = samread(in, b)) >= 0) { // read one alignment from `in'
+                       if (!__g_skip_aln(in->header, b)) {
+                               if (slx2sngr) sol2sanger(b);
+                               samwrite(out, b); // write the alignment to `out'
+                       }
+               }
+               if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n");
+               bam_destroy1(b);
+       } else { // retrieve alignments in specified regions
+               int i;
+               bam_index_t *idx = 0;
+               if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index
+               if (idx == 0) { // index is unavailable
+                       fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n");
+                       ret = 1;
+                       goto view_end;
+               }
+               for (i = optind + 1; i < argc; ++i) {
+                       int tid, beg, end;
+                       bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200'
+                       if (tid < 0) { // reference name is not found
+                               fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n");
+                               continue;
+                       }
+                       bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments
+               }
+               bam_index_destroy(idx); // destroy the BAM index
+       }
+
+view_end:
+       // close files, free and return
+       free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg);
+       samclose(in);
+       samclose(out);
+       return ret;
+}
+
+static int usage(int is_long_help)
+{
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Usage:   samtools view [options] <in.bam>|<in.sam> [region1 [...]]\n\n");
+       fprintf(stderr, "Options: -b       output BAM\n");
+       fprintf(stderr, "         -h       print header for the SAM output\n");
+       fprintf(stderr, "         -H       print header only (no alignments)\n");
+       fprintf(stderr, "         -S       input is SAM\n");
+       fprintf(stderr, "         -u       uncompressed BAM output (force -b)\n");
+       fprintf(stderr, "         -x       output FLAG in HEX (samtools-C specific)\n");
+       fprintf(stderr, "         -X       output FLAG in string (samtools-C specific)\n");
+       fprintf(stderr, "         -t FILE  list of reference names and lengths (force -S) [null]\n");
+       fprintf(stderr, "         -T FILE  reference sequence file (force -S) [null]\n");
+       fprintf(stderr, "         -o FILE  output file name [stdout]\n");
+       fprintf(stderr, "         -f INT   required flag, 0 for unset [0]\n");
+       fprintf(stderr, "         -F INT   filtering flag, 0 for unset [0]\n");
+       fprintf(stderr, "         -q INT   minimum mapping quality [0]\n");
+       fprintf(stderr, "         -l STR   only output reads in library STR [null]\n");
+       fprintf(stderr, "         -r STR   only output reads in read group STR [null]\n");
+       fprintf(stderr, "         -?       longer help\n");
+       fprintf(stderr, "\n");
+       if (is_long_help)
+               fprintf(stderr, "Notes:\n\
+\n\
+  1. By default, this command assumes the file on the command line is in\n\
+     the BAM format and it prints the alignments in SAM. If `-t' is\n\
+     applied, the input file is assumed to be in the SAM format. The\n\
+     file supplied with `-t' is SPACE/TAB delimited with the first two\n\
+     fields of each line consisting of the reference name and the\n\
+     corresponding sequence length. The `.fai' file generated by `faidx'\n\
+     can be used here. This file may be empty if reads are unaligned.\n\
+\n\
+  2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\
+\n\
+  3. BAM->SAM conversion: `samtools view in.bam'.\n\
+\n\
+  4. A region should be presented in one of the following formats:\n\
+     `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\
+     specified, the input alignment file must be an indexed BAM file.\n\
+\n\
+  5. Option `-u' is preferred over `-b' when the output is piped to\n\
+     another samtools command.\n\
+\n\
+  6. In a string FLAG, each character represents one bit with\n\
+     p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\n\
+     U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\n\
+     1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \n\
+     f=0x200 (failure) and d=0x400 (duplicate). Note that `-x' and\n\
+     `-X' are samtools-C specific. Picard and older samtools do not\n\
+     support HEX or string flags.\n\
+\n");
+       return 1;
+}
+
+int main_import(int argc, char *argv[])
+{
+       int argc2, ret;
+       char **argv2;
+       if (argc != 4) {
+               fprintf(stderr, "Usage: bamtk import <in.ref_list> <in.sam> <out.bam>\n");
+               return 1;
+       }
+       argc2 = 6;
+       argv2 = calloc(6, sizeof(char*));
+       argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
+       ret = main_samview(argc2, argv2);
+       free(argv2);
+       return ret;
+}
diff --git a/setup.cfg b/setup.cfg
new file mode 100644 (file)
index 0000000..652736c
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,6 @@
+[bdist_rpm]
+doc_files = README doc/*.html ChangeLog
+vendor = TDB
+packager = TDB <email@email.com>
+distribution-name = Red Hat Linux
+requires = python
diff --git a/setup.py b/setup.py
new file mode 100644 (file)
index 0000000..098cb7f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+'''
+
+pysam
+*****
+
+'''
+
+import os, sys, glob, shutil
+
+name = "pysam"
+version = "0.2"
+
+samtools_exclude = ( "bamtk.c", "razip.c", "bgzip.c" )
+samtools_dest = os.path.abspath( "samtools" )
+
+# copy samtools source
+if len(sys.argv) >= 2 and sys.argv[1] == "import":
+   if len(sys.argv) < 3: raise ValueError("missing PATH to samtools source directory")
+   samtools_src = os.path.abspath( sys.argv[2] )
+   if not os.path.exists( samtools_src ): raise IOError( "samtools src dir `%s` does not exist." % samtools_src )
+
+   cfiles = glob.glob( os.path.join( samtools_src, "*.c" ) )
+   hfiles = glob.glob( os.path.join( samtools_src, "*.h" ) )
+   ncopied = 0
+   for p in cfiles + hfiles:
+      f = os.path.basename(p)
+      if f in samtools_exclude: continue
+      if os.path.exists( os.path.join( samtools_dest, f )): continue
+      shutil.copy( p, samtools_dest )
+      ncopied += 1
+   print "installed latest source code from %s: %i files copied" % (samtools_src, ncopied)
+   sys.exit(0)
+
+from distutils.core import setup, Extension
+from Pyrex.Distutils import build_ext
+
+classifiers = """
+Development Status :: 2 - Alpha
+Operating System :: MacOS :: MacOS X
+Operating System :: Microsoft :: Windows :: Windows NT/2000
+Operating System :: OS Independent
+Operating System :: POSIX
+Operating System :: POSIX :: Linux
+Operating System :: Unix
+Programming Language :: Python
+Topic :: Scientific/Engineering
+Topic :: Scientific/Engineering :: Bioinformatics
+"""
+
+pysam = Extension(
+    "pysam/csamtools",                   # name of extension
+    [ "pysam/csamtools.pyx" ]  +\
+       [ "pysam/%s" % x for x in (
+             "pysam_util.c", )] +\
+       glob.glob( os.path.join( "samtools", "*.c" ) ),
+    library_dirs=[],
+    include_dirs=[ "samtools", ],
+    libraries=[ "z", ],
+    language="c",
+    )
+
+metadata = {
+    'name': name,
+    'version': version,
+    'description': "pysam", 
+    'long_description': __doc__,
+    'author': "Andreas Heger",
+    'author_email': "andreas.heger@gmail.com",
+    'license': "MIT",
+    'platforms': "ALL",
+    'url': "http://code.google.com/p/pysam/",
+    'py_modules': [
+      "pysam/__init__", "pysam/Pileup", "pysam/namedtuple" ],
+    'ext_modules': [pysam,],
+    'cmdclass' : {'build_ext': build_ext} }
+
+if __name__=='__main__':
+   dist = setup(**metadata)
diff --git a/tests/00README.txt b/tests/00README.txt
new file mode 100644 (file)
index 0000000..67b8689
--- /dev/null
@@ -0,0 +1,32 @@
+File ex1.fa contains two sequences cut from the human genome
+build36. They were exatracted with command:
+
+  samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550
+
+Sequence names were changed manually for simplicity. File ex1.sam.gz
+contains MAQ alignments exatracted with:
+
+  (samtools view NA18507_maq.bam 2:2044001-2045500;
+   samtools view NA18507_maq.bam 20:68001-69500)
+
+and processed with `samtools fixmate' to make it self-consistent as a
+standalone alignment.
+
+To try samtools, you may run the following commands:
+
+  samtools faidx ex1.fa                 # index the reference FASTA
+  samtools import ex1.fa.fai ex1.sam.gz ex1.bam   # SAM->BAM
+  samtools index ex1.bam                # index BAM
+  samtools tview ex1.bam ex1.fa         # view alignment
+  samtools pileup -cf ex1.fa ex1.bam    # pileup and consensus
+  samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz
+
+In order for the script pysam_test.py to work, you will need pysam
+in your PYTHONPATH.
+
+In order for the script example.py to work, you will need pysam
+in your PYTHONPATH and run
+
+  make all
+
+beforehand.
diff --git a/tests/Makefile b/tests/Makefile
new file mode 100644 (file)
index 0000000..5403750
--- /dev/null
@@ -0,0 +1,32 @@
+all: ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz \
+       ex2.sam.gz ex2.sam ex1.sam \
+       ex2.bam \
+       ex3.bam ex3.bam.bai \
+       ex4.bam ex4.bam.bai \
+       ex5.bam ex5.bam.bai \
+       ex6.bam 
+
+ex2.sam.gz: ex1.bam ex1.bam.bai
+               samtools view -h ex1.bam | gzip > ex2.sam.gz
+
+%.bam: %.sam ex1.fa.fai
+       samtools import ex1.fa.fai $< $@
+
+%.sam: %.sam.gz
+       gunzip < $< > $@
+
+ex1.fa.fai:ex1.fa
+               samtools faidx ex1.fa
+ex1.bam:ex1.sam.gz ex1.fa.fai
+               samtools import ex1.fa.fai ex1.sam.gz ex1.bam
+%.bam.bai:%.bam
+               samtools index $<
+ex1.pileup.gz:ex1.bam ex1.fa
+               samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz
+ex1.glf:ex1.bam ex1.fa 
+               samtools pileup -gf ex1.fa ex1.bam > ex1.glf
+ex1.glfview.gz:ex1.glf
+               samtools glfview ex1.glf | gzip > ex1.glfview.gz
+
+clean:
+               rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM pysam_*.sam ex2.sam ex2.sam.gz ex1.sam
diff --git a/tests/ex1.fa b/tests/ex1.fa
new file mode 100644 (file)
index 0000000..b4ed0cf
--- /dev/null
@@ -0,0 +1,56 @@
+>chr1
+CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT
+GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC
+GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG
+TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC
+AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA
+CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC
+AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT
+CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA
+ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC
+AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC
+AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC
+ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC
+CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT
+TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT
+TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT
+GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT
+ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA
+ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG
+TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA
+CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG
+TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC
+TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC
+TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG
+TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG
+AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA
+TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC
+TCCCTCGTCTTCTTA
+>chr2
+TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG
+CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT
+TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT
+CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA
+AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT
+AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC
+ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG
+GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT
+CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT
+TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA
+AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA
+ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT
+TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA
+AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC
+TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA
+GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT
+AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA
+AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT
+AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT
+AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT
+ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT
+GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG
+CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA
+GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA
+AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA
+TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC
+CAGAAAAAAATATTTACAGTAACT
diff --git a/tests/ex1.sam.gz b/tests/ex1.sam.gz
new file mode 100644 (file)
index 0000000..8dd2bc4
Binary files /dev/null and b/tests/ex1.sam.gz differ
diff --git a/tests/ex3.sam b/tests/ex3.sam
new file mode 100644 (file)
index 0000000..bae2a22
--- /dev/null
@@ -0,0 +1,13 @@
+@HD    VN:1.0
+@SQ    SN:chr1 LN:1575
+@SQ    SN:chr2 LN:1584
+@RG    ID:L1   PU:SC_1_10      LB:SC_1 SM:NA12891      CN:name:with:colon
+@RG    ID:L2   PU:SC_2_12      LB:SC_2 SM:NA12891      CN:name:with:colon
+@PG    ID:P1   VN:1.0
+@PG    ID:P2   VN:1.1
+@CO    this is a comment
+@CO    this is another comment
+read_28833_29006_6945  99      chr1    33      20      10M1D25M        =       200     167     AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG     <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<     NM:i:1  RG:Z:L1 PG:Z:P1 XT:A:U
+read_28701_28881_323b  147     chr2    88      30      35M     =       500     412     ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA     <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<     MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R
+read_28701_28881_323c  147     chr2    88      30      35M     =       500     412     ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA     <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<
+
diff --git a/tests/ex4.sam b/tests/ex4.sam
new file mode 100644 (file)
index 0000000..b2282b8
--- /dev/null
@@ -0,0 +1,9 @@
+@HD    VN:1.0
+@SQ    SN:chr1 LN:100
+@SQ    SN:chr2 LN:100
+@RG    ID:L1   PU:SC_1_10      LB:SC_1 SM:NA12891
+@RG    ID:L2   PU:SC_2_12      LB:SC_2 SM:NA12891
+@CO    this is a comment
+@CO    this is another comment
+read_28833_29006_6945  99      chr1    21      20      10M1D25M        =       200     167     AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG     <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<     NM:i:1  RG:Z:L1
+read_28701_28881_323b  147     chr2    21      30      35M     =       500     412     ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA     <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<     MF:i:18 RG:Z:L2
diff --git a/tests/ex5.sam b/tests/ex5.sam
new file mode 100644 (file)
index 0000000..f1f8aad
--- /dev/null
@@ -0,0 +1,5 @@
+@HD    VN:1.0
+@SQ    SN:chr1 LN:100
+@SQ    SN:chr2 LN:100
+read_28833_29006_6945  0       *       *       *       *       *       200     167     AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG     <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<
+read_28701_28881_323b  0       *       *       *       *       *       500     412     ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA     <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<
diff --git a/tests/ex6.sam b/tests/ex6.sam
new file mode 100644 (file)
index 0000000..7ae90f3
--- /dev/null
@@ -0,0 +1,5 @@
+@HD    VN:1.0
+@SQ    SN:chr1 LN:1575
+@SQ    SN:chr2 LN:1584
+read_28833_29006_6945  99      chr1    33      20      10M1D25M        =       200     167     AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG     <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<     NM:i:1  RG:Z:L1
+read_28701_28881_323b  147     chr2    88      30      35M     =       500     412     ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA     <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<     MF:i:18 RG:Z:L2
diff --git a/tests/ex7.sam b/tests/ex7.sam
new file mode 100644 (file)
index 0000000..12befae
--- /dev/null
@@ -0,0 +1,2 @@
+read_28833_29006_6945  99      chr1    33      20      10M1D25M        =       200     167     AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG     <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<     NM:i:1  RG:Z:L1 PG:Z:P1 XT:A:U
+read_28701_28881_323b  147     chr2    88      30      35M     =       500     412     ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA     <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<     MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R
diff --git a/tests/example.py b/tests/example.py
new file mode 100644 (file)
index 0000000..a1ca7a0
--- /dev/null
@@ -0,0 +1,121 @@
+import sys
+import pysam
+
+samfile = pysam.Samfile( "ex1.bam", "rb" )
+
+print "###################"
+# check different ways to iterate
+print len(list(samfile.fetch()))
+print len(list(samfile.fetch( "chr1", 10, 200 )))
+print len(list(samfile.fetch( region="chr1:10-200" )))
+print len(list(samfile.fetch( "chr1" )))
+print len(list(samfile.fetch( region="chr1")))
+print len(list(samfile.fetch( "chr2" )))
+print len(list(samfile.fetch( region="chr2")))
+print len(list(samfile.fetch()))
+print len(list(samfile.fetch( "chr1" )))
+print len(list(samfile.fetch( region="chr1")))
+print len(list(samfile.fetch()))
+
+print len(list(samfile.pileup( "chr1", 10, 200 )))
+print len(list(samfile.pileup( region="chr1:10-200" )))
+print len(list(samfile.pileup( "chr1" )))
+print len(list(samfile.pileup( region="chr1")))
+print len(list(samfile.pileup( "chr2" )))
+print len(list(samfile.pileup( region="chr2")))
+print len(list(samfile.pileup()))
+print len(list(samfile.pileup()))
+
+print "########### fetch with callback ################"
+def my_fetch_callback( alignment ): print str(alignment)
+samfile.fetch( region="chr1:10-200", callback=my_fetch_callback )
+
+print "########## pileup with callback ################"
+def my_pileup_callback( column ): print str(column)
+samfile.pileup( region="chr1:10-200", callback=my_pileup_callback )
+
+print "##########iterator row #################"
+iter = pysam.IteratorRow( samfile, 0, 10, 200)
+for x in iter: print str(x)
+
+print "##########iterator col #################"
+iter = pysam.IteratorColumn( samfile, 0, 10, 200 )
+for x in iter: print str(x)
+
+print "#########row all##################"
+iter = pysam.IteratorRowAll( samfile )
+for x in iter: print str(x)
+
+
+print "###################"
+
+class Counter:
+    mCounts = 0
+    def __call__(self, alignment):
+        self.mCounts += 1
+
+c = Counter()
+samfile.fetch( "chr1:10-200", c )
+print "counts=", c.mCounts
+
+sys.exit(0)
+print samfile.getTarget( 0 )
+print samfile.getTarget( 1 )
+
+for p in pysam.pileup( "-c", "ex1.bam" ):
+    print str(p)
+
+print pysam.pileup.getMessages()
+
+for p in pysam.pileup( "-c", "ex1.bam", raw=True ):
+    print str(p),
+
+
+
+print "###########################"
+
+samfile = pysam.Samfile( "ex2.sam.gz", "r" )
+
+print "num targets=", samfile.getNumTargets()
+
+iter = pysam.IteratorRowAll( samfile )
+for x in iter: print str(x)
+
+samfile.close()
+
+print "###########################"
+samfile = pysam.Samfile( "ex2.sam.gz", "r" )
+def my_fetch_callback( alignment ):
+    print str(alignment)
+
+try:
+    samfile.fetch( "chr1:10-20", my_fetch_callback )
+except AssertionError:
+    print "caught fetch exception"
+
+samfile.close()
+
+print "###########################"
+samfile = pysam.Samfile( "ex2.sam.gz", "r" )
+def my_pileup_callback( pileups ):
+    print str(pileups)
+try:
+    samfile.pileup( "chr1:10-20", my_pileup_callback )
+except NotImplementedError:
+    print "caught pileup exception"
+
+# playing arount with headers
+samfile = pysam.Samfile( "ex3.sam", "r" )
+print samfile.targets
+print samfile.lengths
+print samfile.text
+print samdile.header
+header = samfile.header
+samfile.close()
+
+header["HD"]["SO"] = "unsorted"
+outfile = pysam.Samfile( "out.sam", "wh", 
+                         header = header )
+
+outfile.close()
+
diff --git a/tests/pysam_test.py b/tests/pysam_test.py
new file mode 100755 (executable)
index 0000000..c2ae6fa
--- /dev/null
@@ -0,0 +1,841 @@
+#!/usr/bin/env python
+'''unit testing code for pysam.
+
+Execute in the :file:`tests` directory as it requires the Makefile
+and data files located there.
+'''
+
+import pysam
+import unittest
+import os
+import itertools
+import subprocess
+import shutil
+
+
+def checkBinaryEqual( filename1, filename2 ):
+    '''return true if the two files are binary equal.'''
+    if os.path.getsize( filename1 ) !=  os.path.getsize( filename2 ):
+        return False
+
+    infile1 = open(filename1, "rb")
+    infile2 = open(filename2, "rb")
+
+    def chariter( infile ):
+        while 1:
+            c = infile.read(1)
+            if c == "": break
+            yield c
+
+    found = False
+    for c1,c2 in itertools.izip( chariter( infile1), chariter( infile2) ):
+        if c1 != c2: break
+    else:
+        found = True
+
+    infile1.close()
+    infile2.close()
+    return found
+
+def runSamtools( cmd ):
+    '''run a samtools command'''
+
+    try:
+        retcode = subprocess.call(cmd, shell=True)
+        if retcode < 0:
+            print >>sys.stderr, "Child was terminated by signal", -retcode
+    except OSError, e:
+        print >>sys.stderr, "Execution failed:", e
+
+        
+class BinaryTest(unittest.TestCase):
+    '''test samtools command line commands and compare
+    against pysam commands.
+
+    Tests fail, if the output is not binary identical.
+    '''
+
+    first_time = True
+
+    # a list of commands to test
+    mCommands = \
+        { "faidx" : \
+        ( 
+            ("ex1.fa.fai", "samtools faidx ex1.fa"), 
+            ("pysam_ex1.fa.fai", (pysam.faidx, "ex1.fa") ),
+            ),
+          "import" :
+              (
+                ("ex1.bam", "samtools import ex1.fa.fai ex1.sam.gz ex1.bam" ),
+                ("pysam_ex1.bam", (pysam.samimport, "ex1.fa.fai ex1.sam.gz pysam_ex1.bam") ),
+                ),
+          "index":
+              (
+                ("ex1.bam.bai", "samtools index ex1.bam" ),
+                ("pysam_ex1.bam.bai", (pysam.index, "pysam_ex1.bam" ) ),
+                ),
+          "pileup1" :
+          (
+                ("ex1.pileup", "samtools pileup -cf ex1.fa ex1.bam > ex1.pileup" ),
+                ("pysam_ex1.pileup", (pysam.pileup, "-c -f ex1.fa ex1.bam" ) )
+                ),
+          "pileup2" :
+          (
+                ("ex1.glf", "samtools pileup -gf ex1.fa ex1.bam > ex1.glf" ),
+                ("pysam_ex1.glf", (pysam.pileup, "-g -f ex1.fa ex1.bam" ) )
+                ),
+          "glfview" :
+        (
+                ("ex1.glfview", "samtools glfview ex1.glf > ex1.glfview"),
+                ("pysam_ex1.glfview", (pysam.glfview, "ex1.glf" ) ),
+                ),
+          "view" :
+        (
+                ("ex1.view", "samtools view ex1.bam > ex1.view"),
+                ("pysam_ex1.view", (pysam.view, "ex1.bam" ) ),
+                ),
+        }
+
+    # some tests depend on others. The order specifies in which order
+    # the samtools commands are executed.
+    mOrder = ('faidx', 'import', 'index', 'pileup1', 'pileup2', 'glfview', 'view' )
+
+    def setUp( self ):
+        '''setup tests. 
+
+        For setup, all commands will be run before the first test is
+        executed. Individual tests will then just compare the output
+        files.
+        '''
+        if BinaryTest.first_time:
+            # copy the source 
+            shutil.copy( "ex1.fa", "pysam_ex1.fa" )
+
+            for label in self.mOrder:
+                command = self.mCommands[label]
+                samtools_target, samtools_command = command[0]
+                pysam_target, pysam_command = command[1]
+                runSamtools( samtools_command )
+                pysam_method, pysam_options = pysam_command
+                output = pysam_method( *pysam_options.split(" "), raw=True)
+                if ">" in samtools_command:
+                    outfile = open( pysam_target, "w" )
+                    for line in output: outfile.write( line )
+                    outfile.close()
+
+            BinaryTest.first_time = False
+
+    def checkCommand( self, command ):
+        if command:
+            samtools_target, pysam_target = self.mCommands[command][0][0], self.mCommands[command][1][0]
+            self.assertTrue( checkBinaryEqual( samtools_target, pysam_target ), 
+                             "%s failed: files %s and %s are not the same" % (command, samtools_target, pysam_target) )
+
+    def testImport( self ):
+        self.checkCommand( "import" )
+
+    def testIndex( self ):
+        self.checkCommand( "index" )
+        
+    def testPileup1( self ):
+        self.checkCommand( "pileup1" )
+    
+    def testPileup2( self ):
+        self.checkCommand( "pileup2" )
+
+    def testGLFView( self ):
+        self.checkCommand( "glfview" )
+
+    def testView( self ):
+        self.checkCommand( "view" )
+
+    def testEmptyIndex( self ):
+        self.assertRaises( pysam.SamtoolsError, pysam.index, "exdoesntexist.bam" )
+
+    def __del__(self):
+
+        for label, command in self.mCommands.iteritems():
+            samtools_target, samtools_command = command[0]
+            pysam_target, pysam_command = command[1]
+            if os.path.exists( samtools_target): os.remove( samtools_target )
+            if os.path.exists( pysam_target): os.remove( pysam_target )
+        if os.path.exists( "pysam_ex1.fa" ): os.remove( "pysam_ex1.fa" )
+
+class IOTest(unittest.TestCase):
+    '''check if reading samfile and writing a samfile are consistent.'''
+
+    def checkEcho( self, input_filename, reference_filename, 
+                   output_filename, 
+                   input_mode, output_mode, use_template = True):
+        '''iterate through *input_filename* writing to *output_filename* and
+        comparing the output to *reference_filename*. 
+        
+        The files are opened according to the *input_mode* and *output_mode*.
+
+        If *use_template* is set, the header is copied from infile using the
+        template mechanism, otherwise target names and lengths are passed explicitely. 
+        '''
+
+        infile = pysam.Samfile( input_filename, input_mode )
+        if use_template:
+            outfile = pysam.Samfile( output_filename, output_mode, template = infile )
+        else:
+            outfile = pysam.Samfile( output_filename, output_mode, 
+                                     referencenames = infile.references,
+                                     referencelengths = infile.lengths )
+
+        iter = infile.fetch()
+        for x in iter: outfile.write( x )
+        infile.close()
+        outfile.close()
+
+        self.assertTrue( checkBinaryEqual( reference_filename, output_filename), 
+                         "files %s and %s are not the same" % (reference_filename, output_filename) )
+
+    def testReadWriteBam( self ):
+        
+        input_filename = "ex1.bam"
+        output_filename = "pysam_ex1.bam"
+        reference_filename = "ex1.bam"
+
+        self.checkEcho( input_filename, reference_filename, output_filename,
+                        "rb", "wb" )
+
+    def testReadWriteBamWithTargetNames( self ):
+        
+        input_filename = "ex1.bam"
+        output_filename = "pysam_ex1.bam"
+        reference_filename = "ex1.bam"
+
+        self.checkEcho( input_filename, reference_filename, output_filename,
+                        "rb", "wb", use_template = False )
+
+    def testReadWriteSamWithHeader( self ):
+        
+        input_filename = "ex2.sam"
+        output_filename = "pysam_ex2.sam"
+        reference_filename = "ex2.sam"
+
+        self.checkEcho( input_filename, reference_filename, output_filename,
+                        "r", "wh" )
+
+    def testReadWriteSamWithoutHeader( self ):
+        
+        input_filename = "ex2.sam"
+        output_filename = "pysam_ex2.sam"
+        reference_filename = "ex1.sam"
+
+        self.checkEcho( input_filename, reference_filename, output_filename,
+                        "r", "w" )
+
+    def testFetchFromClosedFile( self ):
+
+        samfile = pysam.Samfile( "ex1.bam", "rb" )
+        samfile.close()
+        self.assertRaises( ValueError, samfile.fetch, 'chr1', 100, 120)
+
+    def testPileupFromClosedFile( self ):
+
+        samfile = pysam.Samfile( "ex1.bam", "rb" )
+        samfile.close()
+        self.assertRaises( ValueError, samfile.pileup, 'chr1', 100, 120)
+
+    def testBinaryReadFromSamfile( self ):
+        pass
+        # needs to re-activated, see issue 19
+        #samfile = pysam.Samfile( "ex1.bam", "r" )
+        #samfile.fetch().next()
+
+    def testReadingFromFileWithoutIndex( self ):
+        '''read from bam file without index.'''
+
+        assert not os.path.exists( "ex2.bam.bai" )
+        samfile = pysam.Samfile( "ex2.bam", "rb" )
+        self.assertRaises( ValueError, samfile.fetch )
+        self.assertEqual( len(list( samfile.fetch(until_eof = True) )), 3270 )
+
+class TestIteratorRow(unittest.TestCase):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex1.bam","rb" )
+
+    def checkRange( self, rnge ):
+        '''compare results from iterator with those from samtools.'''
+        ps = list(self.samfile.fetch(region=rnge))
+        sa = list(pysam.view( "ex1.bam", rnge , raw = True) )
+        self.assertEqual( len(ps), len(sa), "unequal number of results for range %s: %i != %i" % (rnge, len(ps), len(sa) ))
+        # check if the same reads are returned and in the same order
+        for line, pair in enumerate( zip( ps, sa ) ):
+            data = pair[1].split("\t")
+            self.assertEqual( pair[0].qname, data[0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0]) )
+
+    def testIteratePerContig(self):
+        '''check random access per contig'''
+        for contig in self.samfile.references:
+            self.checkRange( contig )
+
+    def testIterateRanges(self):
+        '''check random access per range'''
+        for contig, length in zip(self.samfile.references, self.samfile.lengths):
+            for start in range( 1, length, 90):
+                self.checkRange( "%s:%i-%i" % (contig, start, start + 90) ) # this includes empty ranges
+
+    def tearDown(self):
+        self.samfile.close()
+
+class TestIteratorRowAll(unittest.TestCase):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex1.bam","rb" )
+
+    def testIterate(self):
+        '''compare results from iterator with those from samtools.'''
+        ps = list(self.samfile.fetch())
+        sa = list(pysam.view( "ex1.bam", raw = True) )
+        self.assertEqual( len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa) ))
+        # check if the same reads are returned
+        for line, pair in enumerate( zip( ps, sa ) ):
+            data = pair[1].split("\t")
+            self.assertEqual( pair[0].qname, data[0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0]) )
+
+    def tearDown(self):
+        self.samfile.close()
+
+class TestIteratorColumn(unittest.TestCase):
+    '''test iterator column against contents of ex3.bam.'''
+    
+    # note that samfile contains 1-based coordinates
+    # 1D means deletion with respect to reference sequence
+    # 
+    mCoverages = { 'chr1' : [ 0 ] * 20 + [1] * 36 + [0] * (100 - 20 -35 ),
+                   'chr2' : [ 0 ] * 20 + [1] * 35 + [0] * (100 - 20 -35 ),
+                   }
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex4.bam","rb" )
+
+    def checkRange( self, rnge ):
+        '''compare results from iterator with those from samtools.'''
+        # check if the same reads are returned and in the same order
+        for column in self.samfile.pileup(region=rnge):
+            thiscov = len(column.pileups)
+            refcov = self.mCoverages[self.samfile.getrname(column.tid)][column.pos]
+            self.assertEqual( thiscov, refcov, "wrong coverage at pos %s:%i %i should be %i" % (self.samfile.getrname(column.tid), column.pos, thiscov, refcov))
+
+    def testIterateAll(self):
+        '''check random access per contig'''
+        self.checkRange( None )
+
+    def testIteratePerContig(self):
+        '''check random access per contig'''
+        for contig in self.samfile.references:
+            self.checkRange( contig )
+
+    def testIterateRanges(self):
+        '''check random access per range'''
+        for contig, length in zip(self.samfile.references, self.samfile.lengths):
+            for start in range( 1, length, 90):
+                self.checkRange( "%s:%i-%i" % (contig, start, start + 90) ) # this includes empty ranges
+
+    def testInverse( self ):
+        '''test the inverse, is point-wise pileup accurate.'''
+        for contig, refseq in self.mCoverages.items():
+            refcolumns = sum(refseq)
+            for pos, refcov in enumerate( refseq ):
+                columns = list(self.samfile.pileup( contig, pos, pos+1) )
+                if refcov == 0:
+                    # if no read, no coverage
+                    self.assertEqual( len(columns), refcov, "wrong number of pileup columns returned for position %s:%i, %i should be %i" %(contig,pos,len(columns), refcov) )
+                elif refcov == 1:
+                    # one read, all columns of the read are returned
+                    self.assertEqual( len(columns), refcolumns, "pileup incomplete - %i should be %i " % (len(columns), refcolumns))
+                    
+    def tearDown(self):
+        self.samfile.close()
+    
+class TestAlignedReadFromBam(unittest.TestCase):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex3.bam","rb" )
+        self.reads=list(self.samfile.fetch())
+
+    def testARqname(self):
+        self.assertEqual( self.reads[0].qname, "read_28833_29006_6945", "read name mismatch in read 1: %s != %s" % (self.reads[0].qname, "read_28833_29006_6945") )
+        self.assertEqual( self.reads[1].qname, "read_28701_28881_323b", "read name mismatch in read 2: %s != %s" % (self.reads[1].qname, "read_28701_28881_323b") )
+
+    def testARflag(self):
+        self.assertEqual( self.reads[0].flag, 99, "flag mismatch in read 1: %s != %s" % (self.reads[0].flag, 99) )
+        self.assertEqual( self.reads[1].flag, 147, "flag mismatch in read 2: %s != %s" % (self.reads[1].flag, 147) )
+
+    def testARrname(self):
+        self.assertEqual( self.reads[0].rname, 0, "chromosome/target id mismatch in read 1: %s != %s" % (self.reads[0].rname, 0) )
+        self.assertEqual( self.reads[1].rname, 1, "chromosome/target id mismatch in read 2: %s != %s" % (self.reads[1].rname, 1) )
+
+    def testARpos(self):
+        self.assertEqual( self.reads[0].pos, 33-1, "mapping position mismatch in read 1: %s != %s" % (self.reads[0].pos, 33-1) )
+        self.assertEqual( self.reads[1].pos, 88-1, "mapping position mismatch in read 2: %s != %s" % (self.reads[1].pos, 88-1) )
+
+    def testARmapq(self):
+        self.assertEqual( self.reads[0].mapq, 20, "mapping quality mismatch in read 1: %s != %s" % (self.reads[0].mapq, 20) )
+        self.assertEqual( self.reads[1].mapq, 30, "mapping quality mismatch in read 2: %s != %s" % (self.reads[1].mapq, 30) )
+
+    def testARcigar(self):
+        self.assertEqual( self.reads[0].cigar, [(0, 10), (2, 1), (0, 25)], "read name length mismatch in read 1: %s != %s" % (self.reads[0].cigar, [(0, 10), (2, 1), (0, 25)]) )
+        self.assertEqual( self.reads[1].cigar, [(0, 35)], "read name length mismatch in read 2: %s != %s" % (self.reads[1].cigar, [(0, 35)]) )
+
+    def testARmrnm(self):
+        self.assertEqual( self.reads[0].mrnm, 0, "mate reference sequence name mismatch in read 1: %s != %s" % (self.reads[0].mrnm, 0) )
+        self.assertEqual( self.reads[1].mrnm, 1, "mate reference sequence name mismatch in read 2: %s != %s" % (self.reads[1].mrnm, 1) )
+
+    def testARmpos(self):
+        self.assertEqual( self.reads[0].mpos, 200-1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].mpos, 200-1) )
+        self.assertEqual( self.reads[1].mpos, 500-1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].mpos, 500-1) )
+
+    def testARisize(self):
+        self.assertEqual( self.reads[0].isize, 167, "insert size mismatch in read 1: %s != %s" % (self.reads[0].isize, 167) )
+        self.assertEqual( self.reads[1].isize, 412, "insert size mismatch in read 2: %s != %s" % (self.reads[1].isize, 412) )
+
+    def testARseq(self):
+        self.assertEqual( self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 1: %s != %s" % (self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG") )
+        self.assertEqual( self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "sequence size mismatch in read 2: %s != %s" % (self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA") )
+
+    def testARqual(self):
+        self.assertEqual( self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", "quality string mismatch in read 1: %s != %s" % (self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") )
+        self.assertEqual( self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "quality string mismatch in read 2: %s != %s" % (self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<") )
+
+    def testPresentOptionalFields(self):
+        self.assertEqual( self.reads[0].opt('NM'), 1, "optional field mismatch in read 1, NM: %s != %s" % (self.reads[0].opt('NM'), 1) )
+        self.assertEqual( self.reads[0].opt('RG'), 'L1', "optional field mismatch in read 1, RG: %s != %s" % (self.reads[0].opt('RG'), 'L1') )
+        self.assertEqual( self.reads[1].opt('RG'), 'L2', "optional field mismatch in read 2, RG: %s != %s" % (self.reads[1].opt('RG'), 'L2') )
+        self.assertEqual( self.reads[1].opt('MF'), 18, "optional field mismatch in read 2, MF: %s != %s" % (self.reads[1].opt('MF'), 18) )
+
+    def testPairedBools(self):
+        self.assertEqual( self.reads[0].is_paired, True, "is paired mismatch in read 1: %s != %s" % (self.reads[0].is_paired, True) )
+        self.assertEqual( self.reads[1].is_paired, True, "is paired mismatch in read 2: %s != %s" % (self.reads[1].is_paired, True) )
+        self.assertEqual( self.reads[0].is_proper_pair, True, "is proper pair mismatch in read 1: %s != %s" % (self.reads[0].is_proper_pair, True) )
+        self.assertEqual( self.reads[1].is_proper_pair, True, "is proper pair mismatch in read 2: %s != %s" % (self.reads[1].is_proper_pair, True) )
+
+    def testTags( self ):
+        self.assertEqual( self.reads[0].tags, 
+                          [('NM', 1), ('RG', 'L1'), 
+                           ('PG', 'P1'), ('XT', 'U')] )
+        self.assertEqual( self.reads[1].tags, 
+                          [('MF', 18), ('RG', 'L2'), 
+                           ('PG', 'P2'),('XT', 'R') ] )
+
+    def testOpt( self ):
+        self.assertEqual( self.reads[0].opt("XT"), "U" )
+        self.assertEqual( self.reads[1].opt("XT"), "R" )
+
+    def testMissingOpt( self ):
+        self.assertRaises( KeyError, self.reads[0].opt, "XP" )
+
+    def testEmptyOpt( self ):
+        self.assertRaises( KeyError, self.reads[2].opt, "XT" )
+
+    def tearDown(self):
+        self.samfile.close()
+
+class TestAlignedReadFromSam(TestAlignedReadFromBam):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex3.sam","r" )
+        self.reads=list(self.samfile.fetch())
+
+# needs to be implemented 
+# class TestAlignedReadFromSamWithoutHeader(TestAlignedReadFromBam):
+#
+#     def setUp(self):
+#         self.samfile=pysam.Samfile( "ex7.sam","r" )
+#         self.reads=list(self.samfile.fetch())
+
+class TestHeaderSam(unittest.TestCase):
+
+    header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, 
+                     {'LN': 1584, 'SN': 'chr2'}], 
+              'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN":"name:with:colon"}, 
+                     {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN":"name:with:colon"}],
+              'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}], 
+              'HD': {'VN': '1.0'},
+              'CO' : [ 'this is a comment', 'this is another comment'],
+              }
+
+    def compareHeaders( self, a, b ):
+        '''compare two headers a and b.'''
+        for ak,av in a.iteritems():
+            self.assertTrue( ak in b, "key '%s' not in '%s' " % (ak,b) )
+            self.assertEqual( av, b[ak] )
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex3.sam","r" )
+
+    def testHeaders(self):
+        self.compareHeaders( self.header, self.samfile.header )
+        self.compareHeaders( self.samfile.header, self.header )
+        
+    def tearDown(self):
+        self.samfile.close()
+
+class TestHeaderBam(TestHeaderSam):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex3.bam","rb" )
+
+class TestUnmappedReads(unittest.TestCase):
+
+    def testSAM(self):
+        samfile=pysam.Samfile( "ex5.sam","r" )
+        self.assertEqual( len(list(samfile.fetch( until_eof = True))), 2 ) 
+        samfile.close()
+
+    def testBAM(self):
+        samfile=pysam.Samfile( "ex5.bam","rb" )
+        self.assertEqual( len(list(samfile.fetch( until_eof = True))), 2 ) 
+        samfile.close()
+
+class TestPileupObjects(unittest.TestCase):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex1.bam","rb" )
+
+    def testPileupColumn(self):
+        for pcolumn1 in self.samfile.pileup( region="chr1:105" ):
+            if pcolumn1.pos == 104:
+                self.assertEqual( pcolumn1.tid, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.tid, 0) )
+                self.assertEqual( pcolumn1.pos, 105-1, "position mismatch in position 1: %s != %s" % (pcolumn1.pos, 105-1) )
+                self.assertEqual( pcolumn1.n, 2, "# reads mismatch in position 1: %s != %s" % (pcolumn1.n, 2) )
+        for pcolumn2 in self.samfile.pileup( region="chr2:1480" ):
+            if pcolumn2.pos == 1479:
+                self.assertEqual( pcolumn2.tid, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.tid, 1) )
+                self.assertEqual( pcolumn2.pos, 1480-1, "position mismatch in position 1: %s != %s" % (pcolumn2.pos, 1480-1) )
+                self.assertEqual( pcolumn2.n, 12, "# reads mismatch in position 1: %s != %s" % (pcolumn2.n, 12) )
+
+    def testPileupRead(self):
+        for pcolumn1 in self.samfile.pileup( region="chr1:105" ):
+            if pcolumn1.pos == 104:
+                self.assertEqual( len(pcolumn1.pileups), 2, "# reads aligned to column mismatch in position 1: %s != %s" % (len(pcolumn1.pileups), 2) )
+#                self.assertEqual( pcolumn1.pileups[0]  # need to test additional properties here
+
+    def tearDown(self):
+        self.samfile.close()
+        
+class TestExceptions(unittest.TestCase):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex1.bam","rb" )
+
+    def testMissingFile(self):
+
+        self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.bam", "rb" )
+        self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.sam", "r" )
+        self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.bam", "r" )
+        self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.sam", "rb" )
+
+    def testBadContig(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr88" )
+
+    def testMeaninglessCrap(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "skljf" )
+
+    def testBackwardsOrderNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, 'chr1', 100, 10 )
+
+    def testBackwardsOrderOldFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:100-10")
+        
+    def testOutOfRangeNegativeNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 )
+
+    def testOutOfRangeNegativeOldFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-10" )
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-0" )
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5--10" )
+
+    def testOutOfRangNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999, 99999999999 )
+
+    def testOutOfRangeLargeNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999999999999999999999999, 9999999999999999999999999999999999999999 )
+
+    def testOutOfRangeLargeOldFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" )
+
+    def tearDown(self):
+        self.samfile.close()
+
+class TestFastaFile(unittest.TestCase):
+
+    mSequences = { 'chr1' :
+                       "CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCTGTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACCAAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCTCTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCAATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGCAGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAACAACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACACATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATACCATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTTTCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAATACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGAACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTGTGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAGTCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGCTTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTCTCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGGAGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATATTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTCTCCCTCGTCTTCTTA",
+                   'chr2' :
+                       "TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAGCTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCTTATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTTCAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTTAGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAGGAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATTTTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTAAGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATAAAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACCTCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATTAATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCAAATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGTAAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATATAACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAATACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGATGATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTGCGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATAGCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAAAAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAATTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAAAAAATATTTACAGTAACT",
+                   }
+
+    def setUp(self):
+        self.file=pysam.Fastafile( "ex1.fa" )
+
+    def testFetch(self):
+        for id, seq in self.mSequences.items():
+            self.assertEqual( seq, self.file.fetch( id ) )
+            for x in range( 0, len(seq), 10):
+                self.assertEqual( seq[x:x+10], self.file.fetch( id, x, x+10) )
+
+    def testFetchErrors( self ):
+        self.assertRaises( ValueError, self.file.fetch )
+        self.assertRaises( ValueError, self.file.fetch, "chr1", 0 )
+        self.assertRaises( ValueError, self.file.fetch, "chr1", -1, 10 )
+        self.assertRaises( ValueError, self.file.fetch, "chr1", 20, 10 )
+        # the following segfaults:
+        # self.assertRaises( IndexError, self.file.fetch, "chr12", )
+        pass
+
+    def tearDown(self):
+        self.file.close()
+
+
+class TestAlignedRead(unittest.TestCase):
+    '''tests to check if aligned read can be constructed
+    and manipulated.
+    '''
+
+    def checkFieldEqual( self, read1, read2, exclude = []):
+        '''check if two reads are equal by comparing each field.'''
+
+        for x in ("qname", "seq", "flag",
+                  "rname", "pos", "mapq", "cigar",
+                  "mrnm", "mpos", "isize", "qual",
+                  "is_paired", "is_proper_pair",
+                  "is_unmapped", "mate_is_unmapped",
+                  "is_reverse", "mate_is_reverse",
+                  "is_read1", "is_read2",
+                  "is_secondary", "is_qcfail",
+                  "is_duplicate", "bin"):
+            if x in exclude: continue
+            self.assertEqual( getattr(read1, x), getattr(read2,x), "attribute mismatch for %s: %s != %s" % 
+                              (x, getattr(read1, x), getattr(read2,x)))
+    
+    def testEmpty( self ):
+        a = pysam.AlignedRead()
+        self.assertEqual( a.qname, None )
+        self.assertEqual( a.seq, None )
+        self.assertEqual( a.qual, None )
+        self.assertEqual( a.flag, 0 )
+        self.assertEqual( a.rname, 0 )
+        self.assertEqual( a.mapq, 0 )
+        self.assertEqual( a.cigar, None )
+        self.assertEqual( a.tags, None )
+        self.assertEqual( a.mrnm, 0 )
+        self.assertEqual( a.mpos, 0 )
+        self.assertEqual( a.isize, 0 )
+
+    def buildRead( self ):
+        '''build an example read.'''
+        
+        a = pysam.AlignedRead()
+        a.qname = "read_12345"
+        a.seq="ACGT" * 3
+        a.flag = 0
+        a.rname = 0
+        a.pos = 33
+        a.mapq = 20
+        a.cigar = ( (0,10), (2,1), (0,25) )
+        a.mrnm = 0
+        a.mpos=200
+        a.isize=167
+       a.qual="1234" * 3
+
+        return a
+
+    def testUpdate( self ):
+        '''check if updating fields affects other variable length data
+        '''
+        a = self.buildRead()
+        b = self.buildRead()
+
+        # check qname
+        b.qname = "read_123"
+        self.checkFieldEqual( a, b, "qname" )
+        b.qname = "read_12345678"
+        self.checkFieldEqual( a, b, "qname" )
+        b.qname = "read_12345"
+        self.checkFieldEqual( a, b)
+
+        # check cigar
+        b.cigar = ( (0,10), )
+        self.checkFieldEqual( a, b, "cigar" )
+        b.cigar = ( (0,10), (2,1), (0,25), (2,1), (0,25) )
+        self.checkFieldEqual( a, b, "cigar" )
+        b.cigar = ( (0,10), (2,1), (0,25) )
+        self.checkFieldEqual( a, b)
+
+        # check seq 
+        b.seq = "ACGT"
+        self.checkFieldEqual( a, b, ("seq", "qual") )
+        b.seq = "ACGT" * 10
+        self.checkFieldEqual( a, b, ("seq", "qual") )
+        b.seq = "ACGT" * 3
+        self.checkFieldEqual( a, b, ("qual",))
+
+        # reset qual
+        b = self.buildRead()
+
+        # check flags:
+        for x in (
+            "is_paired", "is_proper_pair",
+            "is_unmapped", "mate_is_unmapped",
+            "is_reverse", "mate_is_reverse",
+            "is_read1", "is_read2",
+            "is_secondary", "is_qcfail",
+            "is_duplicate"):
+            setattr( b, x, True )
+            self.assertEqual( getattr(b, x), True )
+            self.checkFieldEqual( a, b, ("flag", x,) )
+            setattr( b, x, False )
+            self.assertEqual( getattr(b, x), False )
+            self.checkFieldEqual( a, b )
+
+    def testLargeRead( self ):
+        '''build an example read.'''
+        
+        a = pysam.AlignedRead()
+        a.qname = "read_12345"
+        a.seq="ACGT" * 200
+        a.flag = 0
+        a.rname = 0
+        a.pos = 33
+        a.mapq = 20
+        a.cigar = ( (0,10), (2,1), (0,25) )
+        a.mrnm = 0
+        a.mpos=200
+        a.isize=167
+       a.qual="1234" * 200
+
+        return a
+
+class TestDeNovoConstruction(unittest.TestCase):
+    '''check BAM/SAM file construction using ex3.sam
+    
+    (note these are +1 coordinates):
+    
+    read_28833_29006_6945      99      chr1    33      20      10M1D25M        =       200     167     AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG     <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<     NM:i:1  RG:Z:L1
+    read_28701_28881_323b      147     chr2    88      30      35M     =       500     412     ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA     <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<     MF:i:18 RG:Z:L2
+    '''
+
+    header = { 'HD': {'VN': '1.0'},
+               'SQ': [{'LN': 1575, 'SN': 'chr1'}, 
+                      {'LN': 1584, 'SN': 'chr2'}], }
+
+    bamfile = "ex6.bam"
+    samfile = "ex6.sam"
+
+    def checkFieldEqual( self, read1, read2, exclude = []):
+        '''check if two reads are equal by comparing each field.'''
+
+        for x in ("qname", "seq", "flag",
+                  "rname", "pos", "mapq", "cigar",
+                  "mrnm", "mpos", "isize", "qual",
+                  "bin",
+                  "is_paired", "is_proper_pair",
+                  "is_unmapped", "mate_is_unmapped",
+                  "is_reverse", "mate_is_reverse",
+                  "is_read1", "is_read2",
+                  "is_secondary", "is_qcfail",
+                  "is_duplicate"):
+            if x in exclude: continue
+            self.assertEqual( getattr(read1, x), getattr(read2,x), "attribute mismatch for %s: %s != %s" % 
+                              (x, getattr(read1, x), getattr(read2,x)))
+
+    def setUp( self ):
+
+        
+        a = pysam.AlignedRead()
+        a.qname = "read_28833_29006_6945"
+        a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
+        a.flag = 99
+        a.rname = 0
+        a.pos = 32
+        a.mapq = 20
+        a.cigar = ( (0,10), (2,1), (0,25) )
+        a.mrnm = 0
+        a.mpos=199
+        a.isize=167
+       a.qual="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"
+       a.tags = ( ("NM", 1),
+                   ("RG", "L1") )
+
+        b = pysam.AlignedRead()
+        b.qname = "read_28701_28881_323b"
+        b.seq="ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"
+        b.flag = 147
+        b.rname = 1
+        b.pos = 87
+        b.mapq = 30
+        b.cigar = ( (0,35), )
+        b.mrnm = 1
+        b.mpos=499
+        b.isize=412
+       b.qual="<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"
+       b.tags = ( ("MF", 18),
+                   ("RG", "L2") )
+
+        self.reads = (a,b)
+
+    def testSAMWholeFile( self ):
+        
+        tmpfilename = "tmp_%i.sam" % id(self)
+
+        outfile = pysam.Samfile( tmpfilename, "wh", header = self.header )
+
+        for x in self.reads: outfile.write( x )
+        outfile.close()
+        
+        self.assertTrue( checkBinaryEqual( tmpfilename, self.samfile ),
+                         "mismatch when construction SAM file, see %s %s" % (tmpfilename, self.samfile))
+        
+        os.unlink( tmpfilename )
+
+    def testBAMPerRead( self ):
+        '''check if individual reads are binary equal.'''
+        infile = pysam.Samfile( self.bamfile, "rb")
+
+        others = list(infile)
+        for denovo, other in zip( others, self.reads):
+            self.checkFieldEqual( other, denovo )
+            self.assertEqual( other, denovo)
+
+    def testSAMPerRead( self ):
+        '''check if individual reads are binary equal.'''
+        infile = pysam.Samfile( self.samfile, "r")
+
+        others = list(infile)
+        for denovo, other in zip( others, self.reads):
+            self.checkFieldEqual( other, denovo )
+            self.assertEqual( other, denovo)
+            
+    def testBAMWholeFile( self ):
+        
+        tmpfilename = "tmp_%i.bam" % id(self)
+
+        outfile = pysam.Samfile( tmpfilename, "wb", header = self.header )
+
+        for x in self.reads: outfile.write( x )
+        outfile.close()
+        
+        self.assertTrue( checkBinaryEqual( tmpfilename, self.bamfile ),
+                         "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile))
+        
+        os.unlink( tmpfilename )
+
+
+# TODOS
+# 1. finish testing all properties within pileup objects
+# 2. check exceptions and bad input problems (missing files, optional fields that aren't present, etc...)
+
+if __name__ == "__main__":
+    # build data files
+    print "building data files"
+    subprocess.call( "make", shell=True)
+    print "starting tests"
+    unittest.main()
diff --git a/tests/segfault_tests.py b/tests/segfault_tests.py
new file mode 100755 (executable)
index 0000000..ff32fec
--- /dev/null
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+'''unit testing code for pysam.'''
+
+import pysam
+import unittest
+import os
+import itertools
+import subprocess
+import shutil
+
+class TestExceptions(unittest.TestCase):
+
+    def setUp(self):
+        self.samfile=pysam.Samfile( "ex1.bam","rb" )
+
+    def testOutOfRangeNegativeNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 )
+
+    def testOutOfRangeNegativeOldFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-10" )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-0" )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5--10" )
+
+    def testOutOfRangeLargeNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 99999999999999999, 999999999999999999 )
+
+    def testOutOfRangeLargeOldFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" )
+
+    def tearDown(self):
+        self.samfile.close()
+
+if __name__ == "__main__":
+    unittest.main()
+