Imported Upstream version 0.1.5c
authorCharles Plessy <plessy@debian.org>
Tue, 8 Sep 2009 10:38:06 +0000 (19:38 +0900)
committerCharles Plessy <plessy@debian.org>
Tue, 8 Sep 2009 10:38:06 +0000 (19:38 +0900)
68 files changed:
AUTHORS [new file with mode: 0644]
COPYING [new file with mode: 0644]
ChangeLog [new file with mode: 0644]
INSTALL [new file with mode: 0644]
Makefile [new file with mode: 0644]
NEWS [new file with mode: 0644]
bam.c [new file with mode: 0644]
bam.h [new file with mode: 0644]
bam_aux.c [new file with mode: 0644]
bam_color.c [new file with mode: 0644]
bam_endian.h [new file with mode: 0644]
bam_import.c [new file with mode: 0644]
bam_index.c [new file with mode: 0644]
bam_lpileup.c [new file with mode: 0644]
bam_maqcns.c [new file with mode: 0644]
bam_maqcns.h [new file with mode: 0644]
bam_mate.c [new file with mode: 0644]
bam_md.c [new file with mode: 0644]
bam_pileup.c [new file with mode: 0644]
bam_plcmd.c [new file with mode: 0644]
bam_rmdup.c [new file with mode: 0644]
bam_rmdupse.c [new file with mode: 0644]
bam_sort.c [new file with mode: 0644]
bam_stat.c [new file with mode: 0644]
bam_tview.c [new file with mode: 0644]
bamtk.c [new file with mode: 0644]
bgzf.c [new file with mode: 0644]
bgzf.h [new file with mode: 0644]
bgzip.c [new file with mode: 0644]
examples/00README.txt [new file with mode: 0644]
examples/Makefile [new file with mode: 0644]
examples/calDepth.c [new file with mode: 0644]
examples/ex1.fa [new file with mode: 0644]
examples/ex1.sam.gz [new file with mode: 0644]
faidx.c [new file with mode: 0644]
faidx.h [new file with mode: 0644]
glf.c [new file with mode: 0644]
glf.h [new file with mode: 0644]
khash.h [new file with mode: 0644]
knetfile.c [new file with mode: 0644]
knetfile.h [new file with mode: 0644]
kseq.h [new file with mode: 0644]
ksort.h [new file with mode: 0644]
kstring.c [new file with mode: 0644]
kstring.h [new file with mode: 0644]
misc/Makefile [new file with mode: 0644]
misc/blast2sam.pl [new file with mode: 0755]
misc/bowtie2sam.pl [new file with mode: 0755]
misc/export2sam.pl [new file with mode: 0755]
misc/interpolate_sam.pl [new file with mode: 0755]
misc/maq2sam.c [new file with mode: 0644]
misc/md5.c [new file with mode: 0644]
misc/md5.h [new file with mode: 0644]
misc/md5fa.c [new file with mode: 0644]
misc/novo2sam.pl [new file with mode: 0755]
misc/samtools.pl [new file with mode: 0755]
misc/soap2sam.pl [new file with mode: 0755]
misc/wgsim.c [new file with mode: 0644]
misc/wgsim_eval.pl [new file with mode: 0755]
misc/zoom2sam.pl [new file with mode: 0755]
razf.c [new file with mode: 0644]
razf.h [new file with mode: 0644]
razip.c [new file with mode: 0644]
sam.c [new file with mode: 0644]
sam.h [new file with mode: 0644]
sam_view.c [new file with mode: 0644]
samtools.1 [new file with mode: 0644]
source.dot [new file with mode: 0644]

diff --git a/AUTHORS b/AUTHORS
new file mode 100644 (file)
index 0000000..435431c
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,16 @@
+Heng Li from the Sanger Institute wrote most of the initial source codes
+of SAMtools and various converters.
+
+Bob Handsaker from the Broad Institute is a major contributor to the
+SAM/BAM specification. He designed and implemented the BGZF format, the
+underlying indexable compression format for the BAM format. BGZF does
+not support arithmetic between file offsets.
+
+Jue Ruan for the Beijing Genome Institute designed and implemented the
+RAZF format, an alternative indexable compression format. RAZF supports
+arithmetic between file offsets, at the cost of increased index file
+size and the full compatibility with gzip. RAZF is optional and only
+used in `faidx' for indexing RAZF compressed fasta files.
+
+Colin Hercus updated novo2sam.pl to support gapped alignment by
+novoalign.
diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..82fa2f4
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2008-2009 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/ChangeLog b/ChangeLog
new file mode 100644 (file)
index 0000000..3bf82a5
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,2099 @@
+------------------------------------------------------------------------
+r372 | lh3lh3 | 2009-07-07 09:49:27 +0100 (Tue, 07 Jul 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam.c
+
+ * samtools-0.1.4-23 (r372)
+ * keep header text if "view -t" is used (by Gerton)
+
+------------------------------------------------------------------------
+r371 | lh3lh3 | 2009-07-07 01:13:32 +0100 (Tue, 07 Jul 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/samtools.1
+
+update documentation
+
+------------------------------------------------------------------------
+r370 | bhandsaker | 2009-07-02 22:24:34 +0100 (Thu, 02 Jul 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+
+Introduced LIBPATH variable so this could be overridden to allow samtools to build correct at the Broad.
+
+------------------------------------------------------------------------
+r369 | lh3lh3 | 2009-07-02 13:36:53 +0100 (Thu, 02 Jul 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-22 (r369)
+ * in pileup, optionally print E2 and U2
+ * remove the debugging code in bam_aux_get() (Drat!)
+
+------------------------------------------------------------------------
+r368 | lh3lh3 | 2009-07-02 11:32:26 +0100 (Thu, 02 Jul 2009) | 6 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bam_lpileup.c
+   M /trunk/samtools/bam_md.c
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bam_rmdup.c
+   M /trunk/samtools/bam_stat.c
+   M /trunk/samtools/bam_tview.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/faidx.c
+   M /trunk/samtools/faidx.h
+   M /trunk/samtools/glf.c
+
+ * samtools-0.1.4-21 (r368)
+ * propagate errors rather than exit or complain assertion failure. Assertion
+   should be only used for checking internal bugs, but not for external input
+   inconsistency. I was just a bit lazy.
+ * small memory leak may be present on failure, though
+
+------------------------------------------------------------------------
+r367 | lh3lh3 | 2009-06-30 16:18:42 +0100 (Tue, 30 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/knetfile.c
+
+reduce the chance of blocking in FTP connection
+
+------------------------------------------------------------------------
+r366 | lh3lh3 | 2009-06-30 15:35:21 +0100 (Tue, 30 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/knetfile.c
+
+minor changes to knetfile: invalid fd equals -1 rather than 0
+
+------------------------------------------------------------------------
+r365 | lh3lh3 | 2009-06-30 14:04:30 +0100 (Tue, 30 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/knetfile.c
+   M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.4-20 (r365)
+ * download the BAM index file if it is not found in the current working directory.
+
+------------------------------------------------------------------------
+r364 | lh3lh3 | 2009-06-30 12:39:07 +0100 (Tue, 30 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/knetfile.c
+
+ * samtools-0.1.4-19 (r364)
+ * knetfile: report error when the file is not present on FTP
+
+------------------------------------------------------------------------
+r363 | lh3lh3 | 2009-06-29 23:23:32 +0100 (Mon, 29 Jun 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam_tview.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/bgzf.c
+   M /trunk/samtools/bgzf.h
+   M /trunk/samtools/knetfile.c
+   M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.4-18 (r363)
+ * knetfile: do not trigger network communication in FTP seek (lazy seek)
+ * bgzf: cache recent blocks (disabled by default)
+
+------------------------------------------------------------------------
+r362 | lh3lh3 | 2009-06-25 21:04:34 +0100 (Thu, 25 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bgzf.c
+
+write changelog
+
+------------------------------------------------------------------------
+r361 | lh3lh3 | 2009-06-25 21:03:10 +0100 (Thu, 25 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-17 (r361)
+ * if a file is given on FTP, search locally for the BAM index
+
+------------------------------------------------------------------------
+r360 | lh3lh3 | 2009-06-25 20:44:52 +0100 (Thu, 25 Jun 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/bgzf.c
+   M /trunk/samtools/bgzf.h
+   M /trunk/samtools/knetfile.c
+   M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.4-16 (r360)
+ * report more information in index when the input is not sorted
+ * change the behaviour of knet_seek() such that it returns 0 on success
+ * support knetfile library in BGZF
+
+------------------------------------------------------------------------
+r359 | lh3lh3 | 2009-06-25 17:10:55 +0100 (Thu, 25 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/knetfile.c
+   M /trunk/samtools/knetfile.h
+
+fixed bugs in knetfile.*
+
+------------------------------------------------------------------------
+r358 | lh3lh3 | 2009-06-25 13:53:19 +0100 (Thu, 25 Jun 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/knetfile.h
+
+this is the header file
+
+------------------------------------------------------------------------
+r357 | lh3lh3 | 2009-06-25 13:52:03 +0100 (Thu, 25 Jun 2009) | 3 lines
+Changed paths:
+   A /trunk/samtools/knetfile.c
+
+ * open a file at FTP
+ * preliminary version
+
+------------------------------------------------------------------------
+r354 | lh3lh3 | 2009-06-24 14:02:25 +0100 (Wed, 24 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-15 (r354)
+ * fixed a memory leak in bam_view1(), although samtools is not using this routine.
+
+------------------------------------------------------------------------
+r351 | lh3lh3 | 2009-06-18 00:16:26 +0100 (Thu, 18 Jun 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/faidx.c
+
+ * samtools-0.1.4-13 (r351)
+ * make faidx more tolerant to empty lines right before or after > lines
+ * hope this does not introduce new bugs...
+
+------------------------------------------------------------------------
+r350 | lh3lh3 | 2009-06-16 14:37:01 +0100 (Tue, 16 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-13 (r350)
+ * fixed a small memory leak in pileup, caused by recent modifications
+
+------------------------------------------------------------------------
+r347 | lh3lh3 | 2009-06-13 21:20:49 +0100 (Sat, 13 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-12 (r347)
+ * added `-S' to pileup, similar to `view -S' 
+
+------------------------------------------------------------------------
+r346 | lh3lh3 | 2009-06-13 17:52:31 +0100 (Sat, 13 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam_view.c
+   M /trunk/samtools/samtools.1
+
+ * samtools-0.1.4-11 (r346)
+ * allow to select a read group at view command-line
+
+------------------------------------------------------------------------
+r344 | lh3lh3 | 2009-06-13 14:06:24 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/examples/calDepth.c
+
+added more comments
+
+------------------------------------------------------------------------
+r343 | lh3lh3 | 2009-06-13 14:01:22 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/examples/calDepth.c
+
+nothing really
+
+------------------------------------------------------------------------
+r342 | lh3lh3 | 2009-06-13 13:58:48 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/examples/Makefile
+   A /trunk/samtools/examples/calDepth.c
+
+added an example of calculating read depth
+
+------------------------------------------------------------------------
+r341 | lh3lh3 | 2009-06-13 13:00:08 +0100 (Sat, 13 Jun 2009) | 6 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   A /trunk/samtools/bam_color.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bam_sort.c
+   M /trunk/samtools/bam_tview.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam.c
+   M /trunk/samtools/sam.h
+
+ * samtools-0.1.4-10 (r341)
+ * only include key APIs in libbam.a
+ * move color-specific routines to bam_color.c
+ * update documentations
+ * remove the support of -q in pileup
+
+------------------------------------------------------------------------
+r340 | lh3lh3 | 2009-06-13 11:17:14 +0100 (Sat, 13 Jun 2009) | 6 lines
+Changed paths:
+   M /trunk/samtools/INSTALL
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bam_tview.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/razf.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-9 (r340)
+ * added a warning to razf.c if zlib<1.2.2.1
+ * fixed a compilation warning
+ * fixed a segfault caused by @RG parsing
+ * detect NCURSES in bam_tview.c
+
+------------------------------------------------------------------------
+r339 | lh3lh3 | 2009-06-13 10:35:19 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/INSTALL
+
+update INSTALL
+
+------------------------------------------------------------------------
+r338 | lh3lh3 | 2009-06-13 00:15:24 +0100 (Sat, 13 Jun 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/kstring.h
+   M /trunk/samtools/sam.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-8 (r338)
+ * parse the @RG header lines and allow to choose library at the "samtools view"
+   command line
+
+------------------------------------------------------------------------
+r337 | lh3lh3 | 2009-06-12 21:25:50 +0100 (Fri, 12 Jun 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/bgzf.c
+   M /trunk/samtools/bgzf.h
+   M /trunk/samtools/sam.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-7 (r337)
+ * bgzf.c: support mode string "wu": uncompressed output
+ * "samtools view" support "-u" command-line option
+
+------------------------------------------------------------------------
+r336 | lh3lh3 | 2009-06-12 17:20:12 +0100 (Fri, 12 Jun 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/misc/Makefile
+   M /trunk/samtools/razf.c
+   M /trunk/samtools/razf.h
+   M /trunk/samtools/razip.c
+
+ * no changes to samtools itself
+ * remove zlib source codes
+ * make RAZF reading compatible with old version of zlib
+ * on old version of zlib, writing is not available
+
+------------------------------------------------------------------------
+r335 | lh3lh3 | 2009-06-12 16:47:33 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+   D /trunk/samtools/zlib
+
+remove zlib for simplification...
+
+------------------------------------------------------------------------
+r334 | lh3lh3 | 2009-06-12 15:43:36 +0100 (Fri, 12 Jun 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-6 (r334)
+ * do not export bam_aux_get_core() for Bio::DB::Sam because it has already
+   been implemented in that.
+ * this version works with the latest Bio::DB::Sam (20090612)
+
+------------------------------------------------------------------------
+r333 | lh3lh3 | 2009-06-12 15:33:42 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+
+update ChangeLog
+
+------------------------------------------------------------------------
+r332 | lh3lh3 | 2009-06-12 15:21:21 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/AUTHORS
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/misc/Makefile
+
+fixed minor things in Makefile
+
+------------------------------------------------------------------------
+r331 | lh3lh3 | 2009-06-12 15:07:05 +0100 (Fri, 12 Jun 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-5 (r3310
+ * no change to samtools itself. Version number is increased to reflect the
+   changes in the Makefile building system.
+
+------------------------------------------------------------------------
+r330 | lh3lh3 | 2009-06-12 15:03:38 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/AUTHORS
+   D /trunk/samtools/README
+
+update information...
+
+------------------------------------------------------------------------
+r329 | lh3lh3 | 2009-06-12 14:52:21 +0100 (Fri, 12 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/novo2sam.pl
+
+ * updated novoalign converter by Colin Hercus et al.
+ * this version works with indels
+
+------------------------------------------------------------------------
+r328 | lh3lh3 | 2009-06-12 14:50:53 +0100 (Fri, 12 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/INSTALL
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/misc/Makefile
+   M /trunk/samtools/zlib/Makefile
+
+ * update Makefile
+ * update INSTALL instruction
+
+------------------------------------------------------------------------
+r327 | lh3lh3 | 2009-06-12 14:18:29 +0100 (Fri, 12 Jun 2009) | 4 lines
+Changed paths:
+   A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.generic:325)
+   D /trunk/samtools/Makefile.am
+   D /trunk/samtools/Makefile.generic
+   D /trunk/samtools/Makefile.lite
+   D /trunk/samtools/autogen.sh
+   D /trunk/samtools/cleanup.sh
+   D /trunk/samtools/configure.ac
+   A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.generic:305)
+   D /trunk/samtools/misc/Makefile.am
+   D /trunk/samtools/misc/Makefile.generic
+   M /trunk/samtools/razf.c
+   A /trunk/samtools/zlib
+   A /trunk/samtools/zlib/Makefile
+   A /trunk/samtools/zlib/adler32.c
+   A /trunk/samtools/zlib/compress.c
+   A /trunk/samtools/zlib/crc32.c
+   A /trunk/samtools/zlib/crc32.h
+   A /trunk/samtools/zlib/deflate.c
+   A /trunk/samtools/zlib/deflate.h
+   A /trunk/samtools/zlib/gzio.c
+   A /trunk/samtools/zlib/infback.c
+   A /trunk/samtools/zlib/inffast.c
+   A /trunk/samtools/zlib/inffast.h
+   A /trunk/samtools/zlib/inffixed.h
+   A /trunk/samtools/zlib/inflate.c
+   A /trunk/samtools/zlib/inflate.h
+   A /trunk/samtools/zlib/inftrees.c
+   A /trunk/samtools/zlib/inftrees.h
+   A /trunk/samtools/zlib/trees.c
+   A /trunk/samtools/zlib/trees.h
+   A /trunk/samtools/zlib/uncompr.c
+   A /trunk/samtools/zlib/zconf.h
+   A /trunk/samtools/zlib/zlib.h
+   A /trunk/samtools/zlib/zutil.c
+   A /trunk/samtools/zlib/zutil.h
+   D /trunk/samtools/zutil.h
+
+ * added zlib-1.2.3 as razip requires that
+ * prepare to changed back to the Makefile building system
+ * unfinished! (will be soon)
+
+------------------------------------------------------------------------
+r326 | lh3lh3 | 2009-06-12 14:12:03 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+Unfinished
+
+------------------------------------------------------------------------
+r325 | lh3lh3 | 2009-06-10 16:27:59 +0100 (Wed, 10 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-4 (r325)
+ * further avoid wrong consensus calls in repetitive regions.
+
+------------------------------------------------------------------------
+r324 | lh3lh3 | 2009-06-10 15:56:17 +0100 (Wed, 10 Jun 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam.c
+   M /trunk/samtools/sam.h
+
+ * samtools-0.1.4-3 (r324)
+ * make maqcns generate the correct call in repetitive regions.
+ * allow filtering on mapQ at the pileup command line
+
+------------------------------------------------------------------------
+r323 | lh3lh3 | 2009-06-10 10:04:21 +0100 (Wed, 10 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.3.2 (r322)
+ * indels and SNPs use different mapping quality threshold
+
+------------------------------------------------------------------------
+r322 | lh3lh3 | 2009-06-10 10:03:22 +0100 (Wed, 10 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/export2sam.pl
+
+fixed a typo
+
+------------------------------------------------------------------------
+r321 | lh3lh3 | 2009-06-09 09:21:48 +0100 (Tue, 09 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+just typo. no real change
+
+------------------------------------------------------------------------
+r320 | lh3lh3 | 2009-06-08 14:32:51 +0100 (Mon, 08 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+a little bit code cleanup
+
+------------------------------------------------------------------------
+r319 | lh3lh3 | 2009-06-08 14:22:33 +0100 (Mon, 08 Jun 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.3.1
+ * change default parameters
+ * optionally print filtered variants
+
+------------------------------------------------------------------------
+r318 | lh3lh3 | 2009-06-08 14:14:26 +0100 (Mon, 08 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.3.0
+ * combine snpFilter and indelFilter
+
+------------------------------------------------------------------------
+r317 | lh3lh3 | 2009-06-08 11:31:42 +0100 (Mon, 08 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.3
+ * change a default parameter
+
+------------------------------------------------------------------------
+r316 | lh3lh3 | 2009-06-08 11:11:06 +0100 (Mon, 08 Jun 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_maqcns.h
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam.c
+
+ * samtools-0.1.4-2 (r316)
+ * pileup: cap mapping quality at 60 (by default)
+ * pileup: always calculate RMS mapq
+ * pileup: allow to output variant sites only
+
+------------------------------------------------------------------------
+r312 | lh3lh3 | 2009-06-04 13:01:10 +0100 (Thu, 04 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.2
+ * added pileup2fq
+
+------------------------------------------------------------------------
+r311 | lh3lh3 | 2009-06-03 09:40:40 +0100 (Wed, 03 Jun 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * in snpFilter, suppress non-SNP sites
+
+------------------------------------------------------------------------
+r310 | lh3lh3 | 2009-06-01 14:35:13 +0100 (Mon, 01 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.1
+ * fixed a typo
+
+------------------------------------------------------------------------
+r309 | lh3lh3 | 2009-06-01 14:04:39 +0100 (Mon, 01 Jun 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.0
+ * snpFilter
+
+------------------------------------------------------------------------
+r306 | lh3lh3 | 2009-05-28 11:49:35 +0100 (Thu, 28 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bgzf.c
+
+ * minor changes to bgzf: return NULL if fd == -1
+ * suggested by {kdj,jm18}@sanger.ac.uk
+
+------------------------------------------------------------------------
+r305 | lh3lh3 | 2009-05-28 11:16:08 +0100 (Thu, 28 May 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/misc/interpolate_sam.pl
+
+Script for paired-end pileup, contributed by Stephen Montgomery.
+
+------------------------------------------------------------------------
+r304 | lh3lh3 | 2009-05-28 11:08:49 +0100 (Thu, 28 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam.c
+
+ * samtools-0.1.4-1 (r304)
+ * fixed a minor bug in printing headers
+
+------------------------------------------------------------------------
+r297 | lh3lh3 | 2009-05-21 16:06:16 +0100 (Thu, 21 May 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/NEWS
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/misc/maq2sam.c
+   M /trunk/samtools/samtools.1
+
+Release samtools-0.1.4
+
+------------------------------------------------------------------------
+r296 | lh3lh3 | 2009-05-21 12:53:14 +0100 (Thu, 21 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-24 (r296)
+ * another similar bug in the indel caller
+
+------------------------------------------------------------------------
+r295 | lh3lh3 | 2009-05-21 12:50:28 +0100 (Thu, 21 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-23 (r295)
+ * fixed a critical bug in the indel caller
+
+------------------------------------------------------------------------
+r294 | lh3lh3 | 2009-05-20 13:00:20 +0100 (Wed, 20 May 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_stat.c
+
+added a missing header file
+
+------------------------------------------------------------------------
+r293 | lh3lh3 | 2009-05-19 23:44:25 +0100 (Tue, 19 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_tview.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-22 (r293)
+ * open tview in the dot-view mode by default
+
+------------------------------------------------------------------------
+r292 | lh3lh3 | 2009-05-18 21:01:23 +0100 (Mon, 18 May 2009) | 6 lines
+Changed paths:
+   M /trunk/samtools/samtools.1
+
+Added a note to the manual. Currently SAMtools used unaligned words in
+several places. Although this does not cause bus errors to me, it may
+affect portability. Please see the "Bus error" wiki page for more
+information. Also thank James Bonfields for pointing this out.
+
+
+------------------------------------------------------------------------
+r286 | lh3lh3 | 2009-05-14 15:23:13 +0100 (Thu, 14 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-21 (286)
+ * declare bam_aux_get_core() in bam.h
+
+------------------------------------------------------------------------
+r276 | lh3lh3 | 2009-05-13 10:07:55 +0100 (Wed, 13 May 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-20 (r276)
+ * remove bam1_t::hash again. We need to modify the Perl API anyway to
+   make it work with the latest SVN.
+ * As is suggested by Tim, scan "{base}.bai" and "{base}.bam.bai" for index
+
+------------------------------------------------------------------------
+r275 | lh3lh3 | 2009-05-12 21:14:10 +0100 (Tue, 12 May 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-19 (r275)
+ * a minor change to the bam1_t struct: added back "void *hash" for the
+   backward compatibility with Bio::DB::Sam
+
+------------------------------------------------------------------------
+r273 | lh3lh3 | 2009-05-12 14:28:39 +0100 (Tue, 12 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_rmdupse.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-18 (r273)
+ * rmdupse: do not remove unmapped reads
+
+------------------------------------------------------------------------
+r272 | lh3lh3 | 2009-05-12 14:20:00 +0100 (Tue, 12 May 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_rmdupse.c
+
+change a parameter. It does nothing
+
+------------------------------------------------------------------------
+r271 | lh3lh3 | 2009-05-12 14:17:58 +0100 (Tue, 12 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/Makefile.am
+   M /trunk/samtools/Makefile.generic
+   M /trunk/samtools/Makefile.lite
+   A /trunk/samtools/bam_rmdupse.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/configure.ac
+
+ * samtools-0.1.3-17 (r271)
+ * added 'rmdupse' command
+
+------------------------------------------------------------------------
+r267 | lh3lh3 | 2009-05-05 22:31:41 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-16 (r267)
+ * in sam_view.c, changed g_flag_on based on the suggestion by Angie Hinrichs
+
+------------------------------------------------------------------------
+r266 | lh3lh3 | 2009-05-05 22:23:27 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-15 (r266)
+ * report an error if a non-* reference is present while @SQ is absent
+
+------------------------------------------------------------------------
+r265 | lh3lh3 | 2009-05-05 22:09:00 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-14 (r262)
+ * make samopen() recognize @SQ header lines
+
+------------------------------------------------------------------------
+r261 | lh3lh3 | 2009-05-05 15:10:30 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/bgzf.c
+   M /trunk/samtools/sam.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-13 (r260)
+ * report error for file I/O error
+
+------------------------------------------------------------------------
+r260 | lh3lh3 | 2009-05-05 15:01:16 +0100 (Tue, 05 May 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile.am
+
+update Makefile.am
+
+------------------------------------------------------------------------
+r259 | lh3lh3 | 2009-05-05 14:52:25 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam.c
+   M /trunk/samtools/sam.h
+
+ * samtools-0.1.3-12 (r259)
+ * use the new I/O interface in pileup
+
+------------------------------------------------------------------------
+r258 | lh3lh3 | 2009-05-05 14:33:22 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/Makefile.generic
+   M /trunk/samtools/Makefile.lite
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bamtk.c
+   A /trunk/samtools/sam.c
+   A /trunk/samtools/sam.h
+   A /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-11 (r258)
+ * unify the interface to BAM and SAM I/O
+
+------------------------------------------------------------------------
+r257 | lh3lh3 | 2009-05-05 09:53:35 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/Makefile.lite
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-10 (r257)
+ * allow hex with "pileup -m"
+
+------------------------------------------------------------------------
+r256 | lh3lh3 | 2009-05-04 19:16:50 +0100 (Mon, 04 May 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam_lpileup.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-9 (r256)
+ * fixed a bug in bam_lpileup.c
+ * I do not know if this also fixes the bug causing assertion failure in the tview
+
+------------------------------------------------------------------------
+r251 | lh3lh3 | 2009-04-28 13:53:23 +0100 (Tue, 28 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-8 (r251)
+ * fixed a bug when there are reads without coordinates
+
+------------------------------------------------------------------------
+r250 | lh3lh3 | 2009-04-28 13:43:33 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/AUTHORS
+   A /trunk/samtools/README
+   M /trunk/samtools/cleanup.sh
+
+added missing files
+
+------------------------------------------------------------------------
+r249 | lh3lh3 | 2009-04-28 13:37:16 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile.generic
+   M /trunk/samtools/Makefile.lite
+   M /trunk/samtools/configure.ac
+   M /trunk/samtools/misc/Makefile.generic
+
+improve large file support in compilation
+
+------------------------------------------------------------------------
+r248 | lh3lh3 | 2009-04-28 13:33:24 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/INSTALL
+
+update INSTALL
+
+------------------------------------------------------------------------
+r247 | lh3lh3 | 2009-04-28 13:28:50 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile.am
+   M /trunk/samtools/autogen.sh
+   M /trunk/samtools/cleanup.sh
+   M /trunk/samtools/configure.ac
+   A /trunk/samtools/misc/Makefile.am
+
+fixed various issues about the GNU building scripts
+
+------------------------------------------------------------------------
+r246 | lh3lh3 | 2009-04-28 13:10:23 +0100 (Tue, 28 Apr 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   D /trunk/samtools/Makefile
+   A /trunk/samtools/Makefile.am
+   A /trunk/samtools/Makefile.generic
+   A /trunk/samtools/autogen.sh
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bam_tview.c
+   M /trunk/samtools/bamtk.c
+   A /trunk/samtools/cleanup.sh
+   A /trunk/samtools/configure.ac
+   D /trunk/samtools/misc/Makefile
+   A /trunk/samtools/misc/Makefile.generic (from /trunk/samtools/misc/Makefile:245)
+
+ * samtools-0.1.3-7 (r246)
+ * incorporated revisions from Nils Homer
+ * enhanced support of displaying color-space reads
+
+------------------------------------------------------------------------
+r244 | lh3lh3 | 2009-04-25 11:49:40 +0100 (Sat, 25 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_md.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-6 (r244)
+ * fixed segfault for unmapped reads
+
+------------------------------------------------------------------------
+r243 | lh3lh3 | 2009-04-24 21:27:26 +0100 (Fri, 24 Apr 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_md.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-5 (r243)
+ * fixed a long existing bug which may cause memory leak
+ * check MD
+ * consensus calling now works with "=", but indel calling not
+
+------------------------------------------------------------------------
+r242 | lh3lh3 | 2009-04-24 20:44:46 +0100 (Fri, 24 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_md.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-4 (r242)
+ * fixed a memory leak
+
+------------------------------------------------------------------------
+r240 | lh3lh3 | 2009-04-24 16:40:18 +0100 (Fri, 24 Apr 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/Makefile.lite
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   A /trunk/samtools/bam_md.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-3 (r240)
+ * generate MD tag
+ * generate "=" bases
+ * the plain pileup now support "=" bases, but consensus calling and glfgen may fail
+
+------------------------------------------------------------------------
+r239 | lh3lh3 | 2009-04-24 12:08:20 +0100 (Fri, 24 Apr 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-2 (r239)
+ * fixed bugs in bam_aux.c (these functions nevered used by samtools)
+ * removed bam_aux_init()/bam_aux_destroy()
+ * added tagview for testing bam_aux
+
+------------------------------------------------------------------------
+r235 | lh3lh3 | 2009-04-21 23:17:39 +0100 (Tue, 21 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-1
+ * fixed a bug in pileup: the first read in a chromosome may not be printed
+
+------------------------------------------------------------------------
+r232 | lh3lh3 | 2009-04-16 15:25:43 +0100 (Thu, 16 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile.lite
+
+a missing file in Makefile.lite
+
+------------------------------------------------------------------------
+r227 | lh3lh3 | 2009-04-15 22:02:53 +0100 (Wed, 15 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/NEWS
+   M /trunk/samtools/bamtk.c
+
+Release samtools-0.1.3
+
+------------------------------------------------------------------------
+r223 | lh3lh3 | 2009-04-15 14:31:32 +0100 (Wed, 15 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-28
+ * make samtools more robust to weird input such as empty file
+
+------------------------------------------------------------------------
+r222 | lh3lh3 | 2009-04-15 14:05:33 +0100 (Wed, 15 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/NEWS
+   M /trunk/samtools/samtools.1
+
+prepare for release 0.1.3
+
+------------------------------------------------------------------------
+r221 | lh3lh3 | 2009-04-15 13:32:14 +0100 (Wed, 15 Apr 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/misc/blast2sam.pl
+
+convert NCBI-BLASTN to SAM
+
+------------------------------------------------------------------------
+r220 | lh3lh3 | 2009-04-15 13:18:19 +0100 (Wed, 15 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_lpileup.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-27
+ * fixed a small memory leak in tview
+
+------------------------------------------------------------------------
+r219 | lh3lh3 | 2009-04-15 13:00:08 +0100 (Wed, 15 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_rmdup.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-26
+ * fixed a bug in rmdup when there are unmapped reads
+
+------------------------------------------------------------------------
+r218 | lh3lh3 | 2009-04-14 22:28:58 +0100 (Tue, 14 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/NEWS
+
+proposed NEWS for the new release (have not yet)
+
+------------------------------------------------------------------------
+r216 | lh3lh3 | 2009-04-14 22:10:46 +0100 (Tue, 14 Apr 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.1.1
+ * improve indelFilter to avoid filtering true indels. The new filter relies
+   on the new pileup indel line implemented in samtools-0.1.2-25
+
+------------------------------------------------------------------------
+r215 | lh3lh3 | 2009-04-14 22:04:19 +0100 (Tue, 14 Apr 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/samtools.1
+
+ * samtools-0.1.2-25
+ * change the pileup indel line to shows the number of alignments actually
+   containing indels
+
+------------------------------------------------------------------------
+r211 | lh3lh3 | 2009-04-13 12:07:13 +0100 (Mon, 13 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+
+update ChangeLog from "svn log"
+
+------------------------------------------------------------------------
+r210 | lh3lh3 | 2009-04-12 20:57:05 +0100 (Sun, 12 Apr 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bam_sort.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/kseq.h
+
+ * samtools-0.1.2-24
+ * in merge, gives a warning rather than error if the target sequence length is different
+ * allow empty header
+
+------------------------------------------------------------------------
+r209 | lh3lh3 | 2009-04-12 20:32:44 +0100 (Sun, 12 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-23
+ * recognize '*' at the QUAL field
+
+------------------------------------------------------------------------
+r208 | lh3lh3 | 2009-04-12 20:08:02 +0100 (Sun, 12 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/kseq.h
+
+ * samtools-0.1.2-22
+ * the field separater is TAB only, now
+
+------------------------------------------------------------------------
+r207 | lh3lh3 | 2009-04-08 15:18:03 +0100 (Wed, 08 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/examples/ex1.sam.gz
+
+ * fixed the problem in the example alignment due to the bug in fixmate
+
+------------------------------------------------------------------------
+r206 | lh3lh3 | 2009-04-08 15:15:05 +0100 (Wed, 08 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_mate.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/misc/soap2sam.pl
+
+ * samtools-0.1.2-21
+ * fixed a nasty bug in `fixmate'
+
+------------------------------------------------------------------------
+r205 | lh3lh3 | 2009-04-08 10:57:08 +0100 (Wed, 08 Apr 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/bowtie2sam.pl
+   M /trunk/samtools/misc/soap2sam.pl
+   M /trunk/samtools/misc/wgsim_eval.pl
+
+make the script robust to the bugs in SOAP-2.1.7
+
+------------------------------------------------------------------------
+r200 | lh3lh3 | 2009-04-02 15:14:56 +0100 (Thu, 02 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_stat.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-20
+ * check if file is truncated in flagstat
+
+------------------------------------------------------------------------
+r199 | lh3lh3 | 2009-04-02 15:09:10 +0100 (Thu, 02 Apr 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-19
+ * print the header if requested
+
+------------------------------------------------------------------------
+r193 | lh3lh3 | 2009-03-27 15:09:50 +0000 (Fri, 27 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-18
+ * fixed a minor bug reported by Nils Homer
+
+------------------------------------------------------------------------
+r185 | lh3lh3 | 2009-03-24 11:50:32 +0000 (Tue, 24 Mar 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.std:184)
+   D /trunk/samtools/Makefile.std
+   A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.std:184)
+   D /trunk/samtools/misc/Makefile.std
+
+rename Makefile.std as Makefile. GNU building systerm is not ready and may take some time...
+
+------------------------------------------------------------------------
+r184 | lh3lh3 | 2009-03-24 10:36:38 +0000 (Tue, 24 Mar 2009) | 4 lines
+Changed paths:
+   D /trunk/samtools/Makefile
+   A /trunk/samtools/Makefile.std (from /trunk/samtools/Makefile:183)
+   M /trunk/samtools/bam_sort.c
+   M /trunk/samtools/bam_tview.c
+   M /trunk/samtools/bamtk.c
+   D /trunk/samtools/misc/Makefile
+   A /trunk/samtools/misc/Makefile.std (from /trunk/samtools/misc/Makefile:182)
+   M /trunk/samtools/samtools.1
+
+ * samtools-0.1.2-17
+ * incorporating Nils' changes
+ * rename Makefile to Makefile.std and prepare to add the GNU building systerms (also by Nils)
+
+------------------------------------------------------------------------
+r183 | lh3lh3 | 2009-03-24 10:30:23 +0000 (Tue, 24 Mar 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_maqcns.h
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/kseq.h
+   A /trunk/samtools/kstring.c
+   A /trunk/samtools/kstring.h
+
+ * samtools-0.1.2-16
+ * made pileup take a list of proposed indels. An insertion is N at the moment.
+ * added my kstring library for a bit complex parsing of the position list.
+
+------------------------------------------------------------------------
+r169 | lh3lh3 | 2009-03-12 13:40:14 +0000 (Thu, 12 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/soap2sam.pl
+
+ * soap2sam.pl-0.1.2
+ * more robust to truncated soap output
+
+------------------------------------------------------------------------
+r168 | lh3lh3 | 2009-03-11 10:49:00 +0000 (Wed, 11 Mar 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile.lite
+
+added bam_stat.o to Makefile.lite
+
+------------------------------------------------------------------------
+r167 | lh3lh3 | 2009-03-10 22:11:31 +0000 (Tue, 10 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-15
+ * generate RMS of mapQ instead of max mapQ
+
+------------------------------------------------------------------------
+r166 | lh3lh3 | 2009-03-10 22:06:45 +0000 (Tue, 10 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/glf.c
+   M /trunk/samtools/glf.h
+   M /trunk/samtools/misc/Makefile
+
+ * samtools-0.1.2-14
+ * implemented GLFv3
+
+------------------------------------------------------------------------
+r159 | lh3lh3 | 2009-03-03 11:26:08 +0000 (Tue, 03 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-13
+ * fixed a minor bug in displaying pileup
+
+------------------------------------------------------------------------
+r158 | lh3lh3 | 2009-03-03 11:24:16 +0000 (Tue, 03 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-12
+ * optionally print SAM header
+
+------------------------------------------------------------------------
+r153 | lh3lh3 | 2009-03-02 10:45:28 +0000 (Mon, 02 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/glf.c
+
+ * samtools-0.1.2-11
+ * use "GLF\3" as the magic for GLFv3 files
+
+------------------------------------------------------------------------
+r152 | lh3lh3 | 2009-03-02 10:39:09 +0000 (Mon, 02 Mar 2009) | 5 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/glf.c
+   M /trunk/samtools/glf.h
+
+ * samtools-0.1.2-10
+ * fixed a bug in import: core.bin is undefined for unmapped reads
+ * this bug can be alleviated (not completely solved) in bam_index.c
+ * update to GLFv3: pos is changed to offset for better compression
+
+------------------------------------------------------------------------
+r151 | lh3lh3 | 2009-03-01 15:18:43 +0000 (Sun, 01 Mar 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/wgsim.c
+
+ * wgsim-0.2.3
+ * fixed a bug in simulating indels
+
+------------------------------------------------------------------------
+r145 | lh3lh3 | 2009-02-26 19:43:57 +0000 (Thu, 26 Feb 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/misc/wgsim.c
+
+ * wgsim-0.2.2
+ * allow to print mismatch information as fastq comment. MAQ does
+   not like long read names.
+
+------------------------------------------------------------------------
+r141 | lh3lh3 | 2009-02-26 14:53:03 +0000 (Thu, 26 Feb 2009) | 6 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/misc/wgsim.c
+   M /trunk/samtools/misc/wgsim_eval.pl
+
+ * wgsim-0.2.1
+ * fixed a bug about color read coordinates
+ * fixed a bug in read names
+ * wgsim_eval.pl-0.1.3
+ * make the script work with color reads
+
+------------------------------------------------------------------------
+r140 | lh3lh3 | 2009-02-26 14:02:57 +0000 (Thu, 26 Feb 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/Makefile
+   M /trunk/samtools/misc/wgsim.c
+
+ * wgsim: added a note
+
+------------------------------------------------------------------------
+r139 | lh3lh3 | 2009-02-26 11:39:08 +0000 (Thu, 26 Feb 2009) | 7 lines
+Changed paths:
+   M /trunk/samtools/misc/wgsim.c
+   M /trunk/samtools/misc/wgsim_eval.pl
+
+ * wgsim-0.2.0
+ * considerable code clean up
+ * print number of substitutions/indels/errors on each read
+ * potentially support SOLiD simulation, though not tested at the moment
+ * wgsim_eval.pl-0.1.2
+ * change in accordant with wgsim
+
+------------------------------------------------------------------------
+r129 | lh3lh3 | 2009-02-18 22:23:27 +0000 (Wed, 18 Feb 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-9
+ * fixed a bug in bam_fetch, caused by completely contained adjacent chunks
+
+------------------------------------------------------------------------
+r128 | bhandsaker | 2009-02-18 19:06:57 +0000 (Wed, 18 Feb 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+
+Fix annoying segv when invalid region specified.
+
+------------------------------------------------------------------------
+r127 | lh3lh3 | 2009-02-17 10:49:55 +0000 (Tue, 17 Feb 2009) | 2 lines
+Changed paths:
+   D /trunk/samtools/misc/indel_filter.pl
+   A /trunk/samtools/misc/samtools.pl
+
+ * move indel_filter.pl to samtools.pl
+
+------------------------------------------------------------------------
+r126 | lh3lh3 | 2009-02-14 21:22:30 +0000 (Sat, 14 Feb 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_mate.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-7
+ * fixed a bug in fixmate: SE reads are flagged as BAM_FMUNMAP
+
+------------------------------------------------------------------------
+r125 | lh3lh3 | 2009-02-13 09:54:45 +0000 (Fri, 13 Feb 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_stat.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-7
+ * fixed a minor bug in flagstat
+
+------------------------------------------------------------------------
+r124 | lh3lh3 | 2009-02-12 11:15:32 +0000 (Thu, 12 Feb 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/misc/indel_filter.pl
+
+ * samtools-0.1.2-6
+ * improve indel caller by setting maximum window size
+
+------------------------------------------------------------------------
+r123 | lh3lh3 | 2009-02-12 10:30:29 +0000 (Thu, 12 Feb 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * output max mapping quality in indel line
+
+------------------------------------------------------------------------
+r122 | lh3lh3 | 2009-02-11 10:59:10 +0000 (Wed, 11 Feb 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/maq2sam.c
+
+fixed a bug in generating tag AM
+
+------------------------------------------------------------------------
+r121 | lh3lh3 | 2009-02-03 10:43:11 +0000 (Tue, 03 Feb 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+fixed a potential memory problem in indexing
+
+------------------------------------------------------------------------
+r120 | bhandsaker | 2009-02-02 15:52:52 +0000 (Mon, 02 Feb 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+
+Pass LIBS to recursive targets to facilitate building at Broad.
+
+------------------------------------------------------------------------
+r119 | lh3lh3 | 2009-02-02 10:12:15 +0000 (Mon, 02 Feb 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bam_stat.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-3
+ * fixed a bug in generating GLFv2 for indels
+ * improve flagstat report a little bit
+
+------------------------------------------------------------------------
+r118 | lh3lh3 | 2009-01-29 12:33:23 +0000 (Thu, 29 Jan 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   A /trunk/samtools/bam_stat.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-1
+ * added flagstat command
+
+------------------------------------------------------------------------
+r116 | lh3lh3 | 2009-01-28 13:31:12 +0000 (Wed, 28 Jan 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/NEWS
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/samtools.1
+
+Release SAMtools-0.1.2
+
+------------------------------------------------------------------------
+r115 | lh3lh3 | 2009-01-28 12:54:08 +0000 (Wed, 28 Jan 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/misc/indel_filter.pl
+
+Script for filtering indel results
+
+------------------------------------------------------------------------
+r114 | lh3lh3 | 2009-01-25 11:45:37 +0000 (Sun, 25 Jan 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/misc/zoom2sam.pl
+
+convert ZOOM to SAM
+
+------------------------------------------------------------------------
+r113 | lh3lh3 | 2009-01-24 14:25:07 +0000 (Sat, 24 Jan 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/misc/novo2sam.pl
+
+add a script to convert novo alignment to SAM
+
+------------------------------------------------------------------------
+r112 | lh3lh3 | 2009-01-23 20:57:39 +0000 (Fri, 23 Jan 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/ChangeLog.old
+   M /trunk/samtools/samtools.1
+
+update documentation and ChangeLog
+
+------------------------------------------------------------------------
+r111 | lh3lh3 | 2009-01-23 19:22:59 +0000 (Fri, 23 Jan 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_sort.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.1-19
+ * fixed a bug in "merge" command line
+
+------------------------------------------------------------------------
+r110 | lh3lh3 | 2009-01-22 15:36:48 +0000 (Thu, 22 Jan 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/Makefile
+   A /trunk/samtools/misc/bowtie2sam.pl (from /branches/dev/samtools/misc/bowtie2sam.pl:108)
+   M /trunk/samtools/misc/export2sam.pl
+   A /trunk/samtools/misc/soap2sam.pl (from /branches/dev/samtools/misc/soap2sam.pl:108)
+   A /trunk/samtools/misc/wgsim.c (from /branches/dev/samtools/misc/wgsim.c:108)
+   A /trunk/samtools/misc/wgsim_eval.pl (from /branches/dev/samtools/misc/wgsim_eval.pl:108)
+
+ * merge from branches/dev/
+ * all future development will happen here
+
+------------------------------------------------------------------------
+r109 | lh3lh3 | 2009-01-22 15:14:27 +0000 (Thu, 22 Jan 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/COPYING
+   M /trunk/samtools/ChangeLog
+   A /trunk/samtools/INSTALL (from /branches/dev/samtools/INSTALL:108)
+   M /trunk/samtools/Makefile
+   A /trunk/samtools/Makefile.lite (from /branches/dev/samtools/Makefile.lite:108)
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bam_lpileup.c
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_maqcns.h
+   A /trunk/samtools/bam_mate.c (from /branches/dev/samtools/bam_mate.c:108)
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bam_plcmd.c
+   A /trunk/samtools/bam_rmdup.c (from /branches/dev/samtools/bam_rmdup.c:108)
+   M /trunk/samtools/bam_sort.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/bgzf.h
+   M /trunk/samtools/examples/00README.txt
+   A /trunk/samtools/examples/Makefile (from /branches/dev/samtools/examples/Makefile:108)
+   D /trunk/samtools/examples/ex1.fa.fai
+   M /trunk/samtools/examples/ex1.sam.gz
+   M /trunk/samtools/faidx.c
+   A /trunk/samtools/glf.c (from /branches/dev/samtools/glf.c:108)
+   M /trunk/samtools/glf.h
+   M /trunk/samtools/misc/Makefile
+   M /trunk/samtools/misc/maq2sam.c
+   M /trunk/samtools/razf.c
+   M /trunk/samtools/source.dot
+
+ * Merge from branches/dev/
+ * all future development will happen here at trunk/
+
+------------------------------------------------------------------------
+r79 | bhandsaker | 2009-01-07 21:42:15 +0000 (Wed, 07 Jan 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_tview.c
+
+Fix problem with compiling without curses.
+
+------------------------------------------------------------------------
+r63 | lh3lh3 | 2008-12-22 15:58:02 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+   A /trunk/samtools (from /branches/dev/samtools:62)
+
+Create trunk copy
+
+------------------------------------------------------------------------
+r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/NEWS
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+Release samtools-0.1.1
+
+------------------------------------------------------------------------
+r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines
+Changed paths:
+   M /branches/dev/samtools/bam_aux.c
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/razf.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-66
+ * fixed a bug in razf.c: reset z_eof when razf_seek() is called
+ * fixed a memory leak in parsing a region
+ * changed pileup a little bit when -s is in use: output ^ and $
+ * when a bam is not indexed, output more meaningful error message
+ * fixed a bug in indexing for small alignment
+ * fixed a bug in the viewer when we come to the end of a reference file
+ * updated documentation
+ * prepare to release 0.1.1
+
+------------------------------------------------------------------------
+r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/examples
+   A /branches/dev/samtools/examples/00README.txt
+   A /branches/dev/samtools/examples/ex1.fa
+   A /branches/dev/samtools/examples/ex1.fa.fai
+   A /branches/dev/samtools/examples/ex1.sam.gz
+
+example
+
+------------------------------------------------------------------------
+r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/ChangeLog
+
+update ChangeLog
+
+------------------------------------------------------------------------
+r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/misc/export2sam.pl
+
+ * added comments
+ * fixed several bugs 
+
+------------------------------------------------------------------------
+r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/misc/export2sam.pl
+
+convert Export format to SAM; not thoroughly tested
+
+------------------------------------------------------------------------
+r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/source.dot
+
+ * samtools-0.1.0-65
+ * pileup: generate maq-like simple output
+ * pileup: allow to output pileup at required sites
+ * source.dot: source file relationship graph
+ * tview: fixed a minor bug
+
+------------------------------------------------------------------------
+r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines
+Changed paths:
+   D /branches/dev/samtools/misc/all2sam.pl
+
+remove all2sam.pl
+
+------------------------------------------------------------------------
+r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/COPYING
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/faidx.h
+   M /branches/dev/samtools/khash.h
+   M /branches/dev/samtools/kseq.h
+   M /branches/dev/samtools/ksort.h
+   M /branches/dev/samtools/samtools.1
+
+Added copyright information and a bit more documentation. No code change.
+
+------------------------------------------------------------------------
+r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-64
+ * improved efficiency of the indel caller for spliced alignments
+
+------------------------------------------------------------------------
+r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_aux.c
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-63
+ * a bit code cleanup: reduce the dependency between source files
+
+------------------------------------------------------------------------
+r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-62
+ * fixed a memory leak
+
+------------------------------------------------------------------------
+r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/ChangeLog
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/samtools.1
+
+update documentation, ChangeLog and a comment
+
+------------------------------------------------------------------------
+r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_maqcns.h
+   M /branches/dev/samtools/bam_pileup.c
+   A /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-61
+ * moved pileup command to a separate source file
+ * added indel caller
+ * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!)
+ * updated documentation
+
+------------------------------------------------------------------------
+r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-60
+ * fixed another bug in maqcns when there is a nearby deletion
+
+------------------------------------------------------------------------
+r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-59
+ * pileup: outputing consensus is now optional
+ * fixed a bug in glfgen. This bug also exists in maq's glfgen. However,
+   I am not quite sure why the previous version may have problem.
+
+------------------------------------------------------------------------
+r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-58
+ * add maq consensus to pileup. However, I will move this part to a new
+   command as strictly speaking, consensus callin is not part of pileup,
+   and imposing it would make it harder to generate for other language
+   bindings.
+
+------------------------------------------------------------------------
+r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bgzf.c
+
+Fix bug in tell() after reads that consume to the exact end of a block.
+
+------------------------------------------------------------------------
+r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-57
+ * fixed a bug in parser when there is auxiliary fields
+ * made the parser a bit more robust
+
+------------------------------------------------------------------------
+r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-56
+ * fixed a bug in bgzf (only reading is affected)
+ * fixed a typo in bam_index.c
+ * in bam_index.c, check potential bugs in the underlying I/O library
+
+------------------------------------------------------------------------
+r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-55
+ * tried to make pileup work with clipping (previously not), though NOT tested
+ * removed -v from pileup
+ * made pileup take the reference sequence
+
+------------------------------------------------------------------------
+r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-54
+ * in parser, recognize "=", rather than ",", as a match
+ * in parser, correctl parse "=" at the MRNM field.
+
+------------------------------------------------------------------------
+r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/maq2sam.c
+
+fixed a bug in handling maq flag 64 and 192
+
+------------------------------------------------------------------------
+r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/md5fa.c
+
+also calculate unordered md5sum check
+
+------------------------------------------------------------------------
+r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/md5fa.c
+
+fixed a minor bug when there are space in the sequence
+
+------------------------------------------------------------------------
+r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/md5fa.c
+
+fixed a potential memory leak
+
+------------------------------------------------------------------------
+r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+
+ * fixed a bug in import: bin is wrongly calculated
+
+------------------------------------------------------------------------
+r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/all2sam.pl
+
+nothing, really
+
+------------------------------------------------------------------------
+r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/kseq.h
+   M /branches/dev/samtools/misc/Makefile
+   A /branches/dev/samtools/misc/md5.c
+   A /branches/dev/samtools/misc/md5.h
+   A /branches/dev/samtools/misc/md5fa.c
+
+ * fixed two warnings in kseq.h
+ * added md5sum utilities
+
+------------------------------------------------------------------------
+r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/kseq.h
+   D /branches/dev/samtools/kstream.h
+
+ * samtools-0.1.0-52
+ * replace kstream with kseq. kseq is a superset of kstream. I need the
+   extra functions in kseq.h.
+ * also compile stand-alone faidx
+
+------------------------------------------------------------------------
+r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_sort.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-51
+ * sorting by read names is available
+
+------------------------------------------------------------------------
+r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bam_sort.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/misc/maq2sam.c
+
+ * samtools-0.1.0-50
+ * format change to meet the latest specification
+
+------------------------------------------------------------------------
+r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/misc/maq2sam.c
+
+ * minor change in maqcns: special care when n==0
+ * change maq2sam to meet the latest specification
+
+------------------------------------------------------------------------
+r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+   M /branches/dev/samtools/razf.h
+
+considerable code clean up in razf
+
+------------------------------------------------------------------------
+r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/ChangeLog
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/faidx.c
+
+make RAZF optional in faidx.c
+
+------------------------------------------------------------------------
+r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_aux.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-49
+ * added routines for retrieving aux data, NOT TESTED YET!
+
+------------------------------------------------------------------------
+r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/bgzf.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-48
+ * bgzf: fixed a potential integer overflow on 32-it machines
+ * maqcns: set the minimum combined quality as 0
+ * supporting hex strings
+
+------------------------------------------------------------------------
+r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-47
+ * fixed the bug in maqcns
+
+------------------------------------------------------------------------
+r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   A /branches/dev/samtools/bam_maqcns.c
+   A /branches/dev/samtools/bam_maqcns.h
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/glf.h
+
+ * samtools-0.1.0-46
+ * add MAQ consensus caller, currently BUGGY!
+
+------------------------------------------------------------------------
+r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-45
+ * tview: display padded alignment (but not P operation)
+ * better coordinates and reference sequence
+
+------------------------------------------------------------------------
+r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/ChangeLog
+
+new ChangeLog
+
+------------------------------------------------------------------------
+r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+   D /branches/dev/samtools/ChangeLog
+   A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6)
+
+Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from
+the log of my personal SVN repository.
+
+------------------------------------------------------------------------
+r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-44
+ * declare fseeko and ftello as some Linux may not do this by default and
+   missing these declarations will make bgzf buggy
+ * get rid of some harmless warings
+ * use BGZF by default, now
+
+------------------------------------------------------------------------
+r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/razf.c
+
+ * samtools-0.1.0-43
+ * fixed a bug in razf_read()
+ * give more warnings when the file is truncated (or due to bugs in I/O library)
+
+------------------------------------------------------------------------
+r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bgzf.c
+
+fixed a bug in bgzf.c at the end of the file
+
+------------------------------------------------------------------------
+r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-42
+ * a lot happened to RAZF, although samtools itself is untouched. Better
+   also update the version number anyway to avoid confusion
+
+------------------------------------------------------------------------
+r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+
+a change from Jue, but I think it should not matter
+
+------------------------------------------------------------------------
+r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+
+fixed a potential bug in razf. However, it seems still buggy, just
+rarely happens, very rarely.
+
+------------------------------------------------------------------------
+r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+
+fixed a bug in razf, with the help of Jue
+
+------------------------------------------------------------------------
+r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bam_index.c
+
+remove a comment
+
+------------------------------------------------------------------------
+r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/razf.c
+   M /branches/dev/samtools/razf.h
+
+ * Jue has updated razf to realize Bob's scheme
+
+------------------------------------------------------------------------
+r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/samtools.1
+
+the manual page
+
+------------------------------------------------------------------------
+r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines
+Changed paths:
+   A /branches/dev/samtools/ChangeLog
+   A /branches/dev/samtools/Makefile
+   A /branches/dev/samtools/bam.c
+   A /branches/dev/samtools/bam.h
+   A /branches/dev/samtools/bam_aux.c
+   A /branches/dev/samtools/bam_endian.h
+   A /branches/dev/samtools/bam_import.c
+   A /branches/dev/samtools/bam_index.c
+   A /branches/dev/samtools/bam_lpileup.c
+   A /branches/dev/samtools/bam_pileup.c
+   A /branches/dev/samtools/bam_sort.c
+   A /branches/dev/samtools/bam_tview.c
+   A /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/bgzf.c
+   A /branches/dev/samtools/bgzf.h
+   A /branches/dev/samtools/bgzip.c
+   A /branches/dev/samtools/faidx.c
+   A /branches/dev/samtools/faidx.h
+   A /branches/dev/samtools/khash.h
+   A /branches/dev/samtools/ksort.h
+   A /branches/dev/samtools/kstream.h
+   A /branches/dev/samtools/misc
+   A /branches/dev/samtools/misc/Makefile
+   A /branches/dev/samtools/misc/all2sam.pl
+   A /branches/dev/samtools/misc/maq2sam.c
+   A /branches/dev/samtools/razf.c
+   A /branches/dev/samtools/razf.h
+   A /branches/dev/samtools/razip.c
+   A /branches/dev/samtools/zutil.h
+
+The initial version of samtools, replicated from my local SVN repository.
+The current version is: 0.1.0-42. All future development will happen here.
+
+------------------------------------------------------------------------
+r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools
+
+samtools (C version)
+
+------------------------------------------------------------------------
diff --git a/INSTALL b/INSTALL
new file mode 100644 (file)
index 0000000..f1cf7aa
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,29 @@
+System Requirements
+===================
+
+SAMtools depends on the zlib library <http://www.zlib.net>. The latest
+version 1.2.3 is preferred and with the latest version you can compile
+razip and use it to compress a FASTA file. SAMtools' faidx is able to
+index a razip-compressed FASTA file to save diskspace. Older zlib also
+works with SAMtools, but razip cannot be compiled.
+
+The text-based viewer (tview) requires the GNU ncurses library
+<http://www.gnu.org/software/ncurses/>, which comes with Mac OS X and
+most of the modern Linux/Unix distributions. If you do not have this
+library installed, you can still compile the rest of SAMtools by
+manually modifying one line in Makefile.
+
+
+Compilation
+===========
+
+Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can
+compile razip with `make razip'.
+
+
+Installation
+============
+
+Simply copy `samtools' and other executables/scripts in `misc' to a
+location you want (e.g. a directory in your $PATH). No further
+configurations are required.
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..7bb4469
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,69 @@
+CC=                    gcc
+CXX=           g++
+CFLAGS=                -g -Wall -O2 #-m64 #-arch ppc
+CXXFLAGS=      $(CFLAGS)
+DFLAGS=                -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE #-D_NO_CURSES
+LOBJS=         bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
+                       bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o     \
+                       bam_sort.o
+AOBJS=         bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
+                       bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o     \
+                       bamtk.o
+PROG=          samtools
+INCLUDES=      
+SUBDIRS=       . misc
+LIBPATH=       
+
+.SUFFIXES:.c .o
+
+.c.o:
+               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+               @target=`echo $@ | sed s/-recur//`; \
+               wdir=`pwd`; \
+               list='$(SUBDIRS)'; for subdir in $$list; do \
+                       cd $$subdir; \
+                       $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+                               INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+                       cd $$wdir; \
+               done;
+
+all:$(PROG)
+
+lib:libbam.a
+
+libbam.a:$(LOBJS)
+               $(AR) -cru $@ $(LOBJS)
+
+### For the curses library: comment out `-lcurses' if you do not have curses installed
+samtools:lib $(AOBJS)
+               $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -lcurses -lz -L. -lbam
+
+razip:razip.o razf.o
+               $(CC) $(CFLAGS) -o $@ razf.o razip.o -lz
+
+bgzip:bgzip.o bgzf.o
+               $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h kstring.h
+sam.o:sam.h bam.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h bam_maqcns.h
+bam_maqcns.o:bam.h ksort.h bam_maqcns.h
+bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
+glf.o:glf.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+cleanlocal:
+               rm -fr gmon.out *.o a.out *.dSYM razip $(PROG) *~ *.a
+
+clean:cleanlocal-recur
diff --git a/NEWS b/NEWS
new file mode 100644 (file)
index 0000000..149c090
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,224 @@
+Beta Release 0.1.5 (7 July, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Support opening a BAM alignment on FTP. Users can now use "tview" to
+   view alignments at the NCBI ftp site. Please read manual for more
+   information.
+
+ * In library, propagate errors rather than exit or complain assertion
+   failure.
+
+ * Simplified the building system and fixed compiling errors caused by
+   zlib<1.2.2.1.
+
+ * Fixed an issue about lost header information when a SAM is imported
+   with "view -t".
+
+ * Implemented "samtool.pl varFilter" which filters both SNPs and short
+   indels. This command replaces "indelFilter".
+
+ * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from
+   pileup output.
+
+ * In pileup, cap mapping quality at 60. This helps filtering when
+   different aligners are in use.
+
+ * In pileup, allow to output variant sites only.
+
+ * Made pileup generate correct calls in repetitive region. At the same
+   time, I am considering to implement a simplified model in SOAPsnp,
+   although this has not happened yet.
+
+ * In view, added '-u' option to output BAM without compression. This
+   option is preferred when the output is piped to other commands.
+
+ * In view, added '-l' and '-r' to get the alignments for one library or
+   read group. The "@RG" header lines are now partially parsed.
+
+ * Do not include command line utilities to libbam.a.
+
+ * Fixed memory leaks in pileup and bam_view1().
+
+ * Made faidx more tolerant to empty lines right before or after FASTA >
+   lines.
+
+
+Changes in other utilities:
+
+ * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign.
+
+
+This release involves several modifications to the key code base which
+may potentially introduce new bugs even though we have tried to minimize
+this by testing on several examples. Please let us know if you catch
+bugs.
+
+(0.1.5: 7 July 2009, r373)
+
+
+
+Beta Release 0.1.4 (21 May, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Added the 'rmdupse' command: removing duplicates for SE reads.
+
+ * Fixed a critical bug in the indel caller: clipped alignments are not
+   processed correctly.
+
+ * Fixed a bug in the tview: gapped alignment may be incorrectly
+   displayed.
+
+ * Unified the interface to BAM and SAM I/O. This is done by
+   implementing a wrapper on top of the old APIs and therefore old APIs
+   are still valid. The new I/O APIs also recognize the @SQ header
+   lines.
+
+ * Generate the MD tag.
+
+ * Generate "=" bases. However, the indel caller will not work when "="
+   bases are present.
+
+ * Enhanced support of color-read display (by Nils Homer).
+
+ * Implemented the GNU building system. However, currently the building
+   system does not generate libbam.a. We will improve this later. For
+   the time being, `make -f Makefile.generic' is preferred.
+
+ * Fixed a minor bug in pileup: the first read in a chromosome may be
+   skipped.
+
+ * Fixed bugs in bam_aux.c. These bugs do not affect other components as
+   they were not used previously.
+
+ * Output the 'SM' tag from maq2sam.
+
+(0.1.4: 21 May 2009, r297)
+
+
+
+Beta Release 0.1.3 (15 April, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in SAMtools:
+
+ * SAMtools is more consistent with the specification: a) '*' in the
+   QUAL field is allowed; b) the field separator is TAB only and SPACE
+   is treated as a character in a field; c) empty header is allowed.
+
+ * Implemented GLFv3 support in pileup.
+
+ * Fixed a severe bug in fixmate: strand information is wrongly
+   overwritten.
+
+ * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are
+   not correctly retrieved sometimes.
+
+ * Fixed a bug in rmdup: segfault if unmapped reads are present.
+
+ * Move indel_filter.pl to samtools.pl and improved the filtering by
+   checking the actual number of alignments containing indels. The indel
+   pileup line is also changed a little to make this filtration easier.
+
+ * Fixed a minor bug in indexing: the bin number of an unmapped read is
+   wrongly calculated.
+
+ * Added `flagstat' command to show statistics on the FLAG field.
+
+ * Improved indel caller by setting the maximum window size in local
+   realignment.
+
+Changes in other utilities:
+
+ * Fixed a bug in maq2sam: a tag name is obsolete.
+
+ * Improvement to wgsim: a) added support for SOLiD read simulation; b)
+   show the number of substitutions/indels/errors in read name; c)
+   considerable code clean up.
+
+ * Various converters: improved functionality in general.
+
+ * Updated the example SAM due to the previous bug in fixmate.
+
+(0.1.3: 15 April 2009, r227)
+
+
+
+Beta Release 0.1.2 (28 January, 2008)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in SAMtools:
+
+ * Implemented a Bayesian indel caller. The new caller generate scores
+   and genotype and is potentially more accurate than Maq's indel
+   caller. The pileup format is also changed accordingly.
+
+ * Implemented rmdup command: remove potential PCR duplicates. Note that
+   this command ONLY works for FR orientation and requires ISIZE is
+   correctly set.
+
+ * Added fixmate command: fill in mate coordinates, ISIZE and mate
+   related flags from a name-sorted alignment.
+
+ * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved.
+
+ * Allow to select reads shown in the pileup output with a mask.
+
+ * Generate GLFv2 from pileup.
+
+ * Added two more flags for flagging PCR/optical duplicates and for QC
+   failure.
+
+ * Fixed a bug in sort command: name sorting for large alignment did not
+   work.
+
+ * Allow to completely disable RAZF (using Makefile.lite) as some people
+   have problem to compile it.
+
+ * Fixed a bug in import command when there are reads without
+   coordinates.
+
+ * Fixed a bug in tview: clipping broke the alignment viewer.
+
+ * Fixed a compiling error when _NO_CURSES is applied.
+
+ * Fixed a bug in merge command.
+
+Changes in other utilities:
+
+ * Added wgsim, a paired-end reads simulator. Wgsim was adapted from
+   maq's reads simulator. Colin Hercus further improved it to allow
+   longer indels.
+
+ * Added wgsim_eval.pl, a script that evaluates the accuracy of
+   alignment on reads generated by wgsim.
+
+ * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not
+   work properly when multiple hits are output.
+
+ * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will
+   be retained when multiple hits are present.
+
+ * Fixed a bug in export2sam.pl for QC reads.
+
+ * Support RG tag at MAQ->SAM converter.
+
+ * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and
+   indel are not properly handled, though.
+
+ * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the
+   default Illumina output.
+
+(0.1.2: 28 January 2008; r116)
+
+
+
+Beta Release 0.1.1 (22 December, 2008)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The is the first public release of samtools. For more information,
+please check the manual page `samtools.1' and the samtools website
+http://samtools.sourceforge.net
\ No newline at end of file
diff --git a/bam.c b/bam.c
new file mode 100644 (file)
index 0000000..1ff4a5a
--- /dev/null
+++ b/bam.c
@@ -0,0 +1,290 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "bam_endian.h"
+#include "kstring.h"
+
+int bam_is_be = 0;
+
+/**************************
+ * CIGAR related routines *
+ **************************/
+
+int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg)
+{
+       unsigned k;
+       int32_t x = c->pos, y = 0;
+       int state = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK; // operation
+               int l = cigar[k] >> BAM_CIGAR_SHIFT; // length
+               if (state == 0 && (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CINS) && x + l > pos) {
+                       reg->tbeg = x; reg->qbeg = y; reg->cbeg = k;
+                       state = 1;
+               }
+               if (op == BAM_CMATCH) { x += l; y += l; }
+               else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+               else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               if (state == 1 && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CREF_SKIP || k == c->n_cigar - 1)) {
+                       reg->tend = x; reg->qend = y; reg->cend = k;
+               }
+       }
+       return state? 0 : -1;
+}
+
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
+{
+       uint32_t k, end;
+       end = c->pos;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
+                       end += cigar[k] >> BAM_CIGAR_SHIFT;
+       }
+       return end;
+}
+
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
+{
+       uint32_t k;
+       int32_t l = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)
+                       l += cigar[k] >> BAM_CIGAR_SHIFT;
+       }
+       return l;
+}
+
+/********************
+ * BAM I/O routines *
+ ********************/
+
+bam_header_t *bam_header_init()
+{
+       bam_is_be = bam_is_big_endian();
+       return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+       int32_t i;
+       extern void bam_destroy_header_hash(bam_header_t *header);
+       if (header == 0) return;
+       if (header->target_name) {
+               for (i = 0; i < header->n_targets; ++i)
+                       free(header->target_name[i]);
+               free(header->target_name);
+               free(header->target_len);
+       }
+       free(header->text);
+#ifndef BAM_NO_HASH
+       if (header->rg2lib) bam_strmap_destroy(header->rg2lib);
+       bam_destroy_header_hash(header);
+#endif
+       free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+       bam_header_t *header;
+       char buf[4];
+       int32_t i, name_len;
+       // read "BAM1"
+       if (bam_read(fp, buf, 4) != 4) return 0;
+       if (strncmp(buf, "BAM\001", 4)) {
+               fprintf(stderr, "[bam_header_read] wrong header\n");
+               return 0;
+       }
+       header = bam_header_init();
+       // read plain text and the number of reference sequences
+       bam_read(fp, &header->l_text, 4);
+       if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+       header->text = (char*)calloc(header->l_text + 1, 1);
+       bam_read(fp, header->text, header->l_text);
+       bam_read(fp, &header->n_targets, 4);
+       if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+       // read reference sequence names and lengths
+       header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+       header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+       for (i = 0; i != header->n_targets; ++i) {
+               bam_read(fp, &name_len, 4);
+               if (bam_is_be) bam_swap_endian_4p(&name_len);
+               header->target_name[i] = (char*)calloc(name_len, 1);
+               bam_read(fp, header->target_name[i], name_len);
+               bam_read(fp, &header->target_len[i], 4);
+               if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+       }
+       return header;
+}
+
+int bam_header_write(bamFile fp, const bam_header_t *header)
+{
+       char buf[4];
+       int32_t i, name_len, x;
+       // write "BAM1"
+       strncpy(buf, "BAM\001", 4);
+       bam_write(fp, buf, 4);
+       // write plain text and the number of reference sequences
+       if (bam_is_be) {
+               x = bam_swap_endian_4(header->l_text);
+               bam_write(fp, &x, 4);
+               if (header->l_text) bam_write(fp, header->text, header->l_text);
+               x = bam_swap_endian_4(header->n_targets);
+               bam_write(fp, &x, 4);
+       } else {
+               bam_write(fp, &header->l_text, 4);
+               if (header->l_text) bam_write(fp, header->text, header->l_text);
+               bam_write(fp, &header->n_targets, 4);
+       }
+       // write sequence names and lengths
+       for (i = 0; i != header->n_targets; ++i) {
+               char *p = header->target_name[i];
+               name_len = strlen(p) + 1;
+               if (bam_is_be) {
+                       x = bam_swap_endian_4(name_len);
+                       bam_write(fp, &x, 4);
+               } else bam_write(fp, &name_len, 4);
+               bam_write(fp, p, name_len);
+               if (bam_is_be) {
+                       x = bam_swap_endian_4(header->target_len[i]);
+                       bam_write(fp, &x, 4);
+               } else bam_write(fp, &header->target_len[i], 4);
+       }
+       return 0;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+       uint8_t *s;
+       uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+       s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+       for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+       while (s < data + data_len) {
+               uint8_t type;
+               s += 2; // skip key
+               type = toupper(*s); ++s; // skip type
+               if (type == 'C' || type == 'A') ++s;
+               else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+               else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+               else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
+               else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+       }
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+       bam1_core_t *c = &b->core;
+       int32_t block_len, ret, i;
+       uint32_t x[8];
+
+       assert(BAM_CORE_SIZE == 32);
+       if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+               if (ret == 0) return -1; // normal end-of-file
+               else return -2; // truncated
+       }
+       if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
+       if (bam_is_be) {
+               bam_swap_endian_4p(&block_len);
+               for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+       }
+       c->tid = x[0]; c->pos = x[1];
+       c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+       c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+       c->l_qseq = x[4];
+       c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+       b->data_len = block_len - BAM_CORE_SIZE;
+       if (b->m_data < b->data_len) {
+               b->m_data = b->data_len;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+       b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+       if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+       return 4 + block_len;
+}
+
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
+{
+       uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
+       int i;
+       assert(BAM_CORE_SIZE == 32);
+       x[0] = c->tid;
+       x[1] = c->pos;
+       x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+       x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+       x[4] = c->l_qseq;
+       x[5] = c->mtid;
+       x[6] = c->mpos;
+       x[7] = c->isize;
+       if (bam_is_be) {
+               for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+               y = block_len;
+               bam_write(fp, bam_swap_endian_4p(&y), 4);
+               swap_endian_data(c, data_len, data);
+       } else bam_write(fp, &block_len, 4);
+       bam_write(fp, x, BAM_CORE_SIZE);
+       bam_write(fp, data, data_len);
+       if (bam_is_be) swap_endian_data(c, data_len, data);
+       return 4 + block_len;
+}
+
+int bam_write1(bamFile fp, const bam1_t *b)
+{
+       return bam_write1_core(fp, &b->core, b->data_len, b->data);
+}
+
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
+{
+       uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
+       int i;
+       const bam1_core_t *c = &b->core;
+       kstring_t str;
+       str.l = str.m = 0; str.s = 0;
+
+       ksprintf(&str, "%s\t%d\t", bam1_qname(b), c->flag);
+       if (c->tid < 0) kputs("*\t", &str);
+       else ksprintf(&str, "%s\t", header->target_name[c->tid]);
+       ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual);
+       if (c->n_cigar == 0) kputc('*', &str);
+       else {
+               for (i = 0; i < c->n_cigar; ++i)
+                       ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]);
+       }
+       kputc('\t', &str);
+       if (c->mtid < 0) kputs("*\t", &str);
+       else if (c->mtid == c->tid) kputs("=\t", &str);
+       else ksprintf(&str, "%s\t", header->target_name[c->mtid]);
+       ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize);
+       for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
+       kputc('\t', &str);
+       if (t[0] == 0xff) kputc('*', &str);
+       else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+       s = bam1_aux(b);
+       while (s < b->data + b->data_len) {
+               uint8_t type, key[2];
+               key[0] = s[0]; key[1] = s[1];
+               s += 2; type = *s; ++s;
+               ksprintf(&str, "\t%c%c:", key[0], key[1]);
+               if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; }
+               else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; }
+               else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; }
+               else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; }
+               else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; }
+               else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; }
+               else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; }
+               else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
+               else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
+               else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; }
+       }
+       return str.s;
+}
+
+void bam_view1(const bam_header_t *header, const bam1_t *b)
+{
+       char *s = bam_format1(header, b);
+       printf("%s\n", s);
+       free(s);
+}
diff --git a/bam.h b/bam.h
new file mode 100644 (file)
index 0000000..83c03ad
--- /dev/null
+++ b/bam.h
@@ -0,0 +1,714 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef BAM_BAM_H
+#define BAM_BAM_H
+
+/*!
+  @header
+
+  BAM library provides I/O and various operations on manipulating files
+  in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
+  format. It now supports importing from or exporting to TAM, sorting,
+  merging, generating pileup, and quickly retrieval of reads overlapped
+  with a specified region.
+
+  @copyright Genome Research Ltd.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define _IOLIB 2
+
+#if _IOLIB == 1 && !defined(_NO_RAZF)
+#define BAM_TRUE_OFFSET
+#include "razf.h"
+/*! @abstract BAM file handler */
+typedef RAZF *bamFile;
+#define bam_open(fn, mode) razf_open(fn, mode)
+#define bam_dopen(fd, mode) razf_dopen(fd, mode)
+#define bam_close(fp) razf_close(fp)
+#define bam_read(fp, buf, size) razf_read(fp, buf, size)
+#define bam_write(fp, buf, size) razf_write(fp, buf, size)
+#define bam_tell(fp) razf_tell(fp)
+#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir)
+#elif _IOLIB == 2
+#define BAM_VIRTUAL_OFFSET16
+#include "bgzf.h"
+/*! @abstract BAM file handler */
+typedef BGZF *bamFile;
+#define bam_open(fn, mode) bgzf_open(fn, mode)
+#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
+#define bam_close(fp) bgzf_close(fp)
+#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
+#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
+#define bam_tell(fp) bgzf_tell(fp)
+#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
+#elif _IOLIB == 3
+#define BAM_VIRTUAL_OFFSET16
+#include "razf.h"
+/*! @abstract BAM file handler */
+typedef RAZF *bamFile;
+#define bam_open(fn, mode) razf_open2(fn, mode)
+#define bam_dopen(fd, mode) razf_dopen2(fd, mode)
+#define bam_close(fp) razf_close(fp)
+#define bam_read(fp, buf, size) razf_read(fp, buf, size)
+#define bam_write(fp, buf, size) razf_write(fp, buf, size)
+#define bam_tell(fp) razf_tell2(fp)
+#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir)
+#endif
+
+/*! @typedef
+  @abstract Structure for the alignment header.
+  @field n_targets   number of reference sequences
+  @field target_name names of the reference sequences
+  @field target_len  lengths of the referene sequences
+  @field hash        hash table for fast name lookup
+  @field rg2lib      hash table for @RG-ID -> LB lookup
+  @field l_text      length of the plain text in the header
+  @field text        plain text
+
+  @discussion Field hash points to null by default. It is a private
+  member.
+ */
+typedef struct {
+       int32_t n_targets;
+       char **target_name;
+       uint32_t *target_len;
+       void *hash, *rg2lib;
+       int l_text;
+       char *text;
+} bam_header_t;
+
+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
+#define BAM_FPAIRED        1
+/*! @abstract the read is mapped in a proper pair */
+#define BAM_FPROPER_PAIR   2
+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
+#define BAM_FUNMAP         4
+/*! @abstract the mate is unmapped */
+#define BAM_FMUNMAP        8
+/*! @abstract the read is mapped to the reverse strand */
+#define BAM_FREVERSE      16
+/*! @abstract the mate is mapped to the reverse strand */
+#define BAM_FMREVERSE     32
+/*! @abstract this is read1 */
+#define BAM_FREAD1        64
+/*! @abstract this is read2 */
+#define BAM_FREAD2       128
+/*! @abstract not primary alignment */
+#define BAM_FSECONDARY   256
+/*! @abstract QC failure */
+#define BAM_FQCFAIL      512
+/*! @abstract optical or PCR duplicate */
+#define BAM_FDUP        1024
+
+/*! @abstract defautl mask for pileup */
+#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
+
+#define BAM_CORE_SIZE   sizeof(bam1_core_t)
+
+/**
+ * Describing how CIGAR operation/length is packed in a 32-bit integer.
+ */
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+
+/*
+  CIGAR operations.
+ */
+/*! @abstract CIGAR: match */
+#define BAM_CMATCH      0
+/*! @abstract CIGAR: insertion to the reference */
+#define BAM_CINS        1
+/*! @abstract CIGAR: deletion from the reference */
+#define BAM_CDEL        2
+/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
+#define BAM_CREF_SKIP   3
+/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
+#define BAM_CSOFT_CLIP  4
+/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
+#define BAM_CHARD_CLIP  5
+/*! @abstract CIGAR: padding */
+#define BAM_CPAD        6
+
+/*! @typedef
+  @abstract Structure for core alignment information.
+  @field  tid     chromosome ID, defined by bam_header_t
+  @field  pos     0-based leftmost coordinate
+  @field  strand  strand; 0 for forward and 1 otherwise
+  @field  bin     bin calculated by bam_reg2bin()
+  @field  qual    mapping quality
+  @field  l_qname length of the query name
+  @field  flag    bitwise flag
+  @field  n_cigar number of CIGAR operations
+  @field  l_qseq  length of the query sequence (read)
+ */
+typedef struct {
+       int32_t tid;
+       int32_t pos;
+       uint32_t bin:16, qual:8, l_qname:8;
+       uint32_t flag:16, n_cigar:16;
+       int32_t l_qseq;
+       int32_t mtid;
+       int32_t mpos;
+       int32_t isize;
+} bam1_core_t;
+
+/*! @typedef
+  @abstract Structure for one alignment.
+  @field  core       core information about the alignment
+  @field  l_aux      length of auxiliary data
+  @field  data_len   current length of bam1_t::data
+  @field  m_data     maximum length of bam1_t::data
+  @field  data       all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
+
+  @discussion Notes:
+   1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+   2. l_qseq is calculated from the total length of an alignment block
+      on reading or from CIGAR.
+ */
+typedef struct {
+       bam1_core_t core;
+       int l_aux, data_len, m_data;
+       uint8_t *data;
+} bam1_t;
+
+#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
+#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
+
+/*! @function
+  @abstract  Get the CIGAR array
+  @param  b  pointer to an alignment
+  @return    pointer to the CIGAR array
+
+  @discussion In the CIGAR array, each element is a 32-bit integer. The
+  lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+  length of a CIGAR.
+ */
+#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
+
+/*! @function
+  @abstract  Get the name of the query
+  @param  b  pointer to an alignment
+  @return    pointer to the name string, null terminated
+ */
+#define bam1_qname(b) ((char*)((b)->data))
+
+/*! @function
+  @abstract  Get query sequence
+  @param  b  pointer to an alignment
+  @return    pointer to sequence
+
+  @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+  8 for T and 15 for N. Two bases are packed in one byte with the base
+  at the higher 4 bits having smaller coordinate on the read. It is
+  recommended to use bam1_seqi() macro to get the base.
+ */
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
+
+/*! @function
+  @abstract  Get query quality
+  @param  b  pointer to an alignment
+  @return    pointer to quality string
+ */
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
+
+/*! @function
+  @abstract  Get a base on read
+  @param  s  Query sequence returned by bam1_seq()
+  @param  i  The i-th position, 0-based
+  @return    4-bit integer representing the base.
+ */
+#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+
+/*! @function
+  @abstract  Get query sequence and quality
+  @param  b  pointer to an alignment
+  @return    pointer to the concatenated auxiliary data
+ */
+#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
+
+#ifndef kroundup32
+/*! @function
+  @abstract  Round an integer to the next closest power-2 integer.
+  @param  x  integer to be rounded (in place)
+  @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/*!
+  @abstract Whether the machine is big-endian; modified only in
+  bam_header_init().
+ */
+extern int bam_is_be;
+
+/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
+extern unsigned char bam_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
+extern char *bam_nt16_rev_table;
+
+extern char bam_nt16_nt4_table[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       /*! @abstract TAM file handler */
+       typedef struct __tamFile_t *tamFile;
+
+       /*!
+         @abstract   Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
+         @param  fn  SAM file name
+         @return     SAM file handler
+        */
+       tamFile sam_open(const char *fn);
+
+       /*!
+         @abstract   Close a SAM file handler
+         @param  fp  SAM file handler
+        */
+       void sam_close(tamFile fp);
+
+       /*!
+         @abstract      Read one alignment from a SAM file handler
+         @param  fp     SAM file handler
+         @param  header header information (ordered names of chromosomes)
+         @param  b      read alignment; all members in b will be updated
+         @return        0 if successful; otherwise negative
+        */
+       int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
+
+       /*!
+         @abstract       Read header information from a TAB-delimited list file.
+         @param  fn_list file name for the list
+         @return         a pointer to the header structure
+
+         @discussion Each line in this file consists of chromosome name and
+         the length of chromosome.
+        */
+       bam_header_t *sam_header_read2(const char *fn_list);
+
+       /*!
+         @abstract       Read header from a SAM file (if present)
+         @param  fp      SAM file handler
+         @return         pointer to header struct; 0 if no @SQ lines available
+        */
+       bam_header_t *sam_header_read(tamFile fp);
+
+       /*!
+         @abstract       Parse @SQ lines a update a header struct
+         @param  h       pointer to the header struct to be updated
+         @return         number of target sequences
+
+         @discussion bam_header_t::{n_targets,target_len,target_name} will
+         be destroyed in the first place.
+        */
+       int sam_header_parse(bam_header_t *h);
+
+       /*!
+         @abstract       Parse @RG lines a update a header struct
+         @param  h       pointer to the header struct to be updated
+         @return         number of @RG lines
+
+         @discussion bam_header_t::rg2lib will be destroyed in the first
+         place.
+        */
+       int sam_header_parse_rg(bam_header_t *h);
+
+#define sam_write1(header, b) bam_view1(header, b)
+
+       int bam_strmap_put(void *strmap, const char *rg, const char *lib);
+       const char *bam_strmap_get(const void *strmap, const char *rg);
+       void *bam_strmap_dup(const void*);
+       void *bam_strmap_init();
+       void bam_strmap_destroy(void *strmap);
+
+       /*!
+         @abstract Initialize a header structure.
+         @return   the pointer to the header structure
+
+         @discussion This function also modifies the global variable
+         bam_is_be.
+        */
+       bam_header_t *bam_header_init();
+
+       /*!
+         @abstract        Destroy a header structure.
+         @param  header  pointer to the header
+        */
+       void bam_header_destroy(bam_header_t *header);
+
+       /*!
+         @abstract   Read a header structure from BAM.
+         @param  fp  BAM file handler, opened by bam_open()
+         @return     pointer to the header structure
+
+         @discussion The file position indicator must be placed at the
+         beginning of the file. Upon success, the position indicator will
+         be set at the start of the first alignment.
+        */
+       bam_header_t *bam_header_read(bamFile fp);
+
+       /*!
+         @abstract      Write a header structure to BAM.
+         @param  fp     BAM file handler
+         @param  header pointer to the header structure
+         @return        always 0 currently
+        */
+       int bam_header_write(bamFile fp, const bam_header_t *header);
+
+       /*!
+         @abstract   Read an alignment from BAM.
+         @param  fp  BAM file handler
+         @param  b   read alignment; all members are updated.
+         @return     number of bytes read from the file
+
+         @discussion The file position indicator must be
+         placed right before an alignment. Upon success, this function
+         will set the position indicator to the start of the next
+         alignment. This function is not affected by the machine
+         endianness.
+        */
+       int bam_read1(bamFile fp, bam1_t *b);
+
+       /*!
+         @abstract Write an alignment to BAM.
+         @param  fp       BAM file handler
+         @param  c        pointer to the bam1_core_t structure
+         @param  data_len total length of variable size data related to
+                          the alignment
+         @param  data     pointer to the concatenated data
+         @return          number of bytes written to the file
+
+         @discussion This function is not affected by the machine
+         endianness.
+        */
+       int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
+
+       /*!
+         @abstract   Write an alignment to BAM.
+         @param  fp  BAM file handler
+         @param  b   alignment to write
+         @return     number of bytes written to the file
+
+         @abstract It is equivalent to:
+           bam_write1_core(fp, &b->core, b->data_len, b->data)
+        */
+       int bam_write1(bamFile fp, const bam1_t *b);
+
+       /*! @function
+         @abstract  Initiate a pointer to bam1_t struct
+        */
+#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
+
+       /*! @function
+         @abstract  Free the memory allocated for an alignment.
+         @param  b  pointer to an alignment
+        */
+#define bam_destroy1(b) do {           \
+               free((b)->data); free(b);       \
+       } while (0)
+
+       /*!
+         @abstract       Format a BAM record in the SAM format
+         @param  header  pointer to the header structure
+         @param  b       alignment to print
+         @return         a pointer to the SAM string
+        */
+       char *bam_format1(const bam_header_t *header, const bam1_t *b);
+
+       /*! @typedef
+         @abstract Structure for one alignment covering the pileup position.
+         @field  b      pointer to the alignment
+         @field  qpos   position of the read base at the pileup site, 0-based
+         @field  indel  indel length; 0 for no indel, positive for ins and negative for del
+         @field  is_del 1 iff the base on the padded read is a deletion
+         @field  level  the level of the read in the "viewer" mode
+
+         @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+         difference between the two functions is that the former does not
+         set bam_pileup1_t::level, while the later does. Level helps the
+         implementation of alignment viewers, but calculating this has some
+         overhead.
+        */
+       typedef struct {
+               bam1_t *b;
+               int32_t qpos;
+               int indel, level;
+               uint32_t is_del:1, is_head:1, is_tail:1;
+       } bam_pileup1_t;
+
+       struct __bam_plbuf_t;
+       /*! @abstract pileup buffer */
+       typedef struct __bam_plbuf_t bam_plbuf_t;
+
+       void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+
+       /*! @typedef
+         @abstract    Type of function to be called by bam_plbuf_push().
+         @param  tid  chromosome ID as is defined in the header
+         @param  pos  start coordinate of the alignment, 0-based
+         @param  n    number of elements in pl array
+         @param  pl   array of alignments
+         @param  data user provided data
+         @discussion  See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
+        */
+       typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
+
+       /*!
+         @abstract     Reset a pileup buffer for another pileup process
+         @param  buf   the pileup buffer to be reset
+        */
+       void bam_plbuf_reset(bam_plbuf_t *buf);
+
+       /*!
+         @abstract     Initialize a buffer for pileup.
+         @param  func  fucntion to be called by bam_pileup_core()
+         @param  data  user provided data
+         @return       pointer to the pileup buffer
+        */
+       bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
+
+       /*!
+         @abstract    Destroy a pileup buffer.
+         @param  buf  pointer to the pileup buffer
+        */
+       void bam_plbuf_destroy(bam_plbuf_t *buf);
+
+       /*!
+         @abstract    Push an alignment to the pileup buffer.
+         @param  b    alignment to be pushed
+         @param  buf  pileup buffer
+         @see         bam_plbuf_init()
+         @return      always 0 currently
+
+         @discussion If all the alignments covering a particular site have
+         been collected, this function will call the user defined function
+         as is provided to bam_plbuf_init(). The coordinate of the site and
+         all the alignments will be transferred to the user defined
+         function as function parameters.
+        
+         When all the alignments are pushed to the buffer, this function
+         needs to be called with b equal to NULL. This will flush the
+         buffer. A pileup buffer can only be reused when bam_plbuf_reset()
+         is called.
+        */
+       int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
+
+       struct __bam_lplbuf_t;
+       typedef struct __bam_lplbuf_t bam_lplbuf_t;
+
+       void bam_lplbuf_reset(bam_lplbuf_t *buf);
+
+       /*! @abstract  bam_plbuf_init() equivalent with level calculated. */
+       bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
+
+       /*! @abstract  bam_plbuf_destroy() equivalent with level calculated. */
+       void bam_lplbuf_destroy(bam_lplbuf_t *tv);
+
+       /*! @abstract  bam_plbuf_push() equivalent with level calculated. */
+       int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
+
+       /*! @abstract  bam_plbuf_file() equivalent with level calculated. */
+       int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
+
+       struct __bam_index_t;
+       typedef struct __bam_index_t bam_index_t;
+
+       /*!
+         @abstract   Build index for a BAM file.
+         @discussion Index file "fn.bai" will be created.
+         @param  fn  name of the BAM file
+         @return     always 0 currently
+        */
+       int bam_index_build(const char *fn);
+
+       /*!
+         @abstract   Load index from file "fn.bai".
+         @param  fn  name of the BAM file (NOT the index file)
+         @return     pointer to the index structure
+        */
+       bam_index_t *bam_index_load(const char *fn);
+
+       /*!
+         @abstract    Destroy an index structure.
+         @param  idx  pointer to the index structure
+        */
+       void bam_index_destroy(bam_index_t *idx);
+
+       /*! @typedef
+         @abstract      Type of function to be called by bam_fetch().
+         @param  b     the alignment
+         @param  data  user provided data
+        */
+       typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
+
+       /*!
+         @abstract Retrieve the alignments that are overlapped with the
+         specified region.
+
+         @discussion A user defined function will be called for each
+         retrieved alignment ordered by its start position.
+
+         @param  fp    BAM file handler
+         @param  idx   pointer to the alignment index
+         @param  tid   chromosome ID as is defined in the header
+         @param  beg   start coordinate, 0-based
+         @param  end   end coordinate, 0-based
+         @param  data  user provided data (will be transferred to func)
+         @param  func  user defined function
+        */
+       int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
+
+       /*!
+         @abstract       Parse a region in the format: "chr2:100,000-200,000".
+         @discussion     bam_header_t::hash will be initialized if empty.
+         @param  header  pointer to the header structure
+         @param  str     string to be parsed
+         @param  ref_id  the returned chromosome ID
+         @param  begin   the returned start coordinate
+         @param  end     the returned end coordinate
+         @return         0 on success; -1 on failure
+        */
+       int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+
+       /*!
+         @abstract       Retrieve data of a tag
+         @param  b       pointer to an alignment struct
+         @param  tag     two-character tag to be retrieved
+
+         @return  pointer to the type and data. The first character is the
+         type that can be 'iIsScCdfAZH'.
+
+         @discussion  Use bam_aux2?() series to convert the returned data to
+         the corresponding type.
+       */
+       uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
+
+       int32_t bam_aux2i(const uint8_t *s);
+       float bam_aux2f(const uint8_t *s);
+       double bam_aux2d(const uint8_t *s);
+       char bam_aux2A(const uint8_t *s);
+       char *bam_aux2Z(const uint8_t *s);
+
+       void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
+
+       uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
+
+       /*!  
+         @abstract Calculate the rightmost coordinate of an alignment on the
+         reference genome.
+
+         @param  c      pointer to the bam1_core_t structure
+         @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
+         @return        the rightmost coordinate, 0-based
+       */
+       uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
+
+       /*!
+         @abstract      Calculate the length of the query sequence from CIGAR.
+         @param  c      pointer to the bam1_core_t structure
+         @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
+         @return        length of the query sequence
+       */
+       int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
+
+       typedef struct {
+               int32_t qbeg, qend;
+               int32_t tbeg, tend;
+               int32_t cbeg, cend;
+       } bam_segreg_t;
+
+       int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*!
+  @abstract    Calculate the minimum bin that contains a region [beg,end).
+  @param  beg  start of the region, 0-based
+  @param  end  end of the region, 0-based
+  @return      bin
+ */
+static inline int bam_reg2bin(uint32_t beg, uint32_t end)
+{
+       --end;
+       if (beg>>14 == end>>14) return 4681 + (beg>>14);
+       if (beg>>17 == end>>17) return  585 + (beg>>17);
+       if (beg>>20 == end>>20) return   73 + (beg>>20);
+       if (beg>>23 == end>>23) return    9 + (beg>>23);
+       if (beg>>26 == end>>26) return    1 + (beg>>26);
+       return 0;
+}
+
+/*!
+  @abstract     Copy an alignment
+  @param  bdst  destination alignment struct
+  @param  bsrc  source alignment struct
+  @return       pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+{
+       uint8_t *data = bdst->data;
+       int m_data = bdst->m_data;   // backup data and m_data
+       if (m_data < bsrc->m_data) { // double the capacity
+               m_data = bsrc->m_data; kroundup32(m_data);
+               data = (uint8_t*)realloc(data, m_data);
+       }
+       memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
+       *bdst = *bsrc; // copy the rest
+       // restore the backup
+       bdst->m_data = m_data;
+       bdst->data = data;
+       return bdst;
+}
+
+/*!
+  @abstract     Duplicate an alignment
+  @param  src   source alignment struct
+  @return       pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_dup1(const bam1_t *src)
+{
+       bam1_t *b;
+       b = bam_init1();
+       *b = *src;
+       b->m_data = b->data_len;
+       b->data = (uint8_t*)calloc(b->data_len, 1);
+       memcpy(b->data, src->data, b->data_len);
+       return b;
+}
+
+#endif
diff --git a/bam_aux.c b/bam_aux.c
new file mode 100644 (file)
index 0000000..7482500
--- /dev/null
+++ b/bam_aux.c
@@ -0,0 +1,232 @@
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+typedef char *str_p;
+KHASH_MAP_INIT_STR(s, int)
+KHASH_MAP_INIT_STR(r2l, str_p)
+
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+       int ori_len = b->data_len;
+       b->data_len += 3 + len;
+       b->l_aux += 3 + len;
+       if (b->m_data < b->data_len) {
+               b->m_data = b->data_len;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+       b->data[ori_len + 2] = type;
+       memcpy(b->data + ori_len + 3, data, len);
+}
+
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
+{
+       return bam_aux_get(b, tag);
+}
+
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
+{
+       uint8_t *s;
+       int y = tag[0]<<8 | tag[1];
+       s = bam1_aux(b);
+       while (s < b->data + b->data_len) {
+               int type, x = (int)s[0]<<8 | s[1];
+               s += 2;
+               if (x == y) return s;
+               type = toupper(*s); ++s;
+               if (type == 'C') ++s;
+               else if (type == 'S') s += 2;
+               else if (type == 'I' || type == 'F') s += 4;
+               else if (type == 'D') s += 8;
+               else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+       }
+       return 0;
+}
+
+void bam_init_header_hash(bam_header_t *header)
+{
+       if (header->hash == 0) {
+               int ret, i;
+               khiter_t iter;
+               khash_t(s) *h;
+               header->hash = h = kh_init(s);
+               for (i = 0; i < header->n_targets; ++i) {
+                       iter = kh_put(s, h, header->target_name[i], &ret);
+                       kh_value(h, iter) = i;
+               }
+       }
+}
+
+void bam_destroy_header_hash(bam_header_t *header)
+{
+       if (header->hash)
+               kh_destroy(s, (khash_t(s)*)header->hash);
+}
+
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
+{
+       khint_t k;
+       khash_t(s) *h = (khash_t(s)*)header->hash;
+       k = kh_get(s, h, seq_name);
+       return k == kh_end(h)? -1 : kh_value(h, k);
+}
+
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end)
+{
+       char *s, *p;
+       int i, l, k;
+       khiter_t iter;
+       khash_t(s) *h;
+
+       bam_init_header_hash(header);
+       h = (khash_t(s)*)header->hash;
+
+       l = strlen(str);
+       p = s = (char*)malloc(l+1);
+       /* squeeze out "," */
+       for (i = k = 0; i != l; ++i)
+               if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+       s[k] = 0;
+       for (i = 0; i != k; ++i) if (s[i] == ':') break;
+       s[i] = 0;
+       iter = kh_get(s, h, s); /* get the ref_id */
+       if (iter == kh_end(h)) { // name not found
+               *ref_id = -1; free(s);
+               return -1;
+       }
+       *ref_id = kh_value(h, iter);
+       if (i == k) { /* dump the whole sequence */
+               *begin = 0; *end = 1<<29; free(s);
+               return -1;
+       }
+       for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+       *begin = atoi(p);
+       if (i < k) {
+               p = s + i + 1;
+               *end = atoi(p);
+       } else *end = 1<<29;
+       if (*begin > 0) --*begin;
+       free(s);
+       if (*begin > *end) {
+               fprintf(stderr, "[bam_parse_region] invalid region.\n");
+               return -1;
+       }
+       return 0;
+}
+
+int32_t bam_aux2i(const uint8_t *s)
+{
+       int type;
+       if (s == 0) return 0;
+       type = *s++;
+       if (type == 'c') return (int32_t)*(int8_t*)s;
+       else if (type == 'C') return (int32_t)*(uint8_t*)s;
+       else if (type == 's') return (int32_t)*(int16_t*)s;
+       else if (type == 'S') return (int32_t)*(uint16_t*)s;
+       else if (type == 'i' || type == 'I') return *(int32_t*)s;
+       else return 0;
+}
+
+float bam_aux2f(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0.0;
+       if (type == 'f') return *(float*)s;
+       else return 0.0;
+}
+
+double bam_aux2d(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0.0;
+       if (type == 'd') return *(double*)s;
+       else return 0.0;
+}
+
+char bam_aux2A(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0;
+       if (type == 'A') return *(char*)s;
+       else return 0;
+}
+
+char *bam_aux2Z(const uint8_t *s)
+{
+       int type;
+       type = *s++;
+       if (s == 0) return 0;
+       if (type == 'Z' || type == 'H') return (char*)s;
+       else return 0;
+}
+
+/******************
+ * rg2lib related *
+ ******************/
+
+int bam_strmap_put(void *rg2lib, const char *rg, const char *lib)
+{
+       int ret;
+       khint_t k;
+       khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
+       char *key;
+       if (h == 0) return 1;
+       key = strdup(rg);
+       k = kh_put(r2l, h, key, &ret);
+       if (ret) kh_val(h, k) = strdup(lib);
+       else {
+               fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg);
+               free(key);
+       }
+       return 0;
+}
+
+const char *bam_strmap_get(const void *rg2lib, const char *rg)
+{
+       const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
+       khint_t k;
+       if (h == 0) return 0;
+       k = kh_get(r2l, h, rg);
+       if (k != kh_end(h)) return (const char*)kh_val(h, k);
+       else return 0;
+}
+
+void *bam_strmap_dup(const void *rg2lib)
+{
+       const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
+       khash_t(r2l) *g;
+       khint_t k, l;
+       int ret;
+       if (h == 0) return 0;
+       g = kh_init(r2l);
+       for (k = kh_begin(h); k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       char *key = strdup(kh_key(h, k));
+                       l = kh_put(r2l, g, key, &ret);
+                       kh_val(g, l) = strdup(kh_val(h, k));
+               }
+       }
+       return g;
+}
+
+void *bam_strmap_init()
+{
+       return (void*)kh_init(r2l);
+}
+
+void bam_strmap_destroy(void *rg2lib)
+{
+       khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
+       khint_t k;
+       if (h == 0) return;
+       for (k = kh_begin(h); k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       free((char*)kh_key(h, k)); free(kh_val(h, k));
+               }
+       }
+       kh_destroy(r2l, h);
+}
diff --git a/bam_color.c b/bam_color.c
new file mode 100644 (file)
index 0000000..75aedd6
--- /dev/null
@@ -0,0 +1,127 @@
+#include <ctype.h>
+#include "bam.h"
+
+/*!
+ @abstract     Get the color encoding the previous and current base
+ @param b      pointer to an alignment
+ @param i      The i-th position, 0-based
+ @return       color
+
+ @discussion   Returns 0 no color information is found.
+ */
+char bam_aux_getCSi(bam1_t *b, int i)
+{
+       uint8_t *c = bam_aux_get(b, "CS");
+       char *cs = NULL;
+
+       // return the base if the tag was not found
+       if(0 == c) return 0;
+
+       cs = bam_aux2Z(c);
+       // adjust for strandedness and leading adaptor
+       if(bam1_strand(b)) i = strlen(cs) - 1 - i;
+       else i++;
+       return cs[i];
+}
+
+/*!
+ @abstract     Get the color quality of the color encoding the previous and current base
+ @param b      pointer to an alignment
+ @param i      The i-th position, 0-based
+ @return       color quality
+
+ @discussion   Returns 0 no color information is found.
+ */
+char bam_aux_getCQi(bam1_t *b, int i)
+{
+       uint8_t *c = bam_aux_get(b, "CQ");
+       char *cq = NULL;
+       
+       // return the base if the tag was not found
+       if(0 == c) return 0;
+
+       cq = bam_aux2Z(c);
+       // adjust for strandedness
+       if(bam1_strand(b)) i = strlen(cq) - 1 - i;
+       return cq[i];
+}
+
+char bam_aux_nt2int(char a)
+{
+       switch(toupper(a)) {
+               case 'A':
+                       return 0;
+                       break;
+               case 'C':
+                       return 1;
+                       break;
+               case 'G':
+                       return 2;
+                       break;
+               case 'T':
+                       return 3;
+                       break;
+               default:
+                       return 4;
+                       break;
+       }
+}
+
+char bam_aux_ntnt2cs(char a, char b)
+{
+       a = bam_aux_nt2int(a);
+       b = bam_aux_nt2int(b);
+       if(4 == a || 4 == b) return '4';
+       return "0123"[(int)(a ^ b)];
+}
+
+/*!
+ @abstract     Get the color error profile at the give position    
+ @param b      pointer to an alignment
+ @return       the original color if the color was an error, '-' (dash) otherwise
+
+ @discussion   Returns 0 no color information is found.
+ */
+char bam_aux_getCEi(bam1_t *b, int i)
+{
+       int cs_i;
+       uint8_t *c = bam_aux_get(b, "CS");
+       char *cs = NULL;
+       char prev_b, cur_b;
+       char cur_color, cor_color;
+
+       // return the base if the tag was not found
+       if(0 == c) return 0;
+       
+       cs = bam_aux2Z(c);
+
+       // adjust for strandedness and leading adaptor
+       if(bam1_strand(b)) { //reverse strand
+               cs_i = strlen(cs) - 1 - i;
+               // get current color
+               cur_color = cs[cs_i];
+               // get previous base
+               prev_b = (0 == cs_i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+               // get current base
+               cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; 
+       }
+       else {
+               cs_i=i+1;
+               // get current color
+               cur_color = cs[cs_i];
+               // get previous base
+               prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+               // get current base
+               cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+       }
+
+       // corrected color
+       cor_color = bam_aux_ntnt2cs(prev_b, cur_b);
+
+       if(cur_color == cor_color) { 
+               return '-';
+       }
+       else {
+               return cur_color;
+       }
+}
diff --git a/bam_endian.h b/bam_endian.h
new file mode 100644 (file)
index 0000000..0fc74a8
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef BAM_ENDIAN_H
+#define BAM_ENDIAN_H
+
+#include <stdint.h>
+
+static inline int bam_is_big_endian()
+{
+       long one= 1;
+       return !(*((char *)(&one)));
+}
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+       return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *bam_swap_endian_2p(void *x)
+{
+       *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
+       return x;
+}
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+       v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+       return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *bam_swap_endian_4p(void *x)
+{
+       *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
+       return x;
+}
+static inline uint64_t bam_swap_endian_8(uint64_t v)
+{
+       v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+       v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+       return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *bam_swap_endian_8p(void *x)
+{
+       *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
+       return x;
+}
+
+#endif
diff --git a/bam_import.c b/bam_import.c
new file mode 100644 (file)
index 0000000..fccaa02
--- /dev/null
@@ -0,0 +1,475 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#include "kstring.h"
+#include "bam.h"
+#include "kseq.h"
+#include "khash.h"
+
+KSTREAM_INIT(gzFile, gzread, 8192)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+
+void bam_init_header_hash(bam_header_t *header);
+void bam_destroy_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+unsigned char bam_nt16_table[256] = {
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+        1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
+
+struct __tamFile_t {
+       gzFile fp;
+       kstream_t *ks;
+       kstring_t *str;
+       uint64_t n_lines;
+       int is_first;
+};
+
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
+{
+       char **list = 0, *s;
+       int n = 0, dret, m = 0;
+       gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       kstream_t *ks;
+       kstring_t *str;
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, '\n', str, &dret) > 0) {
+               if (n == m) {
+                       m = m? m << 1 : 16;
+                       list = (char**)realloc(list, m * sizeof(char*));
+               }
+               if (str->s[str->l-1] == '\r')
+                       str->s[--str->l] = '\0';
+               s = list[n++] = (char*)calloc(str->l + 1, 1);
+               strcpy(s, str->s);
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       *_n = n;
+       return list;
+}
+
+static bam_header_t *hash2header(const kh_ref_t *hash)
+{
+       bam_header_t *header;
+       khiter_t k;
+       header = bam_header_init();
+       header->n_targets = kh_size(hash);
+       header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
+       header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
+       for (k = kh_begin(hash); k != kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       int i = (int)kh_value(hash, k);
+                       header->target_name[i] = (char*)kh_key(hash, k);
+                       header->target_len[i] = kh_value(hash, k)>>32;
+               }
+       }
+       bam_init_header_hash(header);
+       return header;
+}
+bam_header_t *sam_header_read2(const char *fn)
+{
+       bam_header_t *header;
+       int c, dret, ret;
+       gzFile fp;
+       kstream_t *ks;
+       kstring_t *str;
+       kh_ref_t *hash;
+       khiter_t k;
+       hash = kh_init(ref);
+       fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       assert(fp);
+       ks = ks_init(fp);
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       while (ks_getuntil(ks, 0, str, &dret) > 0) {
+               char *s = strdup(str->s);
+               int len, i;
+               i = kh_size(hash);
+               ks_getuntil(ks, 0, str, &dret);
+               len = atoi(str->s);
+               k = kh_put(ref, hash, s, &ret);
+               kh_value(hash, k) = (uint64_t)len<<32 | i;
+               if (dret != '\n')
+                       while ((c = ks_getc(ks)) != '\n' && c != -1);
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+       header = hash2header(hash);
+       kh_destroy(ref, hash);
+       return header;
+}
+static inline uint8_t *alloc_data(bam1_t *b, int size)
+{
+       if (b->m_data < size) {
+               b->m_data = size;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       return b->data;
+}
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
+{
+       fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
+       abort();
+}
+static inline void append_text(bam_header_t *header, kstring_t *str)
+{
+       int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+       kroundup32(x); kroundup32(y);
+       if (x < y) header->text = (char*)realloc(header->text, y);
+       strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
+       header->l_text += str->l + 1;
+       header->text[header->l_text] = 0;
+}
+
+int sam_header_parse_rg(bam_header_t *h)
+{
+       kstring_t *rgid, *rglib;
+       char *p, *q, *s, *r;
+       int n = 0;
+
+       // free
+       if (h == 0) return 0;
+       bam_strmap_destroy(h->rg2lib); h->rg2lib = 0;
+       if (h->l_text < 3) return 0;
+       // parse @RG lines
+       h->rg2lib = bam_strmap_init();
+       rgid = calloc(1, sizeof(kstring_t));
+       rglib = calloc(1, sizeof(kstring_t));
+       s = h->text;
+       while ((s = strstr(s, "@RG")) != 0) {
+               if (rgid->l && rglib->l) {
+                       bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+                       ++n;
+               }
+               rgid->l = rglib->l = 0;
+               s += 3;
+               r = s;
+               if ((p = strstr(s, "ID:")) != 0) {
+                       q = p + 3;
+                       for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+                       kputsn(q, p - q, rgid);
+               } else {
+                       fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n");
+                       break;
+               }
+               if (r < p) r = p;
+               if ((p = strstr(s, "LB:")) != 0) {
+                       q = p + 3;
+                       for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+                       kputsn(q, p - q, rglib);
+               } else {
+                       fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n");
+                       break;
+               }
+               if (r < p) r = p;
+               s = r + 3;
+       }
+       if (rgid->l && rglib->l) {
+               bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+               ++n;
+       }
+       free(rgid->s); free(rgid);
+       free(rglib->s); free(rglib);
+       if (n == 0) {
+               bam_strmap_destroy(h->rg2lib);
+               h->rg2lib = 0;
+       }
+       return n;
+}
+
+int sam_header_parse(bam_header_t *h)
+{
+       int i;
+       char *s, *p, *q, *r;
+
+       // free
+       free(h->target_len); free(h->target_name);
+       h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+       if (h->l_text < 3) return 0;
+       // count number of @SQ
+       s = h->text;
+       while ((s = strstr(s, "@SQ")) != 0) {
+               ++h->n_targets;
+               s += 3;
+       }
+       if (h->n_targets == 0) return 0;
+       h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+       h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+       // parse @SQ lines
+       i = 0;
+       s = h->text;
+       while ((s = strstr(s, "@SQ")) != 0) {
+               s += 3;
+               r = s;
+               if ((p = strstr(s, "SN:")) != 0) {
+                       q = p + 3;
+                       for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+                       h->target_name[i] = (char*)calloc(p - q + 1, 1);
+                       strncpy(h->target_name[i], q, p - q);
+               } else goto header_err_ret;
+               if (r < p) r = p;
+               if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
+               else goto header_err_ret;
+               if (r < p) r = p;
+               s = r + 3;
+               ++i;
+       }
+       sam_header_parse_rg(h);
+       return h->n_targets;
+
+header_err_ret:
+       fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n");
+       free(h->target_len); free(h->target_name);
+       h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+       return 0;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+       int ret, dret;
+       bam_header_t *header = bam_header_init();
+       kstring_t *str = fp->str;
+       while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
+               str->s[str->l] = dret; // note that str->s is NOT null terminated!!
+               append_text(header, str);
+               if (dret != '\n') {
+                       ret = ks_getuntil(fp->ks, '\n', str, &dret);
+                       str->s[str->l] = '\n'; // NOT null terminated!!
+                       append_text(header, str);
+               }
+               ++fp->n_lines;
+       }
+       sam_header_parse(header);
+       bam_init_header_hash(header);
+       fp->is_first = 1;
+       return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+       int ret, doff, doff0, dret, z = 0;
+       bam1_core_t *c = &b->core;
+       kstring_t *str = fp->str;
+       kstream_t *ks = fp->ks;
+
+       if (fp->is_first) {
+               fp->is_first = 0;
+               ret = str->l;
+       } else {
+               do { // special consideration for empty lines
+                       ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+                       if (ret >= 0) z += str->l + 1;
+               } while (ret == 0);
+       }
+       if (ret < 0) return -1;
+       ++fp->n_lines;
+       doff = 0;
+
+       { // name
+               c->l_qname = strlen(str->s) + 1;
+               memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
+               doff += c->l_qname;
+       }
+       { // flag, tid, pos, qual
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s);
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+               if (c->tid < 0 && strcmp(str->s, "*")) {
+                       if (header->n_targets == 0) {
+                               fprintf(stderr, "[sam_read1] missing header? Abort!\n");
+                               exit(1);
+                       } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
+               }
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
+               if (ret < 0) return -2;
+       }
+       { // cigar
+               char *s, *t;
+               int i, op;
+               long x;
+               c->n_cigar = 0;
+               if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
+               z += str->l + 1;
+               if (str->s[0] != '*') {
+                       for (s = str->s; *s; ++s) {
+                               if (isalpha(*s)) ++c->n_cigar;
+                               else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
+                       }
+                       b->data = alloc_data(b, doff + c->n_cigar * 4);
+                       for (i = 0, s = str->s; i != c->n_cigar; ++i) {
+                               x = strtol(s, &t, 10);
+                               op = toupper(*t);
+                               if (op == 'M') op = BAM_CMATCH;
+                               else if (op == 'I') op = BAM_CINS;
+                               else if (op == 'D') op = BAM_CDEL;
+                               else if (op == 'N') op = BAM_CREF_SKIP;
+                               else if (op == 'S') op = BAM_CSOFT_CLIP;
+                               else if (op == 'H') op = BAM_CHARD_CLIP;
+                               else if (op == 'P') op = BAM_CPAD;
+                               else parse_error(fp->n_lines, "invalid CIGAR operation");
+                               s = t + 1;
+                               bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
+                       }
+                       if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
+                       c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b)));
+                       doff += c->n_cigar * 4;
+               } else c->bin = bam_reg2bin(c->pos, c->pos + 1);
+       }
+       { // mtid, mpos, isize
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+               c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+               c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+               ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+               c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
+               if (ret < 0) return -4;
+       }
+       { // seq and qual
+               int i;
+               uint8_t *p;
+               if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
+               z += str->l + 1;
+               c->l_qseq = strlen(str->s);
+               if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
+                       parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+               p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+               bzero(p, (c->l_qseq+1)/2);
+               for (i = 0; i < c->l_qseq; ++i)
+                       p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+               if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
+               z += str->l + 1;
+               if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
+                       parse_error(fp->n_lines, "sequence and quality are inconsistent");
+               p += (c->l_qseq+1)/2;
+               if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
+               else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
+               doff += c->l_qseq + (c->l_qseq+1)/2;
+       }
+       doff0 = doff;
+       if (dret != '\n' && dret != '\r') { // aux
+               while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
+                       uint8_t *s, type, key[2];
+                       z += str->l + 1;
+                       if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
+                               parse_error(fp->n_lines, "missing colon in auxiliary data");
+                       key[0] = str->s[0]; key[1] = str->s[1];
+                       type = str->s[3];
+                       s = alloc_data(b, doff + 3) + doff;
+                       s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
+                       if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
+                               s = alloc_data(b, doff + 2) + doff;
+                               *s++ = 'A'; *s = str->s[5];
+                               doff += 2;
+                       } else if (type == 'I' || type == 'i') {
+                               long long x;
+                               s = alloc_data(b, doff + 5) + doff;
+                               x = (long long)atoll(str->s + 5);
+                               if (x < 0) {
+                                       if (x >= -127) {
+                                               *s++ = 'c'; *(int8_t*)s = (int8_t)x;
+                                               s += 1; doff += 2;
+                                       } else if (x >= -32767) {
+                                               *s++ = 's'; *(int16_t*)s = (int16_t)x;
+                                               s += 2; doff += 3;
+                                       } else {
+                                               *s++ = 'i'; *(int32_t*)s = (int32_t)x;
+                                               s += 4; doff += 5;
+                                               if (x < -2147483648ll)
+                                                       fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+                                                                       (long long)fp->n_lines, x);
+                                       }
+                               } else {
+                                       if (x <= 255) {
+                                               *s++ = 'C'; *s++ = (uint8_t)x;
+                                               doff += 2;
+                                       } else if (x <= 65535) {
+                                               *s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
+                                               s += 2; doff += 3;
+                                       } else {
+                                               *s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
+                                               s += 4; doff += 5;
+                                               if (x > 4294967295ll)
+                                                       fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+                                                                       (long long)fp->n_lines, x);
+                                       }
+                               }
+                       } else if (type == 'f') {
+                               s = alloc_data(b, doff + 5) + doff;
+                               *s++ = 'f';
+                               *(float*)s = (float)atof(str->s + 5);
+                               s += 4; doff += 5;
+                       } else if (type == 'd') {
+                               s = alloc_data(b, doff + 9) + doff;
+                               *s++ = 'd';
+                               *(float*)s = (float)atof(str->s + 9);
+                               s += 8; doff += 9;
+                       } else if (type == 'Z' || type == 'H') {
+                               int size = 1 + (str->l - 5) + 1;
+                               if (type == 'H') { // check whether the hex string is valid
+                                       int i;
+                                       if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
+                                       for (i = 0; i < str->l - 5; ++i) {
+                                               int c = toupper(str->s[5 + i]);
+                                               if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
+                                                       parse_error(fp->n_lines, "invalid hex character");
+                                       }
+                               }
+                               s = alloc_data(b, doff + size) + doff;
+                               *s++ = type;
+                               memcpy(s, str->s + 5, str->l - 5);
+                               s[str->l - 5] = 0;
+                               doff += size;
+                       } else parse_error(fp->n_lines, "unrecognized type");
+                       if (dret == '\n' || dret == '\r') break;
+               }
+       }
+       b->l_aux = doff - doff0;
+       b->data_len = doff;
+       return z;
+}
+
+tamFile sam_open(const char *fn)
+{
+       tamFile fp;
+       fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
+       fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       fp->ks = ks_init(fp->fp);
+       return fp;
+}
+
+void sam_close(tamFile fp)
+{
+       if (fp) {
+               ks_destroy(fp->ks);
+               gzclose(fp->fp);
+               free(fp->str->s); free(fp->str);
+               free(fp);
+       }
+}
diff --git a/bam_index.c b/bam_index.c
new file mode 100644 (file)
index 0000000..72ef270
--- /dev/null
@@ -0,0 +1,551 @@
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+#include "knetfile.h"
+
+/*!
+  @header
+
+  Alignment indexing. Before indexing, BAM must be sorted based on the
+  leftmost coordinate of alignments. In indexing, BAM uses two indices:
+  a UCSC binning index and a simple linear index. The binning index is
+  efficient for alignments spanning long distance, while the auxiliary
+  linear index helps to reduce unnecessary seek calls especially for
+  short alignments.
+
+  The UCSC binning scheme was suggested by Richard Durbin and Lincoln
+  Stein and is explained by Kent et al. (2002). In this scheme, each bin
+  represents a contiguous genomic region which can be fully contained in
+  another bin; each alignment is associated with a bin which represents
+  the smallest region containing the entire alignment. The binning
+  scheme is essentially another representation of R-tree. A distinct bin
+  uniquely corresponds to a distinct internal node in a R-tree. Bin A is
+  a child of Bin B if region A is contained in B.
+
+  In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
+  0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
+  585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
+  find the alignments overlapped with a region [rbeg,rend), we need to
+  calculate the list of bins that may be overlapped the region and test
+  the alignments in the bins to confirm the overlaps. If the specified
+  region is short, typically only a few alignments in six bins need to
+  be retrieved. The overlapping alignments can be quickly fetched.
+
+ */
+
+#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
+#define BAM_LIDX_SHIFT    14
+
+typedef struct {
+       uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(off, pair64_t, pair64_lt)
+
+typedef struct {
+       uint32_t m, n;
+       pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+       int32_t n, m;
+       uint64_t *offset;
+} bam_lidx_t;
+
+KHASH_MAP_INIT_INT(i, bam_binlist_t)
+
+struct __bam_index_t {
+       int32_t n;
+       khash_t(i) **index;
+       bam_lidx_t *index2;
+};
+
+// requirement: len <= LEN_MASK
+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
+{
+       khint_t k;
+       bam_binlist_t *l;
+       int ret;
+       k = kh_put(i, h, bin, &ret);
+       l = &kh_value(h, k);
+       if (ret) { // not present
+               l->m = 1; l->n = 0;
+               l->list = (pair64_t*)calloc(l->m, 16);
+       }
+       if (l->n == l->m) {
+               l->m <<= 1;
+               l->list = (pair64_t*)realloc(l->list, l->m * 16);
+       }
+       l->list[l->n].u = beg; l->list[l->n++].v = end;
+}
+
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
+{
+       int i, beg, end;
+       beg = b->core.pos >> BAM_LIDX_SHIFT;
+       end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+       if (index2->m < end + 1) {
+               int old_m = index2->m;
+               index2->m = end + 1;
+               kroundup32(index2->m);
+               index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+               memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+       }
+       for (i = beg + 1; i <= end; ++i)
+               if (index2->offset[i] == 0) index2->offset[i] = offset;
+       index2->n = end + 1;
+}
+
+static void merge_chunks(bam_index_t *idx)
+{
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+       khash_t(i) *index;
+       int i, l, m;
+       khint_t k;
+       for (i = 0; i < idx->n; ++i) {
+               index = idx->index[i];
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       bam_binlist_t *p;
+                       if (!kh_exist(index, k)) continue;
+                       p = &kh_value(index, k);
+                       m = 0;
+                       for (l = 1; l < p->n; ++l) {
+#ifdef BAM_TRUE_OFFSET
+                               if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
+#else
+                               if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
+#endif
+                               else p->list[++m] = p->list[l];
+                       } // ~for(l)
+                       p->n = m + 1;
+               } // ~for(k)
+       } // ~for(i)
+#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
+}
+
+bam_index_t *bam_index_core(bamFile fp)
+{
+       bam1_t *b;
+       bam_header_t *h;
+       int i, ret;
+       bam_index_t *idx;
+       uint32_t last_bin, save_bin;
+       int32_t last_coor, last_tid, save_tid;
+       bam1_core_t *c;
+       uint64_t save_off, last_off;
+
+       idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       h = bam_header_read(fp);
+       c = &b->core;
+
+       idx->n = h->n_targets;
+       bam_header_destroy(h);
+       idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+       for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
+       idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+
+       save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
+       save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+       while ((ret = bam_read1(fp, b)) >= 0) {
+               if (last_tid != c->tid) { // change of chromosomes
+                       last_tid = c->tid;
+                       last_bin = 0xffffffffu;
+               } else if (last_coor > c->pos) {
+                       fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
+                                       bam1_qname(b), last_coor, c->pos, c->tid+1);
+                       exit(1);
+               }
+               if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+               if (c->bin != last_bin) { // then possibly write the binning index
+                       if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+                               insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+                       save_off = last_off;
+                       save_bin = last_bin = c->bin;
+                       save_tid = c->tid;
+                       if (save_tid < 0) break;
+               }
+               if (bam_tell(fp) <= last_off) {
+                       fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
+                                       (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
+                       exit(1);
+               }
+               last_off = bam_tell(fp);
+               last_coor = b->core.pos;
+       }
+       if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+       merge_chunks(idx);
+       if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
+       free(b->data); free(b);
+       return idx;
+}
+
+void bam_index_destroy(bam_index_t *idx)
+{
+       khint_t k;
+       int i;
+       if (idx == 0) return;
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index = idx->index[i];
+               bam_lidx_t *index2 = idx->index2 + i;
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       if (kh_exist(index, k))
+                               free(kh_value(index, k).list);
+               }
+               kh_destroy(i, index);
+               free(index2->offset);
+       }
+       free(idx->index); free(idx->index2);
+       free(idx);
+}
+
+void bam_index_save(const bam_index_t *idx, FILE *fp)
+{
+       int32_t i, size;
+       khint_t k;
+       fwrite("BAI\1", 1, 4, fp);
+       if (bam_is_be) {
+               uint32_t x = idx->n;
+               fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+       } else fwrite(&idx->n, 4, 1, fp);
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index = idx->index[i];
+               bam_lidx_t *index2 = idx->index2 + i;
+               // write binning index
+               size = kh_size(index);
+               if (bam_is_be) { // big endian
+                       uint32_t x = size;
+                       fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+               } else fwrite(&size, 4, 1, fp);
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       if (kh_exist(index, k)) {
+                               bam_binlist_t *p = &kh_value(index, k);
+                               if (bam_is_be) { // big endian
+                                       uint32_t x;
+                                       x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+                                       x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+                                       for (x = 0; (int)x < p->n; ++x) {
+                                               bam_swap_endian_8p(&p->list[x].u);
+                                               bam_swap_endian_8p(&p->list[x].v);
+                                       }
+                                       fwrite(p->list, 16, p->n, fp);
+                                       for (x = 0; (int)x < p->n; ++x) {
+                                               bam_swap_endian_8p(&p->list[x].u);
+                                               bam_swap_endian_8p(&p->list[x].v);
+                                       }
+                               } else {
+                                       fwrite(&kh_key(index, k), 4, 1, fp);
+                                       fwrite(&p->n, 4, 1, fp);
+                                       fwrite(p->list, 16, p->n, fp);
+                               }
+                       }
+               }
+               // write linear index (index2)
+               if (bam_is_be) {
+                       int x = index2->n;
+                       fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+               } else fwrite(&index2->n, 4, 1, fp);
+               if (bam_is_be) { // big endian
+                       int x;
+                       for (x = 0; (int)x < index2->n; ++x)
+                               bam_swap_endian_8p(&index2->offset[x]);
+                       fwrite(index2->offset, 8, index2->n, fp);
+                       for (x = 0; (int)x < index2->n; ++x)
+                               bam_swap_endian_8p(&index2->offset[x]);
+               } else fwrite(index2->offset, 8, index2->n, fp);
+       }
+       fflush(fp);
+}
+
+static bam_index_t *bam_index_load_core(FILE *fp)
+{
+       int i;
+       char magic[4];
+       bam_index_t *idx;
+       if (fp == 0) {
+               fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
+               return 0;
+       }
+       fread(magic, 1, 4, fp);
+       if (strncmp(magic, "BAI\1", 4)) {
+               fprintf(stderr, "[bam_index_load] wrong magic number.\n");
+               fclose(fp);
+               return 0;
+       }
+       idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));     
+       fread(&idx->n, 4, 1, fp);
+       if (bam_is_be) bam_swap_endian_4p(&idx->n);
+       idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+       idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index;
+               bam_lidx_t *index2 = idx->index2 + i;
+               uint32_t key, size;
+               khint_t k;
+               int j, ret;
+               bam_binlist_t *p;
+               index = idx->index[i] = kh_init(i);
+               // load binning index
+               fread(&size, 4, 1, fp);
+               if (bam_is_be) bam_swap_endian_4p(&size);
+               for (j = 0; j < (int)size; ++j) {
+                       fread(&key, 4, 1, fp);
+                       if (bam_is_be) bam_swap_endian_4p(&key);
+                       k = kh_put(i, index, key, &ret);
+                       p = &kh_value(index, k);
+                       fread(&p->n, 4, 1, fp);
+                       if (bam_is_be) bam_swap_endian_4p(&p->n);
+                       p->m = p->n;
+                       p->list = (pair64_t*)malloc(p->m * 16);
+                       fread(p->list, 16, p->n, fp);
+                       if (bam_is_be) {
+                               int x;
+                               for (x = 0; x < p->n; ++x) {
+                                       bam_swap_endian_8p(&p->list[x].u);
+                                       bam_swap_endian_8p(&p->list[x].v);
+                               }
+                       }
+               }
+               // load linear index
+               fread(&index2->n, 4, 1, fp);
+               if (bam_is_be) bam_swap_endian_4p(&index2->n);
+               index2->m = index2->n;
+               index2->offset = (uint64_t*)calloc(index2->m, 8);
+               fread(index2->offset, index2->n, 8, fp);
+               if (bam_is_be)
+                       for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+       }
+       return idx;
+}
+
+bam_index_t *bam_index_load_local(const char *_fn)
+{
+       FILE *fp;
+       char *fnidx, *fn;
+
+       if (strstr(_fn, "ftp://") == _fn) {
+               const char *p;
+               int l = strlen(_fn);
+               for (p = _fn + l - 1; p >= _fn; --p)
+                       if (*p == '/') break;
+               fn = strdup(p + 1);
+       } else fn = strdup(_fn);
+       fnidx = (char*)calloc(strlen(fn) + 5, 1);
+       strcpy(fnidx, fn); strcat(fnidx, ".bai");
+       fp = fopen(fnidx, "r");
+       if (fp == 0) { // try "{base}.bai"
+               char *s = strstr(fn, "bam");
+               if (s == fn + strlen(fn) - 3) {
+                       strcpy(fnidx, fn);
+                       fnidx[strlen(fn)-1] = 'i';
+                       fp = fopen(fnidx, "r");
+               }
+       }
+       free(fnidx); free(fn);
+       if (fp) {
+               bam_index_t *idx = bam_index_load_core(fp);
+               fclose(fp);
+               return idx;
+       } else return 0;
+}
+
+static void download_from_remote(const char *url)
+{
+       const int buf_size = 1 * 1024 * 1024;
+       char *fn;
+       FILE *fp;
+       uint8_t *buf;
+       knetFile *fp_remote;
+       int l;
+       if (strstr(url, "ftp://") != url) return;
+       l = strlen(url);
+       for (fn = (char*)url + l - 1; fn >= url; --fn)
+               if (*fn == '/') break;
+       ++fn; // fn now points to the file name
+       fp_remote = knet_open(url, "r");
+       if (fp_remote == 0) {
+               fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+               return;
+       }
+       if ((fp = fopen(fn, "w")) == 0) {
+               fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+               knet_close(fp_remote);
+               return;
+       }
+       buf = (uint8_t*)calloc(buf_size, 1);
+       while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+               fwrite(buf, 1, l, fp);
+       free(buf);
+       fclose(fp);
+       knet_close(fp_remote);
+}
+
+bam_index_t *bam_index_load(const char *fn)
+{
+       bam_index_t *idx;
+       idx = bam_index_load_local(fn);
+       if (idx == 0 && strstr(fn, "ftp://") == fn) {
+               char *fnidx = calloc(strlen(fn) + 5, 1);
+               strcat(strcpy(fnidx, fn), ".bai");
+               fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
+               download_from_remote(fnidx);
+               idx = bam_index_load_local(fn);
+       }
+       if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
+       return idx;
+}
+
+int bam_index_build2(const char *fn, const char *_fnidx)
+{
+       char *fnidx;
+       FILE *fpidx;
+       bamFile fp;
+       bam_index_t *idx;
+       if ((fp = bam_open(fn, "r")) == 0) {
+               fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
+               return -1;
+       }
+       idx = bam_index_core(fp);
+       bam_close(fp);
+       if (_fnidx == 0) {
+               fnidx = (char*)calloc(strlen(fn) + 5, 1);
+               strcpy(fnidx, fn); strcat(fnidx, ".bai");
+       } else fnidx = strdup(_fnidx);
+       fpidx = fopen(fnidx, "w");
+       if (fpidx == 0) {
+               fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
+               free(fnidx);
+               return -1;
+       }
+       bam_index_save(idx, fpidx);
+       bam_index_destroy(idx);
+       fclose(fpidx);
+       free(fnidx);
+       return 0;
+}
+
+int bam_index_build(const char *fn)
+{
+       return bam_index_build2(fn, 0);
+}
+
+int bam_index(int argc, char *argv[])
+{
+       if (argc < 2) {
+               fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
+               return 1;
+       }
+       if (argc >= 3) bam_index_build2(argv[1], argv[2]);
+       else bam_index_build(argv[1]);
+       return 0;
+}
+
+#define MAX_BIN 37450 // =(8^6-1)/7+1
+
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
+{
+       int i = 0, k;
+       --end;
+       list[i++] = 0;
+       for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
+       for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
+       for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
+       for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
+       for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+       return i;
+}
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+       uint32_t rbeg = b->core.pos;
+       uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
+       return (rend > beg && rbeg < end);
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+       uint16_t *bins;
+       int i, n_bins, n_off;
+       pair64_t *off;
+       khint_t k;
+       khash_t(i) *index;
+       uint64_t min_off;
+
+       bins = (uint16_t*)calloc(MAX_BIN, 2);
+       n_bins = reg2bins(beg, end, bins);
+       index = idx->index[tid];
+       min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+       for (i = n_off = 0; i < n_bins; ++i) {
+               if ((k = kh_get(i, index, bins[i])) != kh_end(index))
+                       n_off += kh_value(index, k).n;
+       }
+       if (n_off == 0) {
+               free(bins); return 0;
+       }
+       off = (pair64_t*)calloc(n_off, 16);
+       for (i = n_off = 0; i < n_bins; ++i) {
+               if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
+                       int j;
+                       bam_binlist_t *p = &kh_value(index, k);
+                       for (j = 0; j < p->n; ++j)
+                               if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+               }
+       }
+       free(bins);
+       {
+               bam1_t *b;
+               int l, ret, n_seeks;
+               uint64_t curr_off;
+               b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               ks_introsort(off, n_off, off);
+               // resolve completely contained adjacent blocks
+               for (i = 1, l = 0; i < n_off; ++i)
+                       if (off[l].v < off[i].v)
+                               off[++l] = off[i];
+               n_off = l + 1;
+               // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+               for (i = 1; i < n_off; ++i)
+                       if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+               { // merge adjacent blocks
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+                       for (i = 1, l = 0; i < n_off; ++i) {
+#ifdef BAM_TRUE_OFFSET
+                               if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
+#else
+                               if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+#endif
+                               else off[++l] = off[i];
+                       }
+                       n_off = l + 1;
+#endif
+               }
+               // retrive alignments
+               n_seeks = 0; i = -1; curr_off = 0;
+               for (;;) {
+                       if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
+                               if (i == n_off - 1) break; // no more chunks
+                               if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+                               if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+                                       bam_seek(fp, off[i+1].u, SEEK_SET);
+                                       curr_off = bam_tell(fp);
+                                       ++n_seeks;
+                               }
+                               ++i;
+                       }
+                       if ((ret = bam_read1(fp, b)) > 0) {
+                               curr_off = bam_tell(fp);
+                               if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
+                               else if (is_overlap(beg, end, b)) func(b, data);
+                       } else break; // end of file
+               }
+//             fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
+               bam_destroy1(b);
+       }
+       free(off);
+       return 0;
+}
diff --git a/bam_lpileup.c b/bam_lpileup.c
new file mode 100644 (file)
index 0000000..425290e
--- /dev/null
@@ -0,0 +1,214 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "bam.h"
+#include "ksort.h"
+
+#define TV_GAP 2
+
+typedef struct __freenode_t {
+       uint32_t level:28, cnt:4;
+       struct __freenode_t *next;
+} freenode_t, *freenode_p;
+
+#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
+KSORT_INIT(node, freenode_p, freenode_lt)
+
+/* Memory pool, similar to the one in bam_pileup.c */
+typedef struct {
+       int cnt, n, max;
+       freenode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+       return (mempool_t*)calloc(1, sizeof(mempool_t));
+}
+static void mp_destroy(mempool_t *mp)
+{
+       int k;
+       for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
+       free(mp->buf); free(mp);
+}
+static inline freenode_t *mp_alloc(mempool_t *mp)
+{
+       ++mp->cnt;
+       if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
+       else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, freenode_t *p)
+{
+       --mp->cnt; p->next = 0; p->cnt = TV_GAP;
+       if (mp->n == mp->max) {
+               mp->max = mp->max? mp->max<<1 : 256;
+               mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
+       }
+       mp->buf[mp->n++] = p;
+}
+
+/* core part */
+struct __bam_lplbuf_t {
+       int max, n_cur, n_pre;
+       int max_level, *cur_level, *pre_level;
+       mempool_t *mp;
+       freenode_t **aux, *head, *tail;
+       int n_nodes, m_aux;
+       bam_pileup_f func;
+       void *user_data;
+       bam_plbuf_t *plbuf;
+};
+
+void bam_lplbuf_reset(bam_lplbuf_t *buf)
+{
+       freenode_t *p, *q;
+       bam_plbuf_reset(buf->plbuf);
+       for (p = buf->head; p->next;) {
+               q = p->next;
+               mp_free(buf->mp, p);
+               p = q;
+       }
+       buf->head = buf->tail;
+       buf->max_level = 0;
+       buf->n_cur = buf->n_pre = 0;
+       buf->n_nodes = 0;
+}
+
+static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+       bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
+       freenode_t *p;
+       int i, l, max_level;
+       // allocate memory if necessary
+       if (tv->max < n) { // enlarge
+               tv->max = n;
+               kroundup32(tv->max);
+               tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
+               tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
+       }
+       tv->n_cur = n;
+       // update cnt
+       for (p = tv->head; p->next; p = p->next)
+               if (p->cnt > 0) --p->cnt;
+       // calculate cur_level[]
+       max_level = 0;
+       for (i = l = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (p->is_head) {
+                       if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
+                               freenode_t *p = tv->head->next;
+                               tv->cur_level[i] = tv->head->level;
+                               mp_free(tv->mp, tv->head);
+                               tv->head = p;
+                               --tv->n_nodes;
+                       } else tv->cur_level[i] = ++tv->max_level;
+               } else {
+                       tv->cur_level[i] = tv->pre_level[l++];
+                       if (p->is_tail) { // then return a free slot
+                               tv->tail->level = tv->cur_level[i];
+                               tv->tail->next = mp_alloc(tv->mp);
+                               tv->tail = tv->tail->next;
+                               ++tv->n_nodes;
+                       }
+               }
+               if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
+               ((bam_pileup1_t*)p)->level = tv->cur_level[i];
+       }
+       assert(l == tv->n_pre);
+       tv->func(tid, pos, n, pl, tv->user_data);
+       // sort the linked list
+       if (tv->n_nodes) {
+               freenode_t *q;
+               if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
+                       tv->m_aux = tv->n_nodes + 1;
+                       kroundup32(tv->m_aux);
+                       tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
+               }
+               for (p = tv->head, i = l = 0; p->next;) {
+                       if (p->level > max_level) { // then discard this entry
+                               q = p->next;
+                               mp_free(tv->mp, p);
+                               p = q;
+                       } else {
+                               tv->aux[i++] = p;
+                               p = p->next;
+                       }
+               }
+               tv->aux[i] = tv->tail; // add a proper tail for the loop below
+               tv->n_nodes = i;
+               if (tv->n_nodes) {
+                       ks_introsort(node, tv->n_nodes, tv->aux);
+                       for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
+                       tv->head = tv->aux[0];
+               } else tv->head = tv->tail;
+       }
+       // clean up
+       tv->max_level = max_level;
+       memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
+       // squeeze out terminated levels
+       for (i = l = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (!p->is_tail)
+                       tv->pre_level[l++] = tv->pre_level[i];
+       }
+       tv->n_pre = l;
+/*
+       fprintf(stderr, "%d\t", pos+1);
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (p->is_head) fprintf(stderr, "^");
+               if (p->is_tail) fprintf(stderr, "$");
+               fprintf(stderr, "%d,", p->level);
+       }
+       fprintf(stderr, "\n");
+*/
+       return 0;
+}
+
+bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
+{
+       bam_lplbuf_t *tv;
+       tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
+       tv->mp = mp_init();
+       tv->head = tv->tail = mp_alloc(tv->mp);
+       tv->func = func;
+       tv->user_data = data;
+       tv->plbuf = bam_plbuf_init(tview_func, tv);
+       return (bam_lplbuf_t*)tv;
+}
+
+void bam_lplbuf_destroy(bam_lplbuf_t *tv)
+{
+       freenode_t *p, *q;
+       free(tv->cur_level); free(tv->pre_level);
+       bam_plbuf_destroy(tv->plbuf);
+       free(tv->aux);
+       for (p = tv->head; p->next;) {
+               q = p->next;
+               mp_free(tv->mp, p); p = q;
+       }
+       mp_free(tv->mp, p);
+       assert(tv->mp->cnt == 0);
+       mp_destroy(tv->mp);
+       free(tv);
+}
+
+int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
+{
+       return bam_plbuf_push(b, tv->plbuf);
+}
+
+int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
+{
+       bam_lplbuf_t *buf;
+       int ret;
+       bam1_t *b;
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       buf = bam_lplbuf_init(func, func_data);
+       bam_plbuf_set_mask(buf->plbuf, mask);
+       while ((ret = bam_read1(fp, b)) >= 0)
+               bam_lplbuf_push(b, buf);
+       bam_lplbuf_push(0, buf);
+       bam_lplbuf_destroy(buf);
+       free(b->data); free(b);
+       return 0;
+}
diff --git a/bam_maqcns.c b/bam_maqcns.c
new file mode 100644 (file)
index 0000000..464288a
--- /dev/null
@@ -0,0 +1,526 @@
+#include <math.h>
+#include "bam.h"
+#include "bam_maqcns.h"
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint32_t)
+
+#define MAX_WINDOW 33
+
+typedef struct __bmc_aux_t {
+       int max;
+       uint32_t *info;
+} bmc_aux_t;
+
+typedef struct {
+       float esum[4], fsum[4];
+       uint32_t c[4];
+       uint32_t rms_mapQ;
+} glf_call_aux_t;
+
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+/*
+  P(<b1,b2>) = \theta \sum_{i=1}^{N-1} 1/i
+  P(D|<b1,b2>) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2]
+  p_k = i/k / \sum_{i=1}^{N-1} 1/i
+ */
+static void cal_het(bam_maqcns_t *aa)
+{
+       int k, n1, n2;
+       double sum_harmo; // harmonic sum
+       double poly_rate;
+       double p1 = 0.0, p3 = 0.0; // just for testing
+
+       free(aa->lhet);
+       aa->lhet = (double*)calloc(256 * 256, sizeof(double));
+       sum_harmo = 0.0;
+       for (k = 1; k <= aa->n_hap - 1; ++k)
+               sum_harmo += 1.0 / k;
+       for (n1 = 0; n1 < 256; ++n1) {
+               for (n2 = 0; n2 < 256; ++n2) {
+                       long double sum = 0.0;
+                       double lC = lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1}
+                       for (k = 1; k <= aa->n_hap - 1; ++k) {
+                               double pk = 1.0 / k / sum_harmo;
+                               double log1 = log((double)k/aa->n_hap);
+                               double log2 = log(1.0 - (double)k/aa->n_hap);
+                               sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2));
+                       }
+                       aa->lhet[n1<<8|n2] = lC + logl(sum);
+                       if (n1 == 17 && n2 == 3) p3 = lC + logl(expl(logl(0.5) * 20));
+                       if (n1 == 19 && n2 == 1) p1 = lC + logl(expl(logl(0.5) * 20));
+               }
+       }
+       poly_rate = aa->het_rate * sum_harmo;
+       aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate));
+}
+
+/** initialize the helper structure */
+static void cal_coef(bam_maqcns_t *aa)
+{
+       int k, n, q;
+       long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256];
+       double *lC;
+
+       lC = (double*)calloc(256 * 256, sizeof(double));
+       // aa->lhet will be allocated and initialized 
+       free(aa->fk); free(aa->coef);
+       aa->fk = (double*)calloc(256, sizeof(double));
+       aa->coef = (double*)calloc(256*256*64, sizeof(double));
+       aa->fk[0] = fk2[0] = 1.0;
+       for (n = 1; n != 256; ++n) {
+               aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta;
+               fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands
+       }
+       for (n = 1; n != 256; ++n)
+               for (k = 1; k <= n; ++k)
+                       lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+       for (q = 1; q != 64; ++q) {
+               double e = pow(10.0, -q/10.0);
+               double le = log(e);
+               double le1 = log(1.0-e);
+               for (n = 1; n != 256; ++n) {
+                       double *coef = aa->coef + (q<<16|n<<8);
+                       sum_a[n+1] = 0.0;
+                       for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k}
+                               sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1);
+                               b[k] = sum_a[k+1] / sum_a[k];
+                               if (b[k] > 0.99) b[k] = 0.99;
+                       }
+                       for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k})
+                               q_c[k] = -4.343 * fk2[k] * logl(b[k] / e);
+                       for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i
+                       for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9
+                               tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k])));
+                               coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk}
+                       }
+               }
+       }
+       free(lC);
+}
+
+bam_maqcns_t *bam_maqcns_init()
+{
+       bam_maqcns_t *bm;
+       bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t));
+       bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t));
+       bm->het_rate = 0.001;
+       bm->theta = 0.85;
+       bm->n_hap = 2;
+       bm->eta = 0.03;
+       bm->cap_mapQ = 60;
+       return bm;
+}
+
+void bam_maqcns_prepare(bam_maqcns_t *bm)
+{
+       cal_coef(bm); cal_het(bm);
+}
+
+void bam_maqcns_destroy(bam_maqcns_t *bm)
+{
+       if (bm == 0) return;
+       free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info);
+       free(bm->aux); free(bm);
+}
+
+glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm)
+{
+       glf_call_aux_t *b;
+       int i, j, k, w[8], c, n;
+       glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t));
+       float p[16], min_p = 1e30;
+       uint64_t rms;
+
+       g->ref_base = ref_base;
+       if (_n == 0) return g;
+
+       // construct aux array
+       if (bm->aux->max < _n) {
+               bm->aux->max = _n;
+               kroundup32(bm->aux->max);
+               bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max);
+       }
+       for (i = n = 0; i < _n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               uint32_t q, x = 0, qq;
+               if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue;
+               q = (uint32_t)bam1_qual(p->b)[p->qpos];
+               x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
+               if (p->b->core.qual < q) q = p->b->core.qual;
+               x |= q << 24;
+               qq = bam1_seqi(bam1_seq(p->b), p->qpos);
+               q = bam_nt16_nt4_table[qq? qq : ref_base];
+               if (!p->is_del && q < 4) x |= 1 << 21 | q << 16;
+               bm->aux->info[n++] = x;
+       }
+       ks_introsort(uint32_t, n, bm->aux->info);
+       // generate esum and fsum
+       b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t));
+       for (k = 0; k != 8; ++k) w[k] = 0;
+       rms = 0;
+       for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
+               uint32_t info = bm->aux->info[j];
+               int tmp;
+               if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
+               k = info>>16&7;
+               if (info>>24 > 0) {
+                       b->esum[k&3] += bm->fk[w[k]] * (info>>24);
+                       b->fsum[k&3] += bm->fk[w[k]];
+                       if (w[k] < 0xff) ++w[k];
+                       ++b->c[k&3];
+               }
+               tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ;
+               rms += tmp * tmp;
+       }
+       b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499);
+       // rescale ->c[]
+       for (j = c = 0; j != 4; ++j) c += b->c[j];
+       if (c > 255) {
+               for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5);
+               for (j = c = 0; j != 4; ++j) c += b->c[j];
+       }
+       // generate likelihood
+       for (j = 0; j != 4; ++j) {
+               // homozygous
+               float tmp1, tmp3;
+               int tmp2, bar_e;
+               for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) {
+                       if (j == k) continue;
+                       tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k];
+               }
+               if (tmp2) {
+                       bar_e = (int)(tmp1 / tmp3 + 0.5);
+                       if (bar_e < 4) bar_e = 4; // should not happen
+                       if (bar_e > 63) bar_e = 63;
+                       p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+               } else p[j<<2|j] = 0.0; // all the bases are j
+               // heterozygous
+               for (k = j + 1; k < 4; ++k) {
+                       for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) {
+                               if (i == j || i == k) continue;
+                               tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i];
+                       }
+                       if (tmp2) {
+                               bar_e = (int)(tmp1 / tmp3 + 0.5);
+                               if (bar_e < 4) bar_e = 4;
+                               if (bar_e > 63) bar_e = 63;
+                               p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+                       } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k
+               }
+               //
+               for (k = 0; k != 4; ++k)
+                       if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0;
+       }
+
+       { // fix p[k<<2|k]
+               float max1, max2, min1, min2;
+               int max_k, min_k;
+               max_k = min_k = -1;
+               max1 = max2 = -1.0; min1 = min2 = 1e30;
+               for (k = 0; k < 4; ++k) {
+                       if (b->esum[k] > max1) {
+                               max2 = max1; max1 = b->esum[k]; max_k = k;
+                       } else if (b->esum[k] > max2) max2 = b->esum[k];
+               }
+               for (k = 0; k < 4; ++k) {
+                       if (p[k<<2|k] < min1) {
+                               min2 = min1; min1 = p[k<<2|k]; min_k = k;
+                       } else if (p[k<<2|k] < min2) min2 = p[k<<2|k];
+               }
+               if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2))
+                       p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0;
+       }
+
+       // convert necessary information to glf1_t
+       g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ;
+       g->depth = n > 16777215? 16777215 : n;
+       for (j = 0; j != 4; ++j)
+               for (k = j; k < 4; ++k)
+                       if (p[j<<2|k] < min_p) min_p = p[j<<2|k];
+       g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5);
+       for (j = c = 0; j != 4; ++j)
+               for (k = j; k < 4; ++k)
+                       g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5);
+
+       free(b);
+       return g;
+}
+
+uint32_t glf2cns(const glf1_t *g, int q_r)
+{
+       int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1;
+       uint32_t x = 0;
+       for (i = k = 0; i < 4; ++i)
+               for (j = i; j < 4; ++j) {
+                       tmp[j<<2|i] = -1;
+                       tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r);
+               }
+       for (i = 0; i < 16; ++i) {
+               if (tmp[i] < 0) continue;
+               if (tmp[i] < min) {
+                       min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i;
+               } else if (tmp[i] < min2) {
+                       min3 = min2; min2 = tmp[i]; min_g2 = i;
+               } else if (tmp[i] < min3) min3 = tmp[i];
+       }
+       x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28;
+       x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24;
+       x |= (uint32_t)g->max_mapQ << 16;
+       x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8;
+       x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff;
+       return x;
+}
+
+uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm)
+{
+       glf1_t *g;
+       uint32_t x;
+       if (n) {
+               g = bam_maqcns_glfgen(n, pl, 0xf, bm);
+               x = glf2cns(g, (int)(bm->q_r + 0.5));
+               free(g);
+       } else x = 0xfU<<28 | 0xfU<<24;
+       return x;
+}
+
+/************** *****************/
+
+bam_maqindel_opt_t *bam_maqindel_opt_init()
+{
+       bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
+       mi->q_indel = 40;
+       mi->r_indel = 0.00015;
+       //
+       mi->mm_penalty = 3;
+       mi->indel_err = 4;
+       mi->ambi_thres = 10;
+       return mi;
+}
+
+void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
+{
+       if (mir == 0) return;
+       free(mir->s[0]); free(mir->s[1]); free(mir);
+}
+
+#define MINUS_CONST 0x10000000
+
+bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+                                                                int _n_types, int *_types)
+{
+       int i, j, n_types, *types, left, right;
+       bam_maqindel_ret_t *ret = 0;
+       // if there is no proposed indel, check if there is an indel from the alignment
+       if (_n_types == 0) {
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+               }
+               if (i == n) return 0; // no indel
+       }
+       { // calculate how many types of indels are available (set n_types and types)
+               int m;
+               uint32_t *aux;
+               aux = (uint32_t*)calloc(n + _n_types + 1, 4);
+               m = 0;
+               aux[m++] = MINUS_CONST; // zero indel is always a type
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
+                               aux[m++] = MINUS_CONST + p->indel;
+               }
+               if (_n_types) // then also add this to aux[]
+                       for (i = 0; i < _n_types; ++i)
+                               if (_types[i]) aux[m++] = MINUS_CONST + _types[i];
+               ks_introsort(uint32_t, m, aux);
+               // squeeze out identical types
+               for (i = 1, n_types = 1; i < m; ++i)
+                       if (aux[i] != aux[i-1]) ++n_types;
+               types = (int*)calloc(n_types, sizeof(int));
+               j = 0;
+               types[j++] = aux[0] - MINUS_CONST; 
+               for (i = 1; i < m; ++i) {
+                       if (aux[i] != aux[i-1])
+                               types[j++] = aux[i] - MINUS_CONST;
+               }
+               free(aux);
+       }
+       { // calculate left and right boundary
+               bam_segreg_t seg;
+               left = 0x7fffffff; right = 0;
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP)) {
+                               bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg);
+                               if (seg.tbeg < left) left = seg.tbeg;
+                               if (seg.tend > right) right = seg.tend;
+                       }
+               }
+               if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW;
+               if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW;
+       }
+       { // the core part
+               char *ref2, *inscns = 0;
+               int k, l, *score, *pscore, max_ins = types[n_types-1];
+               ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
+               if (max_ins > 0) { // get the consensus of inserted sequences
+                       int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
+                       // count occurrences
+                       for (i = 0; i < n_types; ++i) {
+                               if (types[i] <= 0) continue; // not insertion
+                               for (j = 0; j < n; ++j) {
+                                       const bam_pileup1_t *p = pl + j;
+                                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) {
+                                               for (k = 1; k <= p->indel; ++k) {
+                                                       int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)];
+                                                       if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c];
+                                               }
+                                       }
+                               }
+                       }
+                       // construct the consensus of inserted sequence
+                       inscns = (char*)calloc(n_types * max_ins, sizeof(char));
+                       for (i = 0; i < n_types; ++i) {
+                               for (j = 0; j < types[i]; ++j) {
+                                       int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4;
+                                       for (k = 0; k < 4; ++k) {
+                                               if (ia[k] > max) {
+                                                       max = ia[k];
+                                                       max_k = k;
+                                               }
+                                       }
+                                       inscns[i*max_ins + j] = max? 1<<max_k : 15;
+                               }
+                       }
+                       free(inscns_aux);
+               }
+               // calculate score
+               score = (int*)calloc(n_types * n, sizeof(int));
+               pscore = (int*)calloc(n_types * n, sizeof(int));
+               for (i = 0; i < n_types; ++i) {
+                       // write ref2
+                       for (k = 0, j = left; j <= pos; ++j)
+                               ref2[k++] = bam_nt16_table[(int)ref[j]];
+                       if (types[i] <= 0) j += -types[i];
+                       else for (l = 0; l < types[i]; ++l)
+                                        ref2[k++] = inscns[i*max_ins + l];
+                       for (; j < right && ref[j]; ++j)
+                               ref2[k++] = bam_nt16_table[(int)ref[j]];
+                       // calculate score for each read
+                       for (j = 0; j < n; ++j) {
+                               const bam_pileup1_t *p = pl + j;
+                               uint32_t *cigar;
+                               bam1_core_t *c = &p->b->core;
+                               int s, ps;
+                               bam_segreg_t seg;
+                               if (c->flag&BAM_FUNMAP) continue;
+                               cigar = bam1_cigar(p->b);
+                               bam_segreg(pos, c, cigar, &seg);
+                               for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
+                                       int cq = bam1_seqi(bam1_seq(p->b), l), ct;
+                                       // in the following line, "<" will happen if reads are too long
+                                       ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15;
+                                       if (cq < 15 && ct < 15) {
+                                               s += cq == ct? 1 : -mi->mm_penalty;
+                                               if (cq != ct) ps += bam1_qual(p->b)[l];
+                                       }
+                               }
+                               score[i*n + j] = s; pscore[i*n + j] = ps;
+                               if (types[i] != 0) { // then try the other way to calculate the score
+                                       for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
+                                               int cq = bam1_seqi(bam1_seq(p->b), l), ct;
+                                               ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15;
+                                               if (cq < 15 && ct < 15) {
+                                                       s += cq == ct? 1 : -mi->mm_penalty;
+                                                       if (cq != ct) ps += bam1_qual(p->b)[l];
+                                               }
+                                       }
+                               }
+                               if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
+                               if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
+                               if (types[i] != 0) score[i*n+j] -= mi->indel_err;
+                               //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j,
+                               //         score[i*n+j], pscore[i*n+j]);
+                       }
+               }
+               { // get final result
+                       int *sum, max1, max2, max1_i, max2_i;
+                       // pick up the best two score
+                       sum = (int*)calloc(n_types, sizeof(int));
+                       for (i = 0; i < n_types; ++i)
+                               for (j = 0; j < n; ++j)
+                                       sum[i] += -pscore[i*n+j];
+                       max1 = max2 = -0x7fffffff; max1_i = max2_i = -1;
+                       for (i = 0; i < n_types; ++i) {
+                               if (sum[i] > max1) {
+                                       max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i;
+                               } else if (sum[i] > max2) {
+                                       max2 = sum[i]; max2_i = i;
+                               }
+                       }
+                       free(sum);
+                       // write ret
+                       ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));
+                       ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];
+                       ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1);
+                       ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1);
+                       // write indel sequence
+                       if (ret->indel1 > 0) {
+                               ret->s[0][0] = '+';
+                               for (k = 0; k < ret->indel1; ++k)
+                                       ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
+                       } else if (ret->indel1 < 0) {
+                               ret->s[0][0] = '-';
+                               for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)
+                                       ret->s[0][k+1] = ref[pos + k + 1];
+                       } else ret->s[0][0] = '*';
+                       if (ret->indel2 > 0) {
+                               ret->s[1][0] = '+';
+                               for (k = 0; k < ret->indel2; ++k)
+                                       ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
+                       } else if (ret->indel2 < 0) {
+                               ret->s[1][0] = '-';
+                               for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)
+                                       ret->s[1][k+1] = ref[pos + k + 1];
+                       } else ret->s[1][0] = '*';
+                       // write count
+                       for (i = 0; i < n; ++i) {
+                               const bam_pileup1_t *p = pl + i;
+                               if (p->indel == ret->indel1) ++ret->cnt1;
+                               else if (p->indel == ret->indel2) ++ret->cnt2;
+                               else ++ret->cnt_anti;
+                       }
+                       // write gl[]
+                       ret->gl[0] = ret->gl[1] = 0;
+                       for (j = 0; j < n; ++j) {
+                               int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
+                               //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2);
+                               if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel;
+                               else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel;
+                       }
+               }
+               free(score); free(pscore); free(ref2); free(inscns);
+       }
+       { // call genotype
+               int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);
+               int min1, min2, min1_i;
+               q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel;
+               q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel;
+               q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel;
+               min1 = min2 = 0x7fffffff; min1_i = -1;
+               for (i = 0; i < 3; ++i) {
+                       if (q[i] < min1) {
+                               min2 = min1; min1 = q[i]; min1_i = i;
+                       } else if (q[i] < min2) min2 = q[i];
+               }
+               ret->gt = min1_i;
+               ret->q_cns = min2 - min1;
+               // set q_ref
+               if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3;
+               else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2];
+               if (ret->q_ref < 0) ret->q_ref = 0;
+       }
+       free(types);
+       return ret;
+}
diff --git a/bam_maqcns.h b/bam_maqcns.h
new file mode 100644 (file)
index 0000000..36704d7
--- /dev/null
@@ -0,0 +1,55 @@
+#ifndef BAM_MAQCNS_H
+#define BAM_MAQCNS_H
+
+#include "glf.h"
+
+struct __bmc_aux_t;
+
+typedef struct {
+       float het_rate, theta;
+       int n_hap, cap_mapQ;
+
+       float eta, q_r;
+       double *fk, *coef;
+       double *lhet;
+       struct __bmc_aux_t *aux;
+} bam_maqcns_t;
+
+typedef struct {
+       int q_indel;
+       float r_indel;
+       // hidden parameters, unchangeable from command line
+       int mm_penalty, indel_err, ambi_thres;
+} bam_maqindel_opt_t;
+
+typedef struct {
+       int indel1, indel2;
+       int cnt1, cnt2, cnt_ambi, cnt_anti;
+       char *s[2];
+       //
+       int gt, gl[2];
+       int q_cns, q_ref;
+} bam_maqindel_ret_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       bam_maqcns_t *bam_maqcns_init();
+       void bam_maqcns_prepare(bam_maqcns_t *bm);
+       void bam_maqcns_destroy(bam_maqcns_t *bm);
+       glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm);
+       uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm);
+       // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2
+       uint32_t glf2cns(const glf1_t *g, int q_r);
+
+       bam_maqindel_opt_t *bam_maqindel_opt_init();
+       bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+                                                                        int _n_types, int *_types);
+       void bam_maqindel_ret_destroy(bam_maqindel_ret_t*);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bam_mate.c b/bam_mate.c
new file mode 100644 (file)
index 0000000..61f808a
--- /dev/null
@@ -0,0 +1,70 @@
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+
+// currently, this function ONLY works if each read has one hit
+void bam_mating_core(bamFile in, bamFile out)
+{
+       bam_header_t *header;
+       bam1_t *b[2];
+       int curr, has_prev;
+
+       header = bam_header_read(in);
+       bam_header_write(out, header);
+
+       b[0] = bam_init1();
+       b[1] = bam_init1();
+       curr = 0; has_prev = 0;
+       while (bam_read1(in, b[curr]) >= 0) {
+               bam1_t *cur = b[curr], *pre = b[1-curr];
+               if (has_prev) {
+                       if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
+                               cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
+                               pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
+                               if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
+                                       && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)))
+                               {
+                                       uint32_t cur5, pre5;
+                                       cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos;
+                                       pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos;
+                                       cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
+                               } else cur->core.isize = pre->core.isize = 0;
+                               if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
+                               else cur->core.flag &= ~BAM_FMREVERSE;
+                               if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
+                               else pre->core.flag &= ~BAM_FMREVERSE;
+                               if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
+                               if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
+                               bam_write1(out, pre);
+                               bam_write1(out, cur);
+                               has_prev = 0;
+                       } else { // unpaired or singleton
+                               pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
+                               if (pre->core.flag & BAM_FPAIRED) {
+                                       pre->core.flag |= BAM_FMUNMAP;
+                                       pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
+                               }
+                               bam_write1(out, pre);
+                       }
+               } else has_prev = 1;
+               curr = 1 - curr;
+       }
+       if (has_prev) bam_write1(out, b[1-curr]);
+       bam_header_destroy(header);
+       bam_destroy1(b[0]);
+       bam_destroy1(b[1]);
+}
+
+int bam_mating(int argc, char *argv[])
+{
+       bamFile in, out;
+       if (argc < 3) {
+               fprintf(stderr, "samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n");
+               return 1;
+       }
+       in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+    out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+       bam_mating_core(in, out);
+       bam_close(in); bam_close(out);
+       return 0;
+}
diff --git a/bam_md.c b/bam_md.c
new file mode 100644 (file)
index 0000000..a20f9b3
--- /dev/null
+++ b/bam_md.c
@@ -0,0 +1,117 @@
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <ctype.h>
+#include "faidx.h"
+#include "bam.h"
+#include "kstring.h"
+
+void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+{
+       uint8_t *seq = bam1_seq(b);
+       uint32_t *cigar = bam1_cigar(b);
+       bam1_core_t *c = &b->core;
+       int i, x, y, u = 0;
+       kstring_t *str;
+       uint8_t *old_md;
+
+       old_md = bam_aux_get(b, "MD");
+       if (c->flag & BAM_FUNMAP) return;
+       if (old_md && !is_equal) return; // no need to add MD
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+               int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+               if (op == BAM_CMATCH) {
+                       for (j = 0; j < l; ++j) {
+                               int z = y + j;
+                               int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+                               if (ref[x+j] == 0) break; // out of boundary
+                               if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) {
+                                       if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+                                       ++u;
+                               } else {
+                                       ksprintf(str, "%d", u);
+                                       kputc(ref[x+j], str);
+                                       u = 0;
+                               }
+                       }
+                       if (j < l) break;
+                       x += l; y += l;
+               } else if (op == BAM_CDEL) {
+                       ksprintf(str, "%d", u);
+                       kputc('^', str);
+                       for (j = 0; j < l; ++j) {
+                               if (ref[x+j] == 0) break;
+                               kputc(ref[x+j], str);
+                       }
+                       u = 0;
+                       if (j < l) break;
+                       x += l;
+               } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+                       y += l;
+               } else if (op == BAM_CREF_SKIP) {
+                       x += l;
+               }
+       }
+       ksprintf(str, "%d", u);
+       if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+       else {
+               int is_diff = 0;
+               if (strlen((char*)old_md+1) == str->l) {
+                       for (i = 0; i < str->l; ++i)
+                               if (toupper(old_md[i+1]) != toupper(str->s[i]))
+                                       break;
+                       if (i < str->l) is_diff = 1;
+               } else is_diff = 1;
+               if (is_diff)
+                       fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' != '%s'\n", bam1_qname(b), old_md+1, str->s);
+       }
+       free(str->s); free(str);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+       int c, is_equal = 0, tid = -2, ret, len;
+       bamFile fp, fpout = 0;
+       bam_header_t *header;
+       faidx_t *fai;
+       char *ref = 0;
+       bam1_t *b;
+
+       while ((c = getopt(argc, argv, "e")) >= 0) {
+               switch (c) {
+               case 'e': is_equal = 1; break;
+               default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+               }
+       }
+       if (optind + 1 >= argc) {
+               fprintf(stderr, "Usage: bam fillmd [-e] <aln.bam> <ref.fasta>\n");
+               return 1;
+       }
+       fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       assert(fp);
+       header = bam_header_read(fp);
+       fpout = bam_dopen(fileno(stdout), "w");
+       bam_header_write(fpout, header);
+       fai = fai_load(argv[optind+1]);
+
+       b = bam_init1();
+       while ((ret = bam_read1(fp, b)) >= 0) {
+               if (b->core.tid >= 0) {
+                       if (tid != b->core.tid) {
+                               free(ref);
+                               ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
+                               tid = b->core.tid;
+                       }
+                       bam_fillmd1(b, ref, is_equal);
+               }
+               bam_write1(fpout, b);
+       }
+       bam_destroy1(b);
+
+       free(ref);
+       fai_destroy(fai);
+       bam_header_destroy(header);
+       bam_close(fp); bam_close(fpout);
+       return 0;
+}
diff --git a/bam_pileup.c b/bam_pileup.c
new file mode 100644 (file)
index 0000000..3ffd528
--- /dev/null
@@ -0,0 +1,214 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include "sam.h"
+
+typedef struct __linkbuf_t {
+       bam1_t b;
+       uint32_t beg, end;
+       struct __linkbuf_t *next;
+} lbnode_t;
+
+/* --- BEGIN: Memory pool */
+
+typedef struct {
+       int cnt, n, max;
+       lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+       mempool_t *mp;
+       mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+       return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+       int k;
+       for (k = 0; k < mp->n; ++k) {
+               free(mp->buf[k]->b.data);
+               free(mp->buf[k]);
+       }
+       free(mp->buf);
+       free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+       ++mp->cnt;
+       if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+       else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+       --mp->cnt; p->next = 0; // clear lbnode_t::next here
+       if (mp->n == mp->max) {
+               mp->max = mp->max? mp->max<<1 : 256;
+               mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+       }
+       mp->buf[mp->n++] = p;
+}
+
+/* --- END: Memory pool */
+
+/* --- BEGIN: Auxiliary functions */
+
+static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
+{
+       unsigned k;
+       bam1_t *b = p->b;
+       bam1_core_t *c = &b->core;
+       uint32_t x = c->pos, y = 0;
+       int ret = 1, is_restart = 1;
+
+       if (c->flag&BAM_FUNMAP) return 0; // unmapped read
+       assert(x <= pos); // otherwise a bug
+       p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
+               int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
+               if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip
+                       if (x + l > pos) { // overlap with pos
+                               p->indel = p->is_del = 0;
+                               p->qpos = y + (pos - x);
+                               if (x == pos && is_restart) p->is_head = 1;
+                               if (x + l - 1 == pos) { // come to the end of a match
+                                       if (k < c->n_cigar - 1) { // there are additional operation(s)
+                                               uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
+                                               int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
+                                               if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
+                                               else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
+                                               if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
+                                                       p->is_tail = 1; // tail
+                                       } else p->is_tail = 1; // this is the last operation; set tail
+                               }
+                       }
+                       x += l; y += l;
+               } else if (op == BAM_CDEL) { // then set ->is_del
+                       if (x + l > pos) {
+                               p->indel = 0; p->is_del = 1;
+                               p->qpos = y + (pos - x);
+                       }
+                       x += l;
+               } else if (op == BAM_CREF_SKIP) x += l;
+               else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
+               if (x > pos) {
+                       if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
+                       break;
+               }
+       }
+       assert(x > pos); // otherwise a bug
+       return ret;
+}
+
+/* --- END: Auxiliary functions */
+
+struct __bam_plbuf_t {
+       mempool_t *mp;
+       lbnode_t *head, *tail, *dummy;
+       bam_pileup_f func;
+       void *func_data;
+       int32_t tid, pos, max_tid, max_pos;
+       int max_pu, is_eof;
+       bam_pileup1_t *pu;
+       int flag_mask;
+};
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+       lbnode_t *p, *q;
+       buf->max_tid = buf->max_pos = -1;
+       buf->tid = buf->pos = 0;
+       buf->is_eof = 0;
+       for (p = buf->head; p->next;) {
+               q = p->next;
+               mp_free(buf->mp, p);
+               p = q;
+       }
+       buf->head = buf->tail;
+}
+
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+       if (mask < 0) buf->flag_mask = BAM_DEF_MASK;
+       else buf->flag_mask = BAM_FUNMAP | mask;
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+       bam_plbuf_t *buf;
+       buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t));
+       buf->func = func; buf->func_data = data;
+       buf->mp = mp_init();
+       buf->head = buf->tail = mp_alloc(buf->mp);
+       buf->dummy = mp_alloc(buf->mp);
+       buf->max_tid = buf->max_pos = -1;
+       buf->flag_mask = BAM_DEF_MASK;
+       return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+       mp_free(buf->mp, buf->dummy);
+       mp_free(buf->mp, buf->head);
+       if (buf->mp->cnt != 0)
+               fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt);
+       mp_destroy(buf->mp);
+       free(buf->pu);
+       free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+       if (b) { // fill buffer
+               if (b->core.tid < 0) return 0;
+               if (b->core.flag & buf->flag_mask) return 0;
+               bam_copy1(&buf->tail->b, b);
+               buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
+               if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
+                       fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n");
+                       abort();
+               }
+               buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
+               if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
+                       buf->tail->next = mp_alloc(buf->mp);
+                       buf->tail = buf->tail->next;
+               }
+       } else buf->is_eof = 1;
+       while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) {
+               int n_pu = 0;
+               lbnode_t *p, *q;
+               buf->dummy->next = buf->head;
+               for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
+                       if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
+                               q->next = p->next; mp_free(buf->mp, p); p = q;
+                       } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
+                               if (n_pu == buf->max_pu) { // then double the capacity
+                                       buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
+                                       buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
+                               }
+                               buf->pu[n_pu].b = &p->b;
+                               if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
+                       }
+               }
+               buf->head = buf->dummy->next; // dummy->next may be changed
+               if (n_pu) { // then call user defined function
+                       buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data);
+               }
+               // update tid and pos
+               if (buf->head->next) {
+                       if (buf->tid > buf->head->b.core.tid) {
+                               fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
+                               return 1;
+                       }
+               }
+               if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
+                       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
+               } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
+                       buf->pos = buf->head->beg; // jump to the next position
+               } else ++buf->pos; // scan contiguously
+               if (buf->is_eof && buf->head->next == 0) break;
+       }
+       return 0;
+}
diff --git a/bam_plcmd.c b/bam_plcmd.c
new file mode 100644 (file)
index 0000000..5d5506f
--- /dev/null
@@ -0,0 +1,385 @@
+#include <math.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "sam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+#include "khash.h"
+#include "glf.h"
+#include "kstring.h"
+
+typedef int *indel_list_t;
+KHASH_MAP_INIT_INT64(64, indel_list_t)
+
+#define BAM_PLF_SIMPLE     0x01
+#define BAM_PLF_CNS        0x02
+#define BAM_PLF_INDEL_ONLY 0x04
+#define BAM_PLF_GLF        0x08
+#define BAM_PLF_VAR_ONLY   0x10
+#define BAM_PLF_2ND        0x20
+
+typedef struct {
+       bam_header_t *h;
+       bam_maqcns_t *c;
+       bam_maqindel_opt_t *ido;
+       faidx_t *fai;
+       khash_t(64) *hash;
+       uint32_t format;
+       int tid, len, last_pos;
+       int mask;
+       char *ref;
+       glfFile fp_glf; // for glf output only
+} pu_data_t;
+
+char **__bam_get_lines(const char *fn, int *_n);
+void bam_init_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
+{
+       char **list;
+       int i, j, n, *fields, max_fields;
+       khash_t(64) *hash;
+       bam_init_header_hash(h);
+       list = __bam_get_lines(fn, &n);
+       hash = kh_init(64);
+       max_fields = 0; fields = 0;
+       for (i = 0; i < n; ++i) {
+               char *str = list[i];
+               int chr, n_fields, ret;
+               khint_t k;
+               uint64_t x;
+               n_fields = ksplit_core(str, 0, &max_fields, &fields);
+               if (n_fields < 2) continue;
+               chr = bam_get_tid(h, str + fields[0]);
+               if (chr < 0) {
+                       fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]);
+                       continue;
+               }
+               x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);
+               k = kh_put(64, hash, x, &ret);
+               if (ret == 0) {
+                       fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]);
+                       continue;
+               }
+               kh_val(hash, k) = 0;
+               if (n_fields > 2) {
+                       // count
+                       for (j = 2; j < n_fields; ++j) {
+                               char *s = str + fields[j];
+                               if ((*s != '+' && *s != '-') || !isdigit(s[1])) break;
+                       }
+                       if (j > 2) { // update kh_val()
+                               int *q, y, z;
+                               q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));
+                               q[0] = j - 2; z = j; y = 1;
+                               for (j = 2; j < z; ++j)
+                                       q[y++] = atoi(str + fields[j]);
+                       }
+               }
+               free(str);
+       }
+       free(list); free(fields);
+       return hash;
+}
+
+// an analogy to pileup_func() below
+static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+       pu_data_t *d = (pu_data_t*)data;
+       bam_maqindel_ret_t *r = 0;
+       int rb, *proposed_indels = 0;
+       glf1_t *g;
+       glf3_t *g3;
+
+       if (d->fai == 0) {
+               fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n");
+               exit(1);
+       }
+       if (d->hash) { // only output a list of sites
+               khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+               if (k == kh_end(d->hash)) return 0;
+               proposed_indels = kh_val(d->hash, k);
+       }
+       g3 = glf3_init1();
+       if (d->fai && (int)tid != d->tid) {
+               if (d->ref) { // then write the end mark
+                       g3->rtype = GLF3_RTYPE_END;
+                       glf3_write1(d->fp_glf, g3);
+               }
+               glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference
+               free(d->ref);
+               d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+               d->tid = tid;
+               d->last_pos = 0;
+       }
+       rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+       g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c);
+       memcpy(g3, g, sizeof(glf1_t));
+       g3->rtype = GLF3_RTYPE_SUB;
+       g3->offset = pos - d->last_pos;
+       d->last_pos = pos;
+       glf3_write1(d->fp_glf, g3);
+       if (proposed_indels)
+               r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+       else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+       if (r) { // then write indel line
+               int het = 3 * n, min;
+               min = het;
+               if (min > r->gl[0]) min = r->gl[0];
+               if (min > r->gl[1]) min = r->gl[1];
+               g3->ref_base = 0;
+               g3->rtype = GLF3_RTYPE_INDEL;
+               memset(g3->lk, 0, 10);
+               g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255;
+               g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255;
+               g3->lk[2] = het - min < 255? het - min : 255;
+               g3->offset = 0;
+               g3->indel_len[0] = r->indel1;
+               g3->indel_len[1] = r->indel2;
+               g3->min_lk = min < 255? min : 255;
+               g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1;
+               g3->indel_seq[0] = strdup(r->s[0]+1);
+               g3->indel_seq[1] = strdup(r->s[1]+1);
+               glf3_write1(d->fp_glf, g3);
+               bam_maqindel_ret_destroy(r);
+       }
+       free(g);
+       glf3_destroy1(g3);
+       return 0;
+}
+
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+       pu_data_t *d = (pu_data_t*)data;
+       bam_maqindel_ret_t *r = 0;
+       int i, j, rb, rms_mapq = -1, *proposed_indels = 0;
+       uint64_t rms_aux;
+       uint32_t cns = 0;
+
+       // if GLF is required, suppress -c completely
+       if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data);
+       // if d->hash is initialized, only output the sites in the hash table
+       if (d->hash) {
+               khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+               if (k == kh_end(d->hash)) return 0;
+               proposed_indels = kh_val(d->hash, k);
+       }
+       // update d->ref if necessary
+       if (d->fai && (int)tid != d->tid) {
+               free(d->ref);
+               d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+               d->tid = tid;
+       }
+       rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+       // when the indel-only mode is asked for, return if no reads mapped with indels
+       if (d->format & BAM_PLF_INDEL_ONLY) {
+               for (i = 0; i < n; ++i)
+                       if (pu[i].indel != 0) break;
+               if (i == n) return 0;
+       }
+       // call the consensus and indel
+       if (d->format & BAM_PLF_CNS) // call consensus
+               cns = bam_maqcns_call(n, pu, d->c);
+       if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref) { // call indels
+               if (proposed_indels) // the first element gives the size of the array
+                       r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+               else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+       }
+       // when only variant sites are asked for, test if the site is a variant
+       if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) {
+               if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP
+                       if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel
+                               if (r) bam_maqindel_ret_destroy(r);
+                               return 0;
+                       }
+               }
+       }
+       // print the first 3 columns
+       printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
+       // print consensus information if required
+       if (d->format & BAM_PLF_CNS) {
+               int ref_q, rb4 = bam_nt16_table[rb];
+               ref_q = 0;
+               if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP
+                       ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff);
+                       if (ref_q > 255) ref_q = 255;
+               }
+               rms_mapq = cns>>16&0xff;
+               printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq);
+       }
+       // print pileup sequences
+       printf("%d\t", n);
+       rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pu + i;
+               int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
+               rms_aux += tmp * tmp;
+               if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+               if (!p->is_del) {
+                       int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+                       if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                       else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+                       putchar(c);
+                       if (p->indel > 0) {
+                               printf("+%d", p->indel);
+                               for (j = 1; j <= p->indel; ++j) {
+                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                               }
+                       } else if (p->indel < 0) {
+                               printf("%d", p->indel);
+                               for (j = 1; j <= -p->indel; ++j) {
+                                       c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
+                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                               }
+                       }
+               } else putchar('*');
+               if (p->is_tail) putchar('$');
+       }
+       // finalize rms_mapq
+       rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499);
+       if (rms_mapq < 0) rms_mapq = rms_aux;
+       putchar('\t');
+       // print quality
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pu + i;
+               int c = bam1_qual(p->b)[p->qpos] + 33;
+               if (c > 126) c = 126;
+               putchar(c);
+       }
+       if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities
+               const unsigned char *q;
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pu + i;
+                       q = bam_aux_get(p->b, "E2");
+                       putchar(q? q[p->qpos + 1] : 'N');
+               }
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pu + i;
+                       q = bam_aux_get(p->b, "U2");
+                       putchar(q? q[p->qpos + 1] : '!');
+               }
+       }
+       // print mapping quality if -s is flagged on the command line
+       if (d->format & BAM_PLF_SIMPLE) {
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       int c = pu[i].b->core.qual + 33;
+                       if (c > 126) c = 126;
+                       putchar(c);
+               }
+       }
+       putchar('\n');
+       // print the indel line if r has been calculated. This only happens if:
+       // a) -c or -i are flagged, AND b) the reference sequence is available
+       if (r) {
+               printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1);
+               if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]);
+               else printf("%s/%s\t", r->s[0], r->s[1]);
+               printf("%d\t%d\t", r->q_cns, r->q_ref);
+               printf("%d\t%d\t", rms_mapq, n);
+               printf("%s\t%s\t", r->s[0], r->s[1]);
+               //printf("%d\t%d\t", r->gl[0], r->gl[1]);
+               printf("%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_anti);
+               bam_maqindel_ret_destroy(r);
+       }
+       return 0;
+}
+
+int bam_pileup(int argc, char *argv[])
+{
+       int c, is_SAM = 0;
+       char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
+       pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
+       d->tid = -1; d->mask = BAM_DEF_MASK;
+       d->c = bam_maqcns_init();
+       d->ido = bam_maqindel_opt_init();
+       while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2")) >= 0) {
+               switch (c) {
+               case 's': d->format |= BAM_PLF_SIMPLE; break;
+               case 't': fn_list = strdup(optarg); break;
+               case 'l': fn_pos = strdup(optarg); break;
+               case 'f': fn_fa = strdup(optarg); break;
+               case 'T': d->c->theta = atof(optarg); break;
+               case 'N': d->c->n_hap = atoi(optarg); break;
+               case 'r': d->c->het_rate = atof(optarg); break;
+               case 'M': d->c->cap_mapQ = atoi(optarg); break;
+               case 'c': d->format |= BAM_PLF_CNS; break;
+               case 'i': d->format |= BAM_PLF_INDEL_ONLY; break;
+               case 'v': d->format |= BAM_PLF_VAR_ONLY; break;
+               case 'm': d->mask = strtol(optarg, 0, 0); break;
+               case 'g': d->format |= BAM_PLF_GLF; break;
+               case '2': d->format |= BAM_PLF_2ND; break;
+               case 'I': d->ido->q_indel = atoi(optarg); break;
+               case 'G': d->ido->r_indel = atof(optarg); break;
+               case 'S': is_SAM = 1; break;
+               default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
+               }
+       }
+       if (fn_list) is_SAM = 1;
+       if (optind == argc) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:  samtools pileup [options] <in.bam>|<in.sam>\n\n");
+               fprintf(stderr, "Option: -s        simple (yet incomplete) pileup format\n");
+               fprintf(stderr, "        -S        the input is in SAM\n");
+               fprintf(stderr, "        -2        output the 2nd best call and quality\n");
+               fprintf(stderr, "        -i        only show lines/consensus with indels\n");
+               fprintf(stderr, "        -m INT    filtering reads with bits in INT [%d]\n", d->mask);
+               fprintf(stderr, "        -M INT    cap mapping quality at INT [%d]\n", d->c->cap_mapQ);
+               fprintf(stderr, "        -t FILE   list of reference sequences (assume the input is in SAM)\n");
+               fprintf(stderr, "        -l FILE   list of sites at which pileup is output\n");
+               fprintf(stderr, "        -f FILE   reference sequence in the FASTA format\n\n");
+               fprintf(stderr, "        -c        output the maq consensus sequence\n");
+               fprintf(stderr, "        -v        print variants only (for -c)\n");
+               fprintf(stderr, "        -g        output in the GLFv3 format (suppressing -c/-i/-s)\n");
+               fprintf(stderr, "        -T FLOAT  theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta);
+               fprintf(stderr, "        -N INT    number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap);
+               fprintf(stderr, "        -r FLOAT  prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate);
+               fprintf(stderr, "        -G FLOAT  prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel);
+               fprintf(stderr, "        -I INT    phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel);
+               fprintf(stderr, "\n");
+               free(fn_list); free(fn_fa); free(d);
+               return 1;
+       }
+       if (fn_fa) d->fai = fai_load(fn_fa);
+       if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling
+       if (d->format & BAM_PLF_GLF) { // for glf output
+               glf3_header_t *h;
+               h = glf3_header_init();
+               d->fp_glf = bgzf_fdopen(fileno(stdout), "w");
+               glf3_header_write(d->fp_glf, h);
+               glf3_header_destroy(h);
+       }
+       if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)))
+               fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n");
+       {
+               samfile_t *fp;
+               fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0);
+               if (fp == 0 || fp->header == 0) {
+                       fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n");
+                       return 1;
+               }
+               d->h = fp->header;
+               if (fn_pos) d->hash = load_pos(fn_pos, d->h);
+               sampileup(fp, d->mask, pileup_func, d);
+               samclose(fp); // d->h will be destroyed here
+       }
+
+       // free
+       if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf);
+       if (fn_pos) { // free the hash table
+               khint_t k;
+               for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k)
+                       if (kh_exist(d->hash, k)) free(kh_val(d->hash, k));
+               kh_destroy(64, d->hash);
+       }
+       free(fn_pos); free(fn_list); free(fn_fa);
+       if (d->fai) fai_destroy(d->fai);
+       bam_maqcns_destroy(d->c);
+       free(d->ido); free(d->ref); free(d);
+       return 0;
+}
diff --git a/bam_rmdup.c b/bam_rmdup.c
new file mode 100644 (file)
index 0000000..1fa6cad
--- /dev/null
@@ -0,0 +1,144 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "bam.h"
+
+typedef bam1_t *bam1_p;
+#include "khash.h"
+KHASH_SET_INIT_STR(name)
+KHASH_MAP_INIT_INT64(pos, bam1_p)
+
+#define BUFFER_SIZE 0x40000
+
+typedef struct {
+       int n, max;
+       bam1_t **a;
+} tmp_stack_t;
+
+static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
+{
+       if (stack->n == stack->max) {
+               stack->max = stack->max? stack->max<<1 : 0x10000;
+               stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
+       }
+       stack->a[stack->n++] = b;
+}
+
+static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
+{
+       int i;
+       for (i = 0; i != stack->n; ++i) {
+               bam_write1(out, stack->a[i]);
+               bam_destroy1(stack->a[i]);
+       }
+       stack->n = 0;
+       if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
+}
+
+static void clear_del_set(khash_t(name) *del_set)
+{
+       khint_t k;
+       for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
+               if (kh_exist(del_set, k))
+                       free((char*)kh_key(del_set, k));
+       kh_clear(name, del_set);
+}
+
+void bam_rmdup_core(bamFile in, bamFile out)
+{
+       bam_header_t *header;
+       bam1_t *b;
+       int last_tid = -1, last_pos = -1;
+       uint64_t n_checked = 0, n_removed = 0;
+       tmp_stack_t stack;
+       khint_t k;
+       khash_t(pos) *best_hash;
+       khash_t(name) *del_set;
+
+       best_hash = kh_init(pos);
+       del_set = kh_init(name);
+       b = bam_init1();
+       memset(&stack, 0, sizeof(tmp_stack_t));
+       header = bam_header_read(in);
+       bam_header_write(out, header);
+
+       kh_resize(name, del_set, 4 * BUFFER_SIZE);
+       kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
+       while (bam_read1(in, b) >= 0) {
+               bam1_core_t *c = &b->core;
+               if (c->tid != last_tid || last_pos != c->pos) {
+                       dump_best(&stack, best_hash, out); // write the result
+                       if (c->tid != last_tid) {
+                               kh_clear(pos, best_hash);
+                               if (kh_size(del_set)) { // check
+                                       fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
+                                       clear_del_set(del_set);
+                               }
+                               if ((int)c->tid == -1) { // append unmapped reads
+                                       bam_write1(out, b);
+                                       while (bam_read1(in, b) >= 0) bam_write1(out, b);
+                                       break;
+                               }
+                               last_tid = c->tid;
+                               fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
+                       }
+               }
+               if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
+                       bam_write1(out, b);
+               } else if (c->isize > 0) { // paired, head
+                       uint64_t key = (uint64_t)c->pos<<32 | c->isize;
+                       int ret;
+                       ++n_checked;
+                       k = kh_put(pos, best_hash, key, &ret);
+                       if (ret == 0) { // found in best_hash
+                               bam1_t *p = kh_val(best_hash, k);
+                               ++n_removed;
+                               if (p->core.qual < c->qual) { // the current alignment is better
+                                       kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+                                       bam_copy1(p, b); // replaced as b
+                               } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+                               if (ret == 0)
+                                       fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+                       } else { // not found in best_hash
+                               kh_val(best_hash, k) = bam_dup1(b);
+                               stack_insert(&stack, kh_val(best_hash, k));
+                       }
+               } else { // paired, tail
+                       k = kh_get(name, del_set, bam1_qname(b));
+                       if (k != kh_end(del_set)) {
+                               free((char*)kh_key(del_set, k));
+                               kh_del(name, del_set, k);
+                       } else bam_write1(out, b);
+               }
+               last_pos = c->pos;
+       }
+       dump_best(&stack, best_hash, out);
+
+       bam_header_destroy(header);
+       clear_del_set(del_set);
+       kh_destroy(name, del_set);
+       kh_destroy(pos, best_hash);
+       free(stack.a);
+       bam_destroy1(b);
+       fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
+                       (double)n_removed/n_checked);
+}
+int bam_rmdup(int argc, char *argv[])
+{
+       bamFile in, out;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
+               return 1;
+       }
+       in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+       out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+       if (in == 0 || out == 0) {
+               fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
+               return 1;
+       }
+       bam_rmdup_core(in, out);
+       bam_close(in);
+       bam_close(out);
+       return 0;
+}
diff --git a/bam_rmdupse.c b/bam_rmdupse.c
new file mode 100644 (file)
index 0000000..df03717
--- /dev/null
@@ -0,0 +1,177 @@
+#include <math.h>
+#include "sam.h"
+#include "khash.h"
+
+typedef struct {
+       int n, m;
+       int *a;
+} listelem_t;
+
+KHASH_MAP_INIT_INT(32, listelem_t)
+
+#define BLOCK_SIZE 65536
+
+typedef struct {
+       bam1_t *b;
+       int rpos, score;
+} elem_t;
+
+typedef struct {
+       int n, max, x;
+       elem_t *buf;
+} buffer_t;
+
+static int fill_buf(samfile_t *in, buffer_t *buf)
+{
+       int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
+       bam1_t *b = bam_init1();
+       bam1_core_t *c = &b->core;
+       // squeeze out the empty cells at the beginning
+       for (i = 0; i < buf->n; ++i)
+               if (buf->buf[i].b) break;
+       if (i < buf->n) { // squeeze
+               if (i > 0) {
+                       memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
+                       buf->n = buf->n - i;
+               }
+       } else buf->n = 0;
+       // calculate min_rpos
+       for (i = 0; i < buf->n; ++i) {
+               elem_t *e = buf->buf + i;
+               if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
+                       min_rpos = buf->buf[i].rpos;
+       }
+       // fill the buffer
+       buf->x = -1;
+       last_tid = buf->n? buf->buf[0].b->core.tid : -1;
+       capacity = buf->n + BLOCK_SIZE;
+       while ((ret = samread(in, b)) >= 0) {
+               elem_t *e;
+               uint8_t *qual = bam1_qual(b);
+               int is_mapped;
+               if (last_tid < 0) last_tid = c->tid;
+               if (c->tid != last_tid) {
+                       if (buf->x < 0) buf->x = buf->n;
+               }
+               if (buf->n >= buf->max) { // enlarge
+                       buf->max = buf->max? buf->max<<1 : 8;
+                       buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
+               }
+               e = &buf->buf[buf->n++];
+               e->b = bam_dup1(b);
+               e->rpos = -1; e->score = 0;
+               for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
+               e->score = (double)e->score / sqrt(c->l_qseq + 1);
+               is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
+               if (!is_mapped) e->score = -1;
+               if (is_mapped && (c->flag & BAM_FREVERSE)) {
+                       e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
+                       if (min_rpos > e->rpos) min_rpos = e->rpos;
+               }
+               if (buf->n >= capacity) {
+                       if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
+                       else break;
+               }
+       }
+       if (ret >= 0 && buf->x < 0) buf->x = buf->n;
+       bam_destroy1(b);
+       return buf->n;
+}
+
+static void rmdupse_buf(buffer_t *buf)
+{
+       khash_t(32) *h;
+       uint32_t key;
+       khint_t k;
+       int mpos, i, upper;
+       listelem_t *p;
+       mpos = 0x7fffffff;
+       mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
+       upper = (buf->x < 0)? buf->n : buf->x;
+       // fill the hash table
+       h = kh_init(32);
+       for (i = 0; i < upper; ++i) {
+               elem_t *e = buf->buf + i;
+               int ret;
+               if (e->score < 0) continue;
+               if (e->rpos >= 0) {
+                       if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
+                       else continue;
+               } else {
+                       if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
+                       else continue;
+               }
+               k = kh_put(32, h, key, &ret);
+               p = &kh_val(h, k);
+               if (ret == 0) { // present in the hash table
+                       if (p->n == p->m) {
+                               p->m <<= 1;
+                               p->a = (int*)realloc(p->a, p->m * sizeof(int));
+                       }
+                       p->a[p->n++] = i;
+               } else {
+                       p->m = p->n = 1;
+                       p->a = (int*)calloc(p->m, sizeof(int));
+                       p->a[0] = i;
+               }
+       }
+       // rmdup
+       for (k = kh_begin(h); k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       int max, maxi;
+                       p = &kh_val(h, k);
+                       // get the max
+                       for (i = max = 0, maxi = -1; i < p->n; ++i) {
+                               if (buf->buf[p->a[i]].score > max) {
+                                       max = buf->buf[p->a[i]].score;
+                                       maxi = i;
+                               }
+                       }
+                       // mark the elements
+                       for (i = 0; i < p->n; ++i) {
+                               buf->buf[p->a[i]].score = -1;
+                               if (i != maxi) {
+                                       bam_destroy1(buf->buf[p->a[i]].b);
+                                       buf->buf[p->a[i]].b = 0;
+                               }
+                       }
+                       // free
+                       free(p->a);
+               }
+       }
+       kh_destroy(32, h);
+}
+
+static void dump_buf(buffer_t *buf, samfile_t *out)
+{
+       int i;
+       for (i = 0; i < buf->n; ++i) {
+               elem_t *e = buf->buf + i;
+               if (e->score != -1) break;
+               if (e->b) {
+                       samwrite(out, e->b);
+                       bam_destroy1(e->b);
+                       e->b = 0;
+               }
+       }
+}
+
+int bam_rmdupse(int argc, char *argv[])
+{
+       samfile_t *in, *out;
+       buffer_t *buf;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
+               return 1;
+       }
+       buf = calloc(1, sizeof(buffer_t));
+       in = samopen(argv[1], "rb", 0);
+       out = samopen(argv[2], "wb", in->header);
+       while (fill_buf(in, buf)) {
+               rmdupse_buf(buf);
+               dump_buf(buf, out);
+       }
+       samclose(in); samclose(out);
+       free(buf->buf); free(buf);
+       return 0;
+}
diff --git a/bam_sort.c b/bam_sort.c
new file mode 100644 (file)
index 0000000..402792a
--- /dev/null
@@ -0,0 +1,257 @@
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "bam.h"
+#include "ksort.h"
+
+static int g_is_by_qname = 0;
+
+static inline int strnum_cmp(const char *a, const char *b)
+{
+       char *pa, *pb;
+       pa = (char*)a; pb = (char*)b;
+       while (*pa && *pb) {
+               if (isdigit(*pa) && isdigit(*pb)) {
+                       long ai, bi;
+                       ai = strtol(pa, &pa, 10);
+                       bi = strtol(pb, &pb, 10);
+                       if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0;
+               } else {
+                       if (*pa != *pb) break;
+                       ++pa; ++pb;
+               }
+       }
+       if (*pa == *pb)
+               return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0;
+       return *pa<*pb? -1 : *pa>*pb? 1 : 0;
+}
+
+#define HEAP_EMPTY 0xffffffffffffffffull
+
+typedef struct {
+       int i;
+       uint64_t pos;
+       bam1_t *b;
+} heap1_t;
+
+static inline int heap_lt(const heap1_t a, const heap1_t b)
+{
+       if (g_is_by_qname) {
+               int t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+               return (t > 0 || (t == 0 && a.pos > b.pos));
+       } else return (a.pos > b.pos);
+}
+
+KSORT_INIT(heap, heap1_t, heap_lt)
+
+/*!
+  @abstract    Merge multiple sorted BAM.
+  @param  is_by_qname whether to sort by query name
+  @param  out  output BAM file name
+  @param  n    number of files to be merged
+  @param  fn   names of files to be merged
+
+  @discussion Padding information may NOT correctly maintained. This
+  function is NOT thread safe.
+ */
+void bam_merge_core(int by_qname, const char *out, int n, char * const *fn)
+{
+       bamFile fpout, *fp;
+       heap1_t *heap;
+       bam_header_t *hout = 0;
+       int i, j;
+
+       g_is_by_qname = by_qname;
+       fp = (bamFile*)calloc(n, sizeof(bamFile));
+       heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+       for (i = 0; i != n; ++i) {
+               heap1_t *h;
+               bam_header_t *hin;
+               assert(fp[i] = bam_open(fn[i], "r"));
+               hin = bam_header_read(fp[i]);
+               if (i == 0) hout = hin;
+               else { // validate multiple baf
+                       if (hout->n_targets != hin->n_targets) {
+                               fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]);
+                               exit(1);
+                       }
+                       for (j = 0; j < hout->n_targets; ++j) {
+                               if (strcmp(hout->target_name[j], hin->target_name[j])) {
+                                       fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n",
+                                                       hout->target_name[j], hin->target_name[j], fn[i]);
+                                       exit(1);
+                               }
+                               if (hout->target_len[j] != hin->target_len[j])
+                                       fprintf(stderr, "[bam_merge_core] different target sequence length: %d != %d in file '%s'. Continue.\n",
+                                                       hout->target_len[j], hin->target_len[j], fn[i]);
+                       }
+                       bam_header_destroy(hin);
+               }
+               h = heap + i;
+               h->i = i;
+               h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               if (bam_read1(fp[i], h->b) >= 0)
+                       h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b);
+               else h->pos = HEAP_EMPTY;
+       }
+       fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
+       assert(fpout);
+       bam_header_write(fpout, hout);
+       bam_header_destroy(hout);
+
+       ks_heapmake(heap, n, heap);
+       while (heap->pos != HEAP_EMPTY) {
+               bam1_t *b = heap->b;
+               bam_write1_core(fpout, &b->core, b->data_len, b->data);
+               if ((j = bam_read1(fp[heap->i], b)) >= 0)
+                       heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b);
+               else if (j == -1) heap->pos = HEAP_EMPTY;
+               else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+               ks_heapadjust(heap, 0, n, heap);
+       }
+
+       for (i = 0; i != n; ++i) {
+               bam_close(fp[i]);
+               free(heap[i].b->data);
+               free(heap[i].b);
+       }
+       bam_close(fpout);
+       free(fp); free(heap);
+}
+int bam_merge(int argc, char *argv[])
+{
+       int c, is_by_qname = 0;
+       while ((c = getopt(argc, argv, "n")) >= 0) {
+               switch (c) {
+               case 'n': is_by_qname = 1; break;
+               }
+       }
+       if (optind + 2 >= argc) {
+               fprintf(stderr, "Usage: samtools merge [-n] <out.bam> <in1.bam> <in2.bam> [...]\n");
+               return 1;
+       }
+       bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1);
+       return 0;
+}
+
+typedef bam1_t *bam1_p;
+
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+       if (g_is_by_qname) {
+               int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
+               return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos))));
+       } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos));
+}
+KSORT_INIT(sort, bam1_p, bam1_lt)
+
+static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h)
+{
+       char *name;
+       int i;
+       bamFile fp;
+       ks_mergesort(sort, k, buf, 0);
+       name = (char*)calloc(strlen(prefix) + 20, 1);
+       if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n);
+       else sprintf(name, "%s.bam", prefix);
+       assert(fp = bam_open(name, "w"));
+       free(name);
+       bam_header_write(fp, h);
+       for (i = 0; i < k; ++i)
+               bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
+       bam_close(fp);
+}
+
+/*!
+  @abstract Sort an unsorted BAM file based on the chromosome order
+  and the leftmost position of an alignment
+
+  @param  is_by_qname whether to sort by query name
+  @param  fn       name of the file to be sorted
+  @param  prefix   prefix of the output and the temporary files; upon
+                          sucessess, prefix.bam will be written.
+  @param  max_mem  approxiate maximum memory (very inaccurate)
+
+  @discussion It may create multiple temporary subalignment files
+  and then merge them by calling bam_merge_core(). This function is
+  NOT thread safe.
+ */
+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
+{
+       int n, ret, k, i;
+       size_t mem;
+       bam_header_t *header;
+       bamFile fp;
+       bam1_t *b, **buf;
+
+       g_is_by_qname = is_by_qname;
+       n = k = 0; mem = 0;
+       fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+       assert(fp);
+       header = bam_header_read(fp);
+       buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*));
+       // write sub files
+       for (;;) {
+               if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+               b = buf[k];
+               if ((ret = bam_read1(fp, b)) < 0) break;
+               mem += ret;
+               ++k;
+               if (mem >= max_mem) {
+                       sort_blocks(n++, k, buf, prefix, header);
+                       mem = 0; k = 0;
+               }
+       }
+       if (ret != -1)
+               fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+       if (n == 0) sort_blocks(-1, k, buf, prefix, header);
+       else { // then merge
+               char **fns, *fnout;
+               fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1);
+               sort_blocks(n++, k, buf, prefix, header);
+               fnout = (char*)calloc(strlen(prefix) + 20, 1);
+               sprintf(fnout, "%s.bam", prefix);
+               fns = (char**)calloc(n, sizeof(char*));
+               for (i = 0; i < n; ++i) {
+                       fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+                       sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+               }
+               bam_merge_core(is_by_qname, fnout, n, fns);
+               free(fnout);
+               for (i = 0; i < n; ++i) {
+                       unlink(fns[i]);
+                       free(fns[i]);
+               }
+               free(fns);
+       }
+       for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) {
+               if (buf[k]) {
+                       free(buf[k]->data);
+                       free(buf[k]);
+               }
+       }
+       free(buf);
+       bam_header_destroy(header);
+       bam_close(fp);
+}
+
+int bam_sort(int argc, char *argv[])
+{
+       size_t max_mem = 500000000;
+       int c, is_by_qname = 0;
+       while ((c = getopt(argc, argv, "nm:")) >= 0) {
+               switch (c) {
+               case 'n': is_by_qname = 1; break;
+               case 'm': max_mem = atol(optarg); break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "Usage: samtools sort [-n] [-m <maxMem>] <in.bam> <out.prefix>\n");
+               return 1;
+       }
+       bam_sort_core(is_by_qname, argv[optind], argv[optind+1], max_mem);
+       return 0;
+}
diff --git a/bam_stat.c b/bam_stat.c
new file mode 100644 (file)
index 0000000..c1c4a43
--- /dev/null
@@ -0,0 +1,78 @@
+#include <unistd.h>
+#include <assert.h>
+#include "bam.h"
+
+typedef struct {
+       long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
+       long long n_sgltn, n_read1, n_read2;
+       long long n_qcfail, n_dup;
+       long long n_diffchr, n_diffhigh;
+} bam_flagstat_t;
+
+#define flagstat_loop(s, c) do {                                                                               \
+               ++(s)->n_reads;                                                                                                 \
+               if ((c)->flag & BAM_FPAIRED) {                                                                  \
+                       ++(s)->n_pair_all;                                                                                      \
+                       if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good;           \
+                       if ((c)->flag & BAM_FREAD1) ++(s)->n_read1;                                     \
+                       if ((c)->flag & BAM_FREAD2) ++(s)->n_read2;                                     \
+                       if ((c)->flag & BAM_FMUNMAP) ++(s)->n_sgltn;                            \
+                       if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
+                               ++(s)->n_pair_map;                                                                              \
+                               if ((c)->mtid != (c)->tid) {                                                    \
+                                       ++(s)->n_diffchr;                                                                       \
+                                       if ((c)->qual >= 5) ++(s)->n_diffhigh;                          \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped;                                 \
+               if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail;                                   \
+               if ((c)->flag & BAM_FDUP) ++(s)->n_dup;                                                 \
+       } while (0)
+
+bam_flagstat_t *bam_flagstat_core(bamFile fp)
+{
+       bam_flagstat_t *s;
+       bam1_t *b;
+       bam1_core_t *c;
+       int ret;
+       s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+       b = bam_init1();
+       c = &b->core;
+       while ((ret = bam_read1(fp, b)) >= 0)
+               flagstat_loop(s, c);
+       bam_destroy1(b);
+       if (ret != -1)
+               fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+       return s;
+}
+int bam_flagstat(int argc, char *argv[])
+{
+       bamFile fp;
+       bam_header_t *header;
+       bam_flagstat_t *s;
+       if (argc == optind) {
+               fprintf(stderr, "Usage: samtools flagstat <in.bam>\n");
+               return 1;
+       }
+       fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       assert(fp);
+       header = bam_header_read(fp);
+       s = bam_flagstat_core(fp);
+       printf("%lld in total\n", s->n_reads);
+       printf("%lld QC failure\n", s->n_qcfail);
+       printf("%lld duplicates\n", s->n_dup);
+       printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0);
+       printf("%lld paired in sequencing\n", s->n_pair_all);
+       printf("%lld read1\n", s->n_read1);
+       printf("%lld read2\n", s->n_read2);
+       printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0);
+       printf("%lld with itself and mate mapped\n", s->n_pair_map);
+       printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0);
+       printf("%lld with mate mapped to a different chr\n", s->n_diffchr);
+       printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh);
+       free(s);
+       bam_header_destroy(header);
+       bam_close(fp);
+       return 0;
+}
diff --git a/bam_tview.c b/bam_tview.c
new file mode 100644 (file)
index 0000000..be2579c
--- /dev/null
@@ -0,0 +1,379 @@
+#ifndef _NO_CURSES
+#include <curses.h>
+#ifdef NCURSES_VERSION
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include "bam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+
+char bam_aux_getCEi(bam1_t *b, int i);
+char bam_aux_getCSi(bam1_t *b, int i);
+char bam_aux_getCQi(bam1_t *b, int i);
+
+#define TV_MIN_ALNROW 2
+#define TV_MAX_GOTO  40
+#define TV_LOW_MAPQ  10
+
+#define TV_COLOR_MAPQ   0
+#define TV_COLOR_BASEQ  1
+#define TV_COLOR_NUCL   2
+#define TV_COLOR_COL    3
+#define TV_COLOR_COLQ   4
+
+#define TV_BASE_NUCL 0
+#define TV_BASE_COLOR_SPACE 1
+
+typedef struct {
+       int mrow, mcol;
+       WINDOW *wgoto, *whelp;
+
+       bam_index_t *idx;
+       bam_lplbuf_t *lplbuf;
+       bam_header_t *header;
+       bamFile fp;
+       int curr_tid, left_pos;
+       faidx_t *fai;
+       bam_maqcns_t *bmc;
+
+       int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins;
+       char *ref;
+} tview_t;
+
+int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+       tview_t *tv = (tview_t*)data;
+       int i, j, c, rb, attr, max_ins = 0;
+       uint32_t call = 0;
+       if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
+       // print referece
+       rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
+       for (i = tv->last_pos + 1; i < pos; ++i) {
+               if (i%10 == 0) mvprintw(0, tv->ccol, "%-d", i+1);
+               c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
+               mvaddch(1, tv->ccol++, c);
+       }
+       if (pos%10 == 0) mvprintw(0, tv->ccol, "%-d", pos+1);
+       // print consensus
+       call = bam_maqcns_call(n, pl, tv->bmc);
+       attr = A_UNDERLINE;
+       c = ",ACMGRSVTWYHKDBN"[call>>28&0xf];
+       i = (call>>8&0xff)/10+1;
+       if (i > 4) i = 4;
+       attr |= COLOR_PAIR(i);
+       if (c == toupper(rb)) c = '.';
+       attron(attr);
+       mvaddch(2, tv->ccol, c);
+       attroff(attr);
+       if(tv->ins) {
+               // calculate maximum insert
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
+               }
+       }
+       // core loop
+       for (j = 0; j <= max_ins; ++j) {
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       int row = TV_MIN_ALNROW + p->level - tv->row_shift;
+                       if (j == 0) {
+                               if (!p->is_del) {
+                                       if (tv->base_for == TV_BASE_COLOR_SPACE && 
+                                                       (c = bam_aux_getCSi(p->b, p->qpos))) {
+                                               c = bam_aux_getCSi(p->b, p->qpos);
+                                               // assume that if we found one color, we will be able to get the color error
+                                               if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.';
+                                       }
+                                       else {
+                                               c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+                                               if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                                       }
+                               } else c = '*';
+                       } else { // padding
+                               if (j > p->indel) c = '*';
+                               else { // insertion
+                                       if (tv->base_for ==  TV_BASE_NUCL) {
+                                               c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                                               if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                                       }
+                                       else {
+                                               c = bam_aux_getCSi(p->b, p->qpos + j);
+                                               if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.';
+                                       }
+                               }
+                       }
+                       if (row > TV_MIN_ALNROW && row < tv->mrow) {
+                               int x;
+                               attr = 0;
+                               if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
+                                               || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE;
+                               if (tv->color_for == TV_COLOR_BASEQ) {
+                                       x = bam1_qual(p->b)[p->qpos]/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               } else if (tv->color_for == TV_COLOR_MAPQ) {
+                                       x = p->b->core.qual/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               } else if (tv->color_for == TV_COLOR_NUCL) {
+                                       x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5;
+                                       attr |= COLOR_PAIR(x);
+                               } else if(tv->color_for == TV_COLOR_COL) {
+                                       x = 0;
+                                       switch(bam_aux_getCSi(p->b, p->qpos)) {
+                                               case '0': x = 0; break;
+                                               case '1': x = 1; break;
+                                               case '2': x = 2; break;
+                                               case '3': x = 3; break;
+                                               case '4': x = 4; break;
+                                               default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break;
+                                       }
+                                       x+=5;
+                                       attr |= COLOR_PAIR(x);
+                               } else if(tv->color_for == TV_COLOR_COLQ) {
+                                       x = bam_aux_getCQi(p->b, p->qpos);
+                                       if(0 == x) x = bam1_qual(p->b)[p->qpos];
+                                       x = x/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               }
+                               attron(attr);
+                               mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c));
+                               attroff(attr);
+                       }
+               }
+               c = j? '*' : rb;
+               if (c == '*') {
+                       attr = COLOR_PAIR(8);
+                       attron(attr);
+                       mvaddch(1, tv->ccol++, c);
+                       attroff(attr);
+               } else mvaddch(1, tv->ccol++, c);
+       }
+       tv->last_pos = pos;
+       return 0;
+}
+
+tview_t *tv_init(const char *fn, const char *fn_fa)
+{
+       tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t));
+       tv->is_dot = 1;
+       tv->idx = bam_index_load(fn);
+       if (tv->idx == 0) exit(1);
+       tv->fp = bam_open(fn, "r");
+       bgzf_set_cache_size(tv->fp, 8 * 1024 *1024);
+       assert(tv->fp);
+       tv->header = bam_header_read(tv->fp);
+       tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
+       if (fn_fa) tv->fai = fai_load(fn_fa);
+       tv->bmc = bam_maqcns_init();
+       tv->ins = 1;
+       bam_maqcns_prepare(tv->bmc);
+
+       initscr();
+       keypad(stdscr, TRUE);
+       clear();
+       noecho();
+       cbreak();
+#ifdef NCURSES_VERSION
+       getmaxyx(stdscr, tv->mrow, tv->mcol);
+#else
+       tv->mrow = 80; tv->mcol = 40;
+#endif
+       tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
+       tv->whelp = newwin(27, 40, 5, 5);
+       tv->color_for = TV_COLOR_MAPQ;
+       start_color();
+       init_pair(1, COLOR_BLUE, COLOR_BLACK);
+       init_pair(2, COLOR_GREEN, COLOR_BLACK);
+       init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+       init_pair(4, COLOR_WHITE, COLOR_BLACK);
+       init_pair(5, COLOR_GREEN, COLOR_BLACK);
+       init_pair(6, COLOR_CYAN, COLOR_BLACK);
+       init_pair(7, COLOR_YELLOW, COLOR_BLACK);
+       init_pair(8, COLOR_RED, COLOR_BLACK);
+       init_pair(9, COLOR_BLUE, COLOR_BLACK);
+       return tv;
+}
+
+void tv_destroy(tview_t *tv)
+{
+       delwin(tv->wgoto); delwin(tv->whelp);
+       endwin();
+
+       bam_lplbuf_destroy(tv->lplbuf);
+       bam_maqcns_destroy(tv->bmc);
+       bam_index_destroy(tv->idx);
+       if (tv->fai) fai_destroy(tv->fai);
+       free(tv->ref);
+       bam_header_destroy(tv->header);
+       bam_close(tv->fp);
+       free(tv);
+}
+
+int tv_fetch_func(const bam1_t *b, void *data)
+{
+       tview_t *tv = (tview_t*)data;
+       bam_lplbuf_push(b, tv->lplbuf);
+       return 0;
+}
+
+int tv_draw_aln(tview_t *tv, int tid, int pos)
+{
+       // reset
+       clear();
+       tv->curr_tid = tid; tv->left_pos = pos;
+       tv->last_pos = tv->left_pos - 1;
+       tv->ccol = 0;
+       // print ref and consensus
+       if (tv->fai) {
+               char *str;
+               if (tv->ref) free(tv->ref);
+               str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
+               sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
+               tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
+               free(str);
+       }
+       // draw aln
+       bam_lplbuf_reset(tv->lplbuf);
+       bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
+       bam_lplbuf_push(0, tv->lplbuf);
+       return 0;
+}
+
+static void tv_win_goto(tview_t *tv, int *tid, int *pos)
+{
+       char str[256];
+       int i, l = 0;
+       wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
+       mvwprintw(tv->wgoto, 1, 2, "Goto: ");
+       for (;;) {
+               int c = wgetch(tv->wgoto);
+               wrefresh(tv->wgoto);
+               if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
+                       --l;
+               } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
+                       int _tid = -1, _beg, _end;
+                       bam_parse_region(tv->header, str, &_tid, &_beg, &_end);
+                       if (_tid >= 0) {
+                               *tid = _tid; *pos = _beg;
+                               return;
+                       }
+               } else if (isgraph(c)) {
+                       if (l < TV_MAX_GOTO) str[l++] = c;
+               } else if (c == '\027') l = 0;
+               else if (c == '\033') return;
+               str[l] = '\0';
+               for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
+               mvwprintw(tv->wgoto, 1, 8, "%s", str);
+       }
+}
+
+static void tv_win_help(tview_t *tv) {
+       int r = 1;
+       WINDOW *win = tv->whelp;
+       wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
+       mvwprintw(win, r++, 2, "        -=-    Help    -=- ");
+       r++;
+       mvwprintw(win, r++, 2, "?          This window");
+       mvwprintw(win, r++, 2, "Arrows     Small scroll movement");
+       mvwprintw(win, r++, 2, "h,j,k,l    Small scroll movement");
+       mvwprintw(win, r++, 2, "H,J,K,L    Large scroll movement");
+       mvwprintw(win, r++, 2, "ctrl-H     Scroll 1k left");
+       mvwprintw(win, r++, 2, "ctrl-L     Scroll 1k right");
+       mvwprintw(win, r++, 2, "space      Scroll one screen");
+       mvwprintw(win, r++, 2, "backspace  Scroll back one screen");
+       mvwprintw(win, r++, 2, "g          Go to specific location");
+       mvwprintw(win, r++, 2, "m          Color for mapping qual");
+       mvwprintw(win, r++, 2, "n          Color for nucleotide");
+       mvwprintw(win, r++, 2, "b          Color for base quality");
+       mvwprintw(win, r++, 2, "c          Color for cs color");
+       mvwprintw(win, r++, 2, "z          Color for cs qual");
+       mvwprintw(win, r++, 2, ".          Toggle on/off dot view");
+       mvwprintw(win, r++, 2, "N          Turn on nt view");
+       mvwprintw(win, r++, 2, "C          Turn on cs view");
+       mvwprintw(win, r++, 2, "i          Toggle on/off ins");
+       mvwprintw(win, r++, 2, "q          Exit");
+       r++;
+       mvwprintw(win, r++, 2, "Underline:      Secondary or orphan");
+       mvwprintw(win, r++, 2, "Blue:    0-9    Green: 10-19");
+       mvwprintw(win, r++, 2, "Yellow: 20-29   White: >=30");
+       wrefresh(win);
+       wgetch(win);
+}
+
+void tv_loop(tview_t *tv)
+{
+       int tid, pos;
+       tid = tv->curr_tid; pos = tv->left_pos;
+       while (1) {
+               int c = getch();
+               //if(256 < c) {c = 1 + (c%256);} // Terminal was displaying ctrl-H as 263 via ssh from Mac OS X 10.5 computer 
+               switch (c) {
+                       case '?': tv_win_help(tv); break;
+                       case '\033':
+                       case 'q': goto end_loop;
+                       case 'g': tv_win_goto(tv, &tid, &pos); break;
+                       case 'm': tv->color_for = TV_COLOR_MAPQ; break;
+                       case 'b': tv->color_for = TV_COLOR_BASEQ; break;
+                       case 'n': tv->color_for = TV_COLOR_NUCL; break;
+                       case 'c': tv->color_for = TV_COLOR_COL; break;
+                       case 'z': tv->color_for = TV_COLOR_COLQ; break;
+                       case KEY_LEFT:
+                       case 'h': --pos; break;
+                       case KEY_RIGHT:
+                       case 'l': ++pos; break;
+                       case KEY_SLEFT:
+                       case 'H': pos -= 20; break;
+                       case KEY_SRIGHT:
+                       case 'L': pos += 20; break;
+                       case '.': tv->is_dot = !tv->is_dot; break;
+                       case 'N': tv->base_for = TV_BASE_NUCL; break;
+                       case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
+                       case 'i': tv->ins = !tv->ins; break;
+                       case '\010': pos -= 1000; break;
+                       case '\014': pos += 1000; break;
+                       case ' ': pos += tv->mcol; break;
+                       case KEY_UP:
+                       case 'j': --tv->row_shift; break;
+                       case KEY_DOWN:
+                       case 'k': ++tv->row_shift; break;
+                       case KEY_BACKSPACE:
+                       case '\177': pos -= tv->mcol; break;
+#ifdef KEY_RESIZE
+                       case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
+#endif
+                       default: continue;
+               }
+               if (pos < 0) pos = 0;
+               if (tv->row_shift < 0) tv->row_shift = 0;
+               tv_draw_aln(tv, tid, pos);
+       }
+end_loop:
+       return;
+}
+
+int bam_tview_main(int argc, char *argv[])
+{
+       tview_t *tv;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: bamtk tview <aln.bam> [ref.fasta]\n");
+               return 1;
+       }
+       tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]);
+       tv_draw_aln(tv, 0, 0);
+       tv_loop(tv);
+       tv_destroy(tv);
+       return 0;
+}
+#else // #ifdef NCURSES_VERSION
+#warning "The ncurses library is unavailable; tview is disabled."
+int bam_tview_main(int argc, char *argv[])
+{
+       fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n");
+       return 1;
+}
+#endif
+#endif // #ifndef _NO_CURSES
diff --git a/bamtk.c b/bamtk.c
new file mode 100644 (file)
index 0000000..3386836
--- /dev/null
+++ b/bamtk.c
@@ -0,0 +1,118 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include "bam.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "0.1.5c (r385)"
+#endif
+
+int bam_taf2baf(int argc, char *argv[]);
+int bam_pileup(int argc, char *argv[]);
+int bam_merge(int argc, char *argv[]);
+int bam_index(int argc, char *argv[]);
+int bam_sort(int argc, char *argv[]);
+int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+int bam_rmdupse(int argc, char *argv[]);
+int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
+
+int main_samview(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
+
+int faidx_main(int argc, char *argv[]);
+int glf3_view_main(int argc, char *argv[]);
+
+int bam_tagview(int argc, char *argv[])
+{
+       bamFile fp;
+       bam_header_t *header;
+       bam1_t *b;
+       char tag[2];
+       int ret;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: samtools tagview <in.bam> <tag>\n");
+               return 1;
+       }
+       fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r");
+       assert(fp);
+       header = bam_header_read(fp);
+       if (header == 0) {
+               fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n");
+               return 1;
+       }
+       tag[0] = argv[2][0]; tag[1] = argv[2][1];
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       while ((ret = bam_read1(fp, b)) >= 0) {
+               uint8_t *d = bam_aux_get(b, tag);
+               if (d) {
+                       printf("%s\t%d\t", bam1_qname(b), b->core.flag);
+                       if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d));
+                       else if (d[0] == 'f') printf("%f\n", bam_aux2f(d));
+                       else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d));
+                       else if (d[0] == 'A') printf("%c\n", bam_aux2A(d));
+                       else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d));
+                       else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d));
+                       else printf("\n");
+               }
+       }
+       if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret);
+       free(b->data); free(b);
+       bam_header_destroy(header);
+       bam_close(fp);
+       return 0;
+}
+
+static int usage()
+{
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n");
+       fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
+       fprintf(stderr, "Usage:   samtools <command> [options]\n\n");
+       fprintf(stderr, "Command: import      import from SAM (obsolete; use `view')\n");
+       fprintf(stderr, "         view        export to the text format\n");
+       fprintf(stderr, "         sort        sort alignment file\n");
+       fprintf(stderr, "         merge       merge multiple sorted alignment files\n");
+       fprintf(stderr, "         pileup      generate pileup output\n");
+       fprintf(stderr, "         faidx       index/extract FASTA\n");
+#ifndef _NO_CURSES
+       fprintf(stderr, "         tview       text alignment viewer\n");
+#endif
+       fprintf(stderr, "         index       index alignment\n");
+       fprintf(stderr, "         fixmate     fix mate information\n");
+       fprintf(stderr, "         rmdup       remove PCR duplicates\n");
+       fprintf(stderr, "         glfview     print GLFv3 file\n");
+       fprintf(stderr, "         flagstat    simple stats\n");
+       fprintf(stderr, "         fillmd      fill the MD tag and change identical base to =\n");
+       fprintf(stderr, "\n");
+       return 1;
+}
+
+int main(int argc, char *argv[])
+{
+       if (argc < 2) return usage();
+       if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
+       else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
+       else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);
+       else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
+       else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
+       else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
+       else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+       else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
+       else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
+       else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1);
+       else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
+       else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
+       else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);
+       else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
+#ifndef _NO_CURSES
+       else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
+#endif
+       else {
+               fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
+               return 1;
+       }
+       return 0;       
+}
diff --git a/bgzf.c b/bgzf.c
new file mode 100644 (file)
index 0000000..fe4e31d
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,634 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+/*
+  2009-06-29 by lh3: cache recent uncompressed blocks.
+  2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
+  2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+#include "khash.h"
+typedef struct {
+       int size;
+       uint8_t *block;
+       int64_t end_offset;
+} cache_t;
+KHASH_MAP_INIT_INT64(cache, cache_t)
+
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+
+typedef int8_t byte;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+    return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+    buffer[2] = value >> 16;
+    buffer[3] = value >> 24;
+}
+
+inline
+int
+min(int x, int y)
+{
+    return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+    fp->error = message;
+}
+
+static BGZF *bgzf_read_init()
+{
+       BGZF *fp;
+       fp = calloc(1, sizeof(BGZF));
+    fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+    fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+       fp->cache_size = 0;
+       fp->cache = kh_init(cache);
+       return fp;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+#ifdef _USE_KNETFILE
+    knetFile *file = knet_dopen(fd, "r");
+#else
+    FILE* file = fdopen(fd, "r");
+#endif
+    BGZF* fp;
+       if (file == 0) return 0;
+       fp = bgzf_read_init();
+    fp->file_descriptor = fd;
+    fp->open_mode = 'r';
+#ifdef _USE_KNETFILE
+    fp->x.fpr = file;
+#else
+    fp->file = file;
+#endif
+    return fp;
+}
+
+static
+BGZF*
+open_write(int fd, bool is_uncompressed)
+{
+    FILE* file = fdopen(fd, "w");
+    BGZF* fp;
+       if (file == 0) return 0;
+       fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'w';
+    fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
+#ifdef _USE_KNETFILE
+    fp->x.fpw = file;
+#else
+    fp->file = file;
+#endif
+    fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+    fp->uncompressed_block = NULL;
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+    BGZF* fp = NULL;
+    if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
+#ifdef _USE_KNETFILE
+               knetFile *file = knet_open(path, mode);
+               if (file == 0) return 0;
+               fp = bgzf_read_init();
+               fp->file_descriptor = -1;
+               fp->open_mode = 'r';
+               fp->x.fpr = file;
+#else
+               int oflag = O_RDONLY;
+               int fd = open(path, oflag);
+               if (fd == -1) return 0;
+        fp = open_read(fd);
+#endif
+    } else if (mode[0] == 'w' || mode[0] == 'W') {
+               int oflag = O_WRONLY | O_CREAT | O_TRUNC;
+               int fd = open(path, oflag, 0644);
+               if (fd == -1) return 0;
+        fp = open_write(fd, strstr(mode, "u")? 1 : 0);
+    }
+    if (fp != NULL) {
+        fp->owned_file = 1;
+    }
+    return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+       if (fd == -1) return 0;
+    if (mode[0] == 'r' || mode[0] == 'R') {
+        return open_read(fd);
+    } else if (mode[0] == 'w' || mode[0] == 'W') {
+        return open_write(fd, strstr(mode, "u")? 1 : 0);
+    } else {
+        return NULL;
+    }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+    // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+    // Also adds an extra field that stores the compressed block length.
+
+    byte* buffer = fp->compressed_block;
+    int buffer_size = fp->compressed_block_size;
+
+    // Init gzip header
+    buffer[0] = GZIP_ID1;
+    buffer[1] = GZIP_ID2;
+    buffer[2] = CM_DEFLATE;
+    buffer[3] = FLG_FEXTRA;
+    buffer[4] = 0; // mtime
+    buffer[5] = 0;
+    buffer[6] = 0;
+    buffer[7] = 0;
+    buffer[8] = 0;
+    buffer[9] = OS_UNKNOWN;
+    buffer[10] = BGZF_XLEN;
+    buffer[11] = 0;
+    buffer[12] = BGZF_ID1;
+    buffer[13] = BGZF_ID2;
+    buffer[14] = BGZF_LEN;
+    buffer[15] = 0;
+    buffer[16] = 0; // placeholder for block length
+    buffer[17] = 0;
+
+    // loop to retry for blocks that do not compress enough
+    int input_length = block_length;
+    int compressed_length = 0;
+    while (1) {
+               int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
+        z_stream zs;
+        zs.zalloc = NULL;
+        zs.zfree = NULL;
+        zs.next_in = fp->uncompressed_block;
+        zs.avail_in = input_length;
+        zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+        zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+        int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
+                                  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+        if (status != Z_OK) {
+            report_error(fp, "deflate init failed");
+            return -1;
+        }
+        status = deflate(&zs, Z_FINISH);
+        if (status != Z_STREAM_END) {
+            deflateEnd(&zs);
+            if (status == Z_OK) {
+                // Not enough space in buffer.
+                // Can happen in the rare case the input doesn't compress enough.
+                // Reduce the amount of input until it fits.
+                input_length -= 1024;
+                if (input_length <= 0) {
+                    // should never happen
+                    report_error(fp, "input reduction failed");
+                    return -1;
+                }
+                continue;
+            }
+            report_error(fp, "deflate failed");
+            return -1;
+        }
+        status = deflateEnd(&zs);
+        if (status != Z_OK) {
+            report_error(fp, "deflate end failed");
+            return -1;
+        }
+        compressed_length = zs.total_out;
+        compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+        if (compressed_length > MAX_BLOCK_SIZE) {
+            // should never happen
+            report_error(fp, "deflate overflow");
+            return -1;
+        }
+        break;
+    }
+
+    packInt16((uint8_t*)&buffer[16], compressed_length-1);
+    uint32_t crc = crc32(0L, NULL, 0L);
+    crc = crc32(crc, fp->uncompressed_block, input_length);
+    packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+    packInt32((uint8_t*)&buffer[compressed_length-4], input_length);