--- /dev/null
+Heng Li from the Sanger Institute wrote most of the initial source codes
+of SAMtools and various converters.
+
+Bob Handsaker from the Broad Institute is a major contributor to the
+SAM/BAM specification. He designed and implemented the BGZF format, the
+underlying indexable compression format for the BAM format. BGZF does
+not support arithmetic between file offsets.
+
+Jue Ruan for the Beijing Genome Institute designed and implemented the
+RAZF format, an alternative indexable compression format. RAZF supports
+arithmetic between file offsets, at the cost of increased index file
+size and the full compatibility with gzip. RAZF is optional and only
+used in `faidx' for indexing RAZF compressed fasta files.
+
+Colin Hercus updated novo2sam.pl to support gapped alignment by
+novoalign.
--- /dev/null
+The MIT License
+
+Copyright (c) 2008-2009 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
--- /dev/null
+------------------------------------------------------------------------
+r372 | lh3lh3 | 2009-07-07 09:49:27 +0100 (Tue, 07 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+
+ * samtools-0.1.4-23 (r372)
+ * keep header text if "view -t" is used (by Gerton)
+
+------------------------------------------------------------------------
+r371 | lh3lh3 | 2009-07-07 01:13:32 +0100 (Tue, 07 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/samtools.1
+
+update documentation
+
+------------------------------------------------------------------------
+r370 | bhandsaker | 2009-07-02 22:24:34 +0100 (Thu, 02 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+
+Introduced LIBPATH variable so this could be overridden to allow samtools to build correct at the Broad.
+
+------------------------------------------------------------------------
+r369 | lh3lh3 | 2009-07-02 13:36:53 +0100 (Thu, 02 Jul 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-22 (r369)
+ * in pileup, optionally print E2 and U2
+ * remove the debugging code in bam_aux_get() (Drat!)
+
+------------------------------------------------------------------------
+r368 | lh3lh3 | 2009-07-02 11:32:26 +0100 (Thu, 02 Jul 2009) | 6 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bam_lpileup.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bam_rmdup.c
+ M /trunk/samtools/bam_stat.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/faidx.c
+ M /trunk/samtools/faidx.h
+ M /trunk/samtools/glf.c
+
+ * samtools-0.1.4-21 (r368)
+ * propagate errors rather than exit or complain assertion failure. Assertion
+ should be only used for checking internal bugs, but not for external input
+ inconsistency. I was just a bit lazy.
+ * small memory leak may be present on failure, though
+
+------------------------------------------------------------------------
+r367 | lh3lh3 | 2009-06-30 16:18:42 +0100 (Tue, 30 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+
+reduce the chance of blocking in FTP connection
+
+------------------------------------------------------------------------
+r366 | lh3lh3 | 2009-06-30 15:35:21 +0100 (Tue, 30 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+
+minor changes to knetfile: invalid fd equals -1 rather than 0
+
+------------------------------------------------------------------------
+r365 | lh3lh3 | 2009-06-30 14:04:30 +0100 (Tue, 30 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.4-20 (r365)
+ * download the BAM index file if it is not found in the current working directory.
+
+------------------------------------------------------------------------
+r364 | lh3lh3 | 2009-06-30 12:39:07 +0100 (Tue, 30 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/knetfile.c
+
+ * samtools-0.1.4-19 (r364)
+ * knetfile: report error when the file is not present on FTP
+
+------------------------------------------------------------------------
+r363 | lh3lh3 | 2009-06-29 23:23:32 +0100 (Mon, 29 Jun 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.c
+ M /trunk/samtools/bgzf.h
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.4-18 (r363)
+ * knetfile: do not trigger network communication in FTP seek (lazy seek)
+ * bgzf: cache recent blocks (disabled by default)
+
+------------------------------------------------------------------------
+r362 | lh3lh3 | 2009-06-25 21:04:34 +0100 (Thu, 25 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bgzf.c
+
+write changelog
+
+------------------------------------------------------------------------
+r361 | lh3lh3 | 2009-06-25 21:03:10 +0100 (Thu, 25 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-17 (r361)
+ * if a file is given on FTP, search locally for the BAM index
+
+------------------------------------------------------------------------
+r360 | lh3lh3 | 2009-06-25 20:44:52 +0100 (Thu, 25 Jun 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.c
+ M /trunk/samtools/bgzf.h
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.4-16 (r360)
+ * report more information in index when the input is not sorted
+ * change the behaviour of knet_seek() such that it returns 0 on success
+ * support knetfile library in BGZF
+
+------------------------------------------------------------------------
+r359 | lh3lh3 | 2009-06-25 17:10:55 +0100 (Thu, 25 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/knetfile.h
+
+fixed bugs in knetfile.*
+
+------------------------------------------------------------------------
+r358 | lh3lh3 | 2009-06-25 13:53:19 +0100 (Thu, 25 Jun 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/knetfile.h
+
+this is the header file
+
+------------------------------------------------------------------------
+r357 | lh3lh3 | 2009-06-25 13:52:03 +0100 (Thu, 25 Jun 2009) | 3 lines
+Changed paths:
+ A /trunk/samtools/knetfile.c
+
+ * open a file at FTP
+ * preliminary version
+
+------------------------------------------------------------------------
+r354 | lh3lh3 | 2009-06-24 14:02:25 +0100 (Wed, 24 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-15 (r354)
+ * fixed a memory leak in bam_view1(), although samtools is not using this routine.
+
+------------------------------------------------------------------------
+r351 | lh3lh3 | 2009-06-18 00:16:26 +0100 (Thu, 18 Jun 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/faidx.c
+
+ * samtools-0.1.4-13 (r351)
+ * make faidx more tolerant to empty lines right before or after > lines
+ * hope this does not introduce new bugs...
+
+------------------------------------------------------------------------
+r350 | lh3lh3 | 2009-06-16 14:37:01 +0100 (Tue, 16 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-13 (r350)
+ * fixed a small memory leak in pileup, caused by recent modifications
+
+------------------------------------------------------------------------
+r347 | lh3lh3 | 2009-06-13 21:20:49 +0100 (Sat, 13 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-12 (r347)
+ * added `-S' to pileup, similar to `view -S'
+
+------------------------------------------------------------------------
+r346 | lh3lh3 | 2009-06-13 17:52:31 +0100 (Sat, 13 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam_view.c
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.4-11 (r346)
+ * allow to select a read group at view command-line
+
+------------------------------------------------------------------------
+r344 | lh3lh3 | 2009-06-13 14:06:24 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/examples/calDepth.c
+
+added more comments
+
+------------------------------------------------------------------------
+r343 | lh3lh3 | 2009-06-13 14:01:22 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/examples/calDepth.c
+
+nothing really
+
+------------------------------------------------------------------------
+r342 | lh3lh3 | 2009-06-13 13:58:48 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/examples/Makefile
+ A /trunk/samtools/examples/calDepth.c
+
+added an example of calculating read depth
+
+------------------------------------------------------------------------
+r341 | lh3lh3 | 2009-06-13 13:00:08 +0100 (Sat, 13 Jun 2009) | 6 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ A /trunk/samtools/bam_color.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam.h
+
+ * samtools-0.1.4-10 (r341)
+ * only include key APIs in libbam.a
+ * move color-specific routines to bam_color.c
+ * update documentations
+ * remove the support of -q in pileup
+
+------------------------------------------------------------------------
+r340 | lh3lh3 | 2009-06-13 11:17:14 +0100 (Sat, 13 Jun 2009) | 6 lines
+Changed paths:
+ M /trunk/samtools/INSTALL
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/razf.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-9 (r340)
+ * added a warning to razf.c if zlib<1.2.2.1
+ * fixed a compilation warning
+ * fixed a segfault caused by @RG parsing
+ * detect NCURSES in bam_tview.c
+
+------------------------------------------------------------------------
+r339 | lh3lh3 | 2009-06-13 10:35:19 +0100 (Sat, 13 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/INSTALL
+
+update INSTALL
+
+------------------------------------------------------------------------
+r338 | lh3lh3 | 2009-06-13 00:15:24 +0100 (Sat, 13 Jun 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/kstring.h
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-8 (r338)
+ * parse the @RG header lines and allow to choose library at the "samtools view"
+ command line
+
+------------------------------------------------------------------------
+r337 | lh3lh3 | 2009-06-12 21:25:50 +0100 (Fri, 12 Jun 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.c
+ M /trunk/samtools/bgzf.h
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.4-7 (r337)
+ * bgzf.c: support mode string "wu": uncompressed output
+ * "samtools view" support "-u" command-line option
+
+------------------------------------------------------------------------
+r336 | lh3lh3 | 2009-06-12 17:20:12 +0100 (Fri, 12 Jun 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/misc/Makefile
+ M /trunk/samtools/razf.c
+ M /trunk/samtools/razf.h
+ M /trunk/samtools/razip.c
+
+ * no changes to samtools itself
+ * remove zlib source codes
+ * make RAZF reading compatible with old version of zlib
+ * on old version of zlib, writing is not available
+
+------------------------------------------------------------------------
+r335 | lh3lh3 | 2009-06-12 16:47:33 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+ D /trunk/samtools/zlib
+
+remove zlib for simplification...
+
+------------------------------------------------------------------------
+r334 | lh3lh3 | 2009-06-12 15:43:36 +0100 (Fri, 12 Jun 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-6 (r334)
+ * do not export bam_aux_get_core() for Bio::DB::Sam because it has already
+ been implemented in that.
+ * this version works with the latest Bio::DB::Sam (20090612)
+
+------------------------------------------------------------------------
+r333 | lh3lh3 | 2009-06-12 15:33:42 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+
+update ChangeLog
+
+------------------------------------------------------------------------
+r332 | lh3lh3 | 2009-06-12 15:21:21 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/AUTHORS
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/misc/Makefile
+
+fixed minor things in Makefile
+
+------------------------------------------------------------------------
+r331 | lh3lh3 | 2009-06-12 15:07:05 +0100 (Fri, 12 Jun 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-5 (r3310
+ * no change to samtools itself. Version number is increased to reflect the
+ changes in the Makefile building system.
+
+------------------------------------------------------------------------
+r330 | lh3lh3 | 2009-06-12 15:03:38 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/AUTHORS
+ D /trunk/samtools/README
+
+update information...
+
+------------------------------------------------------------------------
+r329 | lh3lh3 | 2009-06-12 14:52:21 +0100 (Fri, 12 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/novo2sam.pl
+
+ * updated novoalign converter by Colin Hercus et al.
+ * this version works with indels
+
+------------------------------------------------------------------------
+r328 | lh3lh3 | 2009-06-12 14:50:53 +0100 (Fri, 12 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/INSTALL
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/misc/Makefile
+ M /trunk/samtools/zlib/Makefile
+
+ * update Makefile
+ * update INSTALL instruction
+
+------------------------------------------------------------------------
+r327 | lh3lh3 | 2009-06-12 14:18:29 +0100 (Fri, 12 Jun 2009) | 4 lines
+Changed paths:
+ A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.generic:325)
+ D /trunk/samtools/Makefile.am
+ D /trunk/samtools/Makefile.generic
+ D /trunk/samtools/Makefile.lite
+ D /trunk/samtools/autogen.sh
+ D /trunk/samtools/cleanup.sh
+ D /trunk/samtools/configure.ac
+ A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.generic:305)
+ D /trunk/samtools/misc/Makefile.am
+ D /trunk/samtools/misc/Makefile.generic
+ M /trunk/samtools/razf.c
+ A /trunk/samtools/zlib
+ A /trunk/samtools/zlib/Makefile
+ A /trunk/samtools/zlib/adler32.c
+ A /trunk/samtools/zlib/compress.c
+ A /trunk/samtools/zlib/crc32.c
+ A /trunk/samtools/zlib/crc32.h
+ A /trunk/samtools/zlib/deflate.c
+ A /trunk/samtools/zlib/deflate.h
+ A /trunk/samtools/zlib/gzio.c
+ A /trunk/samtools/zlib/infback.c
+ A /trunk/samtools/zlib/inffast.c
+ A /trunk/samtools/zlib/inffast.h
+ A /trunk/samtools/zlib/inffixed.h
+ A /trunk/samtools/zlib/inflate.c
+ A /trunk/samtools/zlib/inflate.h
+ A /trunk/samtools/zlib/inftrees.c
+ A /trunk/samtools/zlib/inftrees.h
+ A /trunk/samtools/zlib/trees.c
+ A /trunk/samtools/zlib/trees.h
+ A /trunk/samtools/zlib/uncompr.c
+ A /trunk/samtools/zlib/zconf.h
+ A /trunk/samtools/zlib/zlib.h
+ A /trunk/samtools/zlib/zutil.c
+ A /trunk/samtools/zlib/zutil.h
+ D /trunk/samtools/zutil.h
+
+ * added zlib-1.2.3 as razip requires that
+ * prepare to changed back to the Makefile building system
+ * unfinished! (will be soon)
+
+------------------------------------------------------------------------
+r326 | lh3lh3 | 2009-06-12 14:12:03 +0100 (Fri, 12 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+Unfinished
+
+------------------------------------------------------------------------
+r325 | lh3lh3 | 2009-06-10 16:27:59 +0100 (Wed, 10 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.4-4 (r325)
+ * further avoid wrong consensus calls in repetitive regions.
+
+------------------------------------------------------------------------
+r324 | lh3lh3 | 2009-06-10 15:56:17 +0100 (Wed, 10 Jun 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam.h
+
+ * samtools-0.1.4-3 (r324)
+ * make maqcns generate the correct call in repetitive regions.
+ * allow filtering on mapQ at the pileup command line
+
+------------------------------------------------------------------------
+r323 | lh3lh3 | 2009-06-10 10:04:21 +0100 (Wed, 10 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.3.2 (r322)
+ * indels and SNPs use different mapping quality threshold
+
+------------------------------------------------------------------------
+r322 | lh3lh3 | 2009-06-10 10:03:22 +0100 (Wed, 10 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/export2sam.pl
+
+fixed a typo
+
+------------------------------------------------------------------------
+r321 | lh3lh3 | 2009-06-09 09:21:48 +0100 (Tue, 09 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+just typo. no real change
+
+------------------------------------------------------------------------
+r320 | lh3lh3 | 2009-06-08 14:32:51 +0100 (Mon, 08 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+a little bit code cleanup
+
+------------------------------------------------------------------------
+r319 | lh3lh3 | 2009-06-08 14:22:33 +0100 (Mon, 08 Jun 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.3.1
+ * change default parameters
+ * optionally print filtered variants
+
+------------------------------------------------------------------------
+r318 | lh3lh3 | 2009-06-08 14:14:26 +0100 (Mon, 08 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.3.0
+ * combine snpFilter and indelFilter
+
+------------------------------------------------------------------------
+r317 | lh3lh3 | 2009-06-08 11:31:42 +0100 (Mon, 08 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.3
+ * change a default parameter
+
+------------------------------------------------------------------------
+r316 | lh3lh3 | 2009-06-08 11:11:06 +0100 (Mon, 08 Jun 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_maqcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+
+ * samtools-0.1.4-2 (r316)
+ * pileup: cap mapping quality at 60 (by default)
+ * pileup: always calculate RMS mapq
+ * pileup: allow to output variant sites only
+
+------------------------------------------------------------------------
+r312 | lh3lh3 | 2009-06-04 13:01:10 +0100 (Thu, 04 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.2
+ * added pileup2fq
+
+------------------------------------------------------------------------
+r311 | lh3lh3 | 2009-06-03 09:40:40 +0100 (Wed, 03 Jun 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * in snpFilter, suppress non-SNP sites
+
+------------------------------------------------------------------------
+r310 | lh3lh3 | 2009-06-01 14:35:13 +0100 (Mon, 01 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.1
+ * fixed a typo
+
+------------------------------------------------------------------------
+r309 | lh3lh3 | 2009-06-01 14:04:39 +0100 (Mon, 01 Jun 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.2.0
+ * snpFilter
+
+------------------------------------------------------------------------
+r306 | lh3lh3 | 2009-05-28 11:49:35 +0100 (Thu, 28 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bgzf.c
+
+ * minor changes to bgzf: return NULL if fd == -1
+ * suggested by {kdj,jm18}@sanger.ac.uk
+
+------------------------------------------------------------------------
+r305 | lh3lh3 | 2009-05-28 11:16:08 +0100 (Thu, 28 May 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/misc/interpolate_sam.pl
+
+Script for paired-end pileup, contributed by Stephen Montgomery.
+
+------------------------------------------------------------------------
+r304 | lh3lh3 | 2009-05-28 11:08:49 +0100 (Thu, 28 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+
+ * samtools-0.1.4-1 (r304)
+ * fixed a minor bug in printing headers
+
+------------------------------------------------------------------------
+r297 | lh3lh3 | 2009-05-21 16:06:16 +0100 (Thu, 21 May 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/maq2sam.c
+ M /trunk/samtools/samtools.1
+
+Release samtools-0.1.4
+
+------------------------------------------------------------------------
+r296 | lh3lh3 | 2009-05-21 12:53:14 +0100 (Thu, 21 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-24 (r296)
+ * another similar bug in the indel caller
+
+------------------------------------------------------------------------
+r295 | lh3lh3 | 2009-05-21 12:50:28 +0100 (Thu, 21 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-23 (r295)
+ * fixed a critical bug in the indel caller
+
+------------------------------------------------------------------------
+r294 | lh3lh3 | 2009-05-20 13:00:20 +0100 (Wed, 20 May 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_stat.c
+
+added a missing header file
+
+------------------------------------------------------------------------
+r293 | lh3lh3 | 2009-05-19 23:44:25 +0100 (Tue, 19 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-22 (r293)
+ * open tview in the dot-view mode by default
+
+------------------------------------------------------------------------
+r292 | lh3lh3 | 2009-05-18 21:01:23 +0100 (Mon, 18 May 2009) | 6 lines
+Changed paths:
+ M /trunk/samtools/samtools.1
+
+Added a note to the manual. Currently SAMtools used unaligned words in
+several places. Although this does not cause bus errors to me, it may
+affect portability. Please see the "Bus error" wiki page for more
+information. Also thank James Bonfields for pointing this out.
+
+
+------------------------------------------------------------------------
+r286 | lh3lh3 | 2009-05-14 15:23:13 +0100 (Thu, 14 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-21 (286)
+ * declare bam_aux_get_core() in bam.h
+
+------------------------------------------------------------------------
+r276 | lh3lh3 | 2009-05-13 10:07:55 +0100 (Wed, 13 May 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-20 (r276)
+ * remove bam1_t::hash again. We need to modify the Perl API anyway to
+ make it work with the latest SVN.
+ * As is suggested by Tim, scan "{base}.bai" and "{base}.bam.bai" for index
+
+------------------------------------------------------------------------
+r275 | lh3lh3 | 2009-05-12 21:14:10 +0100 (Tue, 12 May 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-19 (r275)
+ * a minor change to the bam1_t struct: added back "void *hash" for the
+ backward compatibility with Bio::DB::Sam
+
+------------------------------------------------------------------------
+r273 | lh3lh3 | 2009-05-12 14:28:39 +0100 (Tue, 12 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_rmdupse.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-18 (r273)
+ * rmdupse: do not remove unmapped reads
+
+------------------------------------------------------------------------
+r272 | lh3lh3 | 2009-05-12 14:20:00 +0100 (Tue, 12 May 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_rmdupse.c
+
+change a parameter. It does nothing
+
+------------------------------------------------------------------------
+r271 | lh3lh3 | 2009-05-12 14:17:58 +0100 (Tue, 12 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile.am
+ M /trunk/samtools/Makefile.generic
+ M /trunk/samtools/Makefile.lite
+ A /trunk/samtools/bam_rmdupse.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/configure.ac
+
+ * samtools-0.1.3-17 (r271)
+ * added 'rmdupse' command
+
+------------------------------------------------------------------------
+r267 | lh3lh3 | 2009-05-05 22:31:41 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-16 (r267)
+ * in sam_view.c, changed g_flag_on based on the suggestion by Angie Hinrichs
+
+------------------------------------------------------------------------
+r266 | lh3lh3 | 2009-05-05 22:23:27 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-15 (r266)
+ * report an error if a non-* reference is present while @SQ is absent
+
+------------------------------------------------------------------------
+r265 | lh3lh3 | 2009-05-05 22:09:00 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-14 (r262)
+ * make samopen() recognize @SQ header lines
+
+------------------------------------------------------------------------
+r261 | lh3lh3 | 2009-05-05 15:10:30 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-13 (r260)
+ * report error for file I/O error
+
+------------------------------------------------------------------------
+r260 | lh3lh3 | 2009-05-05 15:01:16 +0100 (Tue, 05 May 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile.am
+
+update Makefile.am
+
+------------------------------------------------------------------------
+r259 | lh3lh3 | 2009-05-05 14:52:25 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam.h
+
+ * samtools-0.1.3-12 (r259)
+ * use the new I/O interface in pileup
+
+------------------------------------------------------------------------
+r258 | lh3lh3 | 2009-05-05 14:33:22 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile.generic
+ M /trunk/samtools/Makefile.lite
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ A /trunk/samtools/sam.c
+ A /trunk/samtools/sam.h
+ A /trunk/samtools/sam_view.c
+
+ * samtools-0.1.3-11 (r258)
+ * unify the interface to BAM and SAM I/O
+
+------------------------------------------------------------------------
+r257 | lh3lh3 | 2009-05-05 09:53:35 +0100 (Tue, 05 May 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile.lite
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-10 (r257)
+ * allow hex with "pileup -m"
+
+------------------------------------------------------------------------
+r256 | lh3lh3 | 2009-05-04 19:16:50 +0100 (Mon, 04 May 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_lpileup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-9 (r256)
+ * fixed a bug in bam_lpileup.c
+ * I do not know if this also fixes the bug causing assertion failure in the tview
+
+------------------------------------------------------------------------
+r251 | lh3lh3 | 2009-04-28 13:53:23 +0100 (Tue, 28 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-8 (r251)
+ * fixed a bug when there are reads without coordinates
+
+------------------------------------------------------------------------
+r250 | lh3lh3 | 2009-04-28 13:43:33 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/AUTHORS
+ A /trunk/samtools/README
+ M /trunk/samtools/cleanup.sh
+
+added missing files
+
+------------------------------------------------------------------------
+r249 | lh3lh3 | 2009-04-28 13:37:16 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile.generic
+ M /trunk/samtools/Makefile.lite
+ M /trunk/samtools/configure.ac
+ M /trunk/samtools/misc/Makefile.generic
+
+improve large file support in compilation
+
+------------------------------------------------------------------------
+r248 | lh3lh3 | 2009-04-28 13:33:24 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/INSTALL
+
+update INSTALL
+
+------------------------------------------------------------------------
+r247 | lh3lh3 | 2009-04-28 13:28:50 +0100 (Tue, 28 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile.am
+ M /trunk/samtools/autogen.sh
+ M /trunk/samtools/cleanup.sh
+ M /trunk/samtools/configure.ac
+ A /trunk/samtools/misc/Makefile.am
+
+fixed various issues about the GNU building scripts
+
+------------------------------------------------------------------------
+r246 | lh3lh3 | 2009-04-28 13:10:23 +0100 (Tue, 28 Apr 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ D /trunk/samtools/Makefile
+ A /trunk/samtools/Makefile.am
+ A /trunk/samtools/Makefile.generic
+ A /trunk/samtools/autogen.sh
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ A /trunk/samtools/cleanup.sh
+ A /trunk/samtools/configure.ac
+ D /trunk/samtools/misc/Makefile
+ A /trunk/samtools/misc/Makefile.generic (from /trunk/samtools/misc/Makefile:245)
+
+ * samtools-0.1.3-7 (r246)
+ * incorporated revisions from Nils Homer
+ * enhanced support of displaying color-space reads
+
+------------------------------------------------------------------------
+r244 | lh3lh3 | 2009-04-25 11:49:40 +0100 (Sat, 25 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-6 (r244)
+ * fixed segfault for unmapped reads
+
+------------------------------------------------------------------------
+r243 | lh3lh3 | 2009-04-24 21:27:26 +0100 (Fri, 24 Apr 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-5 (r243)
+ * fixed a long existing bug which may cause memory leak
+ * check MD
+ * consensus calling now works with "=", but indel calling not
+
+------------------------------------------------------------------------
+r242 | lh3lh3 | 2009-04-24 20:44:46 +0100 (Fri, 24 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-4 (r242)
+ * fixed a memory leak
+
+------------------------------------------------------------------------
+r240 | lh3lh3 | 2009-04-24 16:40:18 +0100 (Fri, 24 Apr 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/Makefile.lite
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ A /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-3 (r240)
+ * generate MD tag
+ * generate "=" bases
+ * the plain pileup now support "=" bases, but consensus calling and glfgen may fail
+
+------------------------------------------------------------------------
+r239 | lh3lh3 | 2009-04-24 12:08:20 +0100 (Fri, 24 Apr 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-2 (r239)
+ * fixed bugs in bam_aux.c (these functions nevered used by samtools)
+ * removed bam_aux_init()/bam_aux_destroy()
+ * added tagview for testing bam_aux
+
+------------------------------------------------------------------------
+r235 | lh3lh3 | 2009-04-21 23:17:39 +0100 (Tue, 21 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.3-1
+ * fixed a bug in pileup: the first read in a chromosome may not be printed
+
+------------------------------------------------------------------------
+r232 | lh3lh3 | 2009-04-16 15:25:43 +0100 (Thu, 16 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile.lite
+
+a missing file in Makefile.lite
+
+------------------------------------------------------------------------
+r227 | lh3lh3 | 2009-04-15 22:02:53 +0100 (Wed, 15 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bamtk.c
+
+Release samtools-0.1.3
+
+------------------------------------------------------------------------
+r223 | lh3lh3 | 2009-04-15 14:31:32 +0100 (Wed, 15 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-28
+ * make samtools more robust to weird input such as empty file
+
+------------------------------------------------------------------------
+r222 | lh3lh3 | 2009-04-15 14:05:33 +0100 (Wed, 15 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/samtools.1
+
+prepare for release 0.1.3
+
+------------------------------------------------------------------------
+r221 | lh3lh3 | 2009-04-15 13:32:14 +0100 (Wed, 15 Apr 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/misc/blast2sam.pl
+
+convert NCBI-BLASTN to SAM
+
+------------------------------------------------------------------------
+r220 | lh3lh3 | 2009-04-15 13:18:19 +0100 (Wed, 15 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_lpileup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-27
+ * fixed a small memory leak in tview
+
+------------------------------------------------------------------------
+r219 | lh3lh3 | 2009-04-15 13:00:08 +0100 (Wed, 15 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_rmdup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-26
+ * fixed a bug in rmdup when there are unmapped reads
+
+------------------------------------------------------------------------
+r218 | lh3lh3 | 2009-04-14 22:28:58 +0100 (Tue, 14 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+
+proposed NEWS for the new release (have not yet)
+
+------------------------------------------------------------------------
+r216 | lh3lh3 | 2009-04-14 22:10:46 +0100 (Tue, 14 Apr 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools.pl-0.1.1
+ * improve indelFilter to avoid filtering true indels. The new filter relies
+ on the new pileup indel line implemented in samtools-0.1.2-25
+
+------------------------------------------------------------------------
+r215 | lh3lh3 | 2009-04-14 22:04:19 +0100 (Tue, 14 Apr 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.2-25
+ * change the pileup indel line to shows the number of alignments actually
+ containing indels
+
+------------------------------------------------------------------------
+r211 | lh3lh3 | 2009-04-13 12:07:13 +0100 (Mon, 13 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+
+update ChangeLog from "svn log"
+
+------------------------------------------------------------------------
+r210 | lh3lh3 | 2009-04-12 20:57:05 +0100 (Sun, 12 Apr 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/kseq.h
+
+ * samtools-0.1.2-24
+ * in merge, gives a warning rather than error if the target sequence length is different
+ * allow empty header
+
+------------------------------------------------------------------------
+r209 | lh3lh3 | 2009-04-12 20:32:44 +0100 (Sun, 12 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-23
+ * recognize '*' at the QUAL field
+
+------------------------------------------------------------------------
+r208 | lh3lh3 | 2009-04-12 20:08:02 +0100 (Sun, 12 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/kseq.h
+
+ * samtools-0.1.2-22
+ * the field separater is TAB only, now
+
+------------------------------------------------------------------------
+r207 | lh3lh3 | 2009-04-08 15:18:03 +0100 (Wed, 08 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/examples/ex1.sam.gz
+
+ * fixed the problem in the example alignment due to the bug in fixmate
+
+------------------------------------------------------------------------
+r206 | lh3lh3 | 2009-04-08 15:15:05 +0100 (Wed, 08 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_mate.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/soap2sam.pl
+
+ * samtools-0.1.2-21
+ * fixed a nasty bug in `fixmate'
+
+------------------------------------------------------------------------
+r205 | lh3lh3 | 2009-04-08 10:57:08 +0100 (Wed, 08 Apr 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/bowtie2sam.pl
+ M /trunk/samtools/misc/soap2sam.pl
+ M /trunk/samtools/misc/wgsim_eval.pl
+
+make the script robust to the bugs in SOAP-2.1.7
+
+------------------------------------------------------------------------
+r200 | lh3lh3 | 2009-04-02 15:14:56 +0100 (Thu, 02 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_stat.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-20
+ * check if file is truncated in flagstat
+
+------------------------------------------------------------------------
+r199 | lh3lh3 | 2009-04-02 15:09:10 +0100 (Thu, 02 Apr 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-19
+ * print the header if requested
+
+------------------------------------------------------------------------
+r193 | lh3lh3 | 2009-03-27 15:09:50 +0000 (Fri, 27 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-18
+ * fixed a minor bug reported by Nils Homer
+
+------------------------------------------------------------------------
+r185 | lh3lh3 | 2009-03-24 11:50:32 +0000 (Tue, 24 Mar 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.std:184)
+ D /trunk/samtools/Makefile.std
+ A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.std:184)
+ D /trunk/samtools/misc/Makefile.std
+
+rename Makefile.std as Makefile. GNU building systerm is not ready and may take some time...
+
+------------------------------------------------------------------------
+r184 | lh3lh3 | 2009-03-24 10:36:38 +0000 (Tue, 24 Mar 2009) | 4 lines
+Changed paths:
+ D /trunk/samtools/Makefile
+ A /trunk/samtools/Makefile.std (from /trunk/samtools/Makefile:183)
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ D /trunk/samtools/misc/Makefile
+ A /trunk/samtools/misc/Makefile.std (from /trunk/samtools/misc/Makefile:182)
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.2-17
+ * incorporating Nils' changes
+ * rename Makefile to Makefile.std and prepare to add the GNU building systerms (also by Nils)
+
+------------------------------------------------------------------------
+r183 | lh3lh3 | 2009-03-24 10:30:23 +0000 (Tue, 24 Mar 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_maqcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/kseq.h
+ A /trunk/samtools/kstring.c
+ A /trunk/samtools/kstring.h
+
+ * samtools-0.1.2-16
+ * made pileup take a list of proposed indels. An insertion is N at the moment.
+ * added my kstring library for a bit complex parsing of the position list.
+
+------------------------------------------------------------------------
+r169 | lh3lh3 | 2009-03-12 13:40:14 +0000 (Thu, 12 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/soap2sam.pl
+
+ * soap2sam.pl-0.1.2
+ * more robust to truncated soap output
+
+------------------------------------------------------------------------
+r168 | lh3lh3 | 2009-03-11 10:49:00 +0000 (Wed, 11 Mar 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile.lite
+
+added bam_stat.o to Makefile.lite
+
+------------------------------------------------------------------------
+r167 | lh3lh3 | 2009-03-10 22:11:31 +0000 (Tue, 10 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-15
+ * generate RMS of mapQ instead of max mapQ
+
+------------------------------------------------------------------------
+r166 | lh3lh3 | 2009-03-10 22:06:45 +0000 (Tue, 10 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/glf.c
+ M /trunk/samtools/glf.h
+ M /trunk/samtools/misc/Makefile
+
+ * samtools-0.1.2-14
+ * implemented GLFv3
+
+------------------------------------------------------------------------
+r159 | lh3lh3 | 2009-03-03 11:26:08 +0000 (Tue, 03 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-13
+ * fixed a minor bug in displaying pileup
+
+------------------------------------------------------------------------
+r158 | lh3lh3 | 2009-03-03 11:24:16 +0000 (Tue, 03 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-12
+ * optionally print SAM header
+
+------------------------------------------------------------------------
+r153 | lh3lh3 | 2009-03-02 10:45:28 +0000 (Mon, 02 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/glf.c
+
+ * samtools-0.1.2-11
+ * use "GLF\3" as the magic for GLFv3 files
+
+------------------------------------------------------------------------
+r152 | lh3lh3 | 2009-03-02 10:39:09 +0000 (Mon, 02 Mar 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/glf.c
+ M /trunk/samtools/glf.h
+
+ * samtools-0.1.2-10
+ * fixed a bug in import: core.bin is undefined for unmapped reads
+ * this bug can be alleviated (not completely solved) in bam_index.c
+ * update to GLFv3: pos is changed to offset for better compression
+
+------------------------------------------------------------------------
+r151 | lh3lh3 | 2009-03-01 15:18:43 +0000 (Sun, 01 Mar 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/wgsim.c
+
+ * wgsim-0.2.3
+ * fixed a bug in simulating indels
+
+------------------------------------------------------------------------
+r145 | lh3lh3 | 2009-02-26 19:43:57 +0000 (Thu, 26 Feb 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/misc/wgsim.c
+
+ * wgsim-0.2.2
+ * allow to print mismatch information as fastq comment. MAQ does
+ not like long read names.
+
+------------------------------------------------------------------------
+r141 | lh3lh3 | 2009-02-26 14:53:03 +0000 (Thu, 26 Feb 2009) | 6 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/misc/wgsim.c
+ M /trunk/samtools/misc/wgsim_eval.pl
+
+ * wgsim-0.2.1
+ * fixed a bug about color read coordinates
+ * fixed a bug in read names
+ * wgsim_eval.pl-0.1.3
+ * make the script work with color reads
+
+------------------------------------------------------------------------
+r140 | lh3lh3 | 2009-02-26 14:02:57 +0000 (Thu, 26 Feb 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/Makefile
+ M /trunk/samtools/misc/wgsim.c
+
+ * wgsim: added a note
+
+------------------------------------------------------------------------
+r139 | lh3lh3 | 2009-02-26 11:39:08 +0000 (Thu, 26 Feb 2009) | 7 lines
+Changed paths:
+ M /trunk/samtools/misc/wgsim.c
+ M /trunk/samtools/misc/wgsim_eval.pl
+
+ * wgsim-0.2.0
+ * considerable code clean up
+ * print number of substitutions/indels/errors on each read
+ * potentially support SOLiD simulation, though not tested at the moment
+ * wgsim_eval.pl-0.1.2
+ * change in accordant with wgsim
+
+------------------------------------------------------------------------
+r129 | lh3lh3 | 2009-02-18 22:23:27 +0000 (Wed, 18 Feb 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-9
+ * fixed a bug in bam_fetch, caused by completely contained adjacent chunks
+
+------------------------------------------------------------------------
+r128 | bhandsaker | 2009-02-18 19:06:57 +0000 (Wed, 18 Feb 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+
+Fix annoying segv when invalid region specified.
+
+------------------------------------------------------------------------
+r127 | lh3lh3 | 2009-02-17 10:49:55 +0000 (Tue, 17 Feb 2009) | 2 lines
+Changed paths:
+ D /trunk/samtools/misc/indel_filter.pl
+ A /trunk/samtools/misc/samtools.pl
+
+ * move indel_filter.pl to samtools.pl
+
+------------------------------------------------------------------------
+r126 | lh3lh3 | 2009-02-14 21:22:30 +0000 (Sat, 14 Feb 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_mate.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-7
+ * fixed a bug in fixmate: SE reads are flagged as BAM_FMUNMAP
+
+------------------------------------------------------------------------
+r125 | lh3lh3 | 2009-02-13 09:54:45 +0000 (Fri, 13 Feb 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_stat.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-7
+ * fixed a minor bug in flagstat
+
+------------------------------------------------------------------------
+r124 | lh3lh3 | 2009-02-12 11:15:32 +0000 (Thu, 12 Feb 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/indel_filter.pl
+
+ * samtools-0.1.2-6
+ * improve indel caller by setting maximum window size
+
+------------------------------------------------------------------------
+r123 | lh3lh3 | 2009-02-12 10:30:29 +0000 (Thu, 12 Feb 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * output max mapping quality in indel line
+
+------------------------------------------------------------------------
+r122 | lh3lh3 | 2009-02-11 10:59:10 +0000 (Wed, 11 Feb 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/maq2sam.c
+
+fixed a bug in generating tag AM
+
+------------------------------------------------------------------------
+r121 | lh3lh3 | 2009-02-03 10:43:11 +0000 (Tue, 03 Feb 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bamtk.c
+
+fixed a potential memory problem in indexing
+
+------------------------------------------------------------------------
+r120 | bhandsaker | 2009-02-02 15:52:52 +0000 (Mon, 02 Feb 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+
+Pass LIBS to recursive targets to facilitate building at Broad.
+
+------------------------------------------------------------------------
+r119 | lh3lh3 | 2009-02-02 10:12:15 +0000 (Mon, 02 Feb 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bam_stat.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-3
+ * fixed a bug in generating GLFv2 for indels
+ * improve flagstat report a little bit
+
+------------------------------------------------------------------------
+r118 | lh3lh3 | 2009-01-29 12:33:23 +0000 (Thu, 29 Jan 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ A /trunk/samtools/bam_stat.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.2-1
+ * added flagstat command
+
+------------------------------------------------------------------------
+r116 | lh3lh3 | 2009-01-28 13:31:12 +0000 (Wed, 28 Jan 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+Release SAMtools-0.1.2
+
+------------------------------------------------------------------------
+r115 | lh3lh3 | 2009-01-28 12:54:08 +0000 (Wed, 28 Jan 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/misc/indel_filter.pl
+
+Script for filtering indel results
+
+------------------------------------------------------------------------
+r114 | lh3lh3 | 2009-01-25 11:45:37 +0000 (Sun, 25 Jan 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/misc/zoom2sam.pl
+
+convert ZOOM to SAM
+
+------------------------------------------------------------------------
+r113 | lh3lh3 | 2009-01-24 14:25:07 +0000 (Sat, 24 Jan 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/misc/novo2sam.pl
+
+add a script to convert novo alignment to SAM
+
+------------------------------------------------------------------------
+r112 | lh3lh3 | 2009-01-23 20:57:39 +0000 (Fri, 23 Jan 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/ChangeLog.old
+ M /trunk/samtools/samtools.1
+
+update documentation and ChangeLog
+
+------------------------------------------------------------------------
+r111 | lh3lh3 | 2009-01-23 19:22:59 +0000 (Fri, 23 Jan 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.1-19
+ * fixed a bug in "merge" command line
+
+------------------------------------------------------------------------
+r110 | lh3lh3 | 2009-01-22 15:36:48 +0000 (Thu, 22 Jan 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/misc/Makefile
+ A /trunk/samtools/misc/bowtie2sam.pl (from /branches/dev/samtools/misc/bowtie2sam.pl:108)
+ M /trunk/samtools/misc/export2sam.pl
+ A /trunk/samtools/misc/soap2sam.pl (from /branches/dev/samtools/misc/soap2sam.pl:108)
+ A /trunk/samtools/misc/wgsim.c (from /branches/dev/samtools/misc/wgsim.c:108)
+ A /trunk/samtools/misc/wgsim_eval.pl (from /branches/dev/samtools/misc/wgsim_eval.pl:108)
+
+ * merge from branches/dev/
+ * all future development will happen here
+
+------------------------------------------------------------------------
+r109 | lh3lh3 | 2009-01-22 15:14:27 +0000 (Thu, 22 Jan 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/COPYING
+ M /trunk/samtools/ChangeLog
+ A /trunk/samtools/INSTALL (from /branches/dev/samtools/INSTALL:108)
+ M /trunk/samtools/Makefile
+ A /trunk/samtools/Makefile.lite (from /branches/dev/samtools/Makefile.lite:108)
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bam_lpileup.c
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_maqcns.h
+ A /trunk/samtools/bam_mate.c (from /branches/dev/samtools/bam_mate.c:108)
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bam_plcmd.c
+ A /trunk/samtools/bam_rmdup.c (from /branches/dev/samtools/bam_rmdup.c:108)
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.h
+ M /trunk/samtools/examples/00README.txt
+ A /trunk/samtools/examples/Makefile (from /branches/dev/samtools/examples/Makefile:108)
+ D /trunk/samtools/examples/ex1.fa.fai
+ M /trunk/samtools/examples/ex1.sam.gz
+ M /trunk/samtools/faidx.c
+ A /trunk/samtools/glf.c (from /branches/dev/samtools/glf.c:108)
+ M /trunk/samtools/glf.h
+ M /trunk/samtools/misc/Makefile
+ M /trunk/samtools/misc/maq2sam.c
+ M /trunk/samtools/razf.c
+ M /trunk/samtools/source.dot
+
+ * Merge from branches/dev/
+ * all future development will happen here at trunk/
+
+------------------------------------------------------------------------
+r79 | bhandsaker | 2009-01-07 21:42:15 +0000 (Wed, 07 Jan 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_tview.c
+
+Fix problem with compiling without curses.
+
+------------------------------------------------------------------------
+r63 | lh3lh3 | 2008-12-22 15:58:02 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ A /trunk/samtools (from /branches/dev/samtools:62)
+
+Create trunk copy
+
+------------------------------------------------------------------------
+r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/NEWS
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+Release samtools-0.1.1
+
+------------------------------------------------------------------------
+r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines
+Changed paths:
+ M /branches/dev/samtools/bam_aux.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/razf.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-66
+ * fixed a bug in razf.c: reset z_eof when razf_seek() is called
+ * fixed a memory leak in parsing a region
+ * changed pileup a little bit when -s is in use: output ^ and $
+ * when a bam is not indexed, output more meaningful error message
+ * fixed a bug in indexing for small alignment
+ * fixed a bug in the viewer when we come to the end of a reference file
+ * updated documentation
+ * prepare to release 0.1.1
+
+------------------------------------------------------------------------
+r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/examples
+ A /branches/dev/samtools/examples/00README.txt
+ A /branches/dev/samtools/examples/ex1.fa
+ A /branches/dev/samtools/examples/ex1.fa.fai
+ A /branches/dev/samtools/examples/ex1.sam.gz
+
+example
+
+------------------------------------------------------------------------
+r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+
+update ChangeLog
+
+------------------------------------------------------------------------
+r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/misc/export2sam.pl
+
+ * added comments
+ * fixed several bugs
+
+------------------------------------------------------------------------
+r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/misc/export2sam.pl
+
+convert Export format to SAM; not thoroughly tested
+
+------------------------------------------------------------------------
+r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/source.dot
+
+ * samtools-0.1.0-65
+ * pileup: generate maq-like simple output
+ * pileup: allow to output pileup at required sites
+ * source.dot: source file relationship graph
+ * tview: fixed a minor bug
+
+------------------------------------------------------------------------
+r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines
+Changed paths:
+ D /branches/dev/samtools/misc/all2sam.pl
+
+remove all2sam.pl
+
+------------------------------------------------------------------------
+r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/COPYING
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/faidx.h
+ M /branches/dev/samtools/khash.h
+ M /branches/dev/samtools/kseq.h
+ M /branches/dev/samtools/ksort.h
+ M /branches/dev/samtools/samtools.1
+
+Added copyright information and a bit more documentation. No code change.
+
+------------------------------------------------------------------------
+r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-64
+ * improved efficiency of the indel caller for spliced alignments
+
+------------------------------------------------------------------------
+r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_aux.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-63
+ * a bit code cleanup: reduce the dependency between source files
+
+------------------------------------------------------------------------
+r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-62
+ * fixed a memory leak
+
+------------------------------------------------------------------------
+r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/samtools.1
+
+update documentation, ChangeLog and a comment
+
+------------------------------------------------------------------------
+r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_maqcns.h
+ M /branches/dev/samtools/bam_pileup.c
+ A /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-61
+ * moved pileup command to a separate source file
+ * added indel caller
+ * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!)
+ * updated documentation
+
+------------------------------------------------------------------------
+r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-60
+ * fixed another bug in maqcns when there is a nearby deletion
+
+------------------------------------------------------------------------
+r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-59
+ * pileup: outputing consensus is now optional
+ * fixed a bug in glfgen. This bug also exists in maq's glfgen. However,
+ I am not quite sure why the previous version may have problem.
+
+------------------------------------------------------------------------
+r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-58
+ * add maq consensus to pileup. However, I will move this part to a new
+ command as strictly speaking, consensus callin is not part of pileup,
+ and imposing it would make it harder to generate for other language
+ bindings.
+
+------------------------------------------------------------------------
+r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bgzf.c
+
+Fix bug in tell() after reads that consume to the exact end of a block.
+
+------------------------------------------------------------------------
+r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-57
+ * fixed a bug in parser when there is auxiliary fields
+ * made the parser a bit more robust
+
+------------------------------------------------------------------------
+r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-56
+ * fixed a bug in bgzf (only reading is affected)
+ * fixed a typo in bam_index.c
+ * in bam_index.c, check potential bugs in the underlying I/O library
+
+------------------------------------------------------------------------
+r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-55
+ * tried to make pileup work with clipping (previously not), though NOT tested
+ * removed -v from pileup
+ * made pileup take the reference sequence
+
+------------------------------------------------------------------------
+r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-54
+ * in parser, recognize "=", rather than ",", as a match
+ * in parser, correctl parse "=" at the MRNM field.
+
+------------------------------------------------------------------------
+r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/maq2sam.c
+
+fixed a bug in handling maq flag 64 and 192
+
+------------------------------------------------------------------------
+r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/md5fa.c
+
+also calculate unordered md5sum check
+
+------------------------------------------------------------------------
+r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/md5fa.c
+
+fixed a minor bug when there are space in the sequence
+
+------------------------------------------------------------------------
+r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/md5fa.c
+
+fixed a potential memory leak
+
+------------------------------------------------------------------------
+r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+
+ * fixed a bug in import: bin is wrongly calculated
+
+------------------------------------------------------------------------
+r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/all2sam.pl
+
+nothing, really
+
+------------------------------------------------------------------------
+r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/kseq.h
+ M /branches/dev/samtools/misc/Makefile
+ A /branches/dev/samtools/misc/md5.c
+ A /branches/dev/samtools/misc/md5.h
+ A /branches/dev/samtools/misc/md5fa.c
+
+ * fixed two warnings in kseq.h
+ * added md5sum utilities
+
+------------------------------------------------------------------------
+r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/kseq.h
+ D /branches/dev/samtools/kstream.h
+
+ * samtools-0.1.0-52
+ * replace kstream with kseq. kseq is a superset of kstream. I need the
+ extra functions in kseq.h.
+ * also compile stand-alone faidx
+
+------------------------------------------------------------------------
+r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_sort.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-51
+ * sorting by read names is available
+
+------------------------------------------------------------------------
+r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bam_sort.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/misc/maq2sam.c
+
+ * samtools-0.1.0-50
+ * format change to meet the latest specification
+
+------------------------------------------------------------------------
+r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/misc/maq2sam.c
+
+ * minor change in maqcns: special care when n==0
+ * change maq2sam to meet the latest specification
+
+------------------------------------------------------------------------
+r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+ M /branches/dev/samtools/razf.h
+
+considerable code clean up in razf
+
+------------------------------------------------------------------------
+r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/faidx.c
+
+make RAZF optional in faidx.c
+
+------------------------------------------------------------------------
+r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_aux.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-49
+ * added routines for retrieving aux data, NOT TESTED YET!
+
+------------------------------------------------------------------------
+r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-48
+ * bgzf: fixed a potential integer overflow on 32-it machines
+ * maqcns: set the minimum combined quality as 0
+ * supporting hex strings
+
+------------------------------------------------------------------------
+r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-47
+ * fixed the bug in maqcns
+
+------------------------------------------------------------------------
+r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ A /branches/dev/samtools/bam_maqcns.c
+ A /branches/dev/samtools/bam_maqcns.h
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/glf.h
+
+ * samtools-0.1.0-46
+ * add MAQ consensus caller, currently BUGGY!
+
+------------------------------------------------------------------------
+r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-45
+ * tview: display padded alignment (but not P operation)
+ * better coordinates and reference sequence
+
+------------------------------------------------------------------------
+r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/ChangeLog
+
+new ChangeLog
+
+------------------------------------------------------------------------
+r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+ D /branches/dev/samtools/ChangeLog
+ A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6)
+
+Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from
+the log of my personal SVN repository.
+
+------------------------------------------------------------------------
+r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-44
+ * declare fseeko and ftello as some Linux may not do this by default and
+ missing these declarations will make bgzf buggy
+ * get rid of some harmless warings
+ * use BGZF by default, now
+
+------------------------------------------------------------------------
+r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/razf.c
+
+ * samtools-0.1.0-43
+ * fixed a bug in razf_read()
+ * give more warnings when the file is truncated (or due to bugs in I/O library)
+
+------------------------------------------------------------------------
+r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bgzf.c
+
+fixed a bug in bgzf.c at the end of the file
+
+------------------------------------------------------------------------
+r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-42
+ * a lot happened to RAZF, although samtools itself is untouched. Better
+ also update the version number anyway to avoid confusion
+
+------------------------------------------------------------------------
+r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+
+a change from Jue, but I think it should not matter
+
+------------------------------------------------------------------------
+r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+
+fixed a potential bug in razf. However, it seems still buggy, just
+rarely happens, very rarely.
+
+------------------------------------------------------------------------
+r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+
+fixed a bug in razf, with the help of Jue
+
+------------------------------------------------------------------------
+r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+
+remove a comment
+
+------------------------------------------------------------------------
+r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/razf.c
+ M /branches/dev/samtools/razf.h
+
+ * Jue has updated razf to realize Bob's scheme
+
+------------------------------------------------------------------------
+r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/samtools.1
+
+the manual page
+
+------------------------------------------------------------------------
+r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines
+Changed paths:
+ A /branches/dev/samtools/ChangeLog
+ A /branches/dev/samtools/Makefile
+ A /branches/dev/samtools/bam.c
+ A /branches/dev/samtools/bam.h
+ A /branches/dev/samtools/bam_aux.c
+ A /branches/dev/samtools/bam_endian.h
+ A /branches/dev/samtools/bam_import.c
+ A /branches/dev/samtools/bam_index.c
+ A /branches/dev/samtools/bam_lpileup.c
+ A /branches/dev/samtools/bam_pileup.c
+ A /branches/dev/samtools/bam_sort.c
+ A /branches/dev/samtools/bam_tview.c
+ A /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/bgzf.c
+ A /branches/dev/samtools/bgzf.h
+ A /branches/dev/samtools/bgzip.c
+ A /branches/dev/samtools/faidx.c
+ A /branches/dev/samtools/faidx.h
+ A /branches/dev/samtools/khash.h
+ A /branches/dev/samtools/ksort.h
+ A /branches/dev/samtools/kstream.h
+ A /branches/dev/samtools/misc
+ A /branches/dev/samtools/misc/Makefile
+ A /branches/dev/samtools/misc/all2sam.pl
+ A /branches/dev/samtools/misc/maq2sam.c
+ A /branches/dev/samtools/razf.c
+ A /branches/dev/samtools/razf.h
+ A /branches/dev/samtools/razip.c
+ A /branches/dev/samtools/zutil.h
+
+The initial version of samtools, replicated from my local SVN repository.
+The current version is: 0.1.0-42. All future development will happen here.
+
+------------------------------------------------------------------------
+r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools
+
+samtools (C version)
+
+------------------------------------------------------------------------
--- /dev/null
+System Requirements
+===================
+
+SAMtools depends on the zlib library <http://www.zlib.net>. The latest
+version 1.2.3 is preferred and with the latest version you can compile
+razip and use it to compress a FASTA file. SAMtools' faidx is able to
+index a razip-compressed FASTA file to save diskspace. Older zlib also
+works with SAMtools, but razip cannot be compiled.
+
+The text-based viewer (tview) requires the GNU ncurses library
+<http://www.gnu.org/software/ncurses/>, which comes with Mac OS X and
+most of the modern Linux/Unix distributions. If you do not have this
+library installed, you can still compile the rest of SAMtools by
+manually modifying one line in Makefile.
+
+
+Compilation
+===========
+
+Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can
+compile razip with `make razip'.
+
+
+Installation
+============
+
+Simply copy `samtools' and other executables/scripts in `misc' to a
+location you want (e.g. a directory in your $PATH). No further
+configurations are required.
--- /dev/null
+CC= gcc
+CXX= g++
+CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
+CXXFLAGS= $(CFLAGS)
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE #-D_NO_CURSES
+LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
+ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \
+ bam_sort.o
+AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
+ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
+ bamtk.o
+PROG= samtools
+INCLUDES=
+SUBDIRS= . misc
+LIBPATH=
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+lib:libbam.a
+
+libbam.a:$(LOBJS)
+ $(AR) -cru $@ $(LOBJS)
+
+### For the curses library: comment out `-lcurses' if you do not have curses installed
+samtools:lib $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -lcurses -lz -L. -lbam
+
+razip:razip.o razf.o
+ $(CC) $(CFLAGS) -o $@ razf.o razip.o -lz
+
+bgzip:bgzip.o bgzf.o
+ $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h kstring.h
+sam.o:sam.h bam.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h bam_maqcns.h
+bam_maqcns.o:bam.h ksort.h bam_maqcns.h
+bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
+glf.o:glf.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.dSYM razip $(PROG) *~ *.a
+
+clean:cleanlocal-recur
--- /dev/null
+Beta Release 0.1.5 (7 July, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Support opening a BAM alignment on FTP. Users can now use "tview" to
+ view alignments at the NCBI ftp site. Please read manual for more
+ information.
+
+ * In library, propagate errors rather than exit or complain assertion
+ failure.
+
+ * Simplified the building system and fixed compiling errors caused by
+ zlib<1.2.2.1.
+
+ * Fixed an issue about lost header information when a SAM is imported
+ with "view -t".
+
+ * Implemented "samtool.pl varFilter" which filters both SNPs and short
+ indels. This command replaces "indelFilter".
+
+ * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from
+ pileup output.
+
+ * In pileup, cap mapping quality at 60. This helps filtering when
+ different aligners are in use.
+
+ * In pileup, allow to output variant sites only.
+
+ * Made pileup generate correct calls in repetitive region. At the same
+ time, I am considering to implement a simplified model in SOAPsnp,
+ although this has not happened yet.
+
+ * In view, added '-u' option to output BAM without compression. This
+ option is preferred when the output is piped to other commands.
+
+ * In view, added '-l' and '-r' to get the alignments for one library or
+ read group. The "@RG" header lines are now partially parsed.
+
+ * Do not include command line utilities to libbam.a.
+
+ * Fixed memory leaks in pileup and bam_view1().
+
+ * Made faidx more tolerant to empty lines right before or after FASTA >
+ lines.
+
+
+Changes in other utilities:
+
+ * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign.
+
+
+This release involves several modifications to the key code base which
+may potentially introduce new bugs even though we have tried to minimize
+this by testing on several examples. Please let us know if you catch
+bugs.
+
+(0.1.5: 7 July 2009, r373)
+
+
+
+Beta Release 0.1.4 (21 May, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Added the 'rmdupse' command: removing duplicates for SE reads.
+
+ * Fixed a critical bug in the indel caller: clipped alignments are not
+ processed correctly.
+
+ * Fixed a bug in the tview: gapped alignment may be incorrectly
+ displayed.
+
+ * Unified the interface to BAM and SAM I/O. This is done by
+ implementing a wrapper on top of the old APIs and therefore old APIs
+ are still valid. The new I/O APIs also recognize the @SQ header
+ lines.
+
+ * Generate the MD tag.
+
+ * Generate "=" bases. However, the indel caller will not work when "="
+ bases are present.
+
+ * Enhanced support of color-read display (by Nils Homer).
+
+ * Implemented the GNU building system. However, currently the building
+ system does not generate libbam.a. We will improve this later. For
+ the time being, `make -f Makefile.generic' is preferred.
+
+ * Fixed a minor bug in pileup: the first read in a chromosome may be
+ skipped.
+
+ * Fixed bugs in bam_aux.c. These bugs do not affect other components as
+ they were not used previously.
+
+ * Output the 'SM' tag from maq2sam.
+
+(0.1.4: 21 May 2009, r297)
+
+
+
+Beta Release 0.1.3 (15 April, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in SAMtools:
+
+ * SAMtools is more consistent with the specification: a) '*' in the
+ QUAL field is allowed; b) the field separator is TAB only and SPACE
+ is treated as a character in a field; c) empty header is allowed.
+
+ * Implemented GLFv3 support in pileup.
+
+ * Fixed a severe bug in fixmate: strand information is wrongly
+ overwritten.
+
+ * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are
+ not correctly retrieved sometimes.
+
+ * Fixed a bug in rmdup: segfault if unmapped reads are present.
+
+ * Move indel_filter.pl to samtools.pl and improved the filtering by
+ checking the actual number of alignments containing indels. The indel
+ pileup line is also changed a little to make this filtration easier.
+
+ * Fixed a minor bug in indexing: the bin number of an unmapped read is
+ wrongly calculated.
+
+ * Added `flagstat' command to show statistics on the FLAG field.
+
+ * Improved indel caller by setting the maximum window size in local
+ realignment.
+
+Changes in other utilities:
+
+ * Fixed a bug in maq2sam: a tag name is obsolete.
+
+ * Improvement to wgsim: a) added support for SOLiD read simulation; b)
+ show the number of substitutions/indels/errors in read name; c)
+ considerable code clean up.
+
+ * Various converters: improved functionality in general.
+
+ * Updated the example SAM due to the previous bug in fixmate.
+
+(0.1.3: 15 April 2009, r227)
+
+
+
+Beta Release 0.1.2 (28 January, 2008)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in SAMtools:
+
+ * Implemented a Bayesian indel caller. The new caller generate scores
+ and genotype and is potentially more accurate than Maq's indel
+ caller. The pileup format is also changed accordingly.
+
+ * Implemented rmdup command: remove potential PCR duplicates. Note that
+ this command ONLY works for FR orientation and requires ISIZE is
+ correctly set.
+
+ * Added fixmate command: fill in mate coordinates, ISIZE and mate
+ related flags from a name-sorted alignment.
+
+ * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved.
+
+ * Allow to select reads shown in the pileup output with a mask.
+
+ * Generate GLFv2 from pileup.
+
+ * Added two more flags for flagging PCR/optical duplicates and for QC
+ failure.
+
+ * Fixed a bug in sort command: name sorting for large alignment did not
+ work.
+
+ * Allow to completely disable RAZF (using Makefile.lite) as some people
+ have problem to compile it.
+
+ * Fixed a bug in import command when there are reads without
+ coordinates.
+
+ * Fixed a bug in tview: clipping broke the alignment viewer.
+
+ * Fixed a compiling error when _NO_CURSES is applied.
+
+ * Fixed a bug in merge command.
+
+Changes in other utilities:
+
+ * Added wgsim, a paired-end reads simulator. Wgsim was adapted from
+ maq's reads simulator. Colin Hercus further improved it to allow
+ longer indels.
+
+ * Added wgsim_eval.pl, a script that evaluates the accuracy of
+ alignment on reads generated by wgsim.
+
+ * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not
+ work properly when multiple hits are output.
+
+ * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will
+ be retained when multiple hits are present.
+
+ * Fixed a bug in export2sam.pl for QC reads.
+
+ * Support RG tag at MAQ->SAM converter.
+
+ * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and
+ indel are not properly handled, though.
+
+ * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the
+ default Illumina output.
+
+(0.1.2: 28 January 2008; r116)
+
+
+
+Beta Release 0.1.1 (22 December, 2008)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The is the first public release of samtools. For more information,
+please check the manual page `samtools.1' and the samtools website
+http://samtools.sourceforge.net
\ No newline at end of file
--- /dev/null
+#include <stdio.h>
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "bam_endian.h"
+#include "kstring.h"
+
+int bam_is_be = 0;
+
+/**************************
+ * CIGAR related routines *
+ **************************/
+
+int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg)
+{
+ unsigned k;
+ int32_t x = c->pos, y = 0;
+ int state = 0;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK; // operation
+ int l = cigar[k] >> BAM_CIGAR_SHIFT; // length
+ if (state == 0 && (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CINS) && x + l > pos) {
+ reg->tbeg = x; reg->qbeg = y; reg->cbeg = k;
+ state = 1;
+ }
+ if (op == BAM_CMATCH) { x += l; y += l; }
+ else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ if (state == 1 && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CREF_SKIP || k == c->n_cigar - 1)) {
+ reg->tend = x; reg->qend = y; reg->cend = k;
+ }
+ }
+ return state? 0 : -1;
+}
+
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
+{
+ uint32_t k, end;
+ end = c->pos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
+ end += cigar[k] >> BAM_CIGAR_SHIFT;
+ }
+ return end;
+}
+
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
+{
+ uint32_t k;
+ int32_t l = 0;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)
+ l += cigar[k] >> BAM_CIGAR_SHIFT;
+ }
+ return l;
+}
+
+/********************
+ * BAM I/O routines *
+ ********************/
+
+bam_header_t *bam_header_init()
+{
+ bam_is_be = bam_is_big_endian();
+ return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+ int32_t i;
+ extern void bam_destroy_header_hash(bam_header_t *header);
+ if (header == 0) return;
+ if (header->target_name) {
+ for (i = 0; i < header->n_targets; ++i)
+ free(header->target_name[i]);
+ free(header->target_name);
+ free(header->target_len);
+ }
+ free(header->text);
+#ifndef BAM_NO_HASH
+ if (header->rg2lib) bam_strmap_destroy(header->rg2lib);
+ bam_destroy_header_hash(header);
+#endif
+ free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+ bam_header_t *header;
+ char buf[4];
+ int32_t i, name_len;
+ // read "BAM1"
+ if (bam_read(fp, buf, 4) != 4) return 0;
+ if (strncmp(buf, "BAM\001", 4)) {
+ fprintf(stderr, "[bam_header_read] wrong header\n");
+ return 0;
+ }
+ header = bam_header_init();
+ // read plain text and the number of reference sequences
+ bam_read(fp, &header->l_text, 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+ header->text = (char*)calloc(header->l_text + 1, 1);
+ bam_read(fp, header->text, header->l_text);
+ bam_read(fp, &header->n_targets, 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+ // read reference sequence names and lengths
+ header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+ header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+ for (i = 0; i != header->n_targets; ++i) {
+ bam_read(fp, &name_len, 4);
+ if (bam_is_be) bam_swap_endian_4p(&name_len);
+ header->target_name[i] = (char*)calloc(name_len, 1);
+ bam_read(fp, header->target_name[i], name_len);
+ bam_read(fp, &header->target_len[i], 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+ }
+ return header;
+}
+
+int bam_header_write(bamFile fp, const bam_header_t *header)
+{
+ char buf[4];
+ int32_t i, name_len, x;
+ // write "BAM1"
+ strncpy(buf, "BAM\001", 4);
+ bam_write(fp, buf, 4);
+ // write plain text and the number of reference sequences
+ if (bam_is_be) {
+ x = bam_swap_endian_4(header->l_text);
+ bam_write(fp, &x, 4);
+ if (header->l_text) bam_write(fp, header->text, header->l_text);
+ x = bam_swap_endian_4(header->n_targets);
+ bam_write(fp, &x, 4);
+ } else {
+ bam_write(fp, &header->l_text, 4);
+ if (header->l_text) bam_write(fp, header->text, header->l_text);
+ bam_write(fp, &header->n_targets, 4);
+ }
+ // write sequence names and lengths
+ for (i = 0; i != header->n_targets; ++i) {
+ char *p = header->target_name[i];
+ name_len = strlen(p) + 1;
+ if (bam_is_be) {
+ x = bam_swap_endian_4(name_len);
+ bam_write(fp, &x, 4);
+ } else bam_write(fp, &name_len, 4);
+ bam_write(fp, p, name_len);
+ if (bam_is_be) {
+ x = bam_swap_endian_4(header->target_len[i]);
+ bam_write(fp, &x, 4);
+ } else bam_write(fp, &header->target_len[i], 4);
+ }
+ return 0;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+ uint8_t *s;
+ uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+ s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+ for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+ while (s < data + data_len) {
+ uint8_t type;
+ s += 2; // skip key
+ type = toupper(*s); ++s; // skip type
+ if (type == 'C' || type == 'A') ++s;
+ else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+ else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+ else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
+ else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+ }
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+ bam1_core_t *c = &b->core;
+ int32_t block_len, ret, i;
+ uint32_t x[8];
+
+ assert(BAM_CORE_SIZE == 32);
+ if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+ if (ret == 0) return -1; // normal end-of-file
+ else return -2; // truncated
+ }
+ if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
+ if (bam_is_be) {
+ bam_swap_endian_4p(&block_len);
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+ }
+ c->tid = x[0]; c->pos = x[1];
+ c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+ c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+ c->l_qseq = x[4];
+ c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+ b->data_len = block_len - BAM_CORE_SIZE;
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+ b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+ if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+ return 4 + block_len;
+}
+
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
+{
+ uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
+ int i;
+ assert(BAM_CORE_SIZE == 32);
+ x[0] = c->tid;
+ x[1] = c->pos;
+ x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+ x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+ x[4] = c->l_qseq;
+ x[5] = c->mtid;
+ x[6] = c->mpos;
+ x[7] = c->isize;
+ if (bam_is_be) {
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+ y = block_len;
+ bam_write(fp, bam_swap_endian_4p(&y), 4);
+ swap_endian_data(c, data_len, data);
+ } else bam_write(fp, &block_len, 4);
+ bam_write(fp, x, BAM_CORE_SIZE);
+ bam_write(fp, data, data_len);
+ if (bam_is_be) swap_endian_data(c, data_len, data);
+ return 4 + block_len;
+}
+
+int bam_write1(bamFile fp, const bam1_t *b)
+{
+ return bam_write1_core(fp, &b->core, b->data_len, b->data);
+}
+
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
+{
+ uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
+ int i;
+ const bam1_core_t *c = &b->core;
+ kstring_t str;
+ str.l = str.m = 0; str.s = 0;
+
+ ksprintf(&str, "%s\t%d\t", bam1_qname(b), c->flag);
+ if (c->tid < 0) kputs("*\t", &str);
+ else ksprintf(&str, "%s\t", header->target_name[c->tid]);
+ ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual);
+ if (c->n_cigar == 0) kputc('*', &str);
+ else {
+ for (i = 0; i < c->n_cigar; ++i)
+ ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]);
+ }
+ kputc('\t', &str);
+ if (c->mtid < 0) kputs("*\t", &str);
+ else if (c->mtid == c->tid) kputs("=\t", &str);
+ else ksprintf(&str, "%s\t", header->target_name[c->mtid]);
+ ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize);
+ for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
+ kputc('\t', &str);
+ if (t[0] == 0xff) kputc('*', &str);
+ else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+ s = bam1_aux(b);
+ while (s < b->data + b->data_len) {
+ uint8_t type, key[2];
+ key[0] = s[0]; key[1] = s[1];
+ s += 2; type = *s; ++s;
+ ksprintf(&str, "\t%c%c:", key[0], key[1]);
+ if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; }
+ else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; }
+ else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; }
+ else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; }
+ else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; }
+ else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; }
+ else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; }
+ else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
+ else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
+ else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; }
+ }
+ return str.s;
+}
+
+void bam_view1(const bam_header_t *header, const bam1_t *b)
+{
+ char *s = bam_format1(header, b);
+ printf("%s\n", s);
+ free(s);
+}
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef BAM_BAM_H
+#define BAM_BAM_H
+
+/*!
+ @header
+
+ BAM library provides I/O and various operations on manipulating files
+ in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
+ format. It now supports importing from or exporting to TAM, sorting,
+ merging, generating pileup, and quickly retrieval of reads overlapped
+ with a specified region.
+
+ @copyright Genome Research Ltd.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define _IOLIB 2
+
+#if _IOLIB == 1 && !defined(_NO_RAZF)
+#define BAM_TRUE_OFFSET
+#include "razf.h"
+/*! @abstract BAM file handler */
+typedef RAZF *bamFile;
+#define bam_open(fn, mode) razf_open(fn, mode)
+#define bam_dopen(fd, mode) razf_dopen(fd, mode)
+#define bam_close(fp) razf_close(fp)
+#define bam_read(fp, buf, size) razf_read(fp, buf, size)
+#define bam_write(fp, buf, size) razf_write(fp, buf, size)
+#define bam_tell(fp) razf_tell(fp)
+#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir)
+#elif _IOLIB == 2
+#define BAM_VIRTUAL_OFFSET16
+#include "bgzf.h"
+/*! @abstract BAM file handler */
+typedef BGZF *bamFile;
+#define bam_open(fn, mode) bgzf_open(fn, mode)
+#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
+#define bam_close(fp) bgzf_close(fp)
+#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
+#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
+#define bam_tell(fp) bgzf_tell(fp)
+#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
+#elif _IOLIB == 3
+#define BAM_VIRTUAL_OFFSET16
+#include "razf.h"
+/*! @abstract BAM file handler */
+typedef RAZF *bamFile;
+#define bam_open(fn, mode) razf_open2(fn, mode)
+#define bam_dopen(fd, mode) razf_dopen2(fd, mode)
+#define bam_close(fp) razf_close(fp)
+#define bam_read(fp, buf, size) razf_read(fp, buf, size)
+#define bam_write(fp, buf, size) razf_write(fp, buf, size)
+#define bam_tell(fp) razf_tell2(fp)
+#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir)
+#endif
+
+/*! @typedef
+ @abstract Structure for the alignment header.
+ @field n_targets number of reference sequences
+ @field target_name names of the reference sequences
+ @field target_len lengths of the referene sequences
+ @field hash hash table for fast name lookup
+ @field rg2lib hash table for @RG-ID -> LB lookup
+ @field l_text length of the plain text in the header
+ @field text plain text
+
+ @discussion Field hash points to null by default. It is a private
+ member.
+ */
+typedef struct {
+ int32_t n_targets;
+ char **target_name;
+ uint32_t *target_len;
+ void *hash, *rg2lib;
+ int l_text;
+ char *text;
+} bam_header_t;
+
+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
+#define BAM_FPAIRED 1
+/*! @abstract the read is mapped in a proper pair */
+#define BAM_FPROPER_PAIR 2
+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
+#define BAM_FUNMAP 4
+/*! @abstract the mate is unmapped */
+#define BAM_FMUNMAP 8
+/*! @abstract the read is mapped to the reverse strand */
+#define BAM_FREVERSE 16
+/*! @abstract the mate is mapped to the reverse strand */
+#define BAM_FMREVERSE 32
+/*! @abstract this is read1 */
+#define BAM_FREAD1 64
+/*! @abstract this is read2 */
+#define BAM_FREAD2 128
+/*! @abstract not primary alignment */
+#define BAM_FSECONDARY 256
+/*! @abstract QC failure */
+#define BAM_FQCFAIL 512
+/*! @abstract optical or PCR duplicate */
+#define BAM_FDUP 1024
+
+/*! @abstract defautl mask for pileup */
+#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
+
+#define BAM_CORE_SIZE sizeof(bam1_core_t)
+
+/**
+ * Describing how CIGAR operation/length is packed in a 32-bit integer.
+ */
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+
+/*
+ CIGAR operations.
+ */
+/*! @abstract CIGAR: match */
+#define BAM_CMATCH 0
+/*! @abstract CIGAR: insertion to the reference */
+#define BAM_CINS 1
+/*! @abstract CIGAR: deletion from the reference */
+#define BAM_CDEL 2
+/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
+#define BAM_CREF_SKIP 3
+/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
+#define BAM_CSOFT_CLIP 4
+/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
+#define BAM_CHARD_CLIP 5
+/*! @abstract CIGAR: padding */
+#define BAM_CPAD 6
+
+/*! @typedef
+ @abstract Structure for core alignment information.
+ @field tid chromosome ID, defined by bam_header_t
+ @field pos 0-based leftmost coordinate
+ @field strand strand; 0 for forward and 1 otherwise
+ @field bin bin calculated by bam_reg2bin()
+ @field qual mapping quality
+ @field l_qname length of the query name
+ @field flag bitwise flag
+ @field n_cigar number of CIGAR operations
+ @field l_qseq length of the query sequence (read)
+ */
+typedef struct {
+ int32_t tid;
+ int32_t pos;
+ uint32_t bin:16, qual:8, l_qname:8;
+ uint32_t flag:16, n_cigar:16;
+ int32_t l_qseq;
+ int32_t mtid;
+ int32_t mpos;
+ int32_t isize;
+} bam1_core_t;
+
+/*! @typedef
+ @abstract Structure for one alignment.
+ @field core core information about the alignment
+ @field l_aux length of auxiliary data
+ @field data_len current length of bam1_t::data
+ @field m_data maximum length of bam1_t::data
+ @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
+
+ @discussion Notes:
+
+ 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+ 2. l_qseq is calculated from the total length of an alignment block
+ on reading or from CIGAR.
+ */
+typedef struct {
+ bam1_core_t core;
+ int l_aux, data_len, m_data;
+ uint8_t *data;
+} bam1_t;
+
+#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
+#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
+
+/*! @function
+ @abstract Get the CIGAR array
+ @param b pointer to an alignment
+ @return pointer to the CIGAR array
+
+ @discussion In the CIGAR array, each element is a 32-bit integer. The
+ lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+ length of a CIGAR.
+ */
+#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
+
+/*! @function
+ @abstract Get the name of the query
+ @param b pointer to an alignment
+ @return pointer to the name string, null terminated
+ */
+#define bam1_qname(b) ((char*)((b)->data))
+
+/*! @function
+ @abstract Get query sequence
+ @param b pointer to an alignment
+ @return pointer to sequence
+
+ @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+ 8 for T and 15 for N. Two bases are packed in one byte with the base
+ at the higher 4 bits having smaller coordinate on the read. It is
+ recommended to use bam1_seqi() macro to get the base.
+ */
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
+
+/*! @function
+ @abstract Get query quality
+ @param b pointer to an alignment
+ @return pointer to quality string
+ */
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
+
+/*! @function
+ @abstract Get a base on read
+ @param s Query sequence returned by bam1_seq()
+ @param i The i-th position, 0-based
+ @return 4-bit integer representing the base.
+ */
+#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+
+/*! @function
+ @abstract Get query sequence and quality
+ @param b pointer to an alignment
+ @return pointer to the concatenated auxiliary data
+ */
+#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
+
+#ifndef kroundup32
+/*! @function
+ @abstract Round an integer to the next closest power-2 integer.
+ @param x integer to be rounded (in place)
+ @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/*!
+ @abstract Whether the machine is big-endian; modified only in
+ bam_header_init().
+ */
+extern int bam_is_be;
+
+/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
+extern unsigned char bam_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
+extern char *bam_nt16_rev_table;
+
+extern char bam_nt16_nt4_table[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /*! @abstract TAM file handler */
+ typedef struct __tamFile_t *tamFile;
+
+ /*!
+ @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
+ @param fn SAM file name
+ @return SAM file handler
+ */
+ tamFile sam_open(const char *fn);
+
+ /*!
+ @abstract Close a SAM file handler
+ @param fp SAM file handler
+ */
+ void sam_close(tamFile fp);
+
+ /*!
+ @abstract Read one alignment from a SAM file handler
+ @param fp SAM file handler
+ @param header header information (ordered names of chromosomes)
+ @param b read alignment; all members in b will be updated
+ @return 0 if successful; otherwise negative
+ */
+ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
+
+ /*!
+ @abstract Read header information from a TAB-delimited list file.
+ @param fn_list file name for the list
+ @return a pointer to the header structure
+
+ @discussion Each line in this file consists of chromosome name and
+ the length of chromosome.
+ */
+ bam_header_t *sam_header_read2(const char *fn_list);
+
+ /*!
+ @abstract Read header from a SAM file (if present)
+ @param fp SAM file handler
+ @return pointer to header struct; 0 if no @SQ lines available
+ */
+ bam_header_t *sam_header_read(tamFile fp);
+
+ /*!
+ @abstract Parse @SQ lines a update a header struct
+ @param h pointer to the header struct to be updated
+ @return number of target sequences
+
+ @discussion bam_header_t::{n_targets,target_len,target_name} will
+ be destroyed in the first place.
+ */
+ int sam_header_parse(bam_header_t *h);
+
+ /*!
+ @abstract Parse @RG lines a update a header struct
+ @param h pointer to the header struct to be updated
+ @return number of @RG lines
+
+ @discussion bam_header_t::rg2lib will be destroyed in the first
+ place.
+ */
+ int sam_header_parse_rg(bam_header_t *h);
+
+#define sam_write1(header, b) bam_view1(header, b)
+
+ int bam_strmap_put(void *strmap, const char *rg, const char *lib);
+ const char *bam_strmap_get(const void *strmap, const char *rg);
+ void *bam_strmap_dup(const void*);
+ void *bam_strmap_init();
+ void bam_strmap_destroy(void *strmap);
+
+ /*!
+ @abstract Initialize a header structure.
+ @return the pointer to the header structure
+
+ @discussion This function also modifies the global variable
+ bam_is_be.
+ */
+ bam_header_t *bam_header_init();
+
+ /*!
+ @abstract Destroy a header structure.
+ @param header pointer to the header
+ */
+ void bam_header_destroy(bam_header_t *header);
+
+ /*!
+ @abstract Read a header structure from BAM.
+ @param fp BAM file handler, opened by bam_open()
+ @return pointer to the header structure
+
+ @discussion The file position indicator must be placed at the
+ beginning of the file. Upon success, the position indicator will
+ be set at the start of the first alignment.
+ */
+ bam_header_t *bam_header_read(bamFile fp);
+
+ /*!
+ @abstract Write a header structure to BAM.
+ @param fp BAM file handler
+ @param header pointer to the header structure
+ @return always 0 currently
+ */
+ int bam_header_write(bamFile fp, const bam_header_t *header);
+
+ /*!
+ @abstract Read an alignment from BAM.
+ @param fp BAM file handler
+ @param b read alignment; all members are updated.
+ @return number of bytes read from the file
+
+ @discussion The file position indicator must be
+ placed right before an alignment. Upon success, this function
+ will set the position indicator to the start of the next
+ alignment. This function is not affected by the machine
+ endianness.
+ */
+ int bam_read1(bamFile fp, bam1_t *b);
+
+ /*!
+ @abstract Write an alignment to BAM.
+ @param fp BAM file handler
+ @param c pointer to the bam1_core_t structure
+ @param data_len total length of variable size data related to
+ the alignment
+ @param data pointer to the concatenated data
+ @return number of bytes written to the file
+
+ @discussion This function is not affected by the machine
+ endianness.
+ */
+ int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
+
+ /*!
+ @abstract Write an alignment to BAM.
+ @param fp BAM file handler
+ @param b alignment to write
+ @return number of bytes written to the file
+
+ @abstract It is equivalent to:
+ bam_write1_core(fp, &b->core, b->data_len, b->data)
+ */
+ int bam_write1(bamFile fp, const bam1_t *b);
+
+ /*! @function
+ @abstract Initiate a pointer to bam1_t struct
+ */
+#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
+
+ /*! @function
+ @abstract Free the memory allocated for an alignment.
+ @param b pointer to an alignment
+ */
+#define bam_destroy1(b) do { \
+ free((b)->data); free(b); \
+ } while (0)
+
+ /*!
+ @abstract Format a BAM record in the SAM format
+ @param header pointer to the header structure
+ @param b alignment to print
+ @return a pointer to the SAM string
+ */
+ char *bam_format1(const bam_header_t *header, const bam1_t *b);
+
+ /*! @typedef
+ @abstract Structure for one alignment covering the pileup position.
+ @field b pointer to the alignment
+ @field qpos position of the read base at the pileup site, 0-based
+ @field indel indel length; 0 for no indel, positive for ins and negative for del
+ @field is_del 1 iff the base on the padded read is a deletion
+ @field level the level of the read in the "viewer" mode
+
+ @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+ difference between the two functions is that the former does not
+ set bam_pileup1_t::level, while the later does. Level helps the
+ implementation of alignment viewers, but calculating this has some
+ overhead.
+ */
+ typedef struct {
+ bam1_t *b;
+ int32_t qpos;
+ int indel, level;
+ uint32_t is_del:1, is_head:1, is_tail:1;
+ } bam_pileup1_t;
+
+ struct __bam_plbuf_t;
+ /*! @abstract pileup buffer */
+ typedef struct __bam_plbuf_t bam_plbuf_t;
+
+ void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+
+ /*! @typedef
+ @abstract Type of function to be called by bam_plbuf_push().
+ @param tid chromosome ID as is defined in the header
+ @param pos start coordinate of the alignment, 0-based
+ @param n number of elements in pl array
+ @param pl array of alignments
+ @param data user provided data
+ @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
+ */
+ typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
+
+ /*!
+ @abstract Reset a pileup buffer for another pileup process
+ @param buf the pileup buffer to be reset
+ */
+ void bam_plbuf_reset(bam_plbuf_t *buf);
+
+ /*!
+ @abstract Initialize a buffer for pileup.
+ @param func fucntion to be called by bam_pileup_core()
+ @param data user provided data
+ @return pointer to the pileup buffer
+ */
+ bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
+
+ /*!
+ @abstract Destroy a pileup buffer.
+ @param buf pointer to the pileup buffer
+ */
+ void bam_plbuf_destroy(bam_plbuf_t *buf);
+
+ /*!
+ @abstract Push an alignment to the pileup buffer.
+ @param b alignment to be pushed
+ @param buf pileup buffer
+ @see bam_plbuf_init()
+ @return always 0 currently
+
+ @discussion If all the alignments covering a particular site have
+ been collected, this function will call the user defined function
+ as is provided to bam_plbuf_init(). The coordinate of the site and
+ all the alignments will be transferred to the user defined
+ function as function parameters.
+
+ When all the alignments are pushed to the buffer, this function
+ needs to be called with b equal to NULL. This will flush the
+ buffer. A pileup buffer can only be reused when bam_plbuf_reset()
+ is called.
+ */
+ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
+
+ struct __bam_lplbuf_t;
+ typedef struct __bam_lplbuf_t bam_lplbuf_t;
+
+ void bam_lplbuf_reset(bam_lplbuf_t *buf);
+
+ /*! @abstract bam_plbuf_init() equivalent with level calculated. */
+ bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
+
+ /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */
+ void bam_lplbuf_destroy(bam_lplbuf_t *tv);
+
+ /*! @abstract bam_plbuf_push() equivalent with level calculated. */
+ int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
+
+ /*! @abstract bam_plbuf_file() equivalent with level calculated. */
+ int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
+
+ struct __bam_index_t;
+ typedef struct __bam_index_t bam_index_t;
+
+ /*!
+ @abstract Build index for a BAM file.
+ @discussion Index file "fn.bai" will be created.
+ @param fn name of the BAM file
+ @return always 0 currently
+ */
+ int bam_index_build(const char *fn);
+
+ /*!
+ @abstract Load index from file "fn.bai".
+ @param fn name of the BAM file (NOT the index file)
+ @return pointer to the index structure
+ */
+ bam_index_t *bam_index_load(const char *fn);
+
+ /*!
+ @abstract Destroy an index structure.
+ @param idx pointer to the index structure
+ */
+ void bam_index_destroy(bam_index_t *idx);
+
+ /*! @typedef
+ @abstract Type of function to be called by bam_fetch().
+ @param b the alignment
+ @param data user provided data
+ */
+ typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
+
+ /*!
+ @abstract Retrieve the alignments that are overlapped with the
+ specified region.
+
+ @discussion A user defined function will be called for each
+ retrieved alignment ordered by its start position.
+
+ @param fp BAM file handler
+ @param idx pointer to the alignment index
+ @param tid chromosome ID as is defined in the header
+ @param beg start coordinate, 0-based
+ @param end end coordinate, 0-based
+ @param data user provided data (will be transferred to func)
+ @param func user defined function
+ */
+ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
+
+ /*!
+ @abstract Parse a region in the format: "chr2:100,000-200,000".
+ @discussion bam_header_t::hash will be initialized if empty.
+ @param header pointer to the header structure
+ @param str string to be parsed
+ @param ref_id the returned chromosome ID
+ @param begin the returned start coordinate
+ @param end the returned end coordinate
+ @return 0 on success; -1 on failure
+ */
+ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+
+ /*!
+ @abstract Retrieve data of a tag
+ @param b pointer to an alignment struct
+ @param tag two-character tag to be retrieved
+
+ @return pointer to the type and data. The first character is the
+ type that can be 'iIsScCdfAZH'.
+
+ @discussion Use bam_aux2?() series to convert the returned data to
+ the corresponding type.
+ */
+ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
+
+ int32_t bam_aux2i(const uint8_t *s);
+ float bam_aux2f(const uint8_t *s);
+ double bam_aux2d(const uint8_t *s);
+ char bam_aux2A(const uint8_t *s);
+ char *bam_aux2Z(const uint8_t *s);
+
+ void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
+
+ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
+
+ /*!
+ @abstract Calculate the rightmost coordinate of an alignment on the
+ reference genome.
+
+ @param c pointer to the bam1_core_t structure
+ @param cigar the corresponding CIGAR array (from bam1_t::cigar)
+ @return the rightmost coordinate, 0-based
+ */
+ uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
+
+ /*!
+ @abstract Calculate the length of the query sequence from CIGAR.
+ @param c pointer to the bam1_core_t structure
+ @param cigar the corresponding CIGAR array (from bam1_t::cigar)
+ @return length of the query sequence
+ */
+ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
+
+ typedef struct {
+ int32_t qbeg, qend;
+ int32_t tbeg, tend;
+ int32_t cbeg, cend;
+ } bam_segreg_t;
+
+ int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*!
+ @abstract Calculate the minimum bin that contains a region [beg,end).
+ @param beg start of the region, 0-based
+ @param end end of the region, 0-based
+ @return bin
+ */
+static inline int bam_reg2bin(uint32_t beg, uint32_t end)
+{
+ --end;
+ if (beg>>14 == end>>14) return 4681 + (beg>>14);
+ if (beg>>17 == end>>17) return 585 + (beg>>17);
+ if (beg>>20 == end>>20) return 73 + (beg>>20);
+ if (beg>>23 == end>>23) return 9 + (beg>>23);
+ if (beg>>26 == end>>26) return 1 + (beg>>26);
+ return 0;
+}
+
+/*!
+ @abstract Copy an alignment
+ @param bdst destination alignment struct
+ @param bsrc source alignment struct
+ @return pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+{
+ uint8_t *data = bdst->data;
+ int m_data = bdst->m_data; // backup data and m_data
+ if (m_data < bsrc->m_data) { // double the capacity
+ m_data = bsrc->m_data; kroundup32(m_data);
+ data = (uint8_t*)realloc(data, m_data);
+ }
+ memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
+ *bdst = *bsrc; // copy the rest
+ // restore the backup
+ bdst->m_data = m_data;
+ bdst->data = data;
+ return bdst;
+}
+
+/*!
+ @abstract Duplicate an alignment
+ @param src source alignment struct
+ @return pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_dup1(const bam1_t *src)
+{
+ bam1_t *b;
+ b = bam_init1();
+ *b = *src;
+ b->m_data = b->data_len;
+ b->data = (uint8_t*)calloc(b->data_len, 1);
+ memcpy(b->data, src->data, b->data_len);
+ return b;
+}
+
+#endif
--- /dev/null
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+typedef char *str_p;
+KHASH_MAP_INIT_STR(s, int)
+KHASH_MAP_INIT_STR(r2l, str_p)
+
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+ int ori_len = b->data_len;
+ b->data_len += 3 + len;
+ b->l_aux += 3 + len;
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+ b->data[ori_len + 2] = type;
+ memcpy(b->data + ori_len + 3, data, len);
+}
+
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
+{
+ return bam_aux_get(b, tag);
+}
+
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
+{
+ uint8_t *s;
+ int y = tag[0]<<8 | tag[1];
+ s = bam1_aux(b);
+ while (s < b->data + b->data_len) {
+ int type, x = (int)s[0]<<8 | s[1];
+ s += 2;
+ if (x == y) return s;
+ type = toupper(*s); ++s;
+ if (type == 'C') ++s;
+ else if (type == 'S') s += 2;
+ else if (type == 'I' || type == 'F') s += 4;
+ else if (type == 'D') s += 8;
+ else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+ }
+ return 0;
+}
+
+void bam_init_header_hash(bam_header_t *header)
+{
+ if (header->hash == 0) {
+ int ret, i;
+ khiter_t iter;
+ khash_t(s) *h;
+ header->hash = h = kh_init(s);
+ for (i = 0; i < header->n_targets; ++i) {
+ iter = kh_put(s, h, header->target_name[i], &ret);
+ kh_value(h, iter) = i;
+ }
+ }
+}
+
+void bam_destroy_header_hash(bam_header_t *header)
+{
+ if (header->hash)
+ kh_destroy(s, (khash_t(s)*)header->hash);
+}
+
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
+{
+ khint_t k;
+ khash_t(s) *h = (khash_t(s)*)header->hash;
+ k = kh_get(s, h, seq_name);
+ return k == kh_end(h)? -1 : kh_value(h, k);
+}
+
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end)
+{
+ char *s, *p;
+ int i, l, k;
+ khiter_t iter;
+ khash_t(s) *h;
+
+ bam_init_header_hash(header);
+ h = (khash_t(s)*)header->hash;
+
+ l = strlen(str);
+ p = s = (char*)malloc(l+1);
+ /* squeeze out "," */
+ for (i = k = 0; i != l; ++i)
+ if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+ s[k] = 0;
+ for (i = 0; i != k; ++i) if (s[i] == ':') break;
+ s[i] = 0;
+ iter = kh_get(s, h, s); /* get the ref_id */
+ if (iter == kh_end(h)) { // name not found
+ *ref_id = -1; free(s);
+ return -1;
+ }
+ *ref_id = kh_value(h, iter);
+ if (i == k) { /* dump the whole sequence */
+ *begin = 0; *end = 1<<29; free(s);
+ return -1;
+ }
+ for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+ *begin = atoi(p);
+ if (i < k) {
+ p = s + i + 1;
+ *end = atoi(p);
+ } else *end = 1<<29;
+ if (*begin > 0) --*begin;
+ free(s);
+ if (*begin > *end) {
+ fprintf(stderr, "[bam_parse_region] invalid region.\n");
+ return -1;
+ }
+ return 0;
+}
+
+int32_t bam_aux2i(const uint8_t *s)
+{
+ int type;
+ if (s == 0) return 0;
+ type = *s++;
+ if (type == 'c') return (int32_t)*(int8_t*)s;
+ else if (type == 'C') return (int32_t)*(uint8_t*)s;
+ else if (type == 's') return (int32_t)*(int16_t*)s;
+ else if (type == 'S') return (int32_t)*(uint16_t*)s;
+ else if (type == 'i' || type == 'I') return *(int32_t*)s;
+ else return 0;
+}
+
+float bam_aux2f(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0.0;
+ if (type == 'f') return *(float*)s;
+ else return 0.0;
+}
+
+double bam_aux2d(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0.0;
+ if (type == 'd') return *(double*)s;
+ else return 0.0;
+}
+
+char bam_aux2A(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0;
+ if (type == 'A') return *(char*)s;
+ else return 0;
+}
+
+char *bam_aux2Z(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0;
+ if (type == 'Z' || type == 'H') return (char*)s;
+ else return 0;
+}
+
+/******************
+ * rg2lib related *
+ ******************/
+
+int bam_strmap_put(void *rg2lib, const char *rg, const char *lib)
+{
+ int ret;
+ khint_t k;
+ khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
+ char *key;
+ if (h == 0) return 1;
+ key = strdup(rg);
+ k = kh_put(r2l, h, key, &ret);
+ if (ret) kh_val(h, k) = strdup(lib);
+ else {
+ fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg);
+ free(key);
+ }
+ return 0;
+}
+
+const char *bam_strmap_get(const void *rg2lib, const char *rg)
+{
+ const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
+ khint_t k;
+ if (h == 0) return 0;
+ k = kh_get(r2l, h, rg);
+ if (k != kh_end(h)) return (const char*)kh_val(h, k);
+ else return 0;
+}
+
+void *bam_strmap_dup(const void *rg2lib)
+{
+ const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
+ khash_t(r2l) *g;
+ khint_t k, l;
+ int ret;
+ if (h == 0) return 0;
+ g = kh_init(r2l);
+ for (k = kh_begin(h); k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ char *key = strdup(kh_key(h, k));
+ l = kh_put(r2l, g, key, &ret);
+ kh_val(g, l) = strdup(kh_val(h, k));
+ }
+ }
+ return g;
+}
+
+void *bam_strmap_init()
+{
+ return (void*)kh_init(r2l);
+}
+
+void bam_strmap_destroy(void *rg2lib)
+{
+ khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
+ khint_t k;
+ if (h == 0) return;
+ for (k = kh_begin(h); k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free((char*)kh_key(h, k)); free(kh_val(h, k));
+ }
+ }
+ kh_destroy(r2l, h);
+}
--- /dev/null
+#include <ctype.h>
+#include "bam.h"
+
+/*!
+ @abstract Get the color encoding the previous and current base
+ @param b pointer to an alignment
+ @param i The i-th position, 0-based
+ @return color
+
+ @discussion Returns 0 no color information is found.
+ */
+char bam_aux_getCSi(bam1_t *b, int i)
+{
+ uint8_t *c = bam_aux_get(b, "CS");
+ char *cs = NULL;
+
+ // return the base if the tag was not found
+ if(0 == c) return 0;
+
+ cs = bam_aux2Z(c);
+ // adjust for strandedness and leading adaptor
+ if(bam1_strand(b)) i = strlen(cs) - 1 - i;
+ else i++;
+ return cs[i];
+}
+
+/*!
+ @abstract Get the color quality of the color encoding the previous and current base
+ @param b pointer to an alignment
+ @param i The i-th position, 0-based
+ @return color quality
+
+ @discussion Returns 0 no color information is found.
+ */
+char bam_aux_getCQi(bam1_t *b, int i)
+{
+ uint8_t *c = bam_aux_get(b, "CQ");
+ char *cq = NULL;
+
+ // return the base if the tag was not found
+ if(0 == c) return 0;
+
+ cq = bam_aux2Z(c);
+ // adjust for strandedness
+ if(bam1_strand(b)) i = strlen(cq) - 1 - i;
+ return cq[i];
+}
+
+char bam_aux_nt2int(char a)
+{
+ switch(toupper(a)) {
+ case 'A':
+ return 0;
+ break;
+ case 'C':
+ return 1;
+ break;
+ case 'G':
+ return 2;
+ break;
+ case 'T':
+ return 3;
+ break;
+ default:
+ return 4;
+ break;
+ }
+}
+
+char bam_aux_ntnt2cs(char a, char b)
+{
+ a = bam_aux_nt2int(a);
+ b = bam_aux_nt2int(b);
+ if(4 == a || 4 == b) return '4';
+ return "0123"[(int)(a ^ b)];
+}
+
+/*!
+ @abstract Get the color error profile at the give position
+ @param b pointer to an alignment
+ @return the original color if the color was an error, '-' (dash) otherwise
+
+ @discussion Returns 0 no color information is found.
+ */
+char bam_aux_getCEi(bam1_t *b, int i)
+{
+ int cs_i;
+ uint8_t *c = bam_aux_get(b, "CS");
+ char *cs = NULL;
+ char prev_b, cur_b;
+ char cur_color, cor_color;
+
+ // return the base if the tag was not found
+ if(0 == c) return 0;
+
+ cs = bam_aux2Z(c);
+
+ // adjust for strandedness and leading adaptor
+ if(bam1_strand(b)) { //reverse strand
+ cs_i = strlen(cs) - 1 - i;
+ // get current color
+ cur_color = cs[cs_i];
+ // get previous base
+ prev_b = (0 == cs_i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+ // get current base
+ cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ }
+ else {
+ cs_i=i+1;
+ // get current color
+ cur_color = cs[cs_i];
+ // get previous base
+ prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+ // get current base
+ cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ }
+
+ // corrected color
+ cor_color = bam_aux_ntnt2cs(prev_b, cur_b);
+
+ if(cur_color == cor_color) {
+ return '-';
+ }
+ else {
+ return cur_color;
+ }
+}
--- /dev/null
+#ifndef BAM_ENDIAN_H
+#define BAM_ENDIAN_H
+
+#include <stdint.h>
+
+static inline int bam_is_big_endian()
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *bam_swap_endian_2p(void *x)
+{
+ *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
+ return x;
+}
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *bam_swap_endian_4p(void *x)
+{
+ *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
+ return x;
+}
+static inline uint64_t bam_swap_endian_8(uint64_t v)
+{
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *bam_swap_endian_8p(void *x)
+{
+ *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
+ return x;
+}
+
+#endif
--- /dev/null
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#include "kstring.h"
+#include "bam.h"
+#include "kseq.h"
+#include "khash.h"
+
+KSTREAM_INIT(gzFile, gzread, 8192)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+
+void bam_init_header_hash(bam_header_t *header);
+void bam_destroy_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+unsigned char bam_nt16_table[256] = {
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
+
+struct __tamFile_t {
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t *str;
+ uint64_t n_lines;
+ int is_first;
+};
+
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
+{
+ char **list = 0, *s;
+ int n = 0, dret, m = 0;
+ gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ kstream_t *ks;
+ kstring_t *str;
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, '\n', str, &dret) > 0) {
+ if (n == m) {
+ m = m? m << 1 : 16;
+ list = (char**)realloc(list, m * sizeof(char*));
+ }
+ if (str->s[str->l-1] == '\r')
+ str->s[--str->l] = '\0';
+ s = list[n++] = (char*)calloc(str->l + 1, 1);
+ strcpy(s, str->s);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ *_n = n;
+ return list;
+}
+
+static bam_header_t *hash2header(const kh_ref_t *hash)
+{
+ bam_header_t *header;
+ khiter_t k;
+ header = bam_header_init();
+ header->n_targets = kh_size(hash);
+ header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
+ header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
+ for (k = kh_begin(hash); k != kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ int i = (int)kh_value(hash, k);
+ header->target_name[i] = (char*)kh_key(hash, k);
+ header->target_len[i] = kh_value(hash, k)>>32;
+ }
+ }
+ bam_init_header_hash(header);
+ return header;
+}
+bam_header_t *sam_header_read2(const char *fn)
+{
+ bam_header_t *header;
+ int c, dret, ret;
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t *str;
+ kh_ref_t *hash;
+ khiter_t k;
+ hash = kh_init(ref);
+ fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ assert(fp);
+ ks = ks_init(fp);
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ while (ks_getuntil(ks, 0, str, &dret) > 0) {
+ char *s = strdup(str->s);
+ int len, i;
+ i = kh_size(hash);
+ ks_getuntil(ks, 0, str, &dret);
+ len = atoi(str->s);
+ k = kh_put(ref, hash, s, &ret);
+ kh_value(hash, k) = (uint64_t)len<<32 | i;
+ if (dret != '\n')
+ while ((c = ks_getc(ks)) != '\n' && c != -1);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+ header = hash2header(hash);
+ kh_destroy(ref, hash);
+ return header;
+}
+static inline uint8_t *alloc_data(bam1_t *b, int size)
+{
+ if (b->m_data < size) {
+ b->m_data = size;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ return b->data;
+}
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
+{
+ fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
+ abort();
+}
+static inline void append_text(bam_header_t *header, kstring_t *str)
+{
+ int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+ kroundup32(x); kroundup32(y);
+ if (x < y) header->text = (char*)realloc(header->text, y);
+ strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
+ header->l_text += str->l + 1;
+ header->text[header->l_text] = 0;
+}
+
+int sam_header_parse_rg(bam_header_t *h)
+{
+ kstring_t *rgid, *rglib;
+ char *p, *q, *s, *r;
+ int n = 0;
+
+ // free
+ if (h == 0) return 0;
+ bam_strmap_destroy(h->rg2lib); h->rg2lib = 0;
+ if (h->l_text < 3) return 0;
+ // parse @RG lines
+ h->rg2lib = bam_strmap_init();
+ rgid = calloc(1, sizeof(kstring_t));
+ rglib = calloc(1, sizeof(kstring_t));
+ s = h->text;
+ while ((s = strstr(s, "@RG")) != 0) {
+ if (rgid->l && rglib->l) {
+ bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+ ++n;
+ }
+ rgid->l = rglib->l = 0;
+ s += 3;
+ r = s;
+ if ((p = strstr(s, "ID:")) != 0) {
+ q = p + 3;
+ for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+ kputsn(q, p - q, rgid);
+ } else {
+ fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n");
+ break;
+ }
+ if (r < p) r = p;
+ if ((p = strstr(s, "LB:")) != 0) {
+ q = p + 3;
+ for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+ kputsn(q, p - q, rglib);
+ } else {
+ fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n");
+ break;
+ }
+ if (r < p) r = p;
+ s = r + 3;
+ }
+ if (rgid->l && rglib->l) {
+ bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+ ++n;
+ }
+ free(rgid->s); free(rgid);
+ free(rglib->s); free(rglib);
+ if (n == 0) {
+ bam_strmap_destroy(h->rg2lib);
+ h->rg2lib = 0;
+ }
+ return n;
+}
+
+int sam_header_parse(bam_header_t *h)
+{
+ int i;
+ char *s, *p, *q, *r;
+
+ // free
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ if (h->l_text < 3) return 0;
+ // count number of @SQ
+ s = h->text;
+ while ((s = strstr(s, "@SQ")) != 0) {
+ ++h->n_targets;
+ s += 3;
+ }
+ if (h->n_targets == 0) return 0;
+ h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+ // parse @SQ lines
+ i = 0;
+ s = h->text;
+ while ((s = strstr(s, "@SQ")) != 0) {
+ s += 3;
+ r = s;
+ if ((p = strstr(s, "SN:")) != 0) {
+ q = p + 3;
+ for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+ h->target_name[i] = (char*)calloc(p - q + 1, 1);
+ strncpy(h->target_name[i], q, p - q);
+ } else goto header_err_ret;
+ if (r < p) r = p;
+ if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
+ else goto header_err_ret;
+ if (r < p) r = p;
+ s = r + 3;
+ ++i;
+ }
+ sam_header_parse_rg(h);
+ return h->n_targets;
+
+header_err_ret:
+ fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n");
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ return 0;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+ int ret, dret;
+ bam_header_t *header = bam_header_init();
+ kstring_t *str = fp->str;
+ while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
+ str->s[str->l] = dret; // note that str->s is NOT null terminated!!
+ append_text(header, str);
+ if (dret != '\n') {
+ ret = ks_getuntil(fp->ks, '\n', str, &dret);
+ str->s[str->l] = '\n'; // NOT null terminated!!
+ append_text(header, str);
+ }
+ ++fp->n_lines;
+ }
+ sam_header_parse(header);
+ bam_init_header_hash(header);
+ fp->is_first = 1;
+ return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+ int ret, doff, doff0, dret, z = 0;
+ bam1_core_t *c = &b->core;
+ kstring_t *str = fp->str;
+ kstream_t *ks = fp->ks;
+
+ if (fp->is_first) {
+ fp->is_first = 0;
+ ret = str->l;
+ } else {
+ do { // special consideration for empty lines
+ ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+ if (ret >= 0) z += str->l + 1;
+ } while (ret == 0);
+ }
+ if (ret < 0) return -1;
+ ++fp->n_lines;
+ doff = 0;
+
+ { // name
+ c->l_qname = strlen(str->s) + 1;
+ memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
+ doff += c->l_qname;
+ }
+ { // flag, tid, pos, qual
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s);
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+ if (c->tid < 0 && strcmp(str->s, "*")) {
+ if (header->n_targets == 0) {
+ fprintf(stderr, "[sam_read1] missing header? Abort!\n");
+ exit(1);
+ } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
+ }
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
+ if (ret < 0) return -2;
+ }
+ { // cigar
+ char *s, *t;
+ int i, op;
+ long x;
+ c->n_cigar = 0;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
+ z += str->l + 1;
+ if (str->s[0] != '*') {
+ for (s = str->s; *s; ++s) {
+ if (isalpha(*s)) ++c->n_cigar;
+ else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
+ }
+ b->data = alloc_data(b, doff + c->n_cigar * 4);
+ for (i = 0, s = str->s; i != c->n_cigar; ++i) {
+ x = strtol(s, &t, 10);
+ op = toupper(*t);
+ if (op == 'M') op = BAM_CMATCH;
+ else if (op == 'I') op = BAM_CINS;
+ else if (op == 'D') op = BAM_CDEL;
+ else if (op == 'N') op = BAM_CREF_SKIP;
+ else if (op == 'S') op = BAM_CSOFT_CLIP;
+ else if (op == 'H') op = BAM_CHARD_CLIP;
+ else if (op == 'P') op = BAM_CPAD;
+ else parse_error(fp->n_lines, "invalid CIGAR operation");
+ s = t + 1;
+ bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
+ }
+ if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
+ c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b)));
+ doff += c->n_cigar * 4;
+ } else c->bin = bam_reg2bin(c->pos, c->pos + 1);
+ }
+ { // mtid, mpos, isize
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
+ if (ret < 0) return -4;
+ }
+ { // seq and qual
+ int i;
+ uint8_t *p;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
+ z += str->l + 1;
+ c->l_qseq = strlen(str->s);
+ if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
+ parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+ p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+ bzero(p, (c->l_qseq+1)/2);
+ for (i = 0; i < c->l_qseq; ++i)
+ p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
+ z += str->l + 1;
+ if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
+ parse_error(fp->n_lines, "sequence and quality are inconsistent");
+ p += (c->l_qseq+1)/2;
+ if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
+ else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
+ doff += c->l_qseq + (c->l_qseq+1)/2;
+ }
+ doff0 = doff;
+ if (dret != '\n' && dret != '\r') { // aux
+ while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
+ uint8_t *s, type, key[2];
+ z += str->l + 1;
+ if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
+ parse_error(fp->n_lines, "missing colon in auxiliary data");
+ key[0] = str->s[0]; key[1] = str->s[1];
+ type = str->s[3];
+ s = alloc_data(b, doff + 3) + doff;
+ s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
+ if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
+ s = alloc_data(b, doff + 2) + doff;
+ *s++ = 'A'; *s = str->s[5];
+ doff += 2;
+ } else if (type == 'I' || type == 'i') {
+ long long x;
+ s = alloc_data(b, doff + 5) + doff;
+ x = (long long)atoll(str->s + 5);
+ if (x < 0) {
+ if (x >= -127) {
+ *s++ = 'c'; *(int8_t*)s = (int8_t)x;
+ s += 1; doff += 2;
+ } else if (x >= -32767) {
+ *s++ = 's'; *(int16_t*)s = (int16_t)x;
+ s += 2; doff += 3;
+ } else {
+ *s++ = 'i'; *(int32_t*)s = (int32_t)x;
+ s += 4; doff += 5;
+ if (x < -2147483648ll)
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+ (long long)fp->n_lines, x);
+ }
+ } else {
+ if (x <= 255) {
+ *s++ = 'C'; *s++ = (uint8_t)x;
+ doff += 2;
+ } else if (x <= 65535) {
+ *s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
+ s += 2; doff += 3;
+ } else {
+ *s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
+ s += 4; doff += 5;
+ if (x > 4294967295ll)
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+ (long long)fp->n_lines, x);
+ }
+ }
+ } else if (type == 'f') {
+ s = alloc_data(b, doff + 5) + doff;
+ *s++ = 'f';
+ *(float*)s = (float)atof(str->s + 5);
+ s += 4; doff += 5;
+ } else if (type == 'd') {
+ s = alloc_data(b, doff + 9) + doff;
+ *s++ = 'd';
+ *(float*)s = (float)atof(str->s + 9);
+ s += 8; doff += 9;
+ } else if (type == 'Z' || type == 'H') {
+ int size = 1 + (str->l - 5) + 1;
+ if (type == 'H') { // check whether the hex string is valid
+ int i;
+ if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
+ for (i = 0; i < str->l - 5; ++i) {
+ int c = toupper(str->s[5 + i]);
+ if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
+ parse_error(fp->n_lines, "invalid hex character");
+ }
+ }
+ s = alloc_data(b, doff + size) + doff;
+ *s++ = type;
+ memcpy(s, str->s + 5, str->l - 5);
+ s[str->l - 5] = 0;
+ doff += size;
+ } else parse_error(fp->n_lines, "unrecognized type");
+ if (dret == '\n' || dret == '\r') break;
+ }
+ }
+ b->l_aux = doff - doff0;
+ b->data_len = doff;
+ return z;
+}
+
+tamFile sam_open(const char *fn)
+{
+ tamFile fp;
+ fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
+ fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ fp->ks = ks_init(fp->fp);
+ return fp;
+}
+
+void sam_close(tamFile fp)
+{
+ if (fp) {
+ ks_destroy(fp->ks);
+ gzclose(fp->fp);
+ free(fp->str->s); free(fp->str);
+ free(fp);
+ }
+}
--- /dev/null
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+#include "knetfile.h"
+
+/*!
+ @header
+
+ Alignment indexing. Before indexing, BAM must be sorted based on the
+ leftmost coordinate of alignments. In indexing, BAM uses two indices:
+ a UCSC binning index and a simple linear index. The binning index is
+ efficient for alignments spanning long distance, while the auxiliary
+ linear index helps to reduce unnecessary seek calls especially for
+ short alignments.
+
+ The UCSC binning scheme was suggested by Richard Durbin and Lincoln
+ Stein and is explained by Kent et al. (2002). In this scheme, each bin
+ represents a contiguous genomic region which can be fully contained in
+ another bin; each alignment is associated with a bin which represents
+ the smallest region containing the entire alignment. The binning
+ scheme is essentially another representation of R-tree. A distinct bin
+ uniquely corresponds to a distinct internal node in a R-tree. Bin A is
+ a child of Bin B if region A is contained in B.
+
+ In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
+ 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
+ 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
+ find the alignments overlapped with a region [rbeg,rend), we need to
+ calculate the list of bins that may be overlapped the region and test
+ the alignments in the bins to confirm the overlaps. If the specified
+ region is short, typically only a few alignments in six bins need to
+ be retrieved. The overlapping alignments can be quickly fetched.
+
+ */
+
+#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
+#define BAM_LIDX_SHIFT 14
+
+typedef struct {
+ uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(off, pair64_t, pair64_lt)
+
+typedef struct {
+ uint32_t m, n;
+ pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+ int32_t n, m;
+ uint64_t *offset;
+} bam_lidx_t;
+
+KHASH_MAP_INIT_INT(i, bam_binlist_t)
+
+struct __bam_index_t {
+ int32_t n;
+ khash_t(i) **index;
+ bam_lidx_t *index2;
+};
+
+// requirement: len <= LEN_MASK
+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
+{
+ khint_t k;
+ bam_binlist_t *l;
+ int ret;
+ k = kh_put(i, h, bin, &ret);
+ l = &kh_value(h, k);
+ if (ret) { // not present
+ l->m = 1; l->n = 0;
+ l->list = (pair64_t*)calloc(l->m, 16);
+ }
+ if (l->n == l->m) {
+ l->m <<= 1;
+ l->list = (pair64_t*)realloc(l->list, l->m * 16);
+ }
+ l->list[l->n].u = beg; l->list[l->n++].v = end;
+}
+
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
+{
+ int i, beg, end;
+ beg = b->core.pos >> BAM_LIDX_SHIFT;
+ end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
+ kroundup32(index2->m);
+ index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+ }
+ for (i = beg + 1; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ index2->n = end + 1;
+}
+
+static void merge_chunks(bam_index_t *idx)
+{
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+ khash_t(i) *index;
+ int i, l, m;
+ khint_t k;
+ for (i = 0; i < idx->n; ++i) {
+ index = idx->index[i];
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ bam_binlist_t *p;
+ if (!kh_exist(index, k)) continue;
+ p = &kh_value(index, k);
+ m = 0;
+ for (l = 1; l < p->n; ++l) {
+#ifdef BAM_TRUE_OFFSET
+ if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
+#else
+ if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
+#endif
+ else p->list[++m] = p->list[l];
+ } // ~for(l)
+ p->n = m + 1;
+ } // ~for(k)
+ } // ~for(i)
+#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
+}
+
+bam_index_t *bam_index_core(bamFile fp)
+{
+ bam1_t *b;
+ bam_header_t *h;
+ int i, ret;
+ bam_index_t *idx;
+ uint32_t last_bin, save_bin;
+ int32_t last_coor, last_tid, save_tid;
+ bam1_core_t *c;
+ uint64_t save_off, last_off;
+
+ idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+ b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ h = bam_header_read(fp);
+ c = &b->core;
+
+ idx->n = h->n_targets;
+ bam_header_destroy(h);
+ idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+ for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
+ idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+
+ save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
+ save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ if (last_tid != c->tid) { // change of chromosomes
+ last_tid = c->tid;
+ last_bin = 0xffffffffu;
+ } else if (last_coor > c->pos) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
+ bam1_qname(b), last_coor, c->pos, c->tid+1);
+ exit(1);
+ }
+ if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+ if (c->bin != last_bin) { // then possibly write the binning index
+ if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+ insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+ save_off = last_off;
+ save_bin = last_bin = c->bin;
+ save_tid = c->tid;
+ if (save_tid < 0) break;
+ }
+ if (bam_tell(fp) <= last_off) {
+ fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
+ (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
+ exit(1);
+ }
+ last_off = bam_tell(fp);
+ last_coor = b->core.pos;
+ }
+ if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+ merge_chunks(idx);
+ if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
+ free(b->data); free(b);
+ return idx;
+}
+
+void bam_index_destroy(bam_index_t *idx)
+{
+ khint_t k;
+ int i;
+ if (idx == 0) return;
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index = idx->index[i];
+ bam_lidx_t *index2 = idx->index2 + i;
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ if (kh_exist(index, k))
+ free(kh_value(index, k).list);
+ }
+ kh_destroy(i, index);
+ free(index2->offset);
+ }
+ free(idx->index); free(idx->index2);
+ free(idx);
+}
+
+void bam_index_save(const bam_index_t *idx, FILE *fp)
+{
+ int32_t i, size;
+ khint_t k;
+ fwrite("BAI\1", 1, 4, fp);
+ if (bam_is_be) {
+ uint32_t x = idx->n;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&idx->n, 4, 1, fp);
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index = idx->index[i];
+ bam_lidx_t *index2 = idx->index2 + i;
+ // write binning index
+ size = kh_size(index);
+ if (bam_is_be) { // big endian
+ uint32_t x = size;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&size, 4, 1, fp);
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ if (kh_exist(index, k)) {
+ bam_binlist_t *p = &kh_value(index, k);
+ if (bam_is_be) { // big endian
+ uint32_t x;
+ x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ for (x = 0; (int)x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ fwrite(p->list, 16, p->n, fp);
+ for (x = 0; (int)x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ } else {
+ fwrite(&kh_key(index, k), 4, 1, fp);
+ fwrite(&p->n, 4, 1, fp);
+ fwrite(p->list, 16, p->n, fp);
+ }
+ }
+ }
+ // write linear index (index2)
+ if (bam_is_be) {
+ int x = index2->n;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&index2->n, 4, 1, fp);
+ if (bam_is_be) { // big endian
+ int x;
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ fwrite(index2->offset, 8, index2->n, fp);
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ } else fwrite(index2->offset, 8, index2->n, fp);
+ }
+ fflush(fp);
+}
+
+static bam_index_t *bam_index_load_core(FILE *fp)
+{
+ int i;
+ char magic[4];
+ bam_index_t *idx;
+ if (fp == 0) {
+ fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
+ return 0;
+ }
+ fread(magic, 1, 4, fp);
+ if (strncmp(magic, "BAI\1", 4)) {
+ fprintf(stderr, "[bam_index_load] wrong magic number.\n");
+ fclose(fp);
+ return 0;
+ }
+ idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+ fread(&idx->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&idx->n);
+ idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+ idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index;
+ bam_lidx_t *index2 = idx->index2 + i;
+ uint32_t key, size;
+ khint_t k;
+ int j, ret;
+ bam_binlist_t *p;
+ index = idx->index[i] = kh_init(i);
+ // load binning index
+ fread(&size, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&size);
+ for (j = 0; j < (int)size; ++j) {
+ fread(&key, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&key);
+ k = kh_put(i, index, key, &ret);
+ p = &kh_value(index, k);
+ fread(&p->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&p->n);
+ p->m = p->n;
+ p->list = (pair64_t*)malloc(p->m * 16);
+ fread(p->list, 16, p->n, fp);
+ if (bam_is_be) {
+ int x;
+ for (x = 0; x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ }
+ }
+ // load linear index
+ fread(&index2->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&index2->n);
+ index2->m = index2->n;
+ index2->offset = (uint64_t*)calloc(index2->m, 8);
+ fread(index2->offset, index2->n, 8, fp);
+ if (bam_is_be)
+ for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+ }
+ return idx;
+}
+
+bam_index_t *bam_index_load_local(const char *_fn)
+{
+ FILE *fp;
+ char *fnidx, *fn;
+
+ if (strstr(_fn, "ftp://") == _fn) {
+ const char *p;
+ int l = strlen(_fn);
+ for (p = _fn + l - 1; p >= _fn; --p)
+ if (*p == '/') break;
+ fn = strdup(p + 1);
+ } else fn = strdup(_fn);
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ fp = fopen(fnidx, "r");
+ if (fp == 0) { // try "{base}.bai"
+ char *s = strstr(fn, "bam");
+ if (s == fn + strlen(fn) - 3) {
+ strcpy(fnidx, fn);
+ fnidx[strlen(fn)-1] = 'i';
+ fp = fopen(fnidx, "r");
+ }
+ }
+ free(fnidx); free(fn);
+ if (fp) {
+ bam_index_t *idx = bam_index_load_core(fp);
+ fclose(fp);
+ return idx;
+ } else return 0;
+}
+
+static void download_from_remote(const char *url)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ char *fn;
+ FILE *fp;
+ uint8_t *buf;
+ knetFile *fp_remote;
+ int l;
+ if (strstr(url, "ftp://") != url) return;
+ l = strlen(url);
+ for (fn = (char*)url + l - 1; fn >= url; --fn)
+ if (*fn == '/') break;
+ ++fn; // fn now points to the file name
+ fp_remote = knet_open(url, "r");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+ return;
+ }
+ if ((fp = fopen(fn, "w")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+ knet_close(fp_remote);
+ return;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+}
+
+bam_index_t *bam_index_load(const char *fn)
+{
+ bam_index_t *idx;
+ idx = bam_index_load_local(fn);
+ if (idx == 0 && strstr(fn, "ftp://") == fn) {
+ char *fnidx = calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".bai");
+ fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
+ download_from_remote(fnidx);
+ idx = bam_index_load_local(fn);
+ }
+ if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
+ return idx;
+}
+
+int bam_index_build2(const char *fn, const char *_fnidx)
+{
+ char *fnidx;
+ FILE *fpidx;
+ bamFile fp;
+ bam_index_t *idx;
+ if ((fp = bam_open(fn, "r")) == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
+ return -1;
+ }
+ idx = bam_index_core(fp);
+ bam_close(fp);
+ if (_fnidx == 0) {
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ } else fnidx = strdup(_fnidx);
+ fpidx = fopen(fnidx, "w");
+ if (fpidx == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
+ free(fnidx);
+ return -1;
+ }
+ bam_index_save(idx, fpidx);
+ bam_index_destroy(idx);
+ fclose(fpidx);
+ free(fnidx);
+ return 0;
+}
+
+int bam_index_build(const char *fn)
+{
+ return bam_index_build2(fn, 0);
+}
+
+int bam_index(int argc, char *argv[])
+{
+ if (argc < 2) {
+ fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
+ return 1;
+ }
+ if (argc >= 3) bam_index_build2(argv[1], argv[2]);
+ else bam_index_build(argv[1]);
+ return 0;
+}
+
+#define MAX_BIN 37450 // =(8^6-1)/7+1
+
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
+{
+ int i = 0, k;
+ --end;
+ list[i++] = 0;
+ for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
+ for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
+ for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
+ for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
+ for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+ return i;
+}
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+ uint32_t rbeg = b->core.pos;
+ uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
+ return (rend > beg && rbeg < end);
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+ uint16_t *bins;
+ int i, n_bins, n_off;
+ pair64_t *off;
+ khint_t k;
+ khash_t(i) *index;
+ uint64_t min_off;
+
+ bins = (uint16_t*)calloc(MAX_BIN, 2);
+ n_bins = reg2bins(beg, end, bins);
+ index = idx->index[tid];
+ min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((k = kh_get(i, index, bins[i])) != kh_end(index))
+ n_off += kh_value(index, k).n;
+ }
+ if (n_off == 0) {
+ free(bins); return 0;
+ }
+ off = (pair64_t*)calloc(n_off, 16);
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
+ int j;
+ bam_binlist_t *p = &kh_value(index, k);
+ for (j = 0; j < p->n; ++j)
+ if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+ }
+ }
+ free(bins);
+ {
+ bam1_t *b;
+ int l, ret, n_seeks;
+ uint64_t curr_off;
+ b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ ks_introsort(off, n_off, off);
+ // resolve completely contained adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i)
+ if (off[l].v < off[i].v)
+ off[++l] = off[i];
+ n_off = l + 1;
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+ for (i = 1; i < n_off; ++i)
+ if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+ { // merge adjacent blocks
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+ for (i = 1, l = 0; i < n_off; ++i) {
+#ifdef BAM_TRUE_OFFSET
+ if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
+#else
+ if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+#endif
+ else off[++l] = off[i];
+ }
+ n_off = l + 1;
+#endif
+ }
+ // retrive alignments
+ n_seeks = 0; i = -1; curr_off = 0;
+ for (;;) {
+ if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
+ if (i == n_off - 1) break; // no more chunks
+ if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+ if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+ bam_seek(fp, off[i+1].u, SEEK_SET);
+ curr_off = bam_tell(fp);
+ ++n_seeks;
+ }
+ ++i;
+ }
+ if ((ret = bam_read1(fp, b)) > 0) {
+ curr_off = bam_tell(fp);
+ if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
+ else if (is_overlap(beg, end, b)) func(b, data);
+ } else break; // end of file
+ }
+// fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
+ bam_destroy1(b);
+ }
+ free(off);
+ return 0;
+}
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "bam.h"
+#include "ksort.h"
+
+#define TV_GAP 2
+
+typedef struct __freenode_t {
+ uint32_t level:28, cnt:4;
+ struct __freenode_t *next;
+} freenode_t, *freenode_p;
+
+#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
+KSORT_INIT(node, freenode_p, freenode_lt)
+
+/* Memory pool, similar to the one in bam_pileup.c */
+typedef struct {
+ int cnt, n, max;
+ freenode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+ return (mempool_t*)calloc(1, sizeof(mempool_t));
+}
+static void mp_destroy(mempool_t *mp)
+{
+ int k;
+ for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
+ free(mp->buf); free(mp);
+}
+static inline freenode_t *mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
+ else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, freenode_t *p)
+{
+ --mp->cnt; p->next = 0; p->cnt = TV_GAP;
+ if (mp->n == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 256;
+ mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
+ }
+ mp->buf[mp->n++] = p;
+}
+
+/* core part */
+struct __bam_lplbuf_t {
+ int max, n_cur, n_pre;
+ int max_level, *cur_level, *pre_level;
+ mempool_t *mp;
+ freenode_t **aux, *head, *tail;
+ int n_nodes, m_aux;
+ bam_pileup_f func;
+ void *user_data;
+ bam_plbuf_t *plbuf;
+};
+
+void bam_lplbuf_reset(bam_lplbuf_t *buf)
+{
+ freenode_t *p, *q;
+ bam_plbuf_reset(buf->plbuf);
+ for (p = buf->head; p->next;) {
+ q = p->next;
+ mp_free(buf->mp, p);
+ p = q;
+ }
+ buf->head = buf->tail;
+ buf->max_level = 0;
+ buf->n_cur = buf->n_pre = 0;
+ buf->n_nodes = 0;
+}
+
+static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+ bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
+ freenode_t *p;
+ int i, l, max_level;
+ // allocate memory if necessary
+ if (tv->max < n) { // enlarge
+ tv->max = n;
+ kroundup32(tv->max);
+ tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
+ tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
+ }
+ tv->n_cur = n;
+ // update cnt
+ for (p = tv->head; p->next; p = p->next)
+ if (p->cnt > 0) --p->cnt;
+ // calculate cur_level[]
+ max_level = 0;
+ for (i = l = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->is_head) {
+ if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
+ freenode_t *p = tv->head->next;
+ tv->cur_level[i] = tv->head->level;
+ mp_free(tv->mp, tv->head);
+ tv->head = p;
+ --tv->n_nodes;
+ } else tv->cur_level[i] = ++tv->max_level;
+ } else {
+ tv->cur_level[i] = tv->pre_level[l++];
+ if (p->is_tail) { // then return a free slot
+ tv->tail->level = tv->cur_level[i];
+ tv->tail->next = mp_alloc(tv->mp);
+ tv->tail = tv->tail->next;
+ ++tv->n_nodes;
+ }
+ }
+ if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
+ ((bam_pileup1_t*)p)->level = tv->cur_level[i];
+ }
+ assert(l == tv->n_pre);
+ tv->func(tid, pos, n, pl, tv->user_data);
+ // sort the linked list
+ if (tv->n_nodes) {
+ freenode_t *q;
+ if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
+ tv->m_aux = tv->n_nodes + 1;
+ kroundup32(tv->m_aux);
+ tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
+ }
+ for (p = tv->head, i = l = 0; p->next;) {
+ if (p->level > max_level) { // then discard this entry
+ q = p->next;
+ mp_free(tv->mp, p);
+ p = q;
+ } else {
+ tv->aux[i++] = p;
+ p = p->next;
+ }
+ }
+ tv->aux[i] = tv->tail; // add a proper tail for the loop below
+ tv->n_nodes = i;
+ if (tv->n_nodes) {
+ ks_introsort(node, tv->n_nodes, tv->aux);
+ for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
+ tv->head = tv->aux[0];
+ } else tv->head = tv->tail;
+ }
+ // clean up
+ tv->max_level = max_level;
+ memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
+ // squeeze out terminated levels
+ for (i = l = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (!p->is_tail)
+ tv->pre_level[l++] = tv->pre_level[i];
+ }
+ tv->n_pre = l;
+/*
+ fprintf(stderr, "%d\t", pos+1);
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->is_head) fprintf(stderr, "^");
+ if (p->is_tail) fprintf(stderr, "$");
+ fprintf(stderr, "%d,", p->level);
+ }
+ fprintf(stderr, "\n");
+*/
+ return 0;
+}
+
+bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
+{
+ bam_lplbuf_t *tv;
+ tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
+ tv->mp = mp_init();
+ tv->head = tv->tail = mp_alloc(tv->mp);
+ tv->func = func;
+ tv->user_data = data;
+ tv->plbuf = bam_plbuf_init(tview_func, tv);
+ return (bam_lplbuf_t*)tv;
+}
+
+void bam_lplbuf_destroy(bam_lplbuf_t *tv)
+{
+ freenode_t *p, *q;
+ free(tv->cur_level); free(tv->pre_level);
+ bam_plbuf_destroy(tv->plbuf);
+ free(tv->aux);
+ for (p = tv->head; p->next;) {
+ q = p->next;
+ mp_free(tv->mp, p); p = q;
+ }
+ mp_free(tv->mp, p);
+ assert(tv->mp->cnt == 0);
+ mp_destroy(tv->mp);
+ free(tv);
+}
+
+int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
+{
+ return bam_plbuf_push(b, tv->plbuf);
+}
+
+int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
+{
+ bam_lplbuf_t *buf;
+ int ret;
+ bam1_t *b;
+ b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ buf = bam_lplbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf->plbuf, mask);
+ while ((ret = bam_read1(fp, b)) >= 0)
+ bam_lplbuf_push(b, buf);
+ bam_lplbuf_push(0, buf);
+ bam_lplbuf_destroy(buf);
+ free(b->data); free(b);
+ return 0;
+}
--- /dev/null
+#include <math.h>
+#include "bam.h"
+#include "bam_maqcns.h"
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint32_t)
+
+#define MAX_WINDOW 33
+
+typedef struct __bmc_aux_t {
+ int max;
+ uint32_t *info;
+} bmc_aux_t;
+
+typedef struct {
+ float esum[4], fsum[4];
+ uint32_t c[4];
+ uint32_t rms_mapQ;
+} glf_call_aux_t;
+
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+/*
+ P(<b1,b2>) = \theta \sum_{i=1}^{N-1} 1/i
+ P(D|<b1,b2>) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2]
+ p_k = i/k / \sum_{i=1}^{N-1} 1/i
+ */
+static void cal_het(bam_maqcns_t *aa)
+{
+ int k, n1, n2;
+ double sum_harmo; // harmonic sum
+ double poly_rate;
+ double p1 = 0.0, p3 = 0.0; // just for testing
+
+ free(aa->lhet);
+ aa->lhet = (double*)calloc(256 * 256, sizeof(double));
+ sum_harmo = 0.0;
+ for (k = 1; k <= aa->n_hap - 1; ++k)
+ sum_harmo += 1.0 / k;
+ for (n1 = 0; n1 < 256; ++n1) {
+ for (n2 = 0; n2 < 256; ++n2) {
+ long double sum = 0.0;
+ double lC = lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1}
+ for (k = 1; k <= aa->n_hap - 1; ++k) {
+ double pk = 1.0 / k / sum_harmo;
+ double log1 = log((double)k/aa->n_hap);
+ double log2 = log(1.0 - (double)k/aa->n_hap);
+ sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2));
+ }
+ aa->lhet[n1<<8|n2] = lC + logl(sum);
+ if (n1 == 17 && n2 == 3) p3 = lC + logl(expl(logl(0.5) * 20));
+ if (n1 == 19 && n2 == 1) p1 = lC + logl(expl(logl(0.5) * 20));
+ }
+ }
+ poly_rate = aa->het_rate * sum_harmo;
+ aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate));
+}
+
+/** initialize the helper structure */
+static void cal_coef(bam_maqcns_t *aa)
+{
+ int k, n, q;
+ long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256];
+ double *lC;
+
+ lC = (double*)calloc(256 * 256, sizeof(double));
+ // aa->lhet will be allocated and initialized
+ free(aa->fk); free(aa->coef);
+ aa->fk = (double*)calloc(256, sizeof(double));
+ aa->coef = (double*)calloc(256*256*64, sizeof(double));
+ aa->fk[0] = fk2[0] = 1.0;
+ for (n = 1; n != 256; ++n) {
+ aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta;
+ fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands
+ }
+ for (n = 1; n != 256; ++n)
+ for (k = 1; k <= n; ++k)
+ lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+ for (q = 1; q != 64; ++q) {
+ double e = pow(10.0, -q/10.0);
+ double le = log(e);
+ double le1 = log(1.0-e);
+ for (n = 1; n != 256; ++n) {
+ double *coef = aa->coef + (q<<16|n<<8);
+ sum_a[n+1] = 0.0;
+ for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k}
+ sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1);
+ b[k] = sum_a[k+1] / sum_a[k];
+ if (b[k] > 0.99) b[k] = 0.99;
+ }
+ for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k})
+ q_c[k] = -4.343 * fk2[k] * logl(b[k] / e);
+ for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i
+ for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9
+ tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k])));
+ coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk}
+ }
+ }
+ }
+ free(lC);
+}
+
+bam_maqcns_t *bam_maqcns_init()
+{
+ bam_maqcns_t *bm;
+ bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t));
+ bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t));
+ bm->het_rate = 0.001;
+ bm->theta = 0.85;
+ bm->n_hap = 2;
+ bm->eta = 0.03;
+ bm->cap_mapQ = 60;
+ return bm;
+}
+
+void bam_maqcns_prepare(bam_maqcns_t *bm)
+{
+ cal_coef(bm); cal_het(bm);
+}
+
+void bam_maqcns_destroy(bam_maqcns_t *bm)
+{
+ if (bm == 0) return;
+ free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info);
+ free(bm->aux); free(bm);
+}
+
+glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm)
+{
+ glf_call_aux_t *b;
+ int i, j, k, w[8], c, n;
+ glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t));
+ float p[16], min_p = 1e30;
+ uint64_t rms;
+
+ g->ref_base = ref_base;
+ if (_n == 0) return g;
+
+ // construct aux array
+ if (bm->aux->max < _n) {
+ bm->aux->max = _n;
+ kroundup32(bm->aux->max);
+ bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max);
+ }
+ for (i = n = 0; i < _n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ uint32_t q, x = 0, qq;
+ if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue;
+ q = (uint32_t)bam1_qual(p->b)[p->qpos];
+ x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
+ if (p->b->core.qual < q) q = p->b->core.qual;
+ x |= q << 24;
+ qq = bam1_seqi(bam1_seq(p->b), p->qpos);
+ q = bam_nt16_nt4_table[qq? qq : ref_base];
+ if (!p->is_del && q < 4) x |= 1 << 21 | q << 16;
+ bm->aux->info[n++] = x;
+ }
+ ks_introsort(uint32_t, n, bm->aux->info);
+ // generate esum and fsum
+ b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t));
+ for (k = 0; k != 8; ++k) w[k] = 0;
+ rms = 0;
+ for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
+ uint32_t info = bm->aux->info[j];
+ int tmp;
+ if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
+ k = info>>16&7;
+ if (info>>24 > 0) {
+ b->esum[k&3] += bm->fk[w[k]] * (info>>24);
+ b->fsum[k&3] += bm->fk[w[k]];
+ if (w[k] < 0xff) ++w[k];
+ ++b->c[k&3];
+ }
+ tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ;
+ rms += tmp * tmp;
+ }
+ b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499);
+ // rescale ->c[]
+ for (j = c = 0; j != 4; ++j) c += b->c[j];
+ if (c > 255) {
+ for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5);
+ for (j = c = 0; j != 4; ++j) c += b->c[j];
+ }
+ // generate likelihood
+ for (j = 0; j != 4; ++j) {
+ // homozygous
+ float tmp1, tmp3;
+ int tmp2, bar_e;
+ for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) {
+ if (j == k) continue;
+ tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k];
+ }
+ if (tmp2) {
+ bar_e = (int)(tmp1 / tmp3 + 0.5);
+ if (bar_e < 4) bar_e = 4; // should not happen
+ if (bar_e > 63) bar_e = 63;
+ p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+ } else p[j<<2|j] = 0.0; // all the bases are j
+ // heterozygous
+ for (k = j + 1; k < 4; ++k) {
+ for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) {
+ if (i == j || i == k) continue;
+ tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i];
+ }
+ if (tmp2) {
+ bar_e = (int)(tmp1 / tmp3 + 0.5);
+ if (bar_e < 4) bar_e = 4;
+ if (bar_e > 63) bar_e = 63;
+ p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+ } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k
+ }
+ //
+ for (k = 0; k != 4; ++k)
+ if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0;
+ }
+
+ { // fix p[k<<2|k]
+ float max1, max2, min1, min2;
+ int max_k, min_k;
+ max_k = min_k = -1;
+ max1 = max2 = -1.0; min1 = min2 = 1e30;
+ for (k = 0; k < 4; ++k) {
+ if (b->esum[k] > max1) {
+ max2 = max1; max1 = b->esum[k]; max_k = k;
+ } else if (b->esum[k] > max2) max2 = b->esum[k];
+ }
+ for (k = 0; k < 4; ++k) {
+ if (p[k<<2|k] < min1) {
+ min2 = min1; min1 = p[k<<2|k]; min_k = k;
+ } else if (p[k<<2|k] < min2) min2 = p[k<<2|k];
+ }
+ if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2))
+ p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0;
+ }
+
+ // convert necessary information to glf1_t
+ g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ;
+ g->depth = n > 16777215? 16777215 : n;
+ for (j = 0; j != 4; ++j)
+ for (k = j; k < 4; ++k)
+ if (p[j<<2|k] < min_p) min_p = p[j<<2|k];
+ g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5);
+ for (j = c = 0; j != 4; ++j)
+ for (k = j; k < 4; ++k)
+ g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5);
+
+ free(b);
+ return g;
+}
+
+uint32_t glf2cns(const glf1_t *g, int q_r)
+{
+ int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1;
+ uint32_t x = 0;
+ for (i = k = 0; i < 4; ++i)
+ for (j = i; j < 4; ++j) {
+ tmp[j<<2|i] = -1;
+ tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r);
+ }
+ for (i = 0; i < 16; ++i) {
+ if (tmp[i] < 0) continue;
+ if (tmp[i] < min) {
+ min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i;
+ } else if (tmp[i] < min2) {
+ min3 = min2; min2 = tmp[i]; min_g2 = i;
+ } else if (tmp[i] < min3) min3 = tmp[i];
+ }
+ x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28;
+ x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24;
+ x |= (uint32_t)g->max_mapQ << 16;
+ x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8;
+ x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff;
+ return x;
+}
+
+uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm)
+{
+ glf1_t *g;
+ uint32_t x;
+ if (n) {
+ g = bam_maqcns_glfgen(n, pl, 0xf, bm);
+ x = glf2cns(g, (int)(bm->q_r + 0.5));
+ free(g);
+ } else x = 0xfU<<28 | 0xfU<<24;
+ return x;
+}
+
+/************** *****************/
+
+bam_maqindel_opt_t *bam_maqindel_opt_init()
+{
+ bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
+ mi->q_indel = 40;
+ mi->r_indel = 0.00015;
+ //
+ mi->mm_penalty = 3;
+ mi->indel_err = 4;
+ mi->ambi_thres = 10;
+ return mi;
+}
+
+void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
+{
+ if (mir == 0) return;
+ free(mir->s[0]); free(mir->s[1]); free(mir);
+}
+
+#define MINUS_CONST 0x10000000
+
+bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+ int _n_types, int *_types)
+{
+ int i, j, n_types, *types, left, right;
+ bam_maqindel_ret_t *ret = 0;
+ // if there is no proposed indel, check if there is an indel from the alignment
+ if (_n_types == 0) {
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+ }
+ if (i == n) return 0; // no indel
+ }
+ { // calculate how many types of indels are available (set n_types and types)
+ int m;
+ uint32_t *aux;
+ aux = (uint32_t*)calloc(n + _n_types + 1, 4);
+ m = 0;
+ aux[m++] = MINUS_CONST; // zero indel is always a type
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
+ aux[m++] = MINUS_CONST + p->indel;
+ }
+ if (_n_types) // then also add this to aux[]
+ for (i = 0; i < _n_types; ++i)
+ if (_types[i]) aux[m++] = MINUS_CONST + _types[i];
+ ks_introsort(uint32_t, m, aux);
+ // squeeze out identical types
+ for (i = 1, n_types = 1; i < m; ++i)
+ if (aux[i] != aux[i-1]) ++n_types;
+ types = (int*)calloc(n_types, sizeof(int));
+ j = 0;
+ types[j++] = aux[0] - MINUS_CONST;
+ for (i = 1; i < m; ++i) {
+ if (aux[i] != aux[i-1])
+ types[j++] = aux[i] - MINUS_CONST;
+ }
+ free(aux);
+ }
+ { // calculate left and right boundary
+ bam_segreg_t seg;
+ left = 0x7fffffff; right = 0;
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (!(p->b->core.flag&BAM_FUNMAP)) {
+ bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg);
+ if (seg.tbeg < left) left = seg.tbeg;
+ if (seg.tend > right) right = seg.tend;
+ }
+ }
+ if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW;
+ if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW;
+ }
+ { // the core part
+ char *ref2, *inscns = 0;
+ int k, l, *score, *pscore, max_ins = types[n_types-1];
+ ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
+ if (max_ins > 0) { // get the consensus of inserted sequences
+ int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
+ // count occurrences
+ for (i = 0; i < n_types; ++i) {
+ if (types[i] <= 0) continue; // not insertion
+ for (j = 0; j < n; ++j) {
+ const bam_pileup1_t *p = pl + j;
+ if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) {
+ for (k = 1; k <= p->indel; ++k) {
+ int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)];
+ if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c];
+ }
+ }
+ }
+ }
+ // construct the consensus of inserted sequence
+ inscns = (char*)calloc(n_types * max_ins, sizeof(char));
+ for (i = 0; i < n_types; ++i) {
+ for (j = 0; j < types[i]; ++j) {
+ int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4;
+ for (k = 0; k < 4; ++k) {
+ if (ia[k] > max) {
+ max = ia[k];
+ max_k = k;
+ }
+ }
+ inscns[i*max_ins + j] = max? 1<<max_k : 15;
+ }
+ }
+ free(inscns_aux);
+ }
+ // calculate score
+ score = (int*)calloc(n_types * n, sizeof(int));
+ pscore = (int*)calloc(n_types * n, sizeof(int));
+ for (i = 0; i < n_types; ++i) {
+ // write ref2
+ for (k = 0, j = left; j <= pos; ++j)
+ ref2[k++] = bam_nt16_table[(int)ref[j]];
+ if (types[i] <= 0) j += -types[i];
+ else for (l = 0; l < types[i]; ++l)
+ ref2[k++] = inscns[i*max_ins + l];
+ for (; j < right && ref[j]; ++j)
+ ref2[k++] = bam_nt16_table[(int)ref[j]];
+ // calculate score for each read
+ for (j = 0; j < n; ++j) {
+ const bam_pileup1_t *p = pl + j;
+ uint32_t *cigar;
+ bam1_core_t *c = &p->b->core;
+ int s, ps;
+ bam_segreg_t seg;
+ if (c->flag&BAM_FUNMAP) continue;
+ cigar = bam1_cigar(p->b);
+ bam_segreg(pos, c, cigar, &seg);
+ for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
+ int cq = bam1_seqi(bam1_seq(p->b), l), ct;
+ // in the following line, "<" will happen if reads are too long
+ ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15;
+ if (cq < 15 && ct < 15) {
+ s += cq == ct? 1 : -mi->mm_penalty;
+ if (cq != ct) ps += bam1_qual(p->b)[l];
+ }
+ }
+ score[i*n + j] = s; pscore[i*n + j] = ps;
+ if (types[i] != 0) { // then try the other way to calculate the score
+ for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
+ int cq = bam1_seqi(bam1_seq(p->b), l), ct;
+ ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15;
+ if (cq < 15 && ct < 15) {
+ s += cq == ct? 1 : -mi->mm_penalty;
+ if (cq != ct) ps += bam1_qual(p->b)[l];
+ }
+ }
+ }
+ if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
+ if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
+ if (types[i] != 0) score[i*n+j] -= mi->indel_err;
+ //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j,
+ // score[i*n+j], pscore[i*n+j]);
+ }
+ }
+ { // get final result
+ int *sum, max1, max2, max1_i, max2_i;
+ // pick up the best two score
+ sum = (int*)calloc(n_types, sizeof(int));
+ for (i = 0; i < n_types; ++i)
+ for (j = 0; j < n; ++j)
+ sum[i] += -pscore[i*n+j];
+ max1 = max2 = -0x7fffffff; max1_i = max2_i = -1;
+ for (i = 0; i < n_types; ++i) {
+ if (sum[i] > max1) {
+ max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i;
+ } else if (sum[i] > max2) {
+ max2 = sum[i]; max2_i = i;
+ }
+ }
+ free(sum);
+ // write ret
+ ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));
+ ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];
+ ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1);
+ ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1);
+ // write indel sequence
+ if (ret->indel1 > 0) {
+ ret->s[0][0] = '+';
+ for (k = 0; k < ret->indel1; ++k)
+ ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
+ } else if (ret->indel1 < 0) {
+ ret->s[0][0] = '-';
+ for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)
+ ret->s[0][k+1] = ref[pos + k + 1];
+ } else ret->s[0][0] = '*';
+ if (ret->indel2 > 0) {
+ ret->s[1][0] = '+';
+ for (k = 0; k < ret->indel2; ++k)
+ ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
+ } else if (ret->indel2 < 0) {
+ ret->s[1][0] = '-';
+ for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)
+ ret->s[1][k+1] = ref[pos + k + 1];
+ } else ret->s[1][0] = '*';
+ // write count
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->indel == ret->indel1) ++ret->cnt1;
+ else if (p->indel == ret->indel2) ++ret->cnt2;
+ else ++ret->cnt_anti;
+ }
+ // write gl[]
+ ret->gl[0] = ret->gl[1] = 0;
+ for (j = 0; j < n; ++j) {
+ int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
+ //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2);
+ if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel;
+ else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel;
+ }
+ }
+ free(score); free(pscore); free(ref2); free(inscns);
+ }
+ { // call genotype
+ int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);
+ int min1, min2, min1_i;
+ q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel;
+ q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel;
+ q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel;
+ min1 = min2 = 0x7fffffff; min1_i = -1;
+ for (i = 0; i < 3; ++i) {
+ if (q[i] < min1) {
+ min2 = min1; min1 = q[i]; min1_i = i;
+ } else if (q[i] < min2) min2 = q[i];
+ }
+ ret->gt = min1_i;
+ ret->q_cns = min2 - min1;
+ // set q_ref
+ if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3;
+ else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2];
+ if (ret->q_ref < 0) ret->q_ref = 0;
+ }
+ free(types);
+ return ret;
+}
--- /dev/null
+#ifndef BAM_MAQCNS_H
+#define BAM_MAQCNS_H
+
+#include "glf.h"
+
+struct __bmc_aux_t;
+
+typedef struct {
+ float het_rate, theta;
+ int n_hap, cap_mapQ;
+
+ float eta, q_r;
+ double *fk, *coef;
+ double *lhet;
+ struct __bmc_aux_t *aux;
+} bam_maqcns_t;
+
+typedef struct {
+ int q_indel;
+ float r_indel;
+ // hidden parameters, unchangeable from command line
+ int mm_penalty, indel_err, ambi_thres;
+} bam_maqindel_opt_t;
+
+typedef struct {
+ int indel1, indel2;
+ int cnt1, cnt2, cnt_ambi, cnt_anti;
+ char *s[2];
+ //
+ int gt, gl[2];
+ int q_cns, q_ref;
+} bam_maqindel_ret_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bam_maqcns_t *bam_maqcns_init();
+ void bam_maqcns_prepare(bam_maqcns_t *bm);
+ void bam_maqcns_destroy(bam_maqcns_t *bm);
+ glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm);
+ uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm);
+ // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2
+ uint32_t glf2cns(const glf1_t *g, int q_r);
+
+ bam_maqindel_opt_t *bam_maqindel_opt_init();
+ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+ int _n_types, int *_types);
+ void bam_maqindel_ret_destroy(bam_maqindel_ret_t*);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+
+// currently, this function ONLY works if each read has one hit
+void bam_mating_core(bamFile in, bamFile out)
+{
+ bam_header_t *header;
+ bam1_t *b[2];
+ int curr, has_prev;
+
+ header = bam_header_read(in);
+ bam_header_write(out, header);
+
+ b[0] = bam_init1();
+ b[1] = bam_init1();
+ curr = 0; has_prev = 0;
+ while (bam_read1(in, b[curr]) >= 0) {
+ bam1_t *cur = b[curr], *pre = b[1-curr];
+ if (has_prev) {
+ if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
+ cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
+ pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
+ if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
+ && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)))
+ {
+ uint32_t cur5, pre5;
+ cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos;
+ pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos;
+ cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
+ } else cur->core.isize = pre->core.isize = 0;
+ if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
+ else cur->core.flag &= ~BAM_FMREVERSE;
+ if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
+ else pre->core.flag &= ~BAM_FMREVERSE;
+ if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
+ if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
+ bam_write1(out, pre);
+ bam_write1(out, cur);
+ has_prev = 0;
+ } else { // unpaired or singleton
+ pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
+ if (pre->core.flag & BAM_FPAIRED) {
+ pre->core.flag |= BAM_FMUNMAP;
+ pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
+ }
+ bam_write1(out, pre);
+ }
+ } else has_prev = 1;
+ curr = 1 - curr;
+ }
+ if (has_prev) bam_write1(out, b[1-curr]);
+ bam_header_destroy(header);
+ bam_destroy1(b[0]);
+ bam_destroy1(b[1]);
+}
+
+int bam_mating(int argc, char *argv[])
+{
+ bamFile in, out;
+ if (argc < 3) {
+ fprintf(stderr, "samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n");
+ return 1;
+ }
+ in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+ out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+ bam_mating_core(in, out);
+ bam_close(in); bam_close(out);
+ return 0;
+}
--- /dev/null
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <ctype.h>
+#include "faidx.h"
+#include "bam.h"
+#include "kstring.h"
+
+void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+{
+ uint8_t *seq = bam1_seq(b);
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ int i, x, y, u = 0;
+ kstring_t *str;
+ uint8_t *old_md;
+
+ old_md = bam_aux_get(b, "MD");
+ if (c->flag & BAM_FUNMAP) return;
+ if (old_md && !is_equal) return; // no need to add MD
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) {
+ if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+ ++u;
+ } else {
+ ksprintf(str, "%d", u);
+ kputc(ref[x+j], str);
+ u = 0;
+ }
+ }
+ if (j < l) break;
+ x += l; y += l;
+ } else if (op == BAM_CDEL) {
+ ksprintf(str, "%d", u);
+ kputc('^', str);
+ for (j = 0; j < l; ++j) {
+ if (ref[x+j] == 0) break;
+ kputc(ref[x+j], str);
+ }
+ u = 0;
+ if (j < l) break;
+ x += l;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+ y += l;
+ } else if (op == BAM_CREF_SKIP) {
+ x += l;
+ }
+ }
+ ksprintf(str, "%d", u);
+ if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ else {
+ int is_diff = 0;
+ if (strlen((char*)old_md+1) == str->l) {
+ for (i = 0; i < str->l; ++i)
+ if (toupper(old_md[i+1]) != toupper(str->s[i]))
+ break;
+ if (i < str->l) is_diff = 1;
+ } else is_diff = 1;
+ if (is_diff)
+ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' != '%s'\n", bam1_qname(b), old_md+1, str->s);
+ }
+ free(str->s); free(str);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+ int c, is_equal = 0, tid = -2, ret, len;
+ bamFile fp, fpout = 0;
+ bam_header_t *header;
+ faidx_t *fai;
+ char *ref = 0;
+ bam1_t *b;
+
+ while ((c = getopt(argc, argv, "e")) >= 0) {
+ switch (c) {
+ case 'e': is_equal = 1; break;
+ default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+ }
+ }
+ if (optind + 1 >= argc) {
+ fprintf(stderr, "Usage: bam fillmd [-e] <aln.bam> <ref.fasta>\n");
+ return 1;
+ }
+ fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ assert(fp);
+ header = bam_header_read(fp);
+ fpout = bam_dopen(fileno(stdout), "w");
+ bam_header_write(fpout, header);
+ fai = fai_load(argv[optind+1]);
+
+ b = bam_init1();
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ if (b->core.tid >= 0) {
+ if (tid != b->core.tid) {
+ free(ref);
+ ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
+ tid = b->core.tid;
+ }
+ bam_fillmd1(b, ref, is_equal);
+ }
+ bam_write1(fpout, b);
+ }
+ bam_destroy1(b);
+
+ free(ref);
+ fai_destroy(fai);
+ bam_header_destroy(header);
+ bam_close(fp); bam_close(fpout);
+ return 0;
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include "sam.h"
+
+typedef struct __linkbuf_t {
+ bam1_t b;
+ uint32_t beg, end;
+ struct __linkbuf_t *next;
+} lbnode_t;
+
+/* --- BEGIN: Memory pool */
+
+typedef struct {
+ int cnt, n, max;
+ lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+ mempool_t *mp;
+ mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+ return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+ int k;
+ for (k = 0; k < mp->n; ++k) {
+ free(mp->buf[k]->b.data);
+ free(mp->buf[k]);
+ }
+ free(mp->buf);
+ free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+ else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+ --mp->cnt; p->next = 0; // clear lbnode_t::next here
+ if (mp->n == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 256;
+ mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+ }
+ mp->buf[mp->n++] = p;
+}
+
+/* --- END: Memory pool */
+
+/* --- BEGIN: Auxiliary functions */
+
+static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
+{
+ unsigned k;
+ bam1_t *b = p->b;
+ bam1_core_t *c = &b->core;
+ uint32_t x = c->pos, y = 0;
+ int ret = 1, is_restart = 1;
+
+ if (c->flag&BAM_FUNMAP) return 0; // unmapped read
+ assert(x <= pos); // otherwise a bug
+ p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
+ int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
+ if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip
+ if (x + l > pos) { // overlap with pos
+ p->indel = p->is_del = 0;
+ p->qpos = y + (pos - x);
+ if (x == pos && is_restart) p->is_head = 1;
+ if (x + l - 1 == pos) { // come to the end of a match
+ if (k < c->n_cigar - 1) { // there are additional operation(s)
+ uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
+ int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
+ if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
+ else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
+ if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
+ p->is_tail = 1; // tail
+ } else p->is_tail = 1; // this is the last operation; set tail
+ }
+ }
+ x += l; y += l;
+ } else if (op == BAM_CDEL) { // then set ->is_del
+ if (x + l > pos) {
+ p->indel = 0; p->is_del = 1;
+ p->qpos = y + (pos - x);
+ }
+ x += l;
+ } else if (op == BAM_CREF_SKIP) x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
+ if (x > pos) {
+ if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
+ break;
+ }
+ }
+ assert(x > pos); // otherwise a bug
+ return ret;
+}
+
+/* --- END: Auxiliary functions */
+
+struct __bam_plbuf_t {
+ mempool_t *mp;
+ lbnode_t *head, *tail, *dummy;
+ bam_pileup_f func;
+ void *func_data;
+ int32_t tid, pos, max_tid, max_pos;
+ int max_pu, is_eof;
+ bam_pileup1_t *pu;
+ int flag_mask;
+};
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+ lbnode_t *p, *q;
+ buf->max_tid = buf->max_pos = -1;
+ buf->tid = buf->pos = 0;
+ buf->is_eof = 0;
+ for (p = buf->head; p->next;) {
+ q = p->next;
+ mp_free(buf->mp, p);
+ p = q;
+ }
+ buf->head = buf->tail;
+}
+
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+ if (mask < 0) buf->flag_mask = BAM_DEF_MASK;
+ else buf->flag_mask = BAM_FUNMAP | mask;
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+ bam_plbuf_t *buf;
+ buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t));
+ buf->func = func; buf->func_data = data;
+ buf->mp = mp_init();
+ buf->head = buf->tail = mp_alloc(buf->mp);
+ buf->dummy = mp_alloc(buf->mp);
+ buf->max_tid = buf->max_pos = -1;
+ buf->flag_mask = BAM_DEF_MASK;
+ return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+ mp_free(buf->mp, buf->dummy);
+ mp_free(buf->mp, buf->head);
+ if (buf->mp->cnt != 0)
+ fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt);
+ mp_destroy(buf->mp);
+ free(buf->pu);
+ free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+ if (b) { // fill buffer
+ if (b->core.tid < 0) return 0;
+ if (b->core.flag & buf->flag_mask) return 0;
+ bam_copy1(&buf->tail->b, b);
+ buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
+ if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n");
+ abort();
+ }
+ buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
+ if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
+ buf->tail->next = mp_alloc(buf->mp);
+ buf->tail = buf->tail->next;
+ }
+ } else buf->is_eof = 1;
+ while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) {
+ int n_pu = 0;
+ lbnode_t *p, *q;
+ buf->dummy->next = buf->head;
+ for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
+ if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
+ q->next = p->next; mp_free(buf->mp, p); p = q;
+ } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
+ if (n_pu == buf->max_pu) { // then double the capacity
+ buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
+ buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
+ }
+ buf->pu[n_pu].b = &p->b;
+ if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
+ }
+ }
+ buf->head = buf->dummy->next; // dummy->next may be changed
+ if (n_pu) { // then call user defined function
+ buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data);
+ }
+ // update tid and pos
+ if (buf->head->next) {
+ if (buf->tid > buf->head->b.core.tid) {
+ fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
+ return 1;
+ }
+ }
+ if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
+ buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
+ } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
+ buf->pos = buf->head->beg; // jump to the next position
+ } else ++buf->pos; // scan contiguously
+ if (buf->is_eof && buf->head->next == 0) break;
+ }
+ return 0;
+}
--- /dev/null
+#include <math.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "sam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+#include "khash.h"
+#include "glf.h"
+#include "kstring.h"
+
+typedef int *indel_list_t;
+KHASH_MAP_INIT_INT64(64, indel_list_t)
+
+#define BAM_PLF_SIMPLE 0x01
+#define BAM_PLF_CNS 0x02
+#define BAM_PLF_INDEL_ONLY 0x04
+#define BAM_PLF_GLF 0x08
+#define BAM_PLF_VAR_ONLY 0x10
+#define BAM_PLF_2ND 0x20
+
+typedef struct {
+ bam_header_t *h;
+ bam_maqcns_t *c;
+ bam_maqindel_opt_t *ido;
+ faidx_t *fai;
+ khash_t(64) *hash;
+ uint32_t format;
+ int tid, len, last_pos;
+ int mask;
+ char *ref;
+ glfFile fp_glf; // for glf output only
+} pu_data_t;
+
+char **__bam_get_lines(const char *fn, int *_n);
+void bam_init_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
+{
+ char **list;
+ int i, j, n, *fields, max_fields;
+ khash_t(64) *hash;
+ bam_init_header_hash(h);
+ list = __bam_get_lines(fn, &n);
+ hash = kh_init(64);
+ max_fields = 0; fields = 0;
+ for (i = 0; i < n; ++i) {
+ char *str = list[i];
+ int chr, n_fields, ret;
+ khint_t k;
+ uint64_t x;
+ n_fields = ksplit_core(str, 0, &max_fields, &fields);
+ if (n_fields < 2) continue;
+ chr = bam_get_tid(h, str + fields[0]);
+ if (chr < 0) {
+ fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]);
+ continue;
+ }
+ x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);
+ k = kh_put(64, hash, x, &ret);
+ if (ret == 0) {
+ fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]);
+ continue;
+ }
+ kh_val(hash, k) = 0;
+ if (n_fields > 2) {
+ // count
+ for (j = 2; j < n_fields; ++j) {
+ char *s = str + fields[j];
+ if ((*s != '+' && *s != '-') || !isdigit(s[1])) break;
+ }
+ if (j > 2) { // update kh_val()
+ int *q, y, z;
+ q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));
+ q[0] = j - 2; z = j; y = 1;
+ for (j = 2; j < z; ++j)
+ q[y++] = atoi(str + fields[j]);
+ }
+ }
+ free(str);
+ }
+ free(list); free(fields);
+ return hash;
+}
+
+// an analogy to pileup_func() below
+static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+ pu_data_t *d = (pu_data_t*)data;
+ bam_maqindel_ret_t *r = 0;
+ int rb, *proposed_indels = 0;
+ glf1_t *g;
+ glf3_t *g3;
+
+ if (d->fai == 0) {
+ fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n");
+ exit(1);
+ }
+ if (d->hash) { // only output a list of sites
+ khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+ if (k == kh_end(d->hash)) return 0;
+ proposed_indels = kh_val(d->hash, k);
+ }
+ g3 = glf3_init1();
+ if (d->fai && (int)tid != d->tid) {
+ if (d->ref) { // then write the end mark
+ g3->rtype = GLF3_RTYPE_END;
+ glf3_write1(d->fp_glf, g3);
+ }
+ glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference
+ free(d->ref);
+ d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+ d->tid = tid;
+ d->last_pos = 0;
+ }
+ rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+ g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c);
+ memcpy(g3, g, sizeof(glf1_t));
+ g3->rtype = GLF3_RTYPE_SUB;
+ g3->offset = pos - d->last_pos;
+ d->last_pos = pos;
+ glf3_write1(d->fp_glf, g3);
+ if (proposed_indels)
+ r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+ else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+ if (r) { // then write indel line
+ int het = 3 * n, min;
+ min = het;
+ if (min > r->gl[0]) min = r->gl[0];
+ if (min > r->gl[1]) min = r->gl[1];
+ g3->ref_base = 0;
+ g3->rtype = GLF3_RTYPE_INDEL;
+ memset(g3->lk, 0, 10);
+ g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255;
+ g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255;
+ g3->lk[2] = het - min < 255? het - min : 255;
+ g3->offset = 0;
+ g3->indel_len[0] = r->indel1;
+ g3->indel_len[1] = r->indel2;
+ g3->min_lk = min < 255? min : 255;
+ g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1;
+ g3->indel_seq[0] = strdup(r->s[0]+1);
+ g3->indel_seq[1] = strdup(r->s[1]+1);
+ glf3_write1(d->fp_glf, g3);
+ bam_maqindel_ret_destroy(r);
+ }
+ free(g);
+ glf3_destroy1(g3);
+ return 0;
+}
+
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+ pu_data_t *d = (pu_data_t*)data;
+ bam_maqindel_ret_t *r = 0;
+ int i, j, rb, rms_mapq = -1, *proposed_indels = 0;
+ uint64_t rms_aux;
+ uint32_t cns = 0;
+
+ // if GLF is required, suppress -c completely
+ if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data);
+ // if d->hash is initialized, only output the sites in the hash table
+ if (d->hash) {
+ khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+ if (k == kh_end(d->hash)) return 0;
+ proposed_indels = kh_val(d->hash, k);
+ }
+ // update d->ref if necessary
+ if (d->fai && (int)tid != d->tid) {
+ free(d->ref);
+ d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+ d->tid = tid;
+ }
+ rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+ // when the indel-only mode is asked for, return if no reads mapped with indels
+ if (d->format & BAM_PLF_INDEL_ONLY) {
+ for (i = 0; i < n; ++i)
+ if (pu[i].indel != 0) break;
+ if (i == n) return 0;
+ }
+ // call the consensus and indel
+ if (d->format & BAM_PLF_CNS) // call consensus
+ cns = bam_maqcns_call(n, pu, d->c);
+ if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref) { // call indels
+ if (proposed_indels) // the first element gives the size of the array
+ r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+ else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+ }
+ // when only variant sites are asked for, test if the site is a variant
+ if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) {
+ if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP
+ if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel
+ if (r) bam_maqindel_ret_destroy(r);
+ return 0;
+ }
+ }
+ }
+ // print the first 3 columns
+ printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
+ // print consensus information if required
+ if (d->format & BAM_PLF_CNS) {
+ int ref_q, rb4 = bam_nt16_table[rb];
+ ref_q = 0;
+ if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP
+ ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff);
+ if (ref_q > 255) ref_q = 255;
+ }
+ rms_mapq = cns>>16&0xff;
+ printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq);
+ }
+ // print pileup sequences
+ printf("%d\t", n);
+ rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pu + i;
+ int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
+ rms_aux += tmp * tmp;
+ if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+ if (!p->is_del) {
+ int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+ putchar(c);
+ if (p->indel > 0) {
+ printf("+%d", p->indel);
+ for (j = 1; j <= p->indel; ++j) {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+ }
+ } else if (p->indel < 0) {
+ printf("%d", p->indel);
+ for (j = 1; j <= -p->indel; ++j) {
+ c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
+ putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+ }
+ }
+ } else putchar('*');
+ if (p->is_tail) putchar('$');
+ }
+ // finalize rms_mapq
+ rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499);
+ if (rms_mapq < 0) rms_mapq = rms_aux;
+ putchar('\t');
+ // print quality
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pu + i;
+ int c = bam1_qual(p->b)[p->qpos] + 33;
+ if (c > 126) c = 126;
+ putchar(c);
+ }
+ if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities
+ const unsigned char *q;
+ putchar('\t');
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pu + i;
+ q = bam_aux_get(p->b, "E2");
+ putchar(q? q[p->qpos + 1] : 'N');
+ }
+ putchar('\t');
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pu + i;
+ q = bam_aux_get(p->b, "U2");
+ putchar(q? q[p->qpos + 1] : '!');
+ }
+ }
+ // print mapping quality if -s is flagged on the command line
+ if (d->format & BAM_PLF_SIMPLE) {
+ putchar('\t');
+ for (i = 0; i < n; ++i) {
+ int c = pu[i].b->core.qual + 33;
+ if (c > 126) c = 126;
+ putchar(c);
+ }
+ }
+ putchar('\n');
+ // print the indel line if r has been calculated. This only happens if:
+ // a) -c or -i are flagged, AND b) the reference sequence is available
+ if (r) {
+ printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1);
+ if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]);
+ else printf("%s/%s\t", r->s[0], r->s[1]);
+ printf("%d\t%d\t", r->q_cns, r->q_ref);
+ printf("%d\t%d\t", rms_mapq, n);
+ printf("%s\t%s\t", r->s[0], r->s[1]);
+ //printf("%d\t%d\t", r->gl[0], r->gl[1]);
+ printf("%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_anti);
+ bam_maqindel_ret_destroy(r);
+ }
+ return 0;
+}
+
+int bam_pileup(int argc, char *argv[])
+{
+ int c, is_SAM = 0;
+ char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
+ pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
+ d->tid = -1; d->mask = BAM_DEF_MASK;
+ d->c = bam_maqcns_init();
+ d->ido = bam_maqindel_opt_init();
+ while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2")) >= 0) {
+ switch (c) {
+ case 's': d->format |= BAM_PLF_SIMPLE; break;
+ case 't': fn_list = strdup(optarg); break;
+ case 'l': fn_pos = strdup(optarg); break;
+ case 'f': fn_fa = strdup(optarg); break;
+ case 'T': d->c->theta = atof(optarg); break;
+ case 'N': d->c->n_hap = atoi(optarg); break;
+ case 'r': d->c->het_rate = atof(optarg); break;
+ case 'M': d->c->cap_mapQ = atoi(optarg); break;
+ case 'c': d->format |= BAM_PLF_CNS; break;
+ case 'i': d->format |= BAM_PLF_INDEL_ONLY; break;
+ case 'v': d->format |= BAM_PLF_VAR_ONLY; break;
+ case 'm': d->mask = strtol(optarg, 0, 0); break;
+ case 'g': d->format |= BAM_PLF_GLF; break;
+ case '2': d->format |= BAM_PLF_2ND; break;
+ case 'I': d->ido->q_indel = atoi(optarg); break;
+ case 'G': d->ido->r_indel = atof(optarg); break;
+ case 'S': is_SAM = 1; break;
+ default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
+ }
+ }
+ if (fn_list) is_SAM = 1;
+ if (optind == argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools pileup [options] <in.bam>|<in.sam>\n\n");
+ fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n");
+ fprintf(stderr, " -S the input is in SAM\n");
+ fprintf(stderr, " -2 output the 2nd best call and quality\n");
+ fprintf(stderr, " -i only show lines/consensus with indels\n");
+ fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask);
+ fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ);
+ fprintf(stderr, " -t FILE list of reference sequences (assume the input is in SAM)\n");
+ fprintf(stderr, " -l FILE list of sites at which pileup is output\n");
+ fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n");
+ fprintf(stderr, " -c output the maq consensus sequence\n");
+ fprintf(stderr, " -v print variants only (for -c)\n");
+ fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n");
+ fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta);
+ fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap);
+ fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate);
+ fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel);
+ fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel);
+ fprintf(stderr, "\n");
+ free(fn_list); free(fn_fa); free(d);
+ return 1;
+ }
+ if (fn_fa) d->fai = fai_load(fn_fa);
+ if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling
+ if (d->format & BAM_PLF_GLF) { // for glf output
+ glf3_header_t *h;
+ h = glf3_header_init();
+ d->fp_glf = bgzf_fdopen(fileno(stdout), "w");
+ glf3_header_write(d->fp_glf, h);
+ glf3_header_destroy(h);
+ }
+ if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)))
+ fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n");
+ {
+ samfile_t *fp;
+ fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0);
+ if (fp == 0 || fp->header == 0) {
+ fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n");
+ return 1;
+ }
+ d->h = fp->header;
+ if (fn_pos) d->hash = load_pos(fn_pos, d->h);
+ sampileup(fp, d->mask, pileup_func, d);
+ samclose(fp); // d->h will be destroyed here
+ }
+
+ // free
+ if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf);
+ if (fn_pos) { // free the hash table
+ khint_t k;
+ for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k)
+ if (kh_exist(d->hash, k)) free(kh_val(d->hash, k));
+ kh_destroy(64, d->hash);
+ }
+ free(fn_pos); free(fn_list); free(fn_fa);
+ if (d->fai) fai_destroy(d->fai);
+ bam_maqcns_destroy(d->c);
+ free(d->ido); free(d->ref); free(d);
+ return 0;
+}
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "bam.h"
+
+typedef bam1_t *bam1_p;
+#include "khash.h"
+KHASH_SET_INIT_STR(name)
+KHASH_MAP_INIT_INT64(pos, bam1_p)
+
+#define BUFFER_SIZE 0x40000
+
+typedef struct {
+ int n, max;
+ bam1_t **a;
+} tmp_stack_t;
+
+static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
+{
+ if (stack->n == stack->max) {
+ stack->max = stack->max? stack->max<<1 : 0x10000;
+ stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
+ }
+ stack->a[stack->n++] = b;
+}
+
+static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
+{
+ int i;
+ for (i = 0; i != stack->n; ++i) {
+ bam_write1(out, stack->a[i]);
+ bam_destroy1(stack->a[i]);
+ }
+ stack->n = 0;
+ if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
+}
+
+static void clear_del_set(khash_t(name) *del_set)
+{
+ khint_t k;
+ for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
+ if (kh_exist(del_set, k))
+ free((char*)kh_key(del_set, k));
+ kh_clear(name, del_set);
+}
+
+void bam_rmdup_core(bamFile in, bamFile out)
+{
+ bam_header_t *header;
+ bam1_t *b;
+ int last_tid = -1, last_pos = -1;
+ uint64_t n_checked = 0, n_removed = 0;
+ tmp_stack_t stack;
+ khint_t k;
+ khash_t(pos) *best_hash;
+ khash_t(name) *del_set;
+
+ best_hash = kh_init(pos);
+ del_set = kh_init(name);
+ b = bam_init1();
+ memset(&stack, 0, sizeof(tmp_stack_t));
+ header = bam_header_read(in);
+ bam_header_write(out, header);
+
+ kh_resize(name, del_set, 4 * BUFFER_SIZE);
+ kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
+ while (bam_read1(in, b) >= 0) {
+ bam1_core_t *c = &b->core;
+ if (c->tid != last_tid || last_pos != c->pos) {
+ dump_best(&stack, best_hash, out); // write the result
+ if (c->tid != last_tid) {
+ kh_clear(pos, best_hash);
+ if (kh_size(del_set)) { // check
+ fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
+ clear_del_set(del_set);
+ }
+ if ((int)c->tid == -1) { // append unmapped reads
+ bam_write1(out, b);
+ while (bam_read1(in, b) >= 0) bam_write1(out, b);
+ break;
+ }
+ last_tid = c->tid;
+ fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
+ }
+ }
+ if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
+ bam_write1(out, b);
+ } else if (c->isize > 0) { // paired, head
+ uint64_t key = (uint64_t)c->pos<<32 | c->isize;
+ int ret;
+ ++n_checked;
+ k = kh_put(pos, best_hash, key, &ret);
+ if (ret == 0) { // found in best_hash
+ bam1_t *p = kh_val(best_hash, k);
+ ++n_removed;
+ if (p->core.qual < c->qual) { // the current alignment is better
+ kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+ bam_copy1(p, b); // replaced as b
+ } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+ if (ret == 0)
+ fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+ } else { // not found in best_hash
+ kh_val(best_hash, k) = bam_dup1(b);
+ stack_insert(&stack, kh_val(best_hash, k));
+ }
+ } else { // paired, tail
+ k = kh_get(name, del_set, bam1_qname(b));
+ if (k != kh_end(del_set)) {
+ free((char*)kh_key(del_set, k));
+ kh_del(name, del_set, k);
+ } else bam_write1(out, b);
+ }
+ last_pos = c->pos;
+ }
+ dump_best(&stack, best_hash, out);
+
+ bam_header_destroy(header);
+ clear_del_set(del_set);
+ kh_destroy(name, del_set);
+ kh_destroy(pos, best_hash);
+ free(stack.a);
+ bam_destroy1(b);
+ fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
+ (double)n_removed/n_checked);
+}
+int bam_rmdup(int argc, char *argv[])
+{
+ bamFile in, out;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
+ return 1;
+ }
+ in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+ out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+ if (in == 0 || out == 0) {
+ fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
+ return 1;
+ }
+ bam_rmdup_core(in, out);
+ bam_close(in);
+ bam_close(out);
+ return 0;
+}
--- /dev/null
+#include <math.h>
+#include "sam.h"
+#include "khash.h"
+
+typedef struct {
+ int n, m;
+ int *a;
+} listelem_t;
+
+KHASH_MAP_INIT_INT(32, listelem_t)
+
+#define BLOCK_SIZE 65536
+
+typedef struct {
+ bam1_t *b;
+ int rpos, score;
+} elem_t;
+
+typedef struct {
+ int n, max, x;
+ elem_t *buf;
+} buffer_t;
+
+static int fill_buf(samfile_t *in, buffer_t *buf)
+{
+ int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
+ bam1_t *b = bam_init1();
+ bam1_core_t *c = &b->core;
+ // squeeze out the empty cells at the beginning
+ for (i = 0; i < buf->n; ++i)
+ if (buf->buf[i].b) break;
+ if (i < buf->n) { // squeeze
+ if (i > 0) {
+ memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
+ buf->n = buf->n - i;
+ }
+ } else buf->n = 0;
+ // calculate min_rpos
+ for (i = 0; i < buf->n; ++i) {
+ elem_t *e = buf->buf + i;
+ if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
+ min_rpos = buf->buf[i].rpos;
+ }
+ // fill the buffer
+ buf->x = -1;
+ last_tid = buf->n? buf->buf[0].b->core.tid : -1;
+ capacity = buf->n + BLOCK_SIZE;
+ while ((ret = samread(in, b)) >= 0) {
+ elem_t *e;
+ uint8_t *qual = bam1_qual(b);
+ int is_mapped;
+ if (last_tid < 0) last_tid = c->tid;
+ if (c->tid != last_tid) {
+ if (buf->x < 0) buf->x = buf->n;
+ }
+ if (buf->n >= buf->max) { // enlarge
+ buf->max = buf->max? buf->max<<1 : 8;
+ buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
+ }
+ e = &buf->buf[buf->n++];
+ e->b = bam_dup1(b);
+ e->rpos = -1; e->score = 0;
+ for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
+ e->score = (double)e->score / sqrt(c->l_qseq + 1);
+ is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
+ if (!is_mapped) e->score = -1;
+ if (is_mapped && (c->flag & BAM_FREVERSE)) {
+ e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
+ if (min_rpos > e->rpos) min_rpos = e->rpos;
+ }
+ if (buf->n >= capacity) {
+ if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
+ else break;
+ }
+ }
+ if (ret >= 0 && buf->x < 0) buf->x = buf->n;
+ bam_destroy1(b);
+ return buf->n;
+}
+
+static void rmdupse_buf(buffer_t *buf)
+{
+ khash_t(32) *h;
+ uint32_t key;
+ khint_t k;
+ int mpos, i, upper;
+ listelem_t *p;
+ mpos = 0x7fffffff;
+ mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
+ upper = (buf->x < 0)? buf->n : buf->x;
+ // fill the hash table
+ h = kh_init(32);
+ for (i = 0; i < upper; ++i) {
+ elem_t *e = buf->buf + i;
+ int ret;
+ if (e->score < 0) continue;
+ if (e->rpos >= 0) {
+ if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
+ else continue;
+ } else {
+ if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
+ else continue;
+ }
+ k = kh_put(32, h, key, &ret);
+ p = &kh_val(h, k);
+ if (ret == 0) { // present in the hash table
+ if (p->n == p->m) {
+ p->m <<= 1;
+ p->a = (int*)realloc(p->a, p->m * sizeof(int));
+ }
+ p->a[p->n++] = i;
+ } else {
+ p->m = p->n = 1;
+ p->a = (int*)calloc(p->m, sizeof(int));
+ p->a[0] = i;
+ }
+ }
+ // rmdup
+ for (k = kh_begin(h); k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ int max, maxi;
+ p = &kh_val(h, k);
+ // get the max
+ for (i = max = 0, maxi = -1; i < p->n; ++i) {
+ if (buf->buf[p->a[i]].score > max) {
+ max = buf->buf[p->a[i]].score;
+ maxi = i;
+ }
+ }
+ // mark the elements
+ for (i = 0; i < p->n; ++i) {
+ buf->buf[p->a[i]].score = -1;
+ if (i != maxi) {
+ bam_destroy1(buf->buf[p->a[i]].b);
+ buf->buf[p->a[i]].b = 0;
+ }
+ }
+ // free
+ free(p->a);
+ }
+ }
+ kh_destroy(32, h);
+}
+
+static void dump_buf(buffer_t *buf, samfile_t *out)
+{
+ int i;
+ for (i = 0; i < buf->n; ++i) {
+ elem_t *e = buf->buf + i;
+ if (e->score != -1) break;
+ if (e->b) {
+ samwrite(out, e->b);
+ bam_destroy1(e->b);
+ e->b = 0;
+ }
+ }
+}
+
+int bam_rmdupse(int argc, char *argv[])
+{
+ samfile_t *in, *out;
+ buffer_t *buf;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
+ return 1;
+ }
+ buf = calloc(1, sizeof(buffer_t));
+ in = samopen(argv[1], "rb", 0);
+ out = samopen(argv[2], "wb", in->header);
+ while (fill_buf(in, buf)) {
+ rmdupse_buf(buf);
+ dump_buf(buf, out);
+ }
+ samclose(in); samclose(out);
+ free(buf->buf); free(buf);
+ return 0;
+}
--- /dev/null
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "bam.h"
+#include "ksort.h"
+
+static int g_is_by_qname = 0;
+
+static inline int strnum_cmp(const char *a, const char *b)
+{
+ char *pa, *pb;
+ pa = (char*)a; pb = (char*)b;
+ while (*pa && *pb) {
+ if (isdigit(*pa) && isdigit(*pb)) {
+ long ai, bi;
+ ai = strtol(pa, &pa, 10);
+ bi = strtol(pb, &pb, 10);
+ if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0;
+ } else {
+ if (*pa != *pb) break;
+ ++pa; ++pb;
+ }
+ }
+ if (*pa == *pb)
+ return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0;
+ return *pa<*pb? -1 : *pa>*pb? 1 : 0;
+}
+
+#define HEAP_EMPTY 0xffffffffffffffffull
+
+typedef struct {
+ int i;
+ uint64_t pos;
+ bam1_t *b;
+} heap1_t;
+
+static inline int heap_lt(const heap1_t a, const heap1_t b)
+{
+ if (g_is_by_qname) {
+ int t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+ return (t > 0 || (t == 0 && a.pos > b.pos));
+ } else return (a.pos > b.pos);
+}
+
+KSORT_INIT(heap, heap1_t, heap_lt)
+
+/*!
+ @abstract Merge multiple sorted BAM.
+ @param is_by_qname whether to sort by query name
+ @param out output BAM file name
+ @param n number of files to be merged
+ @param fn names of files to be merged
+
+ @discussion Padding information may NOT correctly maintained. This
+ function is NOT thread safe.
+ */
+void bam_merge_core(int by_qname, const char *out, int n, char * const *fn)
+{
+ bamFile fpout, *fp;
+ heap1_t *heap;
+ bam_header_t *hout = 0;
+ int i, j;
+
+ g_is_by_qname = by_qname;
+ fp = (bamFile*)calloc(n, sizeof(bamFile));
+ heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+ for (i = 0; i != n; ++i) {
+ heap1_t *h;
+ bam_header_t *hin;
+ assert(fp[i] = bam_open(fn[i], "r"));
+ hin = bam_header_read(fp[i]);
+ if (i == 0) hout = hin;
+ else { // validate multiple baf
+ if (hout->n_targets != hin->n_targets) {
+ fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]);
+ exit(1);
+ }
+ for (j = 0; j < hout->n_targets; ++j) {
+ if (strcmp(hout->target_name[j], hin->target_name[j])) {
+ fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n",
+ hout->target_name[j], hin->target_name[j], fn[i]);
+ exit(1);
+ }
+ if (hout->target_len[j] != hin->target_len[j])
+ fprintf(stderr, "[bam_merge_core] different target sequence length: %d != %d in file '%s'. Continue.\n",
+ hout->target_len[j], hin->target_len[j], fn[i]);
+ }
+ bam_header_destroy(hin);
+ }
+ h = heap + i;
+ h->i = i;
+ h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ if (bam_read1(fp[i], h->b) >= 0)
+ h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b);
+ else h->pos = HEAP_EMPTY;
+ }
+ fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
+ assert(fpout);
+ bam_header_write(fpout, hout);
+ bam_header_destroy(hout);
+
+ ks_heapmake(heap, n, heap);
+ while (heap->pos != HEAP_EMPTY) {
+ bam1_t *b = heap->b;
+ bam_write1_core(fpout, &b->core, b->data_len, b->data);
+ if ((j = bam_read1(fp[heap->i], b)) >= 0)
+ heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b);
+ else if (j == -1) heap->pos = HEAP_EMPTY;
+ else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+ ks_heapadjust(heap, 0, n, heap);
+ }
+
+ for (i = 0; i != n; ++i) {
+ bam_close(fp[i]);
+ free(heap[i].b->data);
+ free(heap[i].b);
+ }
+ bam_close(fpout);
+ free(fp); free(heap);
+}
+int bam_merge(int argc, char *argv[])
+{
+ int c, is_by_qname = 0;
+ while ((c = getopt(argc, argv, "n")) >= 0) {
+ switch (c) {
+ case 'n': is_by_qname = 1; break;
+ }
+ }
+ if (optind + 2 >= argc) {
+ fprintf(stderr, "Usage: samtools merge [-n] <out.bam> <in1.bam> <in2.bam> [...]\n");
+ return 1;
+ }
+ bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1);
+ return 0;
+}
+
+typedef bam1_t *bam1_p;
+
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+ if (g_is_by_qname) {
+ int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
+ return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos))));
+ } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos));
+}
+KSORT_INIT(sort, bam1_p, bam1_lt)
+
+static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h)
+{
+ char *name;
+ int i;
+ bamFile fp;
+ ks_mergesort(sort, k, buf, 0);
+ name = (char*)calloc(strlen(prefix) + 20, 1);
+ if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n);
+ else sprintf(name, "%s.bam", prefix);
+ assert(fp = bam_open(name, "w"));
+ free(name);
+ bam_header_write(fp, h);
+ for (i = 0; i < k; ++i)
+ bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
+ bam_close(fp);
+}
+
+/*!
+ @abstract Sort an unsorted BAM file based on the chromosome order
+ and the leftmost position of an alignment
+
+ @param is_by_qname whether to sort by query name
+ @param fn name of the file to be sorted
+ @param prefix prefix of the output and the temporary files; upon
+ sucessess, prefix.bam will be written.
+ @param max_mem approxiate maximum memory (very inaccurate)
+
+ @discussion It may create multiple temporary subalignment files
+ and then merge them by calling bam_merge_core(). This function is
+ NOT thread safe.
+ */
+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
+{
+ int n, ret, k, i;
+ size_t mem;
+ bam_header_t *header;
+ bamFile fp;
+ bam1_t *b, **buf;
+
+ g_is_by_qname = is_by_qname;
+ n = k = 0; mem = 0;
+ fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+ assert(fp);
+ header = bam_header_read(fp);
+ buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*));
+ // write sub files
+ for (;;) {
+ if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+ b = buf[k];
+ if ((ret = bam_read1(fp, b)) < 0) break;
+ mem += ret;
+ ++k;
+ if (mem >= max_mem) {
+ sort_blocks(n++, k, buf, prefix, header);
+ mem = 0; k = 0;
+ }
+ }
+ if (ret != -1)
+ fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+ if (n == 0) sort_blocks(-1, k, buf, prefix, header);
+ else { // then merge
+ char **fns, *fnout;
+ fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1);
+ sort_blocks(n++, k, buf, prefix, header);
+ fnout = (char*)calloc(strlen(prefix) + 20, 1);
+ sprintf(fnout, "%s.bam", prefix);
+ fns = (char**)calloc(n, sizeof(char*));
+ for (i = 0; i < n; ++i) {
+ fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+ sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+ }
+ bam_merge_core(is_by_qname, fnout, n, fns);
+ free(fnout);
+ for (i = 0; i < n; ++i) {
+ unlink(fns[i]);
+ free(fns[i]);
+ }
+ free(fns);
+ }
+ for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) {
+ if (buf[k]) {
+ free(buf[k]->data);
+ free(buf[k]);
+ }
+ }
+ free(buf);
+ bam_header_destroy(header);
+ bam_close(fp);
+}
+
+int bam_sort(int argc, char *argv[])
+{
+ size_t max_mem = 500000000;
+ int c, is_by_qname = 0;
+ while ((c = getopt(argc, argv, "nm:")) >= 0) {
+ switch (c) {
+ case 'n': is_by_qname = 1; break;
+ case 'm': max_mem = atol(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: samtools sort [-n] [-m <maxMem>] <in.bam> <out.prefix>\n");
+ return 1;
+ }
+ bam_sort_core(is_by_qname, argv[optind], argv[optind+1], max_mem);
+ return 0;
+}
--- /dev/null
+#include <unistd.h>
+#include <assert.h>
+#include "bam.h"
+
+typedef struct {
+ long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
+ long long n_sgltn, n_read1, n_read2;
+ long long n_qcfail, n_dup;
+ long long n_diffchr, n_diffhigh;
+} bam_flagstat_t;
+
+#define flagstat_loop(s, c) do { \
+ ++(s)->n_reads; \
+ if ((c)->flag & BAM_FPAIRED) { \
+ ++(s)->n_pair_all; \
+ if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \
+ if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \
+ if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \
+ if ((c)->flag & BAM_FMUNMAP) ++(s)->n_sgltn; \
+ if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
+ ++(s)->n_pair_map; \
+ if ((c)->mtid != (c)->tid) { \
+ ++(s)->n_diffchr; \
+ if ((c)->qual >= 5) ++(s)->n_diffhigh; \
+ } \
+ } \
+ } \
+ if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \
+ if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \
+ if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \
+ } while (0)
+
+bam_flagstat_t *bam_flagstat_core(bamFile fp)
+{
+ bam_flagstat_t *s;
+ bam1_t *b;
+ bam1_core_t *c;
+ int ret;
+ s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+ b = bam_init1();
+ c = &b->core;
+ while ((ret = bam_read1(fp, b)) >= 0)
+ flagstat_loop(s, c);
+ bam_destroy1(b);
+ if (ret != -1)
+ fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+ return s;
+}
+int bam_flagstat(int argc, char *argv[])
+{
+ bamFile fp;
+ bam_header_t *header;
+ bam_flagstat_t *s;
+ if (argc == optind) {
+ fprintf(stderr, "Usage: samtools flagstat <in.bam>\n");
+ return 1;
+ }
+ fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ assert(fp);
+ header = bam_header_read(fp);
+ s = bam_flagstat_core(fp);
+ printf("%lld in total\n", s->n_reads);
+ printf("%lld QC failure\n", s->n_qcfail);
+ printf("%lld duplicates\n", s->n_dup);
+ printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0);
+ printf("%lld paired in sequencing\n", s->n_pair_all);
+ printf("%lld read1\n", s->n_read1);
+ printf("%lld read2\n", s->n_read2);
+ printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0);
+ printf("%lld with itself and mate mapped\n", s->n_pair_map);
+ printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0);
+ printf("%lld with mate mapped to a different chr\n", s->n_diffchr);
+ printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh);
+ free(s);
+ bam_header_destroy(header);
+ bam_close(fp);
+ return 0;
+}
--- /dev/null
+#ifndef _NO_CURSES
+#include <curses.h>
+#ifdef NCURSES_VERSION
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include "bam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+
+char bam_aux_getCEi(bam1_t *b, int i);
+char bam_aux_getCSi(bam1_t *b, int i);
+char bam_aux_getCQi(bam1_t *b, int i);
+
+#define TV_MIN_ALNROW 2
+#define TV_MAX_GOTO 40
+#define TV_LOW_MAPQ 10
+
+#define TV_COLOR_MAPQ 0
+#define TV_COLOR_BASEQ 1
+#define TV_COLOR_NUCL 2
+#define TV_COLOR_COL 3
+#define TV_COLOR_COLQ 4
+
+#define TV_BASE_NUCL 0
+#define TV_BASE_COLOR_SPACE 1
+
+typedef struct {
+ int mrow, mcol;
+ WINDOW *wgoto, *whelp;
+
+ bam_index_t *idx;
+ bam_lplbuf_t *lplbuf;
+ bam_header_t *header;
+ bamFile fp;
+ int curr_tid, left_pos;
+ faidx_t *fai;
+ bam_maqcns_t *bmc;
+
+ int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins;
+ char *ref;
+} tview_t;
+
+int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+ tview_t *tv = (tview_t*)data;
+ int i, j, c, rb, attr, max_ins = 0;
+ uint32_t call = 0;
+ if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
+ // print referece
+ rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
+ for (i = tv->last_pos + 1; i < pos; ++i) {
+ if (i%10 == 0) mvprintw(0, tv->ccol, "%-d", i+1);
+ c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
+ mvaddch(1, tv->ccol++, c);
+ }
+ if (pos%10 == 0) mvprintw(0, tv->ccol, "%-d", pos+1);
+ // print consensus
+ call = bam_maqcns_call(n, pl, tv->bmc);
+ attr = A_UNDERLINE;
+ c = ",ACMGRSVTWYHKDBN"[call>>28&0xf];
+ i = (call>>8&0xff)/10+1;
+ if (i > 4) i = 4;
+ attr |= COLOR_PAIR(i);
+ if (c == toupper(rb)) c = '.';
+ attron(attr);
+ mvaddch(2, tv->ccol, c);
+ attroff(attr);
+ if(tv->ins) {
+ // calculate maximum insert
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
+ }
+ }
+ // core loop
+ for (j = 0; j <= max_ins; ++j) {
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ int row = TV_MIN_ALNROW + p->level - tv->row_shift;
+ if (j == 0) {
+ if (!p->is_del) {
+ if (tv->base_for == TV_BASE_COLOR_SPACE &&
+ (c = bam_aux_getCSi(p->b, p->qpos))) {
+ c = bam_aux_getCSi(p->b, p->qpos);
+ // assume that if we found one color, we will be able to get the color error
+ if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ else {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ } else c = '*';
+ } else { // padding
+ if (j > p->indel) c = '*';
+ else { // insertion
+ if (tv->base_for == TV_BASE_NUCL) {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ else {
+ c = bam_aux_getCSi(p->b, p->qpos + j);
+ if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ }
+ }
+ if (row > TV_MIN_ALNROW && row < tv->mrow) {
+ int x;
+ attr = 0;
+ if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
+ || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE;
+ if (tv->color_for == TV_COLOR_BASEQ) {
+ x = bam1_qual(p->b)[p->qpos]/10 + 1;
+ if (x > 4) x = 4;
+ attr |= COLOR_PAIR(x);
+ } else if (tv->color_for == TV_COLOR_MAPQ) {
+ x = p->b->core.qual/10 + 1;
+ if (x > 4) x = 4;
+ attr |= COLOR_PAIR(x);
+ } else if (tv->color_for == TV_COLOR_NUCL) {
+ x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5;
+ attr |= COLOR_PAIR(x);
+ } else if(tv->color_for == TV_COLOR_COL) {
+ x = 0;
+ switch(bam_aux_getCSi(p->b, p->qpos)) {
+ case '0': x = 0; break;
+ case '1': x = 1; break;
+ case '2': x = 2; break;
+ case '3': x = 3; break;
+ case '4': x = 4; break;
+ default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break;
+ }
+ x+=5;
+ attr |= COLOR_PAIR(x);
+ } else if(tv->color_for == TV_COLOR_COLQ) {
+ x = bam_aux_getCQi(p->b, p->qpos);
+ if(0 == x) x = bam1_qual(p->b)[p->qpos];
+ x = x/10 + 1;
+ if (x > 4) x = 4;
+ attr |= COLOR_PAIR(x);
+ }
+ attron(attr);
+ mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c));
+ attroff(attr);
+ }
+ }
+ c = j? '*' : rb;
+ if (c == '*') {
+ attr = COLOR_PAIR(8);
+ attron(attr);
+ mvaddch(1, tv->ccol++, c);
+ attroff(attr);
+ } else mvaddch(1, tv->ccol++, c);
+ }
+ tv->last_pos = pos;
+ return 0;
+}
+
+tview_t *tv_init(const char *fn, const char *fn_fa)
+{
+ tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t));
+ tv->is_dot = 1;
+ tv->idx = bam_index_load(fn);
+ if (tv->idx == 0) exit(1);
+ tv->fp = bam_open(fn, "r");
+ bgzf_set_cache_size(tv->fp, 8 * 1024 *1024);
+ assert(tv->fp);
+ tv->header = bam_header_read(tv->fp);
+ tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
+ if (fn_fa) tv->fai = fai_load(fn_fa);
+ tv->bmc = bam_maqcns_init();
+ tv->ins = 1;
+ bam_maqcns_prepare(tv->bmc);
+
+ initscr();
+ keypad(stdscr, TRUE);
+ clear();
+ noecho();
+ cbreak();
+#ifdef NCURSES_VERSION
+ getmaxyx(stdscr, tv->mrow, tv->mcol);
+#else
+ tv->mrow = 80; tv->mcol = 40;
+#endif
+ tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
+ tv->whelp = newwin(27, 40, 5, 5);
+ tv->color_for = TV_COLOR_MAPQ;
+ start_color();
+ init_pair(1, COLOR_BLUE, COLOR_BLACK);
+ init_pair(2, COLOR_GREEN, COLOR_BLACK);
+ init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+ init_pair(4, COLOR_WHITE, COLOR_BLACK);
+ init_pair(5, COLOR_GREEN, COLOR_BLACK);
+ init_pair(6, COLOR_CYAN, COLOR_BLACK);
+ init_pair(7, COLOR_YELLOW, COLOR_BLACK);
+ init_pair(8, COLOR_RED, COLOR_BLACK);
+ init_pair(9, COLOR_BLUE, COLOR_BLACK);
+ return tv;
+}
+
+void tv_destroy(tview_t *tv)
+{
+ delwin(tv->wgoto); delwin(tv->whelp);
+ endwin();
+
+ bam_lplbuf_destroy(tv->lplbuf);
+ bam_maqcns_destroy(tv->bmc);
+ bam_index_destroy(tv->idx);
+ if (tv->fai) fai_destroy(tv->fai);
+ free(tv->ref);
+ bam_header_destroy(tv->header);
+ bam_close(tv->fp);
+ free(tv);
+}
+
+int tv_fetch_func(const bam1_t *b, void *data)
+{
+ tview_t *tv = (tview_t*)data;
+ bam_lplbuf_push(b, tv->lplbuf);
+ return 0;
+}
+
+int tv_draw_aln(tview_t *tv, int tid, int pos)
+{
+ // reset
+ clear();
+ tv->curr_tid = tid; tv->left_pos = pos;
+ tv->last_pos = tv->left_pos - 1;
+ tv->ccol = 0;
+ // print ref and consensus
+ if (tv->fai) {
+ char *str;
+ if (tv->ref) free(tv->ref);
+ str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
+ sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
+ tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
+ free(str);
+ }
+ // draw aln
+ bam_lplbuf_reset(tv->lplbuf);
+ bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
+ bam_lplbuf_push(0, tv->lplbuf);
+ return 0;
+}
+
+static void tv_win_goto(tview_t *tv, int *tid, int *pos)
+{
+ char str[256];
+ int i, l = 0;
+ wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
+ mvwprintw(tv->wgoto, 1, 2, "Goto: ");
+ for (;;) {
+ int c = wgetch(tv->wgoto);
+ wrefresh(tv->wgoto);
+ if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
+ --l;
+ } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
+ int _tid = -1, _beg, _end;
+ bam_parse_region(tv->header, str, &_tid, &_beg, &_end);
+ if (_tid >= 0) {
+ *tid = _tid; *pos = _beg;
+ return;
+ }
+ } else if (isgraph(c)) {
+ if (l < TV_MAX_GOTO) str[l++] = c;
+ } else if (c == '\027') l = 0;
+ else if (c == '\033') return;
+ str[l] = '\0';
+ for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
+ mvwprintw(tv->wgoto, 1, 8, "%s", str);
+ }
+}
+
+static void tv_win_help(tview_t *tv) {
+ int r = 1;
+ WINDOW *win = tv->whelp;
+ wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
+ mvwprintw(win, r++, 2, " -=- Help -=- ");
+ r++;
+ mvwprintw(win, r++, 2, "? This window");
+ mvwprintw(win, r++, 2, "Arrows Small scroll movement");
+ mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");
+ mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");
+ mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");
+ mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");
+ mvwprintw(win, r++, 2, "space Scroll one screen");
+ mvwprintw(win, r++, 2, "backspace Scroll back one screen");
+ mvwprintw(win, r++, 2, "g Go to specific location");
+ mvwprintw(win, r++, 2, "m Color for mapping qual");
+ mvwprintw(win, r++, 2, "n Color for nucleotide");
+ mvwprintw(win, r++, 2, "b Color for base quality");
+ mvwprintw(win, r++, 2, "c Color for cs color");
+ mvwprintw(win, r++, 2, "z Color for cs qual");
+ mvwprintw(win, r++, 2, ". Toggle on/off dot view");
+ mvwprintw(win, r++, 2, "N Turn on nt view");
+ mvwprintw(win, r++, 2, "C Turn on cs view");
+ mvwprintw(win, r++, 2, "i Toggle on/off ins");
+ mvwprintw(win, r++, 2, "q Exit");
+ r++;
+ mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
+ mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");
+ mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");
+ wrefresh(win);
+ wgetch(win);
+}
+
+void tv_loop(tview_t *tv)
+{
+ int tid, pos;
+ tid = tv->curr_tid; pos = tv->left_pos;
+ while (1) {
+ int c = getch();
+ //if(256 < c) {c = 1 + (c%256);} // Terminal was displaying ctrl-H as 263 via ssh from Mac OS X 10.5 computer
+ switch (c) {
+ case '?': tv_win_help(tv); break;
+ case '\033':
+ case 'q': goto end_loop;
+ case 'g': tv_win_goto(tv, &tid, &pos); break;
+ case 'm': tv->color_for = TV_COLOR_MAPQ; break;
+ case 'b': tv->color_for = TV_COLOR_BASEQ; break;
+ case 'n': tv->color_for = TV_COLOR_NUCL; break;
+ case 'c': tv->color_for = TV_COLOR_COL; break;
+ case 'z': tv->color_for = TV_COLOR_COLQ; break;
+ case KEY_LEFT:
+ case 'h': --pos; break;
+ case KEY_RIGHT:
+ case 'l': ++pos; break;
+ case KEY_SLEFT:
+ case 'H': pos -= 20; break;
+ case KEY_SRIGHT:
+ case 'L': pos += 20; break;
+ case '.': tv->is_dot = !tv->is_dot; break;
+ case 'N': tv->base_for = TV_BASE_NUCL; break;
+ case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
+ case 'i': tv->ins = !tv->ins; break;
+ case '\010': pos -= 1000; break;
+ case '\014': pos += 1000; break;
+ case ' ': pos += tv->mcol; break;
+ case KEY_UP:
+ case 'j': --tv->row_shift; break;
+ case KEY_DOWN:
+ case 'k': ++tv->row_shift; break;
+ case KEY_BACKSPACE:
+ case '\177': pos -= tv->mcol; break;
+#ifdef KEY_RESIZE
+ case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
+#endif
+ default: continue;
+ }
+ if (pos < 0) pos = 0;
+ if (tv->row_shift < 0) tv->row_shift = 0;
+ tv_draw_aln(tv, tid, pos);
+ }
+end_loop:
+ return;
+}
+
+int bam_tview_main(int argc, char *argv[])
+{
+ tview_t *tv;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bamtk tview <aln.bam> [ref.fasta]\n");
+ return 1;
+ }
+ tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]);
+ tv_draw_aln(tv, 0, 0);
+ tv_loop(tv);
+ tv_destroy(tv);
+ return 0;
+}
+#else // #ifdef NCURSES_VERSION
+#warning "The ncurses library is unavailable; tview is disabled."
+int bam_tview_main(int argc, char *argv[])
+{
+ fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n");
+ return 1;
+}
+#endif
+#endif // #ifndef _NO_CURSES
--- /dev/null
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include "bam.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "0.1.5c (r385)"
+#endif
+
+int bam_taf2baf(int argc, char *argv[]);
+int bam_pileup(int argc, char *argv[]);
+int bam_merge(int argc, char *argv[]);
+int bam_index(int argc, char *argv[]);
+int bam_sort(int argc, char *argv[]);
+int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+int bam_rmdupse(int argc, char *argv[]);
+int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
+
+int main_samview(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
+
+int faidx_main(int argc, char *argv[]);
+int glf3_view_main(int argc, char *argv[]);
+
+int bam_tagview(int argc, char *argv[])
+{
+ bamFile fp;
+ bam_header_t *header;
+ bam1_t *b;
+ char tag[2];
+ int ret;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: samtools tagview <in.bam> <tag>\n");
+ return 1;
+ }
+ fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r");
+ assert(fp);
+ header = bam_header_read(fp);
+ if (header == 0) {
+ fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n");
+ return 1;
+ }
+ tag[0] = argv[2][0]; tag[1] = argv[2][1];
+ b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ uint8_t *d = bam_aux_get(b, tag);
+ if (d) {
+ printf("%s\t%d\t", bam1_qname(b), b->core.flag);
+ if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d));
+ else if (d[0] == 'f') printf("%f\n", bam_aux2f(d));
+ else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d));
+ else if (d[0] == 'A') printf("%c\n", bam_aux2A(d));
+ else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d));
+ else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d));
+ else printf("\n");
+ }
+ }
+ if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret);
+ free(b->data); free(b);
+ bam_header_destroy(header);
+ bam_close(fp);
+ return 0;
+}
+
+static int usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n");
+ fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
+ fprintf(stderr, "Usage: samtools <command> [options]\n\n");
+ fprintf(stderr, "Command: import import from SAM (obsolete; use `view')\n");
+ fprintf(stderr, " view export to the text format\n");
+ fprintf(stderr, " sort sort alignment file\n");
+ fprintf(stderr, " merge merge multiple sorted alignment files\n");
+ fprintf(stderr, " pileup generate pileup output\n");
+ fprintf(stderr, " faidx index/extract FASTA\n");
+#ifndef _NO_CURSES
+ fprintf(stderr, " tview text alignment viewer\n");
+#endif
+ fprintf(stderr, " index index alignment\n");
+ fprintf(stderr, " fixmate fix mate information\n");
+ fprintf(stderr, " rmdup remove PCR duplicates\n");
+ fprintf(stderr, " glfview print GLFv3 file\n");
+ fprintf(stderr, " flagstat simple stats\n");
+ fprintf(stderr, " fillmd fill the MD tag and change identical base to =\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc < 2) return usage();
+ if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
+ else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
+ else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);
+ else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
+ else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
+ else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
+ else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
+ else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1);
+ else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
+ else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);
+ else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
+#ifndef _NO_CURSES
+ else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
+#endif
+ else {
+ fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
+ return 1;
+ }
+ return 0;
+}
--- /dev/null
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+/*
+ 2009-06-29 by lh3: cache recent uncompressed blocks.
+ 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
+ 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+#include "khash.h"
+typedef struct {
+ int size;
+ uint8_t *block;
+ int64_t end_offset;
+} cache_t;
+KHASH_MAP_INIT_INT64(cache, cache_t)
+
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+
+typedef int8_t byte;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+ return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
+}
+
+inline
+int
+min(int x, int y)
+{
+ return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+ fp->error = message;
+}
+
+static BGZF *bgzf_read_init()
+{
+ BGZF *fp;
+ fp = calloc(1, sizeof(BGZF));
+ fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+ fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->compressed_block_size = MAX_BLOCK_SIZE;
+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->cache_size = 0;
+ fp->cache = kh_init(cache);
+ return fp;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+#ifdef _USE_KNETFILE
+ knetFile *file = knet_dopen(fd, "r");
+#else
+ FILE* file = fdopen(fd, "r");
+#endif
+ BGZF* fp;
+ if (file == 0) return 0;
+ fp = bgzf_read_init();
+ fp->file_descriptor = fd;
+ fp->open_mode = 'r';
+#ifdef _USE_KNETFILE
+ fp->x.fpr = file;
+#else
+ fp->file = file;
+#endif
+ return fp;
+}
+
+static
+BGZF*
+open_write(int fd, bool is_uncompressed)
+{
+ FILE* file = fdopen(fd, "w");
+ BGZF* fp;
+ if (file == 0) return 0;
+ fp = malloc(sizeof(BGZF));
+ fp->file_descriptor = fd;
+ fp->open_mode = 'w';
+ fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
+#ifdef _USE_KNETFILE
+ fp->x.fpw = file;
+#else
+ fp->file = file;
+#endif
+ fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+ fp->uncompressed_block = NULL;
+ fp->compressed_block_size = MAX_BLOCK_SIZE;
+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->block_address = 0;
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ fp->error = NULL;
+ return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+ BGZF* fp = NULL;
+ if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
+#ifdef _USE_KNETFILE
+ knetFile *file = knet_open(path, mode);
+ if (file == 0) return 0;
+ fp = bgzf_read_init();
+ fp->file_descriptor = -1;
+ fp->open_mode = 'r';
+ fp->x.fpr = file;
+#else
+ int oflag = O_RDONLY;
+ int fd = open(path, oflag);
+ if (fd == -1) return 0;
+ fp = open_read(fd);
+#endif
+ } else if (mode[0] == 'w' || mode[0] == 'W') {
+ int oflag = O_WRONLY | O_CREAT | O_TRUNC;
+ int fd = open(path, oflag, 0644);
+ if (fd == -1) return 0;
+ fp = open_write(fd, strstr(mode, "u")? 1 : 0);
+ }
+ if (fp != NULL) {
+ fp->owned_file = 1;
+ }
+ return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+ if (fd == -1) return 0;
+ if (mode[0] == 'r' || mode[0] == 'R') {
+ return open_read(fd);
+ } else if (mode[0] == 'w' || mode[0] == 'W') {
+ return open_write(fd, strstr(mode, "u")? 1 : 0);
+ } else {
+ return NULL;
+ }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+ // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+ // Also adds an extra field that stores the compressed block length.
+
+ byte* buffer = fp->compressed_block;
+ int buffer_size = fp->compressed_block_size;
+
+ // Init gzip header
+ buffer[0] = GZIP_ID1;
+ buffer[1] = GZIP_ID2;
+ buffer[2] = CM_DEFLATE;
+ buffer[3] = FLG_FEXTRA;
+ buffer[4] = 0; // mtime
+ buffer[5] = 0;
+ buffer[6] = 0;
+ buffer[7] = 0;
+ buffer[8] = 0;
+ buffer[9] = OS_UNKNOWN;
+ buffer[10] = BGZF_XLEN;
+ buffer[11] = 0;
+ buffer[12] = BGZF_ID1;
+ buffer[13] = BGZF_ID2;
+ buffer[14] = BGZF_LEN;
+ buffer[15] = 0;
+ buffer[16] = 0; // placeholder for block length
+ buffer[17] = 0;
+
+ // loop to retry for blocks that do not compress enough
+ int input_length = block_length;
+ int compressed_length = 0;
+ while (1) {
+ int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->uncompressed_block;
+ zs.avail_in = input_length;
+ zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+ zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+ int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
+ GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (status != Z_OK) {
+ report_error(fp, "deflate init failed");
+ return -1;
+ }
+ status = deflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ deflateEnd(&zs);
+ if (status == Z_OK) {
+ // Not enough space in buffer.
+ // Can happen in the rare case the input doesn't compress enough.
+ // Reduce the amount of input until it fits.
+ input_length -= 1024;
+ if (input_length <= 0) {
+ // should never happen
+ report_error(fp, "input reduction failed");
+ return -1;
+ }
+ continue;
+ }
+ report_error(fp, "deflate failed");
+ return -1;
+ }
+ status = deflateEnd(&zs);
+ if (status != Z_OK) {
+ report_error(fp, "deflate end failed");
+ return -1;
+ }
+ compressed_length = zs.total_out;
+ compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ if (compressed_length > MAX_BLOCK_SIZE) {
+ // should never happen
+ report_error(fp, "deflate overflow");
+ return -1;
+ }
+ break;
+ }
+
+ packInt16((uint8_t*)&buffer[16], compressed_length-1);
+ uint32_t crc = crc32(0L, NULL, 0L);
+ crc = crc32(crc, fp->uncompressed_block, input_length);
+ packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+ packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+ int remaining = block_length - input_length;
+ if (remaining > 0) {
+ if (remaining > input_length) {
+ // should never happen (check so we can use memcpy)
+ report_error(fp, "remainder too large");
+ return -1;
+ }
+ memcpy(fp->uncompressed_block,
+ fp->uncompressed_block + input_length,
+ remaining);
+ }
+ fp->block_offset = remaining;
+ return compressed_length;
+}
+
+static
+int
+inflate_block(BGZF* fp, int block_length)
+{
+ // Inflate the block in fp->compressed_block into fp->uncompressed_block
+
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = fp->uncompressed_block;
+ zs.avail_out = fp->uncompressed_block_size;
+
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ if (status != Z_OK) {
+ report_error(fp, "inflate init failed");
+ return -1;
+ }
+ status = inflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ inflateEnd(&zs);
+ report_error(fp, "inflate failed");
+ return -1;
+ }
+ status = inflateEnd(&zs);
+ if (status != Z_OK) {
+ report_error(fp, "inflate failed");
+ return -1;
+ }
+ return zs.total_out;
+}
+
+static
+int
+check_header(const byte* header)
+{
+ return (header[0] == GZIP_ID1 &&
+ header[1] == (byte) GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & FLG_FEXTRA) != 0 &&
+ unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
+ header[12] == BGZF_ID1 &&
+ header[13] == BGZF_ID2 &&
+ unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+}
+
+static void free_cache(BGZF *fp)
+{
+ khint_t k;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (fp->open_mode != 'r') return;
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) free(kh_val(h, k).block);
+ kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ k = kh_get(cache, h, block_address);
+ if (k == kh_end(h)) return 0;
+ p = &kh_val(h, k);
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address = block_address;
+ fp->block_length = p->size;
+ memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
+#ifdef _USE_KNETFILE
+ knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
+#else
+ fseeko(fp->file, p->end_offset, SEEK_SET);
+#endif
+ return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+ int ret;
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (MAX_BLOCK_SIZE >= fp->cache_size) return;
+ if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
+ /* A better way would be to remove the oldest block in the
+ * cache, but here we remove a random one for simplicity. This
+ * should not have a big impact on performance. */
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) break;
+ if (k < kh_end(h)) {
+ free(kh_val(h, k).block);
+ kh_del(cache, h, k);
+ }
+ }
+ k = kh_put(cache, h, fp->block_address, &ret);
+ if (ret == 0) return; // if this happens, a bug!
+ p = &kh_val(h, k);
+ p->size = fp->block_length;
+ p->end_offset = fp->block_address + size;
+ p->block = malloc(MAX_BLOCK_SIZE);
+ memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
+}
+
+static
+int
+read_block(BGZF* fp)
+{
+ byte header[BLOCK_HEADER_LENGTH];
+ int size = 0;
+#ifdef _USE_KNETFILE
+ int64_t block_address = knet_tell(fp->x.fpr);
+ if (load_block_from_cache(fp, block_address)) return 0;
+ int count = knet_read(fp->x.fpr, header, sizeof(header));
+#else
+ int64_t block_address = ftello(fp->file);
+ if (load_block_from_cache(fp, block_address)) return 0;
+ int count = fread(header, 1, sizeof(header), fp->file);
+#endif
+ if (count == 0) {
+ fp->block_length = 0;
+ return 0;
+ }
+ size = count;
+ if (count != sizeof(header)) {
+ report_error(fp, "read failed");
+ return -1;
+ }
+ if (!check_header(header)) {
+ report_error(fp, "invalid block header");
+ return -1;
+ }
+ int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+ byte* compressed_block = (byte*) fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ int remaining = block_length - BLOCK_HEADER_LENGTH;
+#ifdef _USE_KNETFILE
+ count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+#else
+ count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
+#endif
+ if (count != remaining) {
+ report_error(fp, "read failed");
+ return -1;
+ }
+ size += count;
+ count = inflate_block(fp, block_length);
+ if (count < 0) {
+ return -1;
+ }
+ if (fp->block_length != 0) {
+ // Do not reset offset if this read follows a seek.
+ fp->block_offset = 0;
+ }
+ fp->block_address = block_address;
+ fp->block_length = count;
+ cache_block(fp, size);
+ return 0;
+}
+
+int
+bgzf_read(BGZF* fp, void* data, int length)
+{
+ if (length <= 0) {
+ return 0;
+ }
+ if (fp->open_mode != 'r') {
+ report_error(fp, "file not open for reading");
+ return -1;
+ }
+
+ int bytes_read = 0;
+ byte* output = data;
+ while (bytes_read < length) {
+ int available = fp->block_length - fp->block_offset;
+ if (available <= 0) {
+ if (read_block(fp) != 0) {
+ return -1;
+ }
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) {
+ break;
+ }
+ }
+ int copy_length = min(length-bytes_read, available);
+ byte* buffer = fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+#ifdef _USE_KNETFILE
+ fp->block_address = knet_tell(fp->x.fpr);
+#else
+ fp->block_address = ftello(fp->file);
+#endif
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ return bytes_read;
+}
+
+static
+int
+flush_block(BGZF* fp)
+{
+ while (fp->block_offset > 0) {
+ int block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) {
+ return -1;
+ }
+#ifdef _USE_KNETFILE
+ int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+#else
+ int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+#endif
+ if (count != block_length) {
+ report_error(fp, "write failed");
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
+}
+
+int
+bgzf_write(BGZF* fp, const void* data, int length)
+{
+ if (fp->open_mode != 'w') {
+ report_error(fp, "file not open for writing");
+ return -1;
+ }
+
+ if (fp->uncompressed_block == NULL) {
+ fp->uncompressed_block = malloc(fp->uncompressed_block_size);
+ }
+
+ const byte* input = data;
+ int block_length = fp->uncompressed_block_size;
+ int bytes_written = 0;
+ while (bytes_written < length) {
+ int copy_length = min(block_length - fp->block_offset, length - bytes_written);
+ byte* buffer = fp->uncompressed_block;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ bytes_written += copy_length;
+ if (fp->block_offset == block_length) {
+ if (flush_block(fp) != 0) {
+ break;
+ }
+ }
+ }
+ return bytes_written;
+}
+
+int
+bgzf_close(BGZF* fp)
+{
+ if (fp->open_mode == 'w') {
+ if (flush_block(fp) != 0) {
+ return -1;
+ }
+#ifdef _USE_KNETFILE
+ if (fflush(fp->x.fpw) != 0) {
+#else
+ if (fflush(fp->file) != 0) {
+#endif
+ report_error(fp, "flush failed");
+ return -1;
+ }
+ }
+ if (fp->owned_file) {
+#ifdef _USE_KNETFILE
+ int ret;
+ if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
+ else ret = knet_close(fp->x.fpr);
+ if (ret != 0) return -1;
+#else
+ if (fclose(fp->file) != 0) {
+ return -1;
+ }
+#endif
+ }
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free_cache(fp);
+ free(fp);
+ return 0;
+}
+
+int64_t
+bgzf_tell(BGZF* fp)
+{
+ return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+ if (fp) fp->cache_size = cache_size;
+}
+
+int64_t
+bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+ if (fp->open_mode != 'r') {
+ report_error(fp, "file not open for read");
+ return -1;
+ }
+ if (where != SEEK_SET) {
+ report_error(fp, "unimplemented seek option");
+ return -1;
+ }
+ int block_offset = pos & 0xFFFF;
+ int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+#ifdef _USE_KNETFILE
+ if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
+#else
+ if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+#endif
+ report_error(fp, "seek failed");
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block is not loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
+
--- /dev/null
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <zlib.h>
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+//typedef int8_t bool;
+
+typedef struct {
+ int file_descriptor;
+ char open_mode; // 'r' or 'w'
+ bool owned_file, is_uncompressed;
+#ifdef _USE_KNETFILE
+ union {
+ knetFile *fpr;
+ FILE *fpw;
+ } x;
+#else
+ FILE* file;
+#endif
+ int uncompressed_block_size;
+ int compressed_block_size;
+ void* uncompressed_block;
+ void* compressed_block;
+ int64_t block_address;
+ int block_length;
+ int block_offset;
+ int cache_size;
+ const char* error;
+ void *cache; // a pointer to a hash table
+} BGZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Open an existing file descriptor for reading or writing.
+ * Mode must be either "r" or "w".
+ * A subsequent bgzf_close will not close the file descriptor.
+ * Returns null on error.
+ */
+BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
+
+/*
+ * Open the specified file for reading or writing.
+ * Mode must be either "r" or "w".
+ * Returns null on error.
+ */
+BGZF* bgzf_open(const char* path, const char* __restrict mode);
+
+/*
+ * Close the BGZ file and free all associated resources.
+ * Does not close the underlying file descriptor if created with bgzf_fdopen.
+ * Returns zero on success, -1 on error.
+ */
+int bgzf_close(BGZF* fp);
+
+/*
+ * Read up to length bytes from the file storing into data.
+ * Returns the number of bytes actually read.
+ * Returns zero on end of file.
+ * Returns -1 on error.
+ */
+int bgzf_read(BGZF* fp, void* data, int length);
+
+/*
+ * Write length bytes from data to the file.
+ * Returns the number of bytes written.
+ * Returns -1 on error.
+ */
+int bgzf_write(BGZF* fp, const void* data, int length);
+
+/*
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ * Returns -1 on error.
+ */
+int64_t bgzf_tell(BGZF* fp);
+
+/*
+ * Set the file to read from the location specified by pos, which must
+ * be a value previously returned by bgzf_tell for this file (but not
+ * necessarily one returned by this file handle).
+ * The where argument must be SEEK_SET.
+ * Seeking on a file opened for write is not supported.
+ * Returns zero on success, -1 on error.
+ */
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
+
+/*
+ * Set the cache size. Zero to disable. By default, caching is
+ * disabled. The recommended cache size for frequent random access is
+ * about 8M bytes.
+ */
+void bgzf_set_cache_size(BGZF *fp, int cache_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include "bgzf.h"
+
+static const int WINDOW_SIZE = 64 * 1024;
+
+static int bgzip_main_usage()
+{
+ printf("\n");
+ printf("Usage: bgzip [options] [file] ...\n\n");
+ printf("Options: -c write on standard output, keep original files unchanged\n");
+ printf(" -d decompress\n");
+ // printf(" -l list compressed file contents\n");
+ printf(" -b INT decompress at virtual file pointer INT\n");
+ printf(" -s INT decompress INT bytes in the uncompressed file\n");
+ printf(" -h give this help\n");
+ printf("\n");
+ return 0;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+ int fd = -1;
+ char c;
+ if (!is_forced) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) {
+ printf("bgzip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+ scanf("%c", &c);
+ if (c != 'Y' && c != 'y') {
+ printf("bgzip: not overwritten\n");
+ exit(1);
+ }
+ }
+ }
+ if (fd < 0) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) {
+ fprintf(stderr, "bgzip: %s: Fail to write\n", fn);
+ exit(1);
+ }
+ }
+ return fd;
+}
+
+static
+void
+fail(BGZF* fp)
+{
+ printf("Error: %s\n", fp->error);
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ int c, compress, pstdout, is_forced;
+ BGZF *rz;
+ void *buffer;
+ long start, end, size;
+
+ compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+ while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+ switch(c){
+ case 'h': return bgzip_main_usage();
+ case 'd': compress = 0; break;
+ case 'c': pstdout = 1; break;
+ // case 'l': compress = 2; break;
+ case 'b': start = atol(optarg); break;
+ case 's': size = atol(optarg); break;
+ case 'f': is_forced = 1; break;
+ }
+ }
+ if (size >= 0) end = start + size;
+ if(end >= 0 && end < start){
+ fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+ return 1;
+ }
+ if(compress == 1){
+ int f_src, f_dst = -1;
+ if(argc > optind){
+ if((f_src = open(argv[optind], O_RDONLY)) < 0){
+ fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+ return 1;
+ }
+ if(pstdout){
+ f_dst = fileno(stdout);
+ } else {
+ char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+ strcpy(name, argv[optind]);
+ strcat(name, ".gz");
+ f_dst = write_open(name, is_forced);
+ if (f_dst < 0) return 1;
+ free(name);
+ }
+ } else if(pstdout){
+ f_src = fileno(stdin);
+ f_dst = fileno(stdout);
+ } else return bgzip_main_usage();
+ rz = bgzf_fdopen(f_dst, "w");
+ buffer = malloc(WINDOW_SIZE);
+ while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) {
+ if (bgzf_write(rz, buffer, c) < 0) {
+ fail(rz);
+ }
+ }
+ // f_dst will be closed here
+ if (bgzf_close(rz) < 0) {
+ fail(rz);
+ }
+ if (argc > optind) unlink(argv[optind]);
+ free(buffer);
+ close(f_src);
+ return 0;
+ } else {
+ if(argc <= optind) return bgzip_main_usage();
+ int f_dst;
+ if (argc > optind && !pstdout) {
+ char *name;
+ if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) {
+ printf("bgzip: %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = write_open(name, is_forced);
+ free(name);
+ } else f_dst = fileno(stdout);
+ rz = bgzf_open(argv[optind], "r");
+ if (rz == NULL) {
+ printf("Could not open file: %s\n", argv[optind]);
+ return 1;
+ }
+ buffer = malloc(WINDOW_SIZE);
+ if (bgzf_seek(rz, start, SEEK_SET) < 0) {
+ fail(rz);
+ }
+ while(1){
+ if(end < 0) c = bgzf_read(rz, buffer, WINDOW_SIZE);
+ else c = bgzf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if(c == 0) break;
+ if (c < 0) fail(rz);
+ start += c;
+ write(f_dst, buffer, c);
+ if(end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (bgzf_close(rz) < 0) {
+ fail(rz);
+ }
+ if (!pstdout) unlink(argv[optind]);
+ return 0;
+ }
+}
+
--- /dev/null
+File ex1.fa contains two sequences cut from the human genome
+build36. They were exatracted with command:
+
+ samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550
+
+Sequence names were changed manually for simplicity. File ex1.sam.gz
+contains MAQ alignments exatracted with:
+
+ (samtools view NA18507_maq.bam 2:2044001-2045500;
+ samtools view NA18507_maq.bam 20:68001-69500)
+
+and processed with `samtools fixmate' to make it self-consistent as a
+standalone alignment.
+
+To try samtools, you may run the following commands:
+
+ samtools faidx ex1.fa # index the reference FASTA
+ samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM
+ samtools index ex1.bam # index BAM
+ samtools tview ex1.bam ex1.fa # view alignment
+ samtools pileup -cf ex1.fa ex1.bam # pileup and consensus
+ samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz
+
--- /dev/null
+all:../libbam.a ../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz calDepth
+ @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo;
+
+ex1.fa.fai:ex1.fa
+ ../samtools faidx ex1.fa
+ex1.bam:ex1.sam.gz ex1.fa.fai
+ ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam
+ex1.bam.bai:ex1.bam
+ ../samtools index ex1.bam
+ex1.pileup.gz:ex1.bam ex1.fa
+ ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz
+ex1.glf:ex1.bam ex1.fa
+ ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf
+ex1.glfview.gz:ex1.glf
+ ../samtools glfview ex1.glf | gzip > ex1.glfview.gz
+
+../samtools:
+ (cd ..; make samtools)
+
+../libbam.a:
+ (cd ..; make libbam.a)
+
+calDepth:../libbam.a calDepth.c
+ gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam
+
+clean:
+ rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM
\ No newline at end of file
--- /dev/null
+#include <stdio.h>
+#include "sam.h"
+
+typedef struct {
+ int beg, end;
+ samfile_t *in;
+} tmpstruct_t;
+
+// callback for bam_fetch()
+static int fetch_func(const bam1_t *b, void *data)
+{
+ bam_plbuf_t *buf = (bam_plbuf_t*)data;
+ bam_plbuf_push(b, buf);
+ return 0;
+}
+// callback for bam_plbuf_init()
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+ tmpstruct_t *tmp = (tmpstruct_t*)data;
+ if ((int)pos >= tmp->beg && (int)pos < tmp->end)
+ printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ tmpstruct_t tmp;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: calDepth <in.bam> [region]\n");
+ return 1;
+ }
+ tmp.beg = 0; tmp.end = 0x7fffffff;
+ tmp.in = samopen(argv[1], "rb", 0);
+ if (tmp.in == 0) {
+ fprintf(stderr, "Fail to open BAM file %s\n", argv[1]);
+ return 1;
+ }
+ if (argc == 2) { // if a region is not specified
+ sampileup(tmp.in, -1, pileup_func, &tmp);
+ } else {
+ int ref;
+ bam_index_t *idx;
+ bam_plbuf_t *buf;
+ idx = bam_index_load(argv[1]); // load BAM index
+ if (idx == 0) {
+ fprintf(stderr, "BAM indexing file is not available.\n");
+ return 1;
+ }
+ bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region
+ if (ref < 0) {
+ fprintf(stderr, "Invalid region %s\n", argv[2]);
+ return 1;
+ }
+ buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup
+ bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func);
+ bam_plbuf_push(0, buf); // finalize pileup
+ bam_index_destroy(idx);
+ bam_plbuf_destroy(buf);
+ }
+ samclose(tmp.in);
+ return 0;
+}
--- /dev/null
+>seq1
+CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT
+GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC
+GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG
+TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC
+AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA
+CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC
+AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT
+CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA
+ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC
+AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC
+AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC
+ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC
+CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT
+TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT
+TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT
+GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT
+ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA
+ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG
+TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA
+CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG
+TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC
+TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC
+TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG
+TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG
+AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA
+TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC
+TCCCTCGTCTTCTTA
+>seq2
+TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG
+CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT
+TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT
+CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA
+AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT
+AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC
+ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG
+GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT
+CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT
+TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA
+AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA
+ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT
+TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA
+AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC
+TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA
+GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT
+AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA
+AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT
+AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT
+AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT
+ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT
+GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG
+CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA
+GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA
+AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA
+TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC
+CAGAAAAAAATATTTACAGTAACT
--- /dev/null
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "faidx.h"
+#include "khash.h"
+
+typedef struct {
+ uint64_t len:32, line_len:16, line_blen:16;
+ uint64_t offset;
+} faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+
+#ifndef _NO_RAZF
+#include "razf.h"
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#define RAZF FILE
+#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
+#define razf_open(fn, mode) fopen(fn, mode)
+#define razf_close(fp) fclose(fp)
+#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define razf_tell(fp) ftello(fp)
+#endif
+
+struct __faidx_t {
+ RAZF *rz;
+ int n, m;
+ char **name;
+ khash_t(s) *hash;
+};
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
+{
+ khint_t k;
+ int ret;
+ faidx1_t t;
+ if (idx->n == idx->m) {
+ idx->m = idx->m? idx->m<<1 : 16;
+ idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
+ }
+ idx->name[idx->n] = strdup(name);
+ k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
+ t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
+ kh_value(idx->hash, k) = t;
+ ++idx->n;
+}
+
+faidx_t *fai_build_core(RAZF *rz)
+{
+ char c, *name;
+ int l_name, m_name, ret;
+ int len, line_len, line_blen, state;
+ int l1, l2;
+ faidx_t *idx;
+ uint64_t offset;
+
+ idx = (faidx_t*)calloc(1, sizeof(faidx_t));
+ idx->hash = kh_init(s);
+ name = 0; l_name = m_name = 0;
+ len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
+ while (razf_read(rz, &c, 1)) {
+ if (c == '\n') { // an empty line
+ if (state == 1) {
+ offset = razf_tell(rz);
+ continue;
+ } else if ((state == 0 && len < 0) || state == 2) continue;
+ }
+ if (c == '>') { // fasta header
+ if (len >= 0)
+ fai_insert_index(idx, name, len, line_len, line_blen, offset);
+ l_name = 0;
+ while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
+ if (m_name < l_name + 2) {
+ m_name = l_name + 2;
+ kroundup32(m_name);
+ name = (char*)realloc(name, m_name);
+ }
+ name[l_name++] = c;
+ }
+ name[l_name] = '\0';
+ if (ret == 0) {
+ fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
+ state = 1; len = 0;
+ offset = razf_tell(rz);
+ } else {
+ if (state == 3) {
+ fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ if (state == 2) state = 3;
+ l1 = l2 = 0;
+ do {
+ ++l1;
+ if (isgraph(c)) ++l2;
+ } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
+ if (state == 3 && l2) {
+ fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ ++l1; len += l2;
+ if (l2 >= 0x10000) {
+ fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name);
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ if (state == 1) line_len = l1, line_blen = l2, state = 0;
+ else if (state == 0) {
+ if (l1 != line_len || l2 != line_blen) state = 2;
+ }
+ }
+ }
+ fai_insert_index(idx, name, len, line_len, line_blen, offset);
+ free(name);
+ return idx;
+}
+
+void fai_save(const faidx_t *fai, FILE *fp)
+{
+ khint_t k;
+ int i;
+ for (i = 0; i < fai->n; ++i) {
+ faidx1_t x;
+ k = kh_get(s, fai->hash, fai->name[i]);
+ x = kh_value(fai->hash, k);
+ fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+ }
+}
+
+faidx_t *fai_read(FILE *fp)
+{
+ faidx_t *fai;
+ char *buf, *p;
+ int len, line_len, line_blen;
+ long long offset;
+ fai = (faidx_t*)calloc(1, sizeof(faidx_t));
+ fai->hash = kh_init(s);
+ buf = (char*)calloc(0x10000, 1);
+ while (!feof(fp) && fgets(buf, 0x10000, fp)) {
+ for (p = buf; *p && isgraph(*p); ++p);
+ *p = 0; ++p;
+ sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+ fai_insert_index(fai, buf, len, line_len, line_blen, offset);
+ }
+ free(buf);
+ return fai;
+}
+
+void fai_destroy(faidx_t *fai)
+{
+ int i;
+ for (i = 0; i < fai->n; ++i) free(fai->name[i]);
+ free(fai->name);
+ kh_destroy(s, fai->hash);
+ if (fai->rz) razf_close(fai->rz);
+ free(fai);
+}
+
+int fai_build(const char *fn)
+{
+ char *str;
+ RAZF *rz;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+ rz = razf_open(fn, "r");
+ if (rz == 0) {
+ fprintf(stderr, "[fai_build] fail to open the FASTA file.\n");
+ free(str);
+ return -1;
+ }
+ fai = fai_build_core(rz);
+ razf_close(rz);
+ fp = fopen(str, "w");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_build] fail to write FASTA index.\n");
+ fai_destroy(fai); free(str);
+ return -1;
+ }
+ fai_save(fai, fp);
+ fclose(fp);
+ free(str);
+ fai_destroy(fai);
+ return 0;
+}
+
+faidx_t *fai_load(const char *fn)
+{
+ char *str;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+ fp = fopen(str, "r");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] build FASTA index.\n");
+ fai_build(fn);
+ fp = fopen(str, "r");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
+ free(str);
+ return 0;
+ }
+ }
+ fai = fai_read(fp);
+ fclose(fp);
+ fai->rz = razf_open(fn, "r");
+ free(str);
+ if (fai->rz == 0) {
+ fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
+ return 0;
+ }
+ return fai;
+}
+
+char *fai_fetch(const faidx_t *fai, const char *str, int *len)
+{
+ char *s, *p, c;
+ int i, l, k;
+ khiter_t iter;
+ faidx1_t val;
+ khash_t(s) *h;
+ int beg, end;
+
+ beg = end = -1;
+ h = fai->hash;
+ l = strlen(str);
+ p = s = (char*)malloc(l+1);
+ /* squeeze out "," */
+ for (i = k = 0; i != l; ++i)
+ if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+ s[k] = 0;
+ for (i = 0; i != k; ++i) if (s[i] == ':') break;
+ s[i] = 0;
+ iter = kh_get(s, h, s); /* get the ref_id */
+ if (iter == kh_end(h)) {
+ *len = 0;
+ free(s); return 0;
+ }
+ val = kh_value(h, iter);
+ if (i == k) { /* dump the whole sequence */
+ beg = 0; end = val.len;
+ } else {
+ for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+ beg = atoi(p);
+ if (i < k) {
+ p = s + i + 1;
+ end = atoi(p);
+ } else end = val.len;
+ }
+ if (beg > 0) --beg;
+ if (beg >= val.len) beg = val.len;
+ if (end >= val.len) end = val.len;
+ if (beg > end) beg = end;
+ free(s);
+
+ // now retrieve the sequence
+ l = 0;
+ s = (char*)malloc(end - beg + 2);
+ razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
+ while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg)
+ if (isgraph(c)) s[l++] = c;
+ s[l] = '\0';
+ *len = l;
+ return s;
+}
+
+int faidx_main(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
+ return 1;
+ } else {
+ if (argc == 2) fai_build(argv[1]);
+ else {
+ int i, j, k, l;
+ char *s;
+ faidx_t *fai;
+ fai = fai_load(argv[1]);
+ if (fai == 0) return 1;
+ for (i = 2; i != argc; ++i) {
+ printf(">%s\n", argv[i]);
+ s = fai_fetch(fai, argv[i], &l);
+ for (j = 0; j < l; j += 60) {
+ for (k = 0; k < 60 && k < l - j; ++k)
+ putchar(s[j + k]);
+ putchar('\n');
+ }
+ free(s);
+ }
+ fai_destroy(fai);
+ }
+ }
+ return 0;
+}
+
+#ifdef FAIDX_MAIN
+int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
+#endif
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef FAIDX_H
+#define FAIDX_H
+
+/*!
+ @header
+
+ Index FASTA files and extract subsequence.
+
+ @copyright The Wellcome Trust Sanger Institute.
+ */
+
+struct __faidx_t;
+typedef struct __faidx_t faidx_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /*!
+ @abstract Build index for a FASTA or razip compressed FASTA file.
+ @param fn FASTA file name
+ @return 0 on success; or -1 on failure
+ @discussion File "fn.fai" will be generated.
+ */
+ int fai_build(const char *fn);
+
+ /*!
+ @abstract Distroy a faidx_t struct.
+ @param fai Pointer to the struct to be destroyed
+ */
+ void fai_destroy(faidx_t *fai);
+
+ /*!
+ @abstract Load index from "fn.fai".
+ @param fn File name of the FASTA file
+ */
+ faidx_t *fai_load(const char *fn);
+
+ /*!
+ @abstract Fetch the sequence in a region.
+ @param fai Pointer to the faidx_t struct
+ @param reg Region in the format "chr2:20,000-30,000"
+ @param len Length of the region
+ @return Pointer to the sequence; null on failure
+
+ @discussion The returned sequence is allocated by malloc family
+ and should be destroyed by end users by calling free() on it.
+ */
+ char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <string.h>
+#include <stdlib.h>
+#include "glf.h"
+
+#ifdef _NO_BGZF
+// then alias bgzf_*() functions
+#endif
+
+static int glf3_is_BE = 0;
+
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+
+static inline int bam_is_big_endian()
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+
+glf3_header_t *glf3_header_init()
+{
+ glf3_is_BE = bam_is_big_endian();
+ return (glf3_header_t*)calloc(1, sizeof(glf3_header_t));
+}
+
+glf3_header_t *glf3_header_read(glfFile fp)
+{
+ glf3_header_t *h;
+ char magic[4];
+ h = glf3_header_init();
+ bgzf_read(fp, magic, 4);
+ if (strncmp(magic, "GLF\3", 4)) {
+ fprintf(stderr, "[glf3_header_read] invalid magic.\n");
+ glf3_header_destroy(h);
+ return 0;
+ }
+ bgzf_read(fp, &h->l_text, 4);
+ if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text);
+ if (h->l_text) {
+ h->text = (uint8_t*)calloc(h->l_text + 1, 1);
+ bgzf_read(fp, h->text, h->l_text);
+ }
+ return h;
+}
+
+void glf3_header_write(glfFile fp, const glf3_header_t *h)
+{
+ int32_t x;
+ bgzf_write(fp, "GLF\3", 4);
+ x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text;
+ bgzf_write(fp, &x, 4);
+ if (h->l_text) bgzf_write(fp, h->text, h->l_text);
+}
+
+void glf3_header_destroy(glf3_header_t *h)
+{
+ free(h->text);
+ free(h);
+}
+
+char *glf3_ref_read(glfFile fp, int *len)
+{
+ int32_t n, x;
+ char *str;
+ *len = 0;
+ if (bgzf_read(fp, &n, 4) != 4) return 0;
+ if (glf3_is_BE) n = bam_swap_endian_4(n);
+ if (n < 0) {
+ fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n);
+ return 0;
+ }
+ str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact
+ x = bgzf_read(fp, str, n);
+ x += bgzf_read(fp, len, 4);
+ if (x != n + 4) {
+ free(str); *len = -1; return 0; // truncated
+ }
+ if (glf3_is_BE) *len = bam_swap_endian_4(*len);
+ return str;
+}
+
+void glf3_ref_write(glfFile fp, const char *str, int len)
+{
+ int32_t m, n = strlen(str) + 1;
+ m = glf3_is_BE? bam_swap_endian_4(n) : n;
+ bgzf_write(fp, &m, 4);
+ bgzf_write(fp, str, n);
+ if (glf3_is_BE) len = bam_swap_endian_4(len);
+ bgzf_write(fp, &len, 4);
+}
+
+void glf3_view1(const char *ref_name, const glf3_t *g3, int pos)
+{
+ int j;
+ if (g3->rtype == GLF3_RTYPE_END) return;
+ printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1,
+ g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base],
+ g3->depth, g3->rms_mapQ, g3->min_lk);
+ if (g3->rtype == GLF3_RTYPE_SUB)
+ for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]);
+ else {
+ printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1],
+ g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*");
+ }
+ printf("\n");
+}
+
+int glf3_write1(glfFile fp, const glf3_t *g3)
+{
+ int r;
+ uint8_t c;
+ uint32_t y[2];
+ c = g3->rtype<<4 | g3->ref_base;
+ r = bgzf_write(fp, &c, 1);
+ if (g3->rtype == GLF3_RTYPE_END) return r;
+ y[0] = g3->offset;
+ y[1] = g3->min_lk<<24 | g3->depth;
+ if (glf3_is_BE) {
+ y[0] = bam_swap_endian_4(y[0]);
+ y[1] = bam_swap_endian_4(y[1]);
+ }
+ r += bgzf_write(fp, y, 8);
+ r += bgzf_write(fp, &g3->rms_mapQ, 1);
+ if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10);
+ else {
+ int16_t x[2];
+ r += bgzf_write(fp, g3->lk, 3);
+ x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0];
+ x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1];
+ r += bgzf_write(fp, x, 4);
+ if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0]));
+ if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1]));
+ }
+ return r;
+}
+
+#ifndef kv_roundup32
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int glf3_read1(glfFile fp, glf3_t *g3)
+{
+ int r;
+ uint8_t c;
+ uint32_t y[2];
+ r = bgzf_read(fp, &c, 1);
+ if (r == 0) return 0;
+ g3->ref_base = c & 0xf;
+ g3->rtype = c>>4;
+ if (g3->rtype == GLF3_RTYPE_END) return r;
+ r += bgzf_read(fp, y, 8);
+ if (glf3_is_BE) {
+ y[0] = bam_swap_endian_4(y[0]);
+ y[1] = bam_swap_endian_4(y[1]);
+ }
+ g3->offset = y[0];
+ g3->min_lk = y[1]>>24;
+ g3->depth = y[1]<<8>>8;
+ r += bgzf_read(fp, &g3->rms_mapQ, 1);
+ if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10);
+ else {
+ int16_t x[2], max;
+ r += bgzf_read(fp, g3->lk, 3);
+ r += bgzf_read(fp, x, 4);
+ if (glf3_is_BE) {
+ x[0] = bam_swap_endian_2(x[0]);
+ x[1] = bam_swap_endian_2(x[1]);
+ }
+ g3->indel_len[0] = x[0];
+ g3->indel_len[1] = x[1];
+ x[0] = abs(x[0]); x[1] = abs(x[1]);
+ max = (x[0] > x[1]? x[0] : x[1]) + 1;
+ if (g3->max_len < max) {
+ g3->max_len = max;
+ kv_roundup32(g3->max_len);
+ g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len);
+ g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len);
+ }
+ r += bgzf_read(fp, g3->indel_seq[0], x[0]);
+ r += bgzf_read(fp, g3->indel_seq[1], x[1]);
+ g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0;
+ }
+ return r;
+}
+
+void glf3_view(glfFile fp)
+{
+ glf3_header_t *h;
+ char *name;
+ glf3_t *g3;
+ int len;
+ h = glf3_header_read(fp);
+ g3 = glf3_init1();
+ while ((name = glf3_ref_read(fp, &len)) != 0) {
+ int pos = 0;
+ while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) {
+ pos += g3->offset;
+ glf3_view1(name, g3, pos);
+ }
+ free(name);
+ }
+ glf3_header_destroy(h);
+ glf3_destroy1(g3);
+}
+
+int glf3_view_main(int argc, char *argv[])
+{
+ glfFile fp;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: glfview <in.glf>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r");
+ if (fp == 0) {
+ fprintf(stderr, "Fail to open file '%s'\n", argv[1]);
+ return 1;
+ }
+ glf3_view(fp);
+ bgzf_close(fp);
+ return 0;
+}
+
+#ifdef GLFVIEW_MAIN
+int main(int argc, char *argv[])
+{
+ return glf3_view_main(argc, argv);
+}
+#endif
--- /dev/null
+#ifndef GLF_H_
+#define GLF_H_
+
+typedef struct {
+ unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */
+ unsigned char max_mapQ; /** maximum mapping quality */
+ unsigned char lk[10]; /** log likelihood ratio, capped at 255 */
+ unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
+} glf1_t;
+
+#include <stdint.h>
+#include "bgzf.h"
+typedef BGZF *glfFile;
+
+#define GLF3_RTYPE_END 0
+#define GLF3_RTYPE_SUB 1
+#define GLF3_RTYPE_INDEL 2
+
+typedef struct {
+ uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */
+ uint8_t rms_mapQ; /** RMS mapping quality */
+ uint8_t lk[10]; /** log likelihood ratio, capped at 255 */
+ uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
+ int32_t offset; /** the first base in a chromosome has offset zero. */
+ // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10])
+ int16_t indel_len[2];
+ int32_t max_len; // maximum indel len; will be modified by glf3_read1()
+ char *indel_seq[2];
+} glf3_t;
+
+typedef struct {
+ int32_t l_text;
+ uint8_t *text;
+} glf3_header_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t)))
+#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0)
+
+ glf3_header_t *glf3_header_init();
+ glf3_header_t *glf3_header_read(glfFile fp);
+ void glf3_header_write(glfFile fp, const glf3_header_t *h);
+ void glf3_header_destroy(glf3_header_t *h);
+ char *glf3_ref_read(glfFile fp, int *len);
+ void glf3_ref_write(glfFile fp, const char *name, int len);
+ int glf3_write1(glfFile fp, const glf3_t *g3);
+ int glf3_read1(glfFile fp, glf3_t *g3);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ if (!ret) kh_del(32, h, k);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+
+ @copyright Heng Li
+ */
+
+#define AC_VERSION_KHASH_H "0.2.2"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef uint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_HASH_PRIME_SIZE 32
+static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
+{
+ 0ul, 3ul, 11ul, 23ul, 53ul,
+ 97ul, 193ul, 389ul, 769ul, 1543ul,
+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
+ 3221225473ul, 4294967291ul
+};
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ uint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ static inline kh_##name##_t *kh_init_##name() { \
+ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
+ } \
+ static inline void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ free(h->keys); free(h->flags); \
+ free(h->vals); \
+ free(h); \
+ } \
+ } \
+ static inline void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t inc, k, i, last; \
+ k = __hash_func(key); i = k % h->n_buckets; \
+ inc = 1 + k % (h->n_buckets - 1); last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+ else i += inc; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { \
+ uint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ khint_t t = __ac_HASH_PRIME_SIZE - 1; \
+ while (__ac_prime_list[t] > new_n_buckets) --t; \
+ new_n_buckets = __ac_prime_list[t+1]; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
+ else { \
+ new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
+ memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
+ if (h->n_buckets < new_n_buckets) { \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) \
+ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ } \
+ } \
+ if (j) { \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isdel_true(h->flags, j); \
+ while (1) { \
+ khint_t inc, k, i; \
+ k = __hash_func(key); \
+ i = k % new_n_buckets; \
+ inc = 1 + k % (new_n_buckets - 1); \
+ while (!__ac_isempty(new_flags, i)) { \
+ if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
+ else i += inc; \
+ } \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isdel_true(h->flags, i); \
+ } else { \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) \
+ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ free(h->flags); \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ } \
+ static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { \
+ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
+ else kh_resize_##name(h, h->n_buckets + 1); \
+ } \
+ { \
+ khint_t inc, k, i, site, last; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
+ if (__ac_isempty(h->flags, i)) x = i; \
+ else { \
+ inc = 1 + k % (h->n_buckets - 1); last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+ else i += inc; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; \
+ return x; \
+ } \
+ static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [uint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (uint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [uint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+static inline khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = *s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other necessary macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
--- /dev/null
+#include <time.h>
+#include <stdio.h>
+#include <netdb.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include "knetfile.h"
+
+static int socket_wait(int fd, int is_read)
+{
+ fd_set fds, *fdr = 0, *fdw = 0;
+ struct timeval tv;
+ int ret;
+ tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ if (is_read) fdr = &fds;
+ else fdw = &fds;
+ ret = select(fd+1, fdr, fdw, 0, &tv);
+ if (ret == -1) perror("select");
+ return ret;
+}
+
+static int kftp_get_response(knetFile *ftp)
+{
+ unsigned char c;
+ int n = 0;
+ char *p;
+ if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+ while (read(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+ //fputc(c, stderr);
+ if (n >= ftp->max_response) {
+ ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+ ftp->response = realloc(ftp->response, ftp->max_response);
+ }
+ ftp->response[n++] = c;
+ if (c == '\n') {
+ if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+ && ftp->response[3] != '-') break;
+ n = 0;
+ continue;
+ }
+ }
+ if (n < 2) return -1;
+ ftp->response[n-2] = 0;
+ return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+ if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+ write(ftp->ctrl_fd, cmd, strlen(cmd));
+ return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+ char *p;
+ int v[6];
+ kftp_send_cmd(ftp, "PASV\r\n", 1);
+ for (p = ftp->response; *p && *p != '('; ++p);
+ if (*p != '(') return -1;
+ ++p;
+ sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+ memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+ ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+ return 0;
+}
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+#define __err_pasv_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+ struct addrinfo hints, *res;
+ struct linger lng = { 0, 0 };
+ int on = 1;
+ char host[80], port[10];
+
+ if (ftp->pasv_port == 0) {
+ fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+ return -1;
+ }
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+ sprintf(port, "%d", ftp->pasv_port);
+ if (getaddrinfo(host, port, &hints, &res) != 0) { perror("getaddrinfo"); return -1; }
+ if ((ftp->fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_pasv_connect("socket");
+ if (setsockopt(ftp->fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_pasv_connect("setsockopt");
+ if (setsockopt(ftp->fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_pasv_connect("setsockopt");
+ if (connect(ftp->fd, res->ai_addr, res->ai_addrlen) != 0) __err_pasv_connect("connect");
+ freeaddrinfo(res);
+ return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+#define __err_connect(func) do { perror(func); return -1; } while (0)
+
+ int on = 1;
+ { // open socket
+ struct addrinfo hints, *res;
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ if (getaddrinfo(ftp->host, "21", &hints, &res) != 0) __err_connect("getaddrinfo");
+ if ((ftp->ctrl_fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+ if (setsockopt(ftp->ctrl_fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (connect(ftp->ctrl_fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+ freeaddrinfo(res);
+ kftp_get_response(ftp);
+ }
+ { // login
+ kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+ kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+ kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+ }
+ return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+ if (ftp->ctrl_fd >= 0) {
+ close(ftp->ctrl_fd);
+ ftp->ctrl_fd = -1;
+ }
+ close(ftp->fd);
+ return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host and ->retr
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p;
+ int l;
+ if (strstr(fn, "ftp://") != fn) return 0;
+ for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+ if (*p != '/') return 0;
+ l = p - fn - 6;
+ fp = calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_FTP;
+ fp->fd = -1;
+ fp->host = calloc(l + 1, 1);
+ if (strchr(mode, 'c')) fp->no_reconnect = 1;
+ strncpy(fp->host, fn + 6, l);
+ fp->retr = calloc(strlen(p) + 8, 1);
+ sprintf(fp->retr, "RETR %s\r\n", p);
+ fp->seek_offset = -1;
+ return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+ int ret;
+ if (fp->fd >= 0) {
+ close(fp->fd);
+ if (fp->no_reconnect) kftp_get_response(fp);
+ }
+ kftp_pasv_prep(fp);
+ if (fp->offset) {
+ char tmp[32];
+ sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+ kftp_send_cmd(fp, tmp, 1);
+ }
+ kftp_send_cmd(fp, fp->retr, 0);
+ kftp_pasv_connect(fp);
+ ret = kftp_get_response(fp);
+ if (ret != 150) {
+ fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+ close(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ fp->is_ready = 1;
+ return 0;
+}
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+ knetFile *fp = 0;
+ if (mode[0] != 'r') {
+ fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+ return 0;
+ }
+ if (strstr(fn, "ftp://") == fn) {
+ fp = kftp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ if (kftp_connect(fp) == -1) {
+ knet_close(fp);
+ return 0;
+ }
+ kftp_connect_file(fp);
+ if (fp->fd < 0) {
+ knet_close(fp);
+ return 0;
+ }
+ } else {
+ int fd = open(fn, O_RDONLY);
+ if (fd == -1) {
+ perror("open");
+ return 0;
+ }
+ fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ }
+ return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+ knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ return fp;
+}
+
+off_t knet_read(knetFile *fp, void *buf, off_t len)
+{
+ off_t l = 0;
+ if (fp->fd < 0) return 0;
+ if (fp->type == KNF_TYPE_LOCAL) {
+ off_t rest = len, curr;
+ while (rest) {
+ curr = read(fp->fd, buf + l, rest);
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ fp->offset += l;
+ } else {
+ off_t rest = len, curr;
+ if (fp->is_ready == 0) {
+ if (!fp->no_reconnect) kftp_reconnect(fp);
+ kftp_connect_file(fp);
+ fp->is_ready = 1;
+ }
+ while (rest) {
+ if (socket_wait(fp->fd, 1) <= 0) break; // socket is not ready for reading
+ curr = read(fp->fd, buf + l, rest);
+ if (curr == 0) break; // FIXME: end of file or bad network? I do not know...
+ l += curr; rest -= curr;
+ }
+ fp->offset += l;
+ }
+ return l;
+}
+
+int knet_seek(knetFile *fp, off_t off, int whence)
+{
+ if (fp->type == KNF_TYPE_LOCAL) {
+ if (lseek(fp->fd, off, whence) == -1) {
+ perror("lseek");
+ return -1;
+ }
+ fp->offset = off;
+ return 0;
+ }
+ if (fp->type == KNF_TYPE_FTP) {
+ if (whence != SEEK_SET) { // FIXME: we can surely allow SEEK_CUR and SEEK_END in future
+ fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP. Offset is unchanged.\n");
+ return -1;
+ }
+ fp->offset = off;
+ fp->is_ready = 0;
+ return 0;
+ }
+ return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+ if (fp == 0) return 0;
+ if (fp->ctrl_fd >= 0) close(fp->ctrl_fd);
+ if (fp->fd >= 0) close(fp->fd);
+ free(fp->response); free(fp->retr); free(fp->host);
+ free(fp);
+ return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+ char buf[256];
+ knetFile *fp;
+// fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); knet_seek(fp, 2500000000ll, SEEK_SET);
+ fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); knet_seek(fp, 2000, SEEK_SET);
+// fp = knet_open("knetfile.c", "r"); knet_seek(fp, 2000, SEEK_SET);
+ knet_read(fp, buf, 255);
+ buf[255] = 0;
+ printf("%s\n", buf);
+ knet_close(fp);
+ return 0;
+}
+#endif
--- /dev/null
+#ifndef KNETFILE_H
+#define KNETFILE_H
+
+#include <stdint.h>
+#include <fcntl.h>
+
+// FIXME: currently I/O is unbuffered
+
+#define KNF_TYPE_LOCAL 1
+#define KNF_TYPE_FTP 2
+#define KNF_TYPE_HTTP 3
+
+typedef struct knetFile_s {
+ int type, fd;
+ int64_t offset;
+ char *host;
+
+ // the following are for FTP only
+ int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
+ char *response, *retr;
+ int64_t seek_offset; // for lazy seek
+} knetFile;
+
+#define knet_tell(fp) ((fp)->offset)
+#define knet_fileno(fp) ((fp)->fd)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ knetFile *knet_open(const char *fn, const char *mode);
+
+ /*
+ This only works with local files.
+ */
+ knetFile *knet_dopen(int fd, const char *mode);
+
+ /*
+ If ->is_ready==0, this routine updates ->fd; otherwise, it simply
+ reads from ->fd.
+ */
+ off_t knet_read(knetFile *fp, void *buf, off_t len);
+
+ /*
+ This routine only sets ->offset and ->is_ready=0. It does not
+ communicate with the FTP server.
+ */
+ int knet_seek(knetFile *fp, off_t off, int whence);
+ int knet_close(knetFile *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/* Last Modified: 12APR2009 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_MAX 1
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = (char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) return -1; \
+ } \
+ return (int)ks->buf[ks->begin++]; \
+ }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { \
+ if (dret) *dret = 0; \
+ str->l = 0; \
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) break; \
+ } else break; \
+ } \
+ if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < i - ks->begin + 1) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (str->l == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define __KSEQ_BASIC(type_t) \
+ static inline kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ static inline void kseq_rewind(kseq_t *ks) \
+ { \
+ ks->last_char = 0; \
+ ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
+ } \
+ static inline void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ \
+ static int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* the first header char has been read */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
+ if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (isgraph(c)) { /* printable non-space character */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l++] = (char)c; \
+ } \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* we should not stop here */ \
+ while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
+ if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
+ seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT(type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 4096) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(type_t) \
+ __KSEQ_READ
+
+#endif
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+ 2008-11-16 (0.1.4):
+
+ * Fixed a bug in introsort() that happens in rare cases.
+
+ 2008-11-05 (0.1.3):
+
+ * Fixed a bug in introsort() for complex comparisons.
+
+ * Fixed a bug in mergesort(). The previous version is not stable.
+
+ 2008-09-15 (0.1.2):
+
+ * Accelerated introsort. On my Mac (not on another Linux machine),
+ my implementation is as fast as std::sort on random input.
+
+ * Added combsort and in introsort, switch to combsort if the
+ recursion is too deep.
+
+ 2008-09-13 (0.1.1):
+
+ * Added k-small algorithm
+
+ 2008-09-05 (0.1.0):
+
+ * Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+ void *left, *right;
+ int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt) \
+ void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
+ { \
+ type_t *a2[2], *a, *b; \
+ int curr, shift; \
+ \
+ a2[0] = array; \
+ a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
+ for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
+ a = a2[curr]; b = a2[1-curr]; \
+ if (shift == 0) { \
+ type_t *p = b, *i, *eb = a + n; \
+ for (i = a; i < eb; i += 2) { \
+ if (i == eb - 1) *p++ = *i; \
+ else { \
+ if (__sort_lt(*(i+1), *i)) { \
+ *p++ = *(i+1); *p++ = *i; \
+ } else { \
+ *p++ = *i; *p++ = *(i+1); \
+ } \
+ } \
+ } \
+ } else { \
+ size_t i, step = 1ul<<shift; \
+ for (i = 0; i < n; i += step<<1) { \
+ type_t *p, *j, *k, *ea, *eb; \
+ if (n < i + step) { \
+ ea = a + n; eb = a; \
+ } else { \
+ ea = a + i + step; \
+ eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+ } \
+ j = a + i; k = a + i + step; p = b + i; \
+ while (j < ea && k < eb) { \
+ if (__sort_lt(*k, *j)) *p++ = *k++; \
+ else *p++ = *j++; \
+ } \
+ while (j < ea) *p++ = *j++; \
+ while (k < eb) *p++ = *k++; \
+ } \
+ } \
+ curr = 1 - curr; \
+ } \
+ if (curr == 1) { \
+ type_t *p = a2[0], *i = a2[1], *eb = array + n; \
+ for (; p < eb; ++i) *p++ = *i; \
+ } \
+ if (temp == 0) free(a2[1]); \
+ } \
+ void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
+ { \
+ size_t k = i; \
+ type_t tmp = l[i]; \
+ while ((k = (k << 1) + 1) < n) { \
+ if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
+ if (__sort_lt(l[k], tmp)) break; \
+ l[i] = l[k]; i = k; \
+ } \
+ l[i] = tmp; \
+ } \
+ void ks_heapmake_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
+ ks_heapadjust_##name(i, lsize, l); \
+ } \
+ void ks_heapsort_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = lsize - 1; i > 0; --i) { \
+ type_t tmp; \
+ tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+ } \
+ } \
+ inline void __ks_insertsort_##name(type_t *s, type_t *t) \
+ { \
+ type_t *i, *j, swap_tmp; \
+ for (i = s + 1; i < t; ++i) \
+ for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
+ swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
+ } \
+ } \
+ void ks_combsort_##name(size_t n, type_t a[]) \
+ { \
+ const double shrink_factor = 1.2473309501039786540366528676643; \
+ int do_swap; \
+ size_t gap = n; \
+ type_t tmp, *i, *j; \
+ do { \
+ if (gap > 2) { \
+ gap = (size_t)(gap / shrink_factor); \
+ if (gap == 9 || gap == 10) gap = 11; \
+ } \
+ do_swap = 0; \
+ for (i = a; i < a + n - gap; ++i) { \
+ j = i + gap; \
+ if (__sort_lt(*j, *i)) { \
+ tmp = *i; *i = *j; *j = tmp; \
+ do_swap = 1; \
+ } \
+ } \
+ } while (do_swap || gap > 2); \
+ if (gap != 1) __ks_insertsort_##name(a, a + n); \
+ } \
+ void ks_introsort_##name(size_t n, type_t a[]) \
+ { \
+ int d; \
+ ks_isort_stack_t *top, *stack; \
+ type_t rp, swap_tmp; \
+ type_t *s, *t, *i, *j, *k; \
+ \
+ if (n < 1) return; \
+ else if (n == 2) { \
+ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+ return; \
+ } \
+ for (d = 2; 1ul<<d < n; ++d); \
+ stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+ top = stack; s = a; t = a + (n-1); d <<= 1; \
+ while (1) { \
+ if (s < t) { \
+ if (--d == 0) { \
+ ks_combsort_##name(t - s + 1, s); \
+ t = s; \
+ continue; \
+ } \
+ i = s; j = t; k = i + ((j-i)>>1) + 1; \
+ if (__sort_lt(*k, *i)) { \
+ if (__sort_lt(*k, *j)) k = j; \
+ } else k = __sort_lt(*j, *i)? i : j; \
+ rp = *k; \
+ if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
+ for (;;) { \
+ do ++i; while (__sort_lt(*i, rp)); \
+ do --j; while (i <= j && __sort_lt(rp, *j)); \
+ if (j <= i) break; \
+ swap_tmp = *i; *i = *j; *j = swap_tmp; \
+ } \
+ swap_tmp = *i; *i = *t; *t = swap_tmp; \
+ if (i-s > t-i) { \
+ if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+ s = t-i > 16? i+1 : t; \
+ } else { \
+ if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+ t = i-s > 16? i-1 : s; \
+ } \
+ } else { \
+ if (top == stack) { \
+ free(stack); \
+ __ks_insertsort_##name(a, a+n); \
+ return; \
+ } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+ } \
+ } \
+ } \
+ /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+ /* 0 <= kk < n */ \
+ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
+ { \
+ type_t *low, *high, *k, *ll, *hh, *mid; \
+ low = arr; high = arr + n - 1; k = arr + kk; \
+ for (;;) { \
+ if (high <= low) return *k; \
+ if (high == low + 1) { \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ return *k; \
+ } \
+ mid = low + (high - low) / 2; \
+ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
+ KSORT_SWAP(type_t, *mid, *(low+1)); \
+ ll = low + 1; hh = high; \
+ for (;;) { \
+ do ++ll; while (__sort_lt(*ll, *low)); \
+ do --hh; while (__sort_lt(*low, *hh)); \
+ if (hh < ll) break; \
+ KSORT_SWAP(type_t, *ll, *hh); \
+ } \
+ KSORT_SWAP(type_t, *low, *hh); \
+ if (hh <= k) low = ll; \
+ if (hh >= k) high = hh - 1; \
+ } \
+ }
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
--- /dev/null
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "kstring.h"
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+ va_list ap;
+ int l;
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
+ va_end(ap);
+ if (l + 1 > s->m - s->l) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+ }
+ va_end(ap);
+ s->l += l;
+ return l;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+ int i, n, max, last_char, last_start, *offsets, l;
+ n = 0; max = *_max; offsets = *_offsets;
+ l = strlen(s);
+
+#define __ksplit_aux do { \
+ if (_offsets) { \
+ s[i] = 0; \
+ if (n == max) { \
+ max = max? max<<1 : 2; \
+ offsets = (int*)realloc(offsets, sizeof(int) * max); \
+ } \
+ offsets[n++] = last_start; \
+ } else ++n; \
+ } while (0)
+
+ for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+ if (delimiter == 0) {
+ if (isspace(s[i]) || s[i] == 0) {
+ if (isgraph(last_char)) __ksplit_aux; // the end of a field
+ } else {
+ if (isspace(last_char) || last_char == 0) last_start = i;
+ }
+ } else {
+ if (s[i] == delimiter || s[i] == 0) {
+ if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+ } else {
+ if (last_char == delimiter || last_char == 0) last_start = i;
+ }
+ }
+ last_char = s[i];
+ }
+ *_max = max; *_offsets = offsets;
+ return n;
+}
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+ kstring_t *s;
+ int *fields, n, i;
+ s = (kstring_t*)calloc(1, sizeof(kstring_t));
+ // test ksprintf()
+ ksprintf(s, " abcdefg: %d ", 100);
+ printf("'%s'\n", s->s);
+ // test ksplit()
+ fields = ksplit(s, 0, &n);
+ for (i = 0; i < n; ++i)
+ printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+ free(s);
+ return 0;
+}
+#endif
--- /dev/null
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+int ksprintf(kstring_t *s, const char *fmt, ...);
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ strncpy(s->s + s->l, p, l);
+ s->l += l;
+ s->s[s->l] = 0;
+ return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+ return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+ if (s->l + 1 >= s->m) {
+ s->m = s->l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ s->s[s->l++] = c;
+ s->s[s->l] = 0;
+ return c;
+}
+
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+ int max = 0, *offsets = 0;
+ *n = ksplit_core(s->s, delimiter, &max, &offsets);
+ return offsets;
+}
+
+#endif
--- /dev/null
+CC= gcc
+CXX= g++
+CFLAGS= -g -Wall -O2 -m64 #-arch ppc
+CXXFLAGS= $(CFLAGS)
+DFLAGS= -D_FILE_OFFSET_BITS=64
+OBJS=
+PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim
+INCLUDES= -I..
+SUBDIRS= .
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+lib-recur all-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+lib:
+
+wgsim:wgsim.o
+ $(CC) $(CFLAGS) -o $@ wgsim.o -lm
+
+md5fa:md5.o md5fa.o md5.h ../kseq.h
+ $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
+
+md5sum-lite:md5sum-lite.o
+ $(CC) $(CFLAGS) -o $@ md5sum-lite.o
+
+md5sum-lite.o:md5.c md5.h
+ $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c
+
+maq2sam-short:maq2sam.c
+ $(CC) $(CFLAGS) -o $@ maq2sam.c -lz
+
+maq2sam-long:maq2sam.c
+ $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz
+
+md5fa.o:md5.h md5fa.c
+ $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+
+clean:cleanlocal-recur
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&blast2sam;
+
+sub blast2sam {
+ my %opts = ();
+ getopts('s', \%opts);
+ die("Usage: blast2sam.pl <in.blastn>\n") if (-t STDIN && @ARGV == 0);
+ my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq);
+ $show_seq = defined($opts{s});
+ @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*');
+ while (<>) {
+ if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print
+ &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend);
+ @cigar = ();
+ }
+ if (/^Query= (\S+)/) {
+ $sam[0] = $1;
+ } elsif (/\((\S+)\s+letters\)/) {
+ $qlen = $1; $qlen =~ s/,//g;
+ } elsif (/^>(\S+)/) {
+ $sam[2] = $1;
+ } elsif (/Length = (\d+)/) {
+ $slen = $1;
+ } elsif (/Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/) { # the start of an alignment block
+ my ($as, $ev) = (int($1 + .499), $3);
+ $ev = "1$ev" if ($ev =~ /^e/);
+ @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev");
+ @cigar = (); $qbeg = 0;
+ @cmaux = (0, 0, 0, '');
+ } elsif (/Strand = (\S+) \/ (\S+)/) {
+ $sam[1] |= 0x10 if ($2 eq 'Minus');
+ } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) {
+ $q = $2;
+ unless ($qbeg) {
+ $qbeg = $1;
+ push(@cigar, ($1-1) . "H") if ($1 > 1);
+ }
+ $qend = $3;
+ if ($show_seq) {
+ my $x = $q;
+ $x =~ s/-//g; $sam[9] .= $x;
+ }
+ } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) {
+ $s = $2;
+ if ($sam[1] & 0x10) {
+ $sam[3] = $3;
+ } else {
+ $sam[3] = $1 unless ($sam[3]);
+ }
+ &aln2cm(\@cigar, \$q, \$s, \@cmaux);
+ }
+ }
+ &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend);
+}
+
+sub blast_print_sam {
+ my ($sam, $cigar, $cmaux, $qrest) = @_;
+ push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1));
+ push(@$cigar, $qrest . 'H') if ($qrest);
+ if ($sam->[1] & 0x10) {
+ @$cigar = reverse(@$cigar);
+ $sam->[9] = reverse($sam->[9]);
+ $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/;
+ }
+ $sam->[9] = '*' if (!$sam->[9]);
+ $sam->[5] = join('', @$cigar);
+ print join("\t", @$sam), "\n";
+}
+
+sub aln2cm {
+ my ($cigar, $q, $s, $cmaux) = @_;
+ my $l = length($$q);
+ for (my $i = 0; $i < $l; ++$i) {
+ my $op;
+ # set $op
+ if (substr($$q, $i, 1) eq '-') { $op = 2; }
+ elsif (substr($$s, $i, 1) eq '-') { $op = 1; }
+ else { $op = 0; }
+ # for CIGAR
+ if ($cmaux->[0] == $op) {
+ ++$cmaux->[1];
+ } else {
+ push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1));
+ $cmaux->[0] = $op; $cmaux->[1] = 1;
+ }
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.1
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&bowtie2sam;
+exit;
+
+sub bowtie2sam {
+ my %opts = ();
+ die("Usage: bowtie2sam.pl <aln.bowtie>\n") if (@ARGV == 0 && -t STDIN);
+ # core loop
+ my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k);
+ $last = '';
+ while (<>) {
+ my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches
+ if ($name eq $last) {
+ # I do not know whether the multiple hits are ordered on the
+ # number of mismatches. I assume they are not and so I have to
+ # keep all these multiple hits in memory.
+ @{$staging[$k]} = @s;
+ if ($best_s > $nm) {
+ $subbest_s = $best_s;
+ $best_s = $nm;
+ $best_k = $k;
+ } elsif ($subbest_s > $nm) {
+ $subbest_s = $nm;
+ }
+ ++$k;
+ } else {
+ if ($last) {
+ if ($best_s == $subbest_s) {
+ $staging[$best_k][4] = 0;
+ } elsif ($subbest_s - $best_s == 1) {
+ $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15);
+ }
+ print join("\t", @{$staging[$best_k]}), "\n";
+ }
+ $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0;
+ @{$staging[0]} = @s;
+ $last = $name;
+ }
+ }
+ print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0);
+}
+
+sub bowtie2sam_aux {
+ my ($line, $s) = @_;
+ chomp($line);
+ my @t = split("\t", $line);
+ my $ret;
+ @$s = ();
+ # read name
+ $s->[0] = $ret = $t[0];
+ $s->[0] =~ s/\/[12]$//g;
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ # read & quality
+ $s->[9] = $t[4]; $s->[10] = $t[5];
+ # cigar
+ $s->[5] = length($s->[9]) . "M";
+ # coor
+ $s->[2] = $t[2]; $s->[3] = $t[3] + 1;
+ $s->[1] |= 0x10 if ($t[1] eq '-');
+ # mapQ
+ $s->[4] = $t[6] == 0? 25 : 0;
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ my $nm = @t - 7;
+ push(@$s, "NM:i:" . (@t-7));
+ push(@$s, "X$nm:i:" . ($t[6]+1));
+ my $md = '';
+ if ($t[7]) {
+ $_ = $t[7];
+ my $a = 0;
+ while (/(\d+):[ACGTN]>([ACGTN])/gi) {
+ my ($y, $z) = ($1, $2);
+ $md .= (int($y)-$a) . $z;
+ $a += $y - $a + 1;
+ }
+ $md .= length($s->[9]) - $a;
+ } else {
+ $md = length($s->[9]);
+ }
+ push(@$s, "MD:Z:$md");
+ return ($ret, $nm);
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.2 (03JAN2009)
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&export2sam;
+exit;
+
+sub export2sam {
+ my ($fh1, $fh2, $is_paired);
+ $is_paired = (@ARGV >= 2);
+ die("export2sam.pl <read1.export> [<read2.export>]\n") if (@ARGV == 0);
+ open($fh1, $ARGV[0]) || die;
+ if ($is_paired) {
+ open($fh2, $ARGV[1]) || die;
+ }
+ # conversion table
+ my @conv_table;
+ for (-64..64) {
+ $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499));
+ }
+ # core loop
+ while (<$fh1>) {
+ my (@s1, @s2);
+ &export2sam_aux($_, \@s1, \@conv_table, $is_paired);
+ if ($is_paired) {
+ $_ = <$fh2>;
+ &export2sam_aux($_, \@s2, \@conv_table, $is_paired);
+ if (@s1 && @s2) { # then set mate coordinate
+ my $isize = 0;
+ if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize
+ my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3];
+ my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3];
+ $isize = $x2 - $x1;
+ }
+ # update mate coordinate
+ if ($s2[2] ne '*') {
+ @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize);
+ $s1[1] |= 0x20 if ($s2[1] & 0x10);
+ } else {
+ $s1[1] |= 0x8;
+ }
+ if ($s1[2] ne '*') {
+ @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize);
+ $s2[1] |= 0x20 if ($s1[1] & 0x10);
+ } else {
+ $s2[1] |= 0x8;
+ }
+ }
+ }
+ print join("\t", @s1), "\n" if (@s1);
+ print join("\t", @s2), "\n" if (@s2 && $is_paired);
+ }
+ close($fh1);
+ close($fh2) if ($is_paired);
+}
+
+sub export2sam_aux {
+ my ($line, $s, $ct, $is_paired) = @_;
+ chomp($line);
+ my @t = split("\t", $line);
+ @$s = ();
+ return if ($t[21] ne 'Y');
+ # read name
+ $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired);
+ # read & quality
+ $s->[9] = $t[8]; $s->[10] = $t[9];
+ if ($t[13] eq 'R') { # then reverse the sequence and quality
+ $s->[9] = reverse($t[8]);
+ $s->[9] =~ tr/ACGTacgt/TGCAtgca/;
+ $s->[10] = reverse($t[9]);
+ }
+ $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding
+ # cigar
+ $s->[5] = length($s->[9]) . "M";
+ # coor
+ my $has_coor = 0;
+ $s->[2] = "*";
+ if ($t[10] eq 'NM' || $t[10] eq 'QC') {
+ $s->[1] |= 0x4; # unmapped
+ } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) {
+ $s->[1] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
+ push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
+ } else {
+ $s->[2] = $t[10];
+ $has_coor = 1;
+ }
+ $s->[3] = $has_coor? $t[12] : 0;
+ $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R');
+ # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?)
+ $s->[4] = 0;
+ $s->[4] = $t[15] if ($t[15] ne '');
+ $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]);
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ push(@$s, "BC:Z:$t[6]") if ($t[6]);
+ push(@$s, "MD:Z:$t[14]") if ($has_coor);
+ push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor);
+}
--- /dev/null
+#!/usr/bin/perl
+use strict;
+
+###Builds interpolated pileup from SAM file
+##@description counts bases between paired ends and piles up single end reads.
+##@output, uses a #header for the RNAME and then the number of reads per base
+##@author sm8@sanger.ac.uk, Stephen B. Montgomery
+
+##@caveats
+##Requires RNAME to have format as per example
+## chromosome:NCBI36:18:1:76117153:1
+## supercontig::NT_113883:1:137703:1
+## clone::AC138827.3:1:149397:1
+##Expects simple CIGAR characters, M, I and D
+##Expects SAM file to be sorted.
+##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77)
+
+##Verify and read in SAM file
+my $sam_file = $ARGV[0];
+if(!defined($sam_file)) { die("No sam file defined on arg 1"); }
+unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); }
+open(SAM, $sam_file) || die("Cannot open sam file");
+
+##Globals
+my $current_location = ""; ##Current RNAME being processed
+my $current_size = 0; ##Size of sequence region being processed
+my $current_position = 1; ##Current base being processed
+my $open = 0; ##Number of open reads (PE reads that have not been closed)
+my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the
+ ##contained value from those open and deletes the indexed position from the hash
+
+while (my $line = <SAM>) {
+ my @tokens = split /\t/, $line;
+
+ if ($current_location ne $tokens[2]) { ##Start a new sequence region
+ for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region
+ if (defined($close{$i})) {
+ $open = $open - $close{$i};
+ delete $close{$i};
+ }
+ print $open . "\n";
+ }
+ if ($current_location ne "") {
+ print "\n";
+ }
+
+ ##Initiate a new sequence region
+ my @location_tokens = split /:/, $tokens[2];
+ $current_position = 1;
+ $current_location = $tokens[2];
+ $current_size = $location_tokens[4];
+ $open = 0;
+ %close = ();
+ print "#" . $tokens[2] . "\n";
+
+ ##Print pileup to just before the first read (will be 0)
+ for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) {
+ print $open . "\n";
+ }
+ $current_position = $tokens[3];
+
+ } else { ##Sequence region already open
+ if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position
+ ##cycle through to catch up to the current position
+ for (my $i = $current_position; $i < $tokens[3]; $i++) {
+ if (defined($close{$i})) {
+ $open = $open - $close{$i};
+ delete $close{$i};
+ }
+ print $open . "\n";
+ }
+ $current_position = $tokens[3];
+ }
+ }
+ $open++; ##Increment the number of open reads
+
+ if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition
+ $open--;
+ my $parsed_cig = &parseCigar($tokens[5]);
+ my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
+ if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
+ $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
+ } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition
+ my $parsed_cig = &parseCigar($tokens[5]);
+ my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
+ if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
+ $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
+ } else {
+ #do nothing
+ }
+}
+for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region
+ if (defined($close{$i})) {
+ $open = $open - $close{$i};
+ delete $close{$i};
+ }
+ print $open . "\n";
+}
+print "\n";
+close(SAM);
+exit(0);
+
+##reads and tokenizes simple cigarline
+sub parseCigar() {
+ my $cigar_line = shift;
+ $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g;
+ my @cigar_tokens = split /\t/, $cigar_line;
+ my %parsed = ('M' => 0,
+ 'I' => 0,
+ 'D' => 0);
+ my @events = ();
+ for(my $i = 0; $i < scalar(@cigar_tokens); $i++) {
+ if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) {
+ if (!defined($parsed{$2})) { $parsed{$2} = 0; }
+ my $nt = $2;
+ if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; }
+ $parsed{$nt} += $1;
+ my %event_el = ("t" => $nt,
+ "n" => $1);
+ push @events, \%event_el;
+ }
+ }
+ $parsed{'events'} = \@events;
+ return \%parsed;
+}
--- /dev/null
+#include <string.h>
+#include <zlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define PACKAGE_VERSION "0.1.2 (20090521)"
+
+//#define MAQ_LONGREADS
+
+#ifdef MAQ_LONGREADS
+# define MAX_READLEN 128
+#else
+# define MAX_READLEN 64
+#endif
+
+#define MAX_NAMELEN 36
+#define MAQMAP_FORMAT_OLD 0
+#define MAQMAP_FORMAT_NEW -1
+
+#define PAIRFLAG_FF 0x01
+#define PAIRFLAG_FR 0x02
+#define PAIRFLAG_RF 0x04
+#define PAIRFLAG_RR 0x08
+#define PAIRFLAG_PAIRED 0x10
+#define PAIRFLAG_DIFFCHR 0x20
+#define PAIRFLAG_NOMATCH 0x40
+#define PAIRFLAG_SW 0x80
+
+typedef struct
+{
+ uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */
+ uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual;
+ uint32_t seqid, pos;
+ int dist;
+ char name[MAX_NAMELEN];
+} maqmap1_t;
+
+typedef struct
+{
+ int format, n_ref;
+ char **ref_name;
+ uint64_t n_mapped_reads;
+ maqmap1_t *mapped_reads;
+} maqmap_t;
+
+maqmap_t *maq_new_maqmap()
+{
+ maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t));
+ mm->format = MAQMAP_FORMAT_NEW;
+ return mm;
+}
+void maq_delete_maqmap(maqmap_t *mm)
+{
+ int i;
+ if (mm == 0) return;
+ for (i = 0; i < mm->n_ref; ++i)
+ free(mm->ref_name[i]);
+ free(mm->ref_name);
+ free(mm->mapped_reads);
+ free(mm);
+}
+maqmap_t *maqmap_read_header(gzFile fp)
+{
+ maqmap_t *mm;
+ int k, len;
+ mm = maq_new_maqmap();
+ gzread(fp, &mm->format, sizeof(int));
+ if (mm->format != MAQMAP_FORMAT_NEW) {
+ if (mm->format > 0) {
+ fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n");
+ exit(3);
+ }
+ assert(mm->format == MAQMAP_FORMAT_NEW);
+ }
+ gzread(fp, &mm->n_ref, sizeof(int));
+ mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*));
+ for (k = 0; k != mm->n_ref; ++k) {
+ gzread(fp, &len, sizeof(int));
+ mm->ref_name[k] = (char*)malloc(len * sizeof(char));
+ gzread(fp, mm->ref_name[k], len);
+ }
+ /* read number of mapped reads */
+ gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t));
+ return mm;
+}
+
+void maq2tam_core(gzFile fp, const char *rg)
+{
+ maqmap_t *mm;
+ maqmap1_t mm1, *m1;
+ int ret;
+ m1 = &mm1;
+ mm = maqmap_read_header(fp);
+ while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) {
+ int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1];
+ if (m1->flag) flag |= 1;
+ if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2;
+ if (m1->flag == 192) flag |= 4;
+ if (m1->flag == 64) flag |= 8;
+ if (m1->pos&1) flag |= 0x10;
+ if ((flag&1) && m1->dist != 0) {
+ int c;
+ if (m1->dist > 0) {
+ if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0;
+ else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1;
+ else c = m1->pos&1;
+ } else {
+ if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0;
+ else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1;
+ else c = m1->pos&1;
+ }
+ flag |= c;
+ }
+ if (flag) {
+ int l = strlen(m1->name);
+ if (m1->name[l-2] == '/') {
+ flag |= (m1->name[l-1] == '1')? 0x40 : 0x80;
+ m1->name[l-2] = '\0';
+ }
+ }
+ printf("%s\t%d\t", m1->name, flag);
+ printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1);
+ if (m1->flag == 130) {
+ int c = (int8_t)m1->seq[MAX_READLEN-1];
+ printf("%d\t", m1->alt_qual);
+ if (c == 0) printf("%dM\t", m1->size);
+ else {
+ if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c);
+ else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual);
+ }
+ se_mapq = 0; // zero SE mapQ for reads aligned by SW
+ } else {
+ if (flag&4) printf("0\t*\t");
+ else printf("%d\t%dM\t", m1->map_qual, m1->size);
+ }
+ printf("*\t0\t%d\t", m1->dist);
+ for (j = 0; j != m1->size; ++j) {
+ if (m1->seq[j] == 0) putchar('N');
+ else putchar("ACGT"[m1->seq[j]>>6&3]);
+ }
+ putchar('\t');
+ for (j = 0; j != m1->size; ++j)
+ putchar((m1->seq[j]&0x3f) + 33);
+ putchar('\t');
+ if (rg) printf("RG:Z:%s\t", rg);
+ if (flag&4) { // unmapped
+ printf("MF:i:%d\n", m1->flag);
+ } else {
+ printf("MF:i:%d\t", m1->flag);
+ if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq);
+ printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]);
+ }
+ }
+ if (ret > 0)
+ fprintf(stderr, "Truncated! Continue anyway.\n");
+ maq_delete_maqmap(mm);
+}
+
+int main(int argc, char *argv[])
+{
+ gzFile fp;
+ if (argc == 1) {
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Usage: maq2sam <in.map> [<readGroup>]\n");
+ return 1;
+ }
+ fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ maq2tam_core(fp, argc > 2? argv[2] : 0);
+ gzclose(fp);
+ return 0;
+}
--- /dev/null
+/*
+ **********************************************************************
+ ** md5.c **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm **
+ ** Created: 2/17/90 RLR **
+ ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ ** **
+ ** License to copy and use this software is granted provided that **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function. **
+ ** **
+ ** License is also granted to make and use derivative works **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all **
+ ** material mentioning or referencing the derived work. **
+ ** **
+ ** RSA Data Security, Inc. makes no representations concerning **
+ ** either the merchantability of this software or the suitability **
+ ** of this software for any particular purpose. It is provided "as **
+ ** is" without express or implied warranty of any kind. **
+ ** **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software. **
+ **********************************************************************
+ */
+
+#include "md5.h"
+
+/* forward declaration */
+static void Transform ();
+
+static unsigned char PADDING[64] = {
+ 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+/* F, G and H are basic MD5 functions: selection, majority, parity */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s, ac) \
+ {(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) \
+ {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) \
+ {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) \
+ {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+
+void MD5Init (mdContext)
+MD5_CTX *mdContext;
+{
+ mdContext->i[0] = mdContext->i[1] = (UINT4)0;
+
+ /* Load magic initialization constants.
+ */
+ mdContext->buf[0] = (UINT4)0x67452301;
+ mdContext->buf[1] = (UINT4)0xefcdab89;
+ mdContext->buf[2] = (UINT4)0x98badcfe;
+ mdContext->buf[3] = (UINT4)0x10325476;
+}
+
+void MD5Update (mdContext, inBuf, inLen)
+MD5_CTX *mdContext;
+unsigned char *inBuf;
+unsigned int inLen;
+{
+ UINT4 in[16];
+ int mdi;
+ unsigned int i, ii;
+
+ /* compute number of bytes mod 64 */
+ mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+ /* update number of bits */
+ if ((mdContext->i[0] + ((UINT4)inLen << 3)) < mdContext->i[0])
+ mdContext->i[1]++;
+ mdContext->i[0] += ((UINT4)inLen << 3);
+ mdContext->i[1] += ((UINT4)inLen >> 29);
+
+ while (inLen--) {
+ /* add new character to buffer, increment mdi */
+ mdContext->in[mdi++] = *inBuf++;
+
+ /* transform if necessary */
+ if (mdi == 0x40) {
+ for (i = 0, ii = 0; i < 16; i++, ii += 4)
+ in[i] = (((UINT4)mdContext->in[ii+3]) << 24) |
+ (((UINT4)mdContext->in[ii+2]) << 16) |
+ (((UINT4)mdContext->in[ii+1]) << 8) |
+ ((UINT4)mdContext->in[ii]);
+ Transform (mdContext->buf, in);
+ mdi = 0;
+ }
+ }
+}
+
+void MD5Final (mdContext)
+MD5_CTX *mdContext;
+{
+ UINT4 in[16];
+ int mdi;
+ unsigned int i, ii;
+ unsigned int padLen;
+
+ /* save number of bits */
+ in[14] = mdContext->i[0];
+ in[15] = mdContext->i[1];
+
+ /* compute number of bytes mod 64 */
+ mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+ /* pad out to 56 mod 64 */
+ padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi);
+ MD5Update (mdContext, PADDING, padLen);
+
+ /* append length in bits and transform */
+ for (i = 0, ii = 0; i < 14; i++, ii += 4)
+ in[i] = (((UINT4)mdContext->in[ii+3]) << 24) |
+ (((UINT4)mdContext->in[ii+2]) << 16) |
+ (((UINT4)mdContext->in[ii+1]) << 8) |
+ ((UINT4)mdContext->in[ii]);
+ Transform (mdContext->buf, in);
+
+ /* store buffer in digest */
+ for (i = 0, ii = 0; i < 4; i++, ii += 4) {
+ mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF);
+ mdContext->digest[ii+1] =
+ (unsigned char)((mdContext->buf[i] >> 8) & 0xFF);
+ mdContext->digest[ii+2] =
+ (unsigned char)((mdContext->buf[i] >> 16) & 0xFF);
+ mdContext->digest[ii+3] =
+ (unsigned char)((mdContext->buf[i] >> 24) & 0xFF);
+ }
+}
+
+/* Basic MD5 step. Transform buf based on in.
+ */
+static void Transform (buf, in)
+UINT4 *buf;
+UINT4 *in;
+{
+ UINT4 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+ /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+ FF ( a, b, c, d, in[ 0], S11, 3614090360u); /* 1 */
+ FF ( d, a, b, c, in[ 1], S12, 3905402710u); /* 2 */
+ FF ( c, d, a, b, in[ 2], S13, 606105819u); /* 3 */
+ FF ( b, c, d, a, in[ 3], S14, 3250441966u); /* 4 */
+ FF ( a, b, c, d, in[ 4], S11, 4118548399u); /* 5 */
+ FF ( d, a, b, c, in[ 5], S12, 1200080426u); /* 6 */
+ FF ( c, d, a, b, in[ 6], S13, 2821735955u); /* 7 */
+ FF ( b, c, d, a, in[ 7], S14, 4249261313u); /* 8 */
+ FF ( a, b, c, d, in[ 8], S11, 1770035416u); /* 9 */
+ FF ( d, a, b, c, in[ 9], S12, 2336552879u); /* 10 */
+ FF ( c, d, a, b, in[10], S13, 4294925233u); /* 11 */
+ FF ( b, c, d, a, in[11], S14, 2304563134u); /* 12 */
+ FF ( a, b, c, d, in[12], S11, 1804603682u); /* 13 */
+ FF ( d, a, b, c, in[13], S12, 4254626195u); /* 14 */
+ FF ( c, d, a, b, in[14], S13, 2792965006u); /* 15 */
+ FF ( b, c, d, a, in[15], S14, 1236535329u); /* 16 */
+
+ /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+ GG ( a, b, c, d, in[ 1], S21, 4129170786u); /* 17 */
+ GG ( d, a, b, c, in[ 6], S22, 3225465664u); /* 18 */
+ GG ( c, d, a, b, in[11], S23, 643717713u); /* 19 */
+ GG ( b, c, d, a, in[ 0], S24, 3921069994u); /* 20 */
+ GG ( a, b, c, d, in[ 5], S21, 3593408605u); /* 21 */
+ GG ( d, a, b, c, in[10], S22, 38016083u); /* 22 */
+ GG ( c, d, a, b, in[15], S23, 3634488961u); /* 23 */
+ GG ( b, c, d, a, in[ 4], S24, 3889429448u); /* 24 */
+ GG ( a, b, c, d, in[ 9], S21, 568446438u); /* 25 */
+ GG ( d, a, b, c, in[14], S22, 3275163606u); /* 26 */
+ GG ( c, d, a, b, in[ 3], S23, 4107603335u); /* 27 */
+ GG ( b, c, d, a, in[ 8], S24, 1163531501u); /* 28 */
+ GG ( a, b, c, d, in[13], S21, 2850285829u); /* 29 */
+ GG ( d, a, b, c, in[ 2], S22, 4243563512u); /* 30 */
+ GG ( c, d, a, b, in[ 7], S23, 1735328473u); /* 31 */
+ GG ( b, c, d, a, in[12], S24, 2368359562u); /* 32 */
+
+ /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+ HH ( a, b, c, d, in[ 5], S31, 4294588738u); /* 33 */
+ HH ( d, a, b, c, in[ 8], S32, 2272392833u); /* 34 */
+ HH ( c, d, a, b, in[11], S33, 1839030562u); /* 35 */
+ HH ( b, c, d, a, in[14], S34, 4259657740u); /* 36 */
+ HH ( a, b, c, d, in[ 1], S31, 2763975236u); /* 37 */
+ HH ( d, a, b, c, in[ 4], S32, 1272893353u); /* 38 */
+ HH ( c, d, a, b, in[ 7], S33, 4139469664u); /* 39 */
+ HH ( b, c, d, a, in[10], S34, 3200236656u); /* 40 */
+ HH ( a, b, c, d, in[13], S31, 681279174u); /* 41 */
+ HH ( d, a, b, c, in[ 0], S32, 3936430074u); /* 42 */
+ HH ( c, d, a, b, in[ 3], S33, 3572445317u); /* 43 */
+ HH ( b, c, d, a, in[ 6], S34, 76029189u); /* 44 */
+ HH ( a, b, c, d, in[ 9], S31, 3654602809u); /* 45 */
+ HH ( d, a, b, c, in[12], S32, 3873151461u); /* 46 */
+ HH ( c, d, a, b, in[15], S33, 530742520u); /* 47 */
+ HH ( b, c, d, a, in[ 2], S34, 3299628645u); /* 48 */
+
+ /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+ II ( a, b, c, d, in[ 0], S41, 4096336452u); /* 49 */
+ II ( d, a, b, c, in[ 7], S42, 1126891415u); /* 50 */
+ II ( c, d, a, b, in[14], S43, 2878612391u); /* 51 */
+ II ( b, c, d, a, in[ 5], S44, 4237533241u); /* 52 */
+ II ( a, b, c, d, in[12], S41, 1700485571u); /* 53 */
+ II ( d, a, b, c, in[ 3], S42, 2399980690u); /* 54 */
+ II ( c, d, a, b, in[10], S43, 4293915773u); /* 55 */
+ II ( b, c, d, a, in[ 1], S44, 2240044497u); /* 56 */
+ II ( a, b, c, d, in[ 8], S41, 1873313359u); /* 57 */
+ II ( d, a, b, c, in[15], S42, 4264355552u); /* 58 */
+ II ( c, d, a, b, in[ 6], S43, 2734768916u); /* 59 */
+ II ( b, c, d, a, in[13], S44, 1309151649u); /* 60 */
+ II ( a, b, c, d, in[ 4], S41, 4149444226u); /* 61 */
+ II ( d, a, b, c, in[11], S42, 3174756917u); /* 62 */
+ II ( c, d, a, b, in[ 2], S43, 718787259u); /* 63 */
+ II ( b, c, d, a, in[ 9], S44, 3951481745u); /* 64 */
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
+
+/* lh3: the following code is added by me */
+
+#ifdef MD5SUM_MAIN
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define HEX_STR "0123456789abcdef"
+
+static void md5_one(const char *fn)
+{
+ unsigned char buf[4096];
+ MD5_CTX md5;
+ int l;
+ FILE *fp;
+
+ fp = strcmp(fn, "-")? fopen(fn, "r") : stdin;
+ if (fp == 0) {
+ fprintf(stderr, "md5sum: %s: No such file or directory\n", fn);
+ exit(1);
+ }
+ MD5Init(&md5);
+ while ((l = fread(buf, 1, 4096, fp)) > 0)
+ MD5Update(&md5, buf, l);
+ MD5Final(&md5);
+ if (fp != stdin) fclose(fp);
+ for (l = 0; l < 16; ++l)
+ printf("%c%c", HEX_STR[md5.digest[l]>>4&0xf], HEX_STR[md5.digest[l]&0xf]);
+ printf(" %s\n", fn);
+}
+int main(int argc, char *argv[])
+{
+ int i;
+ if (argc == 1) md5_one("-");
+ else for (i = 1; i < argc; ++i) md5_one(argv[i]);
+ return 0;
+}
+#endif
--- /dev/null
+/*
+ **********************************************************************
+ ** md5.h -- Header file for implementation of MD5 **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm **
+ ** Created: 2/17/90 RLR **
+ ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version **
+ ** Revised (for MD5): RLR 4/27/91 **
+ ** -- G modified to have y&~z instead of y&z **
+ ** -- FF, GG, HH modified to add in last register done **
+ ** -- Access pattern: round 2 works mod 5, round 3 works mod 3 **
+ ** -- distinct additive constant for each step **
+ ** -- round 4 added, working mod 7 **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ ** **
+ ** License to copy and use this software is granted provided that **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function. **
+ ** **
+ ** License is also granted to make and use derivative works **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all **
+ ** material mentioning or referencing the derived work. **
+ ** **
+ ** RSA Data Security, Inc. makes no representations concerning **
+ ** either the merchantability of this software or the suitability **
+ ** of this software for any particular purpose. It is provided "as **
+ ** is" without express or implied warranty of any kind. **
+ ** **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software. **
+ **********************************************************************
+ */
+
+#ifndef MD5_H
+#define MD5_H
+
+#include <stdint.h>
+
+/* typedef a 32 bit type */
+typedef uint32_t UINT4;
+
+/* Data structure for MD5 (Message Digest) computation */
+typedef struct {
+ UINT4 i[2]; /* number of _bits_ handled mod 2^64 */
+ UINT4 buf[4]; /* scratch buffer */
+ unsigned char in[64]; /* input buffer */
+ unsigned char digest[16]; /* actual digest after MD5Final call */
+} MD5_CTX;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ void MD5Init(MD5_CTX *mdContext);
+ void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, unsigned intinLen);
+ void MD5Final(MD5_CTX *mdContext);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <stdio.h>
+#include <zlib.h>
+#include "md5.h"
+#include "kseq.h"
+
+#define HEX_STR "0123456789abcdef"
+
+KSEQ_INIT(gzFile, gzread)
+
+static void md5_one(const char *fn)
+{
+ MD5_CTX md5_one, md5_all;
+ int l, i, k;
+ gzFile fp;
+ kseq_t *seq;
+ unsigned char unordered[16];
+
+ for (l = 0; l < 16; ++l) unordered[l] = 0;
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "md5fa: %s: No such file or directory\n", fn);
+ exit(1);
+ }
+
+ MD5Init(&md5_all);
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ for (i = k = 0; i < seq->seq.l; ++i) {
+ if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]);
+ else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i];
+ }
+ MD5Init(&md5_one);
+ MD5Update(&md5_one, (unsigned char*)seq->seq.s, k);
+ MD5Final(&md5_one);
+ for (l = 0; l < 16; ++l) {
+ printf("%c%c", HEX_STR[md5_one.digest[l]>>4&0xf], HEX_STR[md5_one.digest[l]&0xf]);
+ unordered[l] ^= md5_one.digest[l];
+ }
+ printf(" %s %s\n", fn, seq->name.s);
+ MD5Update(&md5_all, (unsigned char*)seq->seq.s, k);
+ }
+ MD5Final(&md5_all);
+ kseq_destroy(seq);
+ for (l = 0; l < 16; ++l)
+ printf("%c%c", HEX_STR[md5_all.digest[l]>>4&0xf], HEX_STR[md5_all.digest[l]&0xf]);
+ printf(" %s >ordered\n", fn);
+ for (l = 0; l < 16; ++l)
+ printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]);
+ printf(" %s >unordered\n", fn);
+}
+
+int main(int argc, char *argv[])
+{
+ int i;
+ if (argc == 1) md5_one("-");
+ else for (i = 1; i < argc; ++i) md5_one(argv[i]);
+ return 0;
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.3
+
+#Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com)
+
+#use strict;
+#use warnings;
+use Data::Dumper;
+use Getopt::Std;
+
+&novo2sam;
+exit;
+
+sub mating {
+ my ($s1, $s2) = @_;
+ my $isize = 0;
+ if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
+ my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
+ my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
+ $isize = $x2 - $x1;
+ }
+ # update mate coordinate
+ if ($s2->[2] ne '*') {
+ @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
+ $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
+ } else {
+ $s1->[1] |= 0x8;
+ }
+ if ($s1->[2] ne '*') {
+ @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
+ $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
+ } else {
+ $s2->[1] |= 0x8;
+ }
+}
+
+sub novo2sam {
+ my %opts = ();
+ getopts("p", \%opts);
+ die("Usage: novo2sam.pl [-p] <aln.novo>\n") if (@ARGV == 0);
+ my $is_paired = defined($opts{p});
+ # core loop
+ my @s1 = ();
+ my @s2 = ();
+ my ($s_last, $s_curr) = (\@s1, \@s2);
+ while (<>) {
+ next if (/^#/);
+ next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/);
+ &novo2sam_aux($_, $s_curr, $is_paired);
+ if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
+ &mating($s_last, $s_curr);
+ print join("\t", @$s_last), "\n";
+ print join("\t", @$s_curr), "\n";
+ @$s_last = (); @$s_curr = ();
+ } else {
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+ my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
+ }
+ }
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+}
+
+sub novo2sam_aux {
+ my ($line, $s, $is_paired) = @_;
+
+ chomp($line);
+ my @t = split(/\s+/, $line);
+ my @variations = @t[13 .. $#t];
+ @$s = ();
+ return if ($t[4] ne 'U');
+ my $len = length($t[2]);
+ # read name
+ $s->[0] = substr($t[0], 1);
+ $s->[0] =~ s/\/[12]$//g;
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7);
+ $s->[1] |= 2 if ($t[10] eq '.');
+ # read & quality
+ if ($t[9] eq 'R') {
+ $s->[9] = reverse($t[2]);
+ $s->[10] = reverse($t[3]);
+ $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
+ } else {
+ $s->[9] = $t[2]; $s->[10] = $t[3];
+ }
+ # cigar
+ my $cigarstring ="";
+ if (scalar @variations ==0 ) {
+ $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment
+ } else {
+ #convert to correct CIGAR
+ my $tmpstr = join" ",@variations ;
+ if ( $tmpstr=~ /\+|\-/ ) {
+ $cigarstring = cigar_method($line,\@variations,$len);
+ $s->[5]=$cigarstring;
+ } else {
+ $s->[5]=$len. "M";
+ }
+}
+
+# coor
+ $s->[2] = substr($t[7], 1); $s->[3] = $t[8];
+ $s->[1] |= 0x10 if ($t[9] eq 'R');
+ # mapQ
+ $s->[4] = $t[5] > $t[6]? $t[5] : $t[6];
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ push(@$s, "NM:i:".(@t-13));
+ my $md = '';
+ $md = mdtag($md,$line,\@variations,$len);
+ push(@$s, "MD:Z:$md");
+
+}
+
+sub mdtag {
+ my $oldmd = shift;
+ my $line = shift;
+ my $ref =shift;
+ my $rdlen = shift;
+ my @variations = @$ref;
+ my $string="";
+ my $mdtag="";
+ my $t=1;
+ my $q=1;
+ my $deleteflag=0;
+ my $len =0;
+ foreach $string (@variations) {
+ my ($indeltype,$insert) = indeltype($string);
+ if ($indeltype eq "+") {
+ $len = length ($insert);
+ $q+=$len;
+ next;
+ }
+ my $pos = $1 if $string =~ /^(\d+)/;
+ $len = $pos - $t;
+ if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) {
+ $mdtag.=$len;
+ }
+ $t+=$len;
+ $q+=$len;
+ if ($indeltype eq ">") {
+ $mdtag.=$insert;
+ $deleteflag=0;
+ $t+=1;
+ $q+=1;
+ }
+ if ($indeltype eq "-") {
+ my $deletedbase = $2 if $string =~ /(\d+)\-([A-Z]+)/;
+ if ($deleteflag == 0 ) {
+ $mdtag.="^";
+ }
+ $mdtag.=$deletedbase;
+ $deleteflag=1;
+ $t+=1;
+ }
+ }
+ $len = $rdlen - $q + 1;
+ if ($len > 0) {
+ $mdtag.="$len";
+ }
+# print "In:$line\n";
+# print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n";
+
+ return $mdtag;
+}
+
+sub indeltype {
+ my $string = shift;
+ my $insert="";
+ my $indeltype;
+ if ($string =~ /([A-Z]+)\>/) {
+ $indeltype=">";
+ $insert=$1;
+ } elsif ($string =~ /\-/) {
+ $indeltype="-";
+ } elsif ($string =~ /\+([A-Z]+)/) {
+ $indeltype="+";
+ $insert=$1;
+ }
+ return ($indeltype,$insert);
+
+}
+
+
+sub cigar_method {
+ my $line = shift;
+ my $ref =shift;
+ my $rdlen = shift;
+ my @variations = @$ref;
+ my $string="";
+ my $type="";
+ my $t =1;
+ my $q=1;
+ my $indeltype="";
+ my $cigar= "";
+ my $insert = "";
+ my $len=0;
+ my @cig=();
+ foreach $string (@variations) {
+ next if $string =~ />/;
+ my $pos = $1 if $string =~ /^(\d+)/;
+
+ if ($string =~ /\+([A-Z]+)/) {
+ $indeltype="+";
+ $insert = $1;
+ }elsif ($string =~ /\-([A-Z]+)/) {
+ $indeltype="-";
+ $insert = $1;
+ }
+#print "$pos $indeltype $insert $t $q\n";
+ $len = $pos - $t;
+ if ( $len > 0) {
+ $cigar.=$len."M";
+ push(@cig,$len."M");
+ }
+ $t+=$len;
+ $q+=$len;
+
+ if ($indeltype eq "-") {
+ $cigar.="D";
+ push(@cig,"D");
+ $t++;
+ }
+ if ($indeltype eq "+") {
+ $len = length ($insert);
+ if ($len == 1) {
+ $cigar.="I";
+ push(@cig,"I");
+ }
+ if ($len > 1) {
+ $cigar.=$len."I";
+ push(@cig,$len."I")
+ }
+ $q+=$len;
+ }
+ $insert="";
+ }
+ $len= $rdlen - $q + 1;
+ if ($len > 0) {
+ $cigar.=$len."M";
+ push(@cig,$len."M");
+ }
+
+ $cigar = newcigar($cigar,'D');
+ $cigar = newcigar($cigar,'I');
+
+ #print "$line\n";
+ #print "c CIGAR:\t$cigar\n\n";
+ return $cigar;
+
+}
+
+
+
+sub newcigar {
+ my $cigar = shift;
+ my $char = shift;
+ my $new = "";
+ my $copy = $cigar;
+#print "$cigar\n";
+ $copy =~ s/^($char+)/$1;/g;
+#print "$copy\n";
+ $copy =~ s/([^0-9$char])($char+)/$1;$2;/g;
+#print "$copy\n";
+ my @parts = split(/;/,$copy);
+ my $el="";
+ foreach $el (@parts) {
+#print "$el\n";
+ if ($el =~ /^$char+$/) {
+ $new.=length($el).$char;
+ }else {
+ $new.=$el;
+ }
+
+ }
+ return $new;
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Author: lh3
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my $version = '0.3.2 (r321)';
+&usage if (@ARGV < 1);
+
+my $command = shift(@ARGV);
+my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter);
+
+die("Unknown command \"$command\".\n") if (!defined($func{$command}));
+&{$func{$command}};
+exit(0);
+
+#
+# showALEN
+#
+
+sub showALEN {
+ die(qq/Usage: samtools.pl showALEN <in.sam>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ my @t = split;
+ my $l = 0;
+ $_ = $t[5];
+ s/(\d+)[SMI]/$l+=$1/eg;
+ print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n";
+ }
+}
+
+#
+# varFilter
+#
+
+sub varFilter {
+ my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef);
+ getopts('pd:D:l:Q:w:W:N:G:', \%opts);
+ die(qq/
+Usage: samtools.pl varFilter [options] <in.cns-pileup>
+
+Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
+ -q INT minimum RMS mapping quality for gaps [$opts{q}]
+ -d INT minimum read depth [$opts{d}]
+ -D INT maximum read depth [$opts{D}]
+
+ -G INT min indel score for nearby SNP filtering [$opts{G}]
+ -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
+
+ -W INT window size for filtering dense SNPs [$opts{W}]
+ -N INT max number of SNPs in a window [$opts{N}]
+
+ -l INT window size for filtering adjacent gaps [$opts{l}]
+
+ -p print filtered variants
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ # calculate the window size
+ my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
+ my $max_dist = $ol > $ow? $ol : $ow;
+ $max_dist = $oW if ($max_dist < $oW);
+ # the core loop
+ my @staging; # (indel_filtering_score, flt_tag)
+ while (<>) {
+ my @t = split;
+ next if ($t[2] eq $t[3] || $t[3] eq '*/*'); # skip non-var sites
+ # clear the out-of-range elements
+ while (@staging) {
+ last if ($staging[0][2] eq $t[0] && $staging[0][3] + $max_dist >= $t[1]);
+ varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
+ }
+ my ($flt, $score) = (0, -1);
+ # first a simple filter
+ if ($t[7] < $opts{d}) {
+ $flt = 2;
+ } elsif ($t[7] > $opts{D}) {
+ $flt = 3;
+ }
+ # site dependent filters
+ if ($flt == 0) {
+ if ($t[2] eq '*') { # an indel
+ $flt = 1 if ($t[6] < $opts{q});
+ # filtering SNPs
+ if ($t[5] >= $opts{G}) {
+ for my $x (@staging) {
+ next if ($x->[0] >= 0 || $x->[3] + $ow < $t[1]);
+ $x->[1] = 5 if ($x->[1] == 0);
+ }
+ }
+ # calculate the filtering score (different from indel quality)
+ $score = $t[5];
+ $score += $opts{s} * $t[10] if ($t[8] ne '*');
+ $score += $opts{s} * $t[11] if ($t[9] ne '*');
+ # check the staging list for indel filtering
+ for my $x (@staging) {
+ next if ($x->[0] < 0 || $x->[3] + $ol < $t[1]);
+ if ($x->[0] < $score) {
+ $x->[1] = 6;
+ } else {
+ $flt = 6; last;
+ }
+ }
+ } else { # a SNP
+ $flt = 1 if ($t[6] < $opts{Q});
+ # check adjacent SNPs
+ my $k = 1;
+ for my $x (@staging) {
+ ++$k if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
+ }
+ # filtering is necessary
+ if ($k > $opts{N}) {
+ $flt = 4;
+ for my $x (@staging) {
+ $x->[1] = 4 if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && $x->[1] == 0);
+ }
+ } else { # then check gap filter
+ for my $x (@staging) {
+ next if ($x->[0] < 0 || $x->[3] + $ow < $t[1]);
+ if ($x->[0] >= $opts{G}) {
+ $flt = 5; last;
+ }
+ }
+ }
+ }
+ }
+ push(@staging, [$score, $flt, @t]);
+ }
+ # output the last few elements in the staging list
+ while (@staging) {
+ varFilter_aux(shift @staging, $opts{p});
+ }
+}
+
+sub varFilter_aux {
+ my ($first, $is_print) = @_;
+ if ($first->[1] == 0) {
+ print join("\t", @$first[2 .. @$first-1]), "\n";
+ } elsif ($is_print) {
+ print STDERR join("\t", substr("UQdDWGgX", $first->[1], 1), @$first[2 .. @$first-1]), "\n";
+ }
+}
+
+#
+# pileup2fq
+#
+
+sub pileup2fq {
+ my %opts = (d=>3, D=>255, Q=>25, G=>25, l=>10);
+ getopts('d:D:Q:G:l:', \%opts);
+ die(qq/
+Usage: samtools.pl pileup2fq [options] <in.cns-pileup>
+
+Options: -d INT minimum depth [$opts{d}]
+ -D INT maximum depth [$opts{D}]
+ -Q INT min RMS mapQ [$opts{Q}]
+ -G INT minimum indel score [$opts{G}]
+ -l INT indel filter winsize [$opts{l}]\n
+/) if (@ARGV == 0 && -t STDIN);
+
+ my ($last_chr, $seq, $qual, @gaps, $last_pos);
+ my $_Q = $opts{Q};
+ my $_d = $opts{d};
+ my $_D = $opts{D};
+
+ $last_chr = '';
+ while (<>) {
+ my @t = split;
+ if ($last_chr ne $t[0]) {
+ &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
+ $last_chr = $t[0];
+ $last_pos = 0;
+ $seq = ''; $qual = '';
+ @gaps = ();
+ }
+ if ($t[1] - $last_pos != 1) {
+ $seq .= 'n' x ($t[1] - $last_pos - 1);
+ $qual .= '!' x ($t[1] - $last_pos - 1);
+ }
+ if ($t[2] eq '*') {
+ push(@gaps, $t[1]) if ($t[5] >= $opts{G});
+ } else {
+ $seq .= ($t[6] >= $_Q && $t[7] >= $_d && $t[7] <= $_D)? uc($t[3]) : lc($t[3]);
+ my $q = $t[4] + 33;
+ $q = 126 if ($q > 126);
+ $qual .= chr($q);
+ }
+ $last_pos = $t[1];
+ }
+ &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
+}
+
+sub p2q_post_process {
+ my ($chr, $seq, $qual, $gaps, $l) = @_;
+ &p2q_filter_gaps($seq, $gaps, $l);
+ print "\@$chr\n"; &p2q_print_str($seq);
+ print "+\n"; &p2q_print_str($qual);
+}
+
+sub p2q_filter_gaps {
+ my ($seq, $gaps, $l) = @_;
+ for my $g (@$gaps) {
+ my $x = $g > $l? $g - $l : 0;
+ substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l));
+ }
+}
+
+sub p2q_print_str {
+ my ($s) = @_;
+ my $l = length($$s);
+ for (my $i = 0; $i < $l; $i += 60) {
+ print substr($$s, $i, 60), "\n";
+ }
+}
+
+#
+# varStats
+#
+
+sub varStats {
+ my %opts = (d=>'', c=>5);
+ getopts('d:c:', \%opts);
+ die("Usage: samtools.pl varStats [-d dbSNP.snp] [-c $opts{c}] <in.plp.snp>\n") if (@ARGV == 0 && -t STDIN);
+ my (@cnt, %hash);
+ my $col = $opts{c} - 1;
+ while (<>) {
+ my @t = split;
+ if ($t[2] eq '*') {
+ } else {
+ my $q = $t[$col];
+ $q = 99 if ($q > 99);
+ $q = int($q/10);
+ my $is_het = ($t[3] =~ /^[ACGT]$/)? 0 : 1;
+ ++$cnt[$q][$is_het];
+ $hash{$t[0],$t[1]} = $q;
+ }
+ }
+}
+
+#
+# Usage
+#
+
+sub usage {
+ die(qq/
+Program: samtools.pl (helper script for SAMtools)
+Version: $version
+Contact: Heng Li <lh3\@sanger.ac.uk>\n
+Usage: samtools.pl <command> [<arguments>]\n
+Command: varFilter filtering SNPs and short indels
+ pileup2fq generate fastq from `pileup -c'
+ showALEN print alignment length (ALEN) following CIGAR
+\n/);
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.1
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&soap2sam;
+exit;
+
+sub mating {
+ my ($s1, $s2) = @_;
+ my $isize = 0;
+ if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
+ my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
+ my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
+ $isize = $x2 - $x1;
+ }
+ # update mate coordinate
+ if ($s2->[2] ne '*') {
+ @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
+ $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
+ } else {
+ $s1->[1] |= 0x8;
+ }
+ if ($s1->[2] ne '*') {
+ @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
+ $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
+ } else {
+ $s2->[1] |= 0x8;
+ }
+}
+
+sub soap2sam {
+ my %opts = ();
+ getopts("p", \%opts);
+ die("Usage: soap2sam.pl [-p] <aln.soap>\n") if (@ARGV == 0 && -t STDIN);
+ my $is_paired = defined($opts{p});
+ # core loop
+ my @s1 = ();
+ my @s2 = ();
+ my ($s_last, $s_curr) = (\@s1, \@s2);
+ while (<>) {
+ s/[\177-\377]|[\000-\010]|[\012-\040]//g;
+ next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0);
+ if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
+ &mating($s_last, $s_curr);
+ print join("\t", @$s_last), "\n";
+ print join("\t", @$s_curr), "\n";
+ @$s_last = (); @$s_curr = ();
+ } else {
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+ my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
+ }
+ }
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+}
+
+sub soap2sam_aux {
+ my ($line, $s, $is_paired) = @_;
+ chomp($line);
+ my @t = split(/\s+/, $line);
+ return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]);
+ @$s = ();
+ # fix SOAP-2.1.x bugs
+ @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/);
+ # read name
+ $s->[0] = $t[0];
+ $s->[0] =~ s/\/[12]$//g;
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7);
+ $s->[1] |= 2 if ($is_paired);
+ # read & quality
+ $s->[9] = $t[1];
+ $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2];
+ # cigar
+ $s->[5] = length($s->[9]) . "M";
+ # coor
+ $s->[2] = $t[7]; $s->[3] = $t[8];
+ $s->[1] |= 0x10 if ($t[6] eq '-');
+ # mapQ
+ $s->[4] = $t[3] == 1? 30 : 0;
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ push(@$s, "NM:i:$t[9]");
+ my $md = '';
+ if ($t[9]) {
+ my @x;
+ for (10 .. $#t) {
+ push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i);
+ }
+ @x = sort(@x);
+ my $a = 0;
+ for (@x) {
+ my ($y, $z) = split(",");
+ $md .= (int($y)-$a) . $z;
+ $a += $y - $a + 1;
+ }
+ $md .= length($t[1]) - $a;
+ } else {
+ $md = length($t[1]);
+ }
+ push(@$s, "MD:Z:$md");
+ return 0;
+}
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/* This program is separated from maq's read simulator with Colin
+ * Hercus' modification to allow longer indels. Colin is the chief
+ * developer of novoalign. */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+
+#define PACKAGE_VERSION "0.2.3"
+
+const uint8_t nst_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
+
+/* Simple normal random number generator, copied from genran.c */
+
+double ran_normal()
+{
+ static int iset = 0;
+ static double gset;
+ double fac, rsq, v1, v2;
+ if (iset == 0) {
+ do {
+ v1 = 2.0 * drand48() - 1.0;
+ v2 = 2.0 * drand48() - 1.0;
+ rsq = v1 * v1 + v2 * v2;
+ } while (rsq >= 1.0 || rsq == 0.0);
+ fac = sqrt(-2.0 * log(rsq) / rsq);
+ gset = v1 * fac;
+ iset = 1;
+ return v2 * fac;
+ } else {
+ iset = 0;
+ return gset;
+ }
+}
+
+/* FASTA parser, copied from seq.c */
+
+typedef struct {
+ int l, m; /* length and maximum buffer size */
+ unsigned char *s; /* sequence */
+} seq_t;
+
+#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
+
+static int SEQ_BLOCK_SIZE = 512;
+
+void seq_set_block_size(int size)
+{
+ SEQ_BLOCK_SIZE = size;
+}
+
+int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
+{
+ int c, l, max;
+ char *p;
+
+ c = 0;
+ while (!feof(fp) && fgetc(fp) != '>');
+ if (feof(fp)) return -1;
+ p = locus;
+ while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
+ if (c != '\r') *p++ = c;
+ *p = '\0';
+ if (comment) {
+ p = comment;
+ if (c != '\n') {
+ while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
+ if (c != '\n') {
+ *p++ = c;
+ while (!feof(fp) && (c = fgetc(fp)) != '\n')
+ if (c != '\r') *p++ = c;
+ }
+ }
+ *p = '\0';
+ } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
+ l = 0; max = seq->m;
+ while (!feof(fp) && (c = fgetc(fp)) != '>') {
+ if (isalpha(c) || c == '-' || c == '.') {
+ if (l + 1 >= max) {
+ max += SEQ_BLOCK_SIZE;
+ seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
+ }
+ seq->s[l++] = (unsigned char)c;
+ }
+ }
+ if (c == '>') ungetc(c,fp);
+ seq->s[l] = 0;
+ seq->m = max; seq->l = l;
+ return l;
+}
+
+/* Error-checking open, copied from utils.c */
+
+#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
+
+FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
+{
+ FILE *fp = 0;
+ if (strcmp(fn, "-") == 0)
+ return (strstr(mode, "r"))? stdin : stdout;
+ if ((fp = fopen(fn, mode)) == 0) {
+ fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
+ abort();
+ }
+ return fp;
+}
+
+/* wgsim */
+
+enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
+typedef unsigned short mut_t;
+static mut_t mutmsk = (mut_t)0xf000;
+
+typedef struct {
+ int l, m; /* length and maximum buffer size */
+ mut_t *s; /* sequence */
+} mutseq_t;
+
+static double ERR_RATE = 0.02;
+static double MUT_RATE = 0.001;
+static double INDEL_FRAC = 0.1;
+static double INDEL_EXTEND = 0.3;
+static int IS_SOLID = 0;
+static int SHOW_MM_INFO = 1;
+
+void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
+{
+ int i, deleting = 0;
+ mutseq_t *ret[2];
+
+ ret[0] = hap1; ret[1] = hap2;
+ ret[0]->l = seq->l; ret[1]->l = seq->l;
+ ret[0]->m = seq->m; ret[1]->m = seq->m;
+ ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
+ ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
+ for (i = 0; i != seq->l; ++i) {
+ int c;
+ c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+ if (deleting) {
+ if (drand48() < INDEL_EXTEND) {
+ if (deleting & 1) ret[0]->s[i] |= DELETE;
+ if (deleting & 2) ret[1]->s[i] |= DELETE;
+ continue;
+ } else deleting = 0;
+ }
+ if (c < 4 && drand48() < MUT_RATE) { // mutation
+ if (drand48() >= INDEL_FRAC) { // substitution
+ double r = drand48();
+ c = (c + (int)(r * 3.0 + 1)) & 3;
+ if (is_hap || drand48() < 0.333333) { // hom
+ ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
+ } else { // het
+ ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
+ }
+ } else { // indel
+ if (drand48() < 0.5) { // deletion
+ if (is_hap || drand48() < 0.333333) { // hom-del
+ ret[0]->s[i] = ret[1]->s[i] = DELETE;
+ deleting = 3;
+ } else { // het-del
+ deleting = drand48()<0.5?1:2;
+ ret[deleting-1]->s[i] = DELETE;
+ }
+ } else { // insertion
+ int num_ins = 0, ins = 0;
+ do {
+ num_ins++;
+ ins = (ins << 2) | (int)(drand48() * 4.0);
+ } while (num_ins < 4 && drand48() < INDEL_EXTEND);
+
+ if (is_hap || drand48() < 0.333333) { // hom-ins
+ ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
+ } else { // het-ins
+ ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
+ }
+ }
+ }
+ }
+ }
+}
+void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
+{
+ int i;
+ for (i = 0; i != seq->l; ++i) {
+ int c[3];
+ c[0] = nst_nt4_table[(int)seq->s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ if (c[0] >= 4) continue;
+ if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) {
+ printf("%s\t%d\t", name, i+1);
+ if (c[1] == c[2]) { // hom
+ if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+ printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+ } else if ((c[1]&mutmsk) == DELETE) { // del
+ printf("%c\t-\t-\n", "ACGTN"[c[0]]);
+ } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
+ printf("-\t");
+ int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
+ while(n > 0) {
+ putchar("ACGTN"[ins & 0x3]);
+ n--;
+ }
+ printf("\t-\n");
+ } else assert(0);
+ } else { // het
+ if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+ printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
+ } else if ((c[1]&mutmsk) == DELETE) {
+ printf("%c\t-\t+\n", "ACGTN"[c[0]]);
+ } else if ((c[2]&mutmsk) == DELETE) {
+ printf("%c\t-\t+\n", "ACGTN"[c[0]]);
+ } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
+ printf("-\t");
+ int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
+ while (n > 0) {
+ putchar("ACGTN"[ins & 0x3]);
+ n--;
+ }
+ printf("\t+\n");
+ } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
+ printf("-\t");
+ int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
+ while (n > 0) {
+ putchar("ACGTN"[ins & 0x3]);
+ ins >>= 2;
+ n--;
+ }
+ printf("\t+\n");
+ } else assert(0);
+ }
+ }
+ }
+}
+
+void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
+{
+ seq_t seq;
+ mutseq_t rseq[2];
+ uint64_t tot_len, ii;
+ int i, l, n_ref;
+ char name[256], *qstr;
+ int size[2], Q;
+ uint8_t *tmp_seq[2];
+ mut_t *target;
+
+ INIT_SEQ(seq);
+ srand48(time(0));
+ seq_set_block_size(0x1000000);
+ l = size_l > size_r? size_l : size_r;
+ qstr = (char*)calloc(l+1, 1);
+ tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
+ tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
+ size[0] = size_l; size[1] = size_r;
+
+ Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
+
+ tot_len = n_ref = 0;
+ while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
+ tot_len += l;
+ ++n_ref;
+ }
+ fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
+ rewind(fp_fa);
+
+ while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
+ uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
+ if (l < dist + 3 * std_dev) {
+ fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
+ continue;
+ }
+
+ // generate mutations and print them out
+ maq_mut_diref(&seq, is_hap, rseq, rseq+1);
+ maq_print_mutref(name, &seq, rseq, rseq+1);
+
+ for (ii = 0; ii != n_pairs; ++ii) { // the core loop
+ double ran;
+ int d, pos, s[2], is_flip = 0;
+ int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
+ FILE *fpo[2];
+
+ do { // avoid boundary failure
+ ran = ran_normal();
+ ran = ran * std_dev + dist;
+ d = (int)(ran + 0.5);
+ pos = (int)((l - d + 1) * drand48());
+ } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
+
+ // flip or not
+ if (drand48() < 0.5) {
+ fpo[0] = fpout1; fpo[1] = fpout2;
+ s[0] = size[0]; s[1] = size[1];
+ } else {
+ fpo[1] = fpout1; fpo[0] = fpout2;
+ s[1] = size[0]; s[0] = size[1];
+ is_flip = 1;
+ }
+
+ // generate the read sequences
+ target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
+ n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
+
+#define __gen_read(x, start, iter) do { \
+ for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
+ int c = target[i], mut_type = c & mutmsk; \
+ if (ext_coor[x] < 0) { \
+ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
+ ext_coor[x] = i; \
+ } \
+ if (mut_type == DELETE) ++n_indel[x]; \
+ else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
+ tmp_seq[x][k++] = c & 0xf; \
+ if (mut_type == SUBSTITUTE) ++n_sub[x]; \
+ } else { \
+ int n, ins; \
+ ++n_indel[x]; \
+ tmp_seq[x][k++] = c & 0xf; \
+ for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
+ tmp_seq[x][k++] = ins & 0x3; \
+ } \
+ } \
+ if (k != s[x]) ext_coor[x] = -10; \
+ } while (0)
+
+ if (!IS_SOLID) {
+ __gen_read(0, pos, ++i);
+ __gen_read(1, pos + d - 1, --i);
+ for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
+ } else {
+ int c1, c2, c;
+ ++s[0]; ++s[1]; // temporarily increase read length by 1
+ if (is_flip) { // RR pair
+ __gen_read(0, pos + s[0], --i);
+ __gen_read(1, pos + d - 1, --i);
+ } else { // FF pair
+ __gen_read(0, pos, ++i);
+ __gen_read(1, pos + d - 1 - s[1], ++i);
+ ++ext_coor[0]; ++ext_coor[1];
+ }
+ // change to color sequence: (0,1,2,3) -> (A,C,G,T)
+ for (j = 0; j < 2; ++j) {
+ c1 = tmp_seq[j][0];
+ for (i = 1; i < s[j]; ++i) {
+ c2 = tmp_seq[j][i];
+ c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
+ tmp_seq[j][i-1] = c;
+ c1 = c2;
+ }
+ }
+ --s[0]; --s[1]; // change back
+ }
+ if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
+ --ii;
+ continue;
+ }
+
+ // generate sequencing errors
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < s[j]; ++i) {
+ int c = tmp_seq[j][i];
+ if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
+ else if (drand48() < ERR_RATE) {
+ c = (c + (int)(drand48() * 3.0 + 1)) & 3;
+ ++n_err[j];
+ }
+ tmp_seq[j][i] = c;
+ }
+ }
+
+ // print
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < s[j]; ++i) qstr[i] = Q;
+ qstr[i] = 0;
+ if (SHOW_MM_INFO) {
+ fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
+ n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+ (long long)ii, j==0? is_flip+1 : 2-is_flip);
+ } else {
+ fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
+ (long long)ii, j==0? is_flip+1 : 2-is_flip,
+ n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
+ }
+ for (i = 0; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
+ fprintf(fpo[j], "\n+\n%s\n", qstr);
+ }
+ }
+ free(rseq[0].s); free(rseq[1].s);
+ }
+ free(seq.s); free(qstr);
+ free(tmp_seq[0]); free(tmp_seq[1]);
+}
+
+static int simu_usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: wgsim (short read simulator)\n");
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
+ fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
+ fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
+ fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
+ fprintf(stderr, " -s INT standard deviation [50]\n");
+ fprintf(stderr, " -N INT number of read pairs [1000000]\n");
+ fprintf(stderr, " -1 INT length of the first read [70]\n");
+ fprintf(stderr, " -2 INT length of the second read [70]\n");
+ fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
+ fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
+ fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
+ fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n");
+ fprintf(stderr, " -C show mismatch info in comment rather than read name\n");
+ fprintf(stderr, " -h haplotype mode\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ int64_t N;
+ int dist, std_dev, c, size_l, size_r, is_hap = 0;
+ FILE *fpout1, *fpout2, *fp_fa;
+
+ N = 1000000; dist = 500; std_dev = 50;
+ size_l = size_r = 70;
+ while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
+ switch (c) {
+ case 'd': dist = atoi(optarg); break;
+ case 's': std_dev = atoi(optarg); break;
+ case 'N': N = atoi(optarg); break;
+ case '1': size_l = atoi(optarg); break;
+ case '2': size_r = atoi(optarg); break;
+ case 'e': ERR_RATE = atof(optarg); break;
+ case 'r': MUT_RATE = atof(optarg); break;
+ case 'R': INDEL_FRAC = atof(optarg); break;
+ case 'X': INDEL_EXTEND = atof(optarg); break;
+ case 'c': IS_SOLID = 1; break;
+ case 'C': SHOW_MM_INFO = 0; break;
+ case 'h': is_hap = 1; break;
+ }
+ }
+ if (argc - optind < 3) return simu_usage();
+ fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
+ fpout1 = xopen(argv[optind+1], "w");
+ fpout2 = xopen(argv[optind+2], "w");
+ wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
+
+ fclose(fpout1); fclose(fpout2); fclose(fp_fa);
+ return 0;
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.3
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&wgsim_eval;
+exit;
+
+sub wgsim_eval {
+ my %opts;
+ getopts('pc', \%opts);
+ die("Usage: wgsim_eval.pl [-pc] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
+ my (@c0, @c1);
+ my ($max_q, $flag) = (0, 0);
+ my $gap = 5;
+ $flag |= 1 if (defined $opts{p});
+ $flag |= 2 if (defined $opts{c});
+ while (<>) {
+ my @t = split;
+ my $line = $_;
+ my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]);
+ $max_q = $q if ($q > $max_q);
+ # right coordinate
+ $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg;
+ --$rght;
+ # correct for soft clipping
+ $left -= $1 if (/^(\d+)S/);
+ $rght += $1 if (/(\d+)S$/);
+ # skip unmapped reads
+ next if (($t[1]&0x4) || $chr eq '*');
+ # parse read name and check
+ if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) {
+ if ($1 ne $chr) { # different chr
+ $is_correct = 0;
+ } else {
+ if ($flag & 2) {
+ if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward
+ $is_correct = 0 if (abs($2 - $left) > $gap);
+ } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse
+ $is_correct = 0 if (abs($3 - $rght) > $gap);
+ } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward
+ $is_correct = 0 if (abs($3 - $left) > $gap);
+ } else { # R3, reverse
+ $is_correct = 0 if (abs($2 - $rght) > $gap);
+ }
+ } else {
+ if ($t[1] & 0x10) { # reverse
+ $is_correct = 0 if (abs($3 - $rght) > $gap); # in case of indels that are close to the end of a reads
+ } else {
+ $is_correct = 0 if (abs($2 - $left) > $gap);
+ }
+ }
+ }
+ } else {
+ warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n");
+ next;
+ }
+ ++$c0[$q];
+ ++$c1[$q] unless ($is_correct);
+ print STDERR $line if (($flag&1) && !$is_correct && $q > 0);
+ }
+ # print
+ my ($cc0, $cc1) = (0, 0);
+ for (my $i = $max_q; $i >= 0; --$i) {
+ $c0[$i] = 0 unless (defined $c0[$i]);
+ $c1[$i] = 0 unless (defined $c1[$i]);
+ $cc0 += $c0[$i]; $cc1 += $c1[$i];
+ printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0);
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.0
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&zoom2sam;
+exit;
+
+sub mating {
+ my ($s1, $s2) = @_;
+ my $isize = 0;
+ if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
+ my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
+ my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
+ $isize = $x2 - $x1;
+ }
+ # update mate coordinate
+ if ($s2->[2] ne '*') {
+ @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
+ $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
+ } else {
+ $s1->[1] |= 0x8;
+ }
+ if ($s1->[2] ne '*') {
+ @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
+ $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
+ } else {
+ $s2->[1] |= 0x8;
+ }
+}
+
+sub zoom2sam {
+ my %opts = ();
+ getopts("p", \%opts);
+ die("Usage: zoom2sam.pl [-p] <readLen> <aln.zoom>
+Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2);
+ my $is_paired = defined($opts{p});
+ my $len = shift(@ARGV);
+ # core loop
+ my @s1 = ();
+ my @s2 = ();
+ my ($s_last, $s_curr) = (\@s1, \@s2);
+ while (<>) {
+ &zoom2sam_aux($_, $s_curr, $is_paired, $len);
+ if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
+ &mating($s_last, $s_curr);
+ print join("\t", @$s_last), "\n";
+ print join("\t", @$s_curr), "\n";
+ @$s_last = (); @$s_curr = ();
+ } else {
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+ my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
+ }
+ }
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+}
+
+sub zoom2sam_aux {
+ my ($line, $s, $is_paired, $len) = @_;
+ chomp($line);
+ my @t = split("\t", $line);
+ @$s = ();
+ # read name
+ $s->[0] = $t[0];
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/);
+ $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/);
+ $s->[1] |= 2 if ($is_paired);
+ # read & quality
+ $s->[9] = "*"; $s->[10] = "*";
+ # cigar
+ $s->[5] = $len . "M";
+ # coor
+ my @s = split(/\s+/, $t[1]);
+ $s->[2] = $s[0];
+ $t[1] =~ /:(\d+)$/;
+ $s->[3] = $1 + 1;
+ if ($s->[0] =~ /_[FR]$/) {
+ my $u = ($s->[0] =~ /_F$/)? 1 : 0;
+ my $w = ($t[2] eq '+')? 1 : 0;
+ $s->[1] |= 0x10 if ($u ^ $w);
+ $s->[0] =~ s/_[FR]$//;
+ } else {
+ $s->[1] |= 0x10 if ($t[2] eq '-');
+ }
+ # mapQ
+ $s->[4] = 30;
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ push(@$s, "NM:i:$t[3]");
+}
--- /dev/null
+/*
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NO_RAZF
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "razf.h"
+
+#if ZLIB_VERNUM < 0x1221
+struct _gz_header_s {
+ int text;
+ uLong time;
+ int xflags;
+ int os;
+ Bytef *extra;
+ uInt extra_len;
+ uInt extra_max;
+ Bytef *name;
+ uInt name_max;
+ Bytef *comment;
+ uInt comm_max;
+ int hcrc;
+ int done;
+};
+#warning "zlib < 1.2.2.1; RAZF writing is disabled."
+#endif
+
+#define DEF_MEM_LEVEL 8
+
+static inline uint32_t byte_swap_4(uint32_t v){
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint64_t byte_swap_8(uint64_t v){
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+
+static inline int is_big_endian(){
+ int x = 0x01;
+ char *c = (char*)&x;
+ return (c[0] != 0x01);
+}
+
+#ifndef _RZ_READONLY
+static void add_zindex(RAZF *rz, int64_t in, int64_t out){
+ if(rz->index->size == rz->index->cap){
+ rz->index->cap = rz->index->cap * 1.5 + 2;
+ rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
+ rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
+ }
+ if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
+ rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
+ rz->index->size ++;
+}
+
+static void save_zindex(RAZF *rz, int fd){
+ int32_t i, v32;
+ int is_be;
+ is_be = is_big_endian();
+ if(is_be) write(fd, &rz->index->size, sizeof(int));
+ else {
+ v32 = byte_swap_4((uint32_t)rz->index->size);
+ write(fd, &v32, sizeof(uint32_t));
+ }
+ v32 = rz->index->size / RZ_BIN_SIZE + 1;
+ if(!is_be){
+ for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+ for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+ }
+ write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+ write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
+}
+#endif
+
+static void load_zindex(RAZF *rz, int fd){
+ int32_t i, v32;
+ int is_be;
+ if(!rz->load_index) return;
+ if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
+ is_be = is_big_endian();
+ read(fd, &rz->index->size, sizeof(int));
+ if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
+ rz->index->cap = rz->index->size;
+ v32 = rz->index->size / RZ_BIN_SIZE + 1;
+ rz->index->bin_offsets = malloc(sizeof(int64_t) * v32);
+ read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+ rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
+ read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+ if(!is_be){
+ for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+ for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+ }
+}
+
+#ifdef _RZ_READONLY
+static RAZF* razf_open_w(int fd)
+{
+ fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n");
+ return 0;
+}
+#else
+static RAZF* razf_open_w(int fd){
+ RAZF *rz;
+ rz = calloc(1, sizeof(RAZF));
+ rz->mode = 'w';
+ rz->filedes = fd;
+ rz->stream = calloc(sizeof(z_stream), 1);
+ rz->inbuf = malloc(RZ_BUFFER_SIZE);
+ rz->outbuf = malloc(RZ_BUFFER_SIZE);
+ rz->index = calloc(sizeof(ZBlockIndex), 1);
+ deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ rz->header = calloc(sizeof(gz_header), 1);
+ rz->header->os = 0x03; //Unix
+ rz->header->text = 0;
+ rz->header->time = 0;
+ rz->header->extra = malloc(7);
+ strncpy((char*)rz->header->extra, "RAZF", 4);
+ rz->header->extra[4] = 1; // obsolete field
+ // block size = RZ_BLOCK_SIZE, Big-Endian
+ rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
+ rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
+ rz->header->extra_len = 7;
+ rz->header->name = rz->header->comment = 0;
+ rz->header->hcrc = 0;
+ deflateSetHeader(rz->stream, rz->header);
+ rz->block_pos = rz->block_off = 0;
+ return rz;
+}
+
+static void _razf_write(RAZF* rz, const void *data, int size){
+ int tout;
+ rz->stream->avail_in = size;
+ rz->stream->next_in = (void*)data;
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_NO_FLUSH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out) break;
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ if(rz->stream->avail_in == 0) break;
+ };
+ rz->in += size - rz->stream->avail_in;
+ rz->block_off += size - rz->stream->avail_in;
+}
+
+static void razf_flush(RAZF *rz){
+ uint32_t tout;
+ if(rz->buf_len){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_off = rz->buf_len = 0;
+ }
+ if(rz->stream->avail_out){
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ }
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_FULL_FLUSH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out == 0){
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ } else break;
+ }
+ rz->block_pos = rz->out;
+ rz->block_off = 0;
+}
+
+static void razf_end_flush(RAZF *rz){
+ uint32_t tout;
+ if(rz->buf_len){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_off = rz->buf_len = 0;
+ }
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_FINISH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out < RZ_BUFFER_SIZE){
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ } else break;
+ }
+}
+
+static void _razf_buffered_write(RAZF *rz, const void *data, int size){
+ int i, n;
+ while(1){
+ if(rz->buf_len == RZ_BUFFER_SIZE){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_len = 0;
+ }
+ if(size + rz->buf_len < RZ_BUFFER_SIZE){
+ for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+ rz->buf_len += size;
+ return;
+ } else {
+ n = RZ_BUFFER_SIZE - rz->buf_len;
+ for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+ size -= n;
+ data += n;
+ rz->buf_len += n;
+ }
+ }
+}
+
+int razf_write(RAZF* rz, const void *data, int size){
+ int ori_size, n;
+ int64_t next_block;
+ ori_size = size;
+ next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+ while(rz->in + rz->buf_len + size >= next_block){
+ n = next_block - rz->in - rz->buf_len;
+ _razf_buffered_write(rz, data, n);
+ data += n;
+ size -= n;
+ razf_flush(rz);
+ add_zindex(rz, rz->in, rz->out);
+ next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+ }
+ _razf_buffered_write(rz, data, size);
+ return ori_size;
+}
+#endif
+
+/* gzip flag byte */
+#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
+#define COMMENT 0x10 /* bit 4 set: file comment present */
+#define RESERVED 0xE0 /* bits 5..7: reserved */
+
+static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
+ int method, flags, n, len;
+ if(size < 2) return 0;
+ if(data[0] != 0x1f || data[1] != 0x8b) return 0;
+ if(size < 4) return 0;
+ method = data[2];
+ flags = data[3];
+ if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
+ n = 4 + 6; // Skip 6 bytes
+ *extra_off = n + 2;
+ *extra_len = 0;
+ if(flags & EXTRA_FIELD){
+ if(size < n + 2) return 0;
+ len = ((int)data[n + 1] << 8) | data[n];
+ n += 2;
+ *extra_off = n;
+ while(len){
+ if(n >= size) return 0;
+ n ++;
+ len --;
+ }
+ *extra_len = n - (*extra_off);
+ }
+ if(flags & ORIG_NAME) while(n < size && data[n++]);
+ if(flags & COMMENT) while(n < size && data[n++]);
+ if(flags & HEAD_CRC){
+ if(n + 2 > size) return 0;
+ n += 2;
+ }
+ return n;
+}
+
+static RAZF* razf_open_r(int fd, int _load_index){
+ RAZF *rz;
+ int ext_off, ext_len;
+ int n, is_be, ret;
+ int64_t end;
+ unsigned char c[] = "RAZF";
+ rz = calloc(1, sizeof(RAZF));
+ rz->mode = 'r';
+ rz->filedes = fd;
+ rz->stream = calloc(sizeof(z_stream), 1);
+ rz->inbuf = malloc(RZ_BUFFER_SIZE);
+ rz->outbuf = malloc(RZ_BUFFER_SIZE);
+ rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
+ n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+ ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
+ if(ret == 0){
+ PLAIN_FILE:
+ rz->in = n;
+ rz->file_type = FILE_TYPE_PLAIN;
+ memcpy(rz->outbuf, rz->inbuf, n);
+ rz->buf_len = n;
+ free(rz->stream);
+ rz->stream = NULL;
+ return rz;
+ }
+ rz->header_size = ret;
+ ret = inflateInit2(rz->stream, -WINDOW_BITS);
+ if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
+ rz->stream->avail_in = n - rz->header_size;
+ rz->stream->next_in = rz->inbuf + rz->header_size;
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ rz->file_type = FILE_TYPE_GZ;
+ rz->in = rz->header_size;
+ rz->block_pos = rz->header_size;
+ rz->next_block_pos = rz->header_size;
+ rz->block_off = 0;
+ if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
+ if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
+ fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
+ return rz;
+ }
+ rz->load_index = _load_index;
+ rz->file_type = FILE_TYPE_RZ;
+ if(lseek(fd, -16, SEEK_END) == -1){
+ UNSEEKABLE:
+ rz->seekable = 0;
+ rz->index = NULL;
+ rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
+ } else {
+ is_be = is_big_endian();
+ rz->seekable = 1;
+ read(fd, &end, sizeof(int64_t));
+ if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
+ else rz->src_end = end;
+ read(fd, &end, sizeof(int64_t));
+ if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
+ else rz->end = end;
+ if(n > rz->end){
+ rz->stream->avail_in -= n - rz->end;
+ n = rz->end;
+ }
+ if(rz->end > rz->src_end){
+ lseek(fd, rz->in, SEEK_SET);
+ goto UNSEEKABLE;
+ }
+ if(lseek(fd, rz->end, SEEK_SET) != rz->end){
+ lseek(fd, rz->in, SEEK_SET);
+ goto UNSEEKABLE;
+ }
+ load_zindex(rz, fd);
+ lseek(fd, n, SEEK_SET);
+ }
+ return rz;
+}
+
+RAZF* razf_dopen(int fd, const char *mode){
+ if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 1);
+ else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
+ else return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+ if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 0);
+ else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
+ else return NULL;
+}
+
+static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
+ int fd;
+ RAZF *rz;
+ if(strcasecmp(mode, "r") == 0){
+ fd = open(filename, O_RDONLY);
+ rz = razf_open_r(fd, _load_index);
+ } else if(strcasecmp(mode, "w") == 0){
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+ rz = razf_open_w(fd);
+ } else return NULL;
+ return rz;
+}
+
+RAZF* razf_open(const char *filename, const char *mode){
+ return _razf_open(filename, mode, 1);
+}
+
+RAZF* razf_open2(const char *filename, const char *mode){
+ return _razf_open(filename, mode, 0);
+}
+
+int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
+ int64_t n;
+ if(rz->mode != 'r' && rz->mode != 'R') return 0;
+ switch(rz->file_type){
+ case FILE_TYPE_PLAIN:
+ if(rz->end == 0x7fffffffffffffffLL){
+ if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
+ rz->end = lseek(rz->filedes, 0, SEEK_END);
+ lseek(rz->filedes, n, SEEK_SET);
+ }
+ *u_size = *c_size = rz->end;
+ return 1;
+ case FILE_TYPE_GZ:
+ return 0;
+ case FILE_TYPE_RZ:
+ if(rz->src_end == rz->end) return 0;
+ *u_size = rz->src_end;
+ *c_size = rz->end;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int _razf_read(RAZF* rz, void *data, int size){
+ int ret, tin;
+ if(rz->z_eof || rz->z_err) return 0;
+ if (rz->file_type == FILE_TYPE_PLAIN) {
+ ret = read(rz->filedes, data, size);
+ if (ret == 0) rz->z_eof = 1;
+ return ret;
+ }
+ rz->stream->avail_out = size;
+ rz->stream->next_out = data;
+ while(rz->stream->avail_out){
+ if(rz->stream->avail_in == 0){
+ if(rz->in >= rz->end){ rz->z_eof = 1; break; }
+ if(rz->end - rz->in < RZ_BUFFER_SIZE){
+ rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
+ } else {
+ rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+ }
+ if(rz->stream->avail_in == 0){
+ rz->z_eof = 1;
+ break;
+ }
+ rz->stream->next_in = rz->inbuf;
+ }
+ tin = rz->stream->avail_in;
+ ret = inflate(rz->stream, Z_BLOCK);
+ rz->in += tin - rz->stream->avail_in;
+ if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
+ fprintf(stderr, "[_razf_read] inflate error: %d (at %s:%d)\n", ret, __FILE__, __LINE__);
+ rz->z_err = 1;
+ break;
+ }
+ if(ret == Z_STREAM_END){
+ rz->z_eof = 1;
+ break;
+ }
+ if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
+ rz->buf_flush = 1;
+ rz->next_block_pos = rz->in;
+ break;
+ }
+ }
+ return size - rz->stream->avail_out;
+}
+
+int razf_read(RAZF *rz, void *data, int size){
+ int ori_size, i;
+ ori_size = size;
+ while(size > 0){
+ if(rz->buf_len){
+ if(size < rz->buf_len){
+ for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+ rz->buf_off += size;
+ rz->buf_len -= size;
+ data += size;
+ rz->block_off += size;
+ size = 0;
+ break;
+ } else {
+ for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+ data += rz->buf_len;
+ size -= rz->buf_len;
+ rz->block_off += rz->buf_len;
+ rz->buf_off = 0;
+ rz->buf_len = 0;
+ if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ }
+ } else if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ if(rz->buf_flush) continue;
+ rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+ if(rz->z_eof && rz->buf_len == 0) break;
+ }
+ rz->out += ori_size - size;
+ return ori_size - size;
+}
+
+int razf_skip(RAZF* rz, int size){
+ int ori_size;
+ ori_size = size;
+ while(size > 0){
+ if(rz->buf_len){
+ if(size < rz->buf_len){
+ rz->buf_off += size;
+ rz->buf_len -= size;
+ rz->block_off += size;
+ size = 0;
+ break;
+ } else {
+ size -= rz->buf_len;
+ rz->buf_off = 0;
+ rz->buf_len = 0;
+ rz->block_off += rz->buf_len;
+ if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ }
+ } else if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ if(rz->buf_flush) continue;
+ rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+ if(rz->z_eof) break;
+ }
+ rz->out += ori_size - size;
+ return ori_size - size;
+}
+
+static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
+ lseek(rz->filedes, in, SEEK_SET);
+ rz->in = in;
+ rz->out = out;
+ rz->block_pos = in;
+ rz->next_block_pos = in;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ rz->z_eof = rz->z_err = 0;
+ inflateReset(rz->stream);
+ rz->stream->avail_in = 0;
+ rz->buf_off = rz->buf_len = 0;
+}
+
+int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
+ int64_t pos;
+ rz->z_eof = 0;
+ if(rz->file_type == FILE_TYPE_PLAIN){
+ rz->buf_off = rz->buf_len = 0;
+ pos = block_start + block_offset;
+ pos = lseek(rz->filedes, pos, SEEK_SET);
+ rz->out = rz->in = pos;
+ return pos;
+ }
+ if(block_start == rz->block_pos && block_offset >= rz->block_off) {
+ block_offset -= rz->block_off;
+ goto SKIP; // Needn't reset inflate
+ }
+ if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
+ _razf_reset_read(rz, block_start, 0);
+ SKIP:
+ if(block_offset) razf_skip(rz, block_offset);
+ return rz->block_off;
+}
+
+int64_t razf_seek(RAZF* rz, int64_t pos, int where){
+ int64_t idx;
+ int64_t seek_pos, new_out;
+ rz->z_eof = 0;
+ if (where == SEEK_CUR) pos += rz->out;
+ else if (where == SEEK_END) pos += rz->src_end;
+ if(rz->file_type == FILE_TYPE_PLAIN){
+ seek_pos = lseek(rz->filedes, pos, SEEK_SET);
+ rz->buf_off = rz->buf_len = 0;
+ rz->out = rz->in = seek_pos;
+ return seek_pos;
+ } else if(rz->file_type == FILE_TYPE_GZ){
+ if(pos >= rz->out) goto SKIP;
+ return rz->out;
+ }
+ if(pos == rz->out) return pos;
+ if(pos > rz->src_end) return rz->out;
+ if(!rz->seekable || !rz->load_index){
+ if(pos >= rz->out) goto SKIP;
+ }
+ idx = pos / RZ_BLOCK_SIZE - 1;
+ seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+ new_out = (idx + 1) * RZ_BLOCK_SIZE;
+ if(pos > rz->out && new_out <= rz->out) goto SKIP;
+ _razf_reset_read(rz, seek_pos, new_out);
+ SKIP:
+ razf_skip(rz, (int)(pos - rz->out));
+ return rz->out;
+}
+
+uint64_t razf_tell2(RAZF *rz)
+{
+ /*
+ if (rz->load_index) {
+ int64_t idx, seek_pos;
+ idx = rz->out / RZ_BLOCK_SIZE - 1;
+ seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+ if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
+ fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
+ (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
+ }
+ */
+ return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
+}
+
+int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
+{
+ if (where != SEEK_SET) return -1;
+ return razf_jump(rz, voffset>>16, voffset&0xffff);
+}
+
+void razf_close(RAZF *rz){
+ if(rz->mode == 'w'){
+#ifndef _RZ_READONLY
+ razf_end_flush(rz);
+ deflateEnd(rz->stream);
+ save_zindex(rz, rz->filedes);
+ if(is_big_endian()){
+ write(rz->filedes, &rz->in, sizeof(int64_t));
+ write(rz->filedes, &rz->out, sizeof(int64_t));
+ } else {
+ uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+ write(rz->filedes, &v64, sizeof(int64_t));
+ v64 = byte_swap_8((uint64_t)rz->out);
+ write(rz->filedes, &v64, sizeof(int64_t));
+ }
+#endif
+ } else if(rz->mode == 'r'){
+ if(rz->stream) inflateEnd(rz->stream);
+ }
+ if(rz->inbuf) free(rz->inbuf);
+ if(rz->outbuf) free(rz->outbuf);
+ if(rz->header){
+ free(rz->header->extra);
+ free(rz->header->name);
+ free(rz->header->comment);
+ free(rz->header);
+ }
+ if(rz->index){
+ free(rz->index->bin_offsets);
+ free(rz->index->cell_offsets);
+ free(rz->index);
+ }
+ free(rz->stream);
+ close(rz->filedes);
+ free(rz);
+}
+
+#endif
--- /dev/null
+ /*-
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef __RAZF_RJ_H
+#define __RAZF_RJ_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+
+#if ZLIB_VERNUM < 0x1221
+#define _RZ_READONLY
+struct _gz_header_s;
+typedef struct _gz_header_s _gz_header;
+#define gz_header _gz_header
+#endif
+
+#define WINDOW_BITS 15
+
+#ifndef RZ_BLOCK_SIZE
+#define RZ_BLOCK_SIZE (1<<WINDOW_BITS)
+#endif
+
+#ifndef RZ_BUFFER_SIZE
+#define RZ_BUFFER_SIZE 4096
+#endif
+
+#ifndef RZ_COMPRESS_LEVEL
+#define RZ_COMPRESS_LEVEL 6
+#endif
+
+#define RZ_BIN_SIZE ((1LLU << 32) / RZ_BLOCK_SIZE)
+
+typedef struct {
+ uint32_t *cell_offsets; // i
+ int64_t *bin_offsets; // i / BIN_SIZE
+ int size;
+ int cap;
+} ZBlockIndex;
+/* When storing index, output bytes in Big-Endian everywhere */
+
+#define FILE_TYPE_RZ 1
+#define FILE_TYPE_PLAIN 2
+#define FILE_TYPE_GZ 3
+
+typedef struct RandomAccessZFile {
+ char mode; /* 'w' : write mode; 'r' : read mode */
+ int file_type;
+ /* plain file or rz file, razf_read support plain file as input too, in this case, razf_read work as buffered fread */
+ int filedes; /* the file descriptor */
+ z_stream *stream;
+ ZBlockIndex *index;
+ int64_t in, out, end, src_end;
+ /* in: n bytes total in; out: n bytes total out; */
+ /* end: the end of all data blocks, while the start of index; src_end: the true end position in uncompressed file */
+ int buf_flush; // buffer should be flush, suspend inflate util buffer is empty
+ int64_t block_pos, block_off, next_block_pos;
+ /* block_pos: the start postiion of current block in compressed file */
+ /* block_off: tell how many bytes have been read from current block */
+ void *inbuf, *outbuf;
+ int header_size;
+ gz_header *header;
+ /* header is used to transfer inflate_state->mode from HEAD to TYPE after call inflateReset */
+ int buf_off, buf_len;
+ int z_err, z_eof;
+ int seekable;
+ /* Indice where the source is seekable */
+ int load_index;
+ /* set has_index to 0 in mode 'w', then index will be discarded */
+} RAZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ RAZF* razf_dopen(int data_fd, const char *mode);
+ RAZF *razf_open(const char *fn, const char *mode);
+ int razf_write(RAZF* rz, const void *data, int size);
+ int razf_read(RAZF* rz, void *data, int size);
+ int64_t razf_seek(RAZF* rz, int64_t pos, int where);
+ void razf_close(RAZF* rz);
+
+#define razf_tell(rz) ((rz)->out)
+
+ RAZF* razf_open2(const char *filename, const char *mode);
+ RAZF* razf_dopen2(int fd, const char *mode);
+ uint64_t razf_tell2(RAZF *rz);
+ int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include "razf.h"
+
+#define WINDOW_SIZE 4096
+
+static int razf_main_usage()
+{
+ printf("\n");
+ printf("Usage: razip [options] [file] ...\n\n");
+ printf("Options: -c write on standard output, keep original files unchanged\n");
+ printf(" -d decompress\n");
+ printf(" -l list compressed file contents\n");
+ printf(" -b INT decompress at INT position in the uncompressed file\n");
+ printf(" -s INT decompress INT bytes in the uncompressed file\n");
+ printf(" -h give this help\n");
+ printf("\n");
+ return 0;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+ int fd = -1;
+ char c;
+ if (!is_forced) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) {
+ printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+ scanf("%c", &c);
+ if (c != 'Y' && c != 'y') {
+ printf("razip: not overwritten\n");
+ exit(1);
+ }
+ }
+ }
+ if (fd < 0) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) {
+ fprintf(stderr, "razip: %s: Fail to write\n", fn);
+ exit(1);
+ }
+ }
+ return fd;
+}
+
+int main(int argc, char **argv)
+{
+ int c, compress, pstdout, is_forced;
+ RAZF *rz;
+ void *buffer;
+ long start, end, size;
+
+ compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+ while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+ switch(c){
+ case 'h': return razf_main_usage();
+ case 'd': compress = 0; break;
+ case 'c': pstdout = 1; break;
+ case 'l': compress = 2; break;
+ case 'b': start = atol(optarg); break;
+ case 's': size = atol(optarg); break;
+ case 'f': is_forced = 1; break;
+ }
+ }
+ if (size >= 0) end = start + size;
+ if(end >= 0 && end < start){
+ fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+ return 1;
+ }
+ if(compress == 1){
+ int f_src, f_dst = -1;
+ if(argc > optind){
+ if((f_src = open(argv[optind], O_RDONLY)) < 0){
+ fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+ return 1;
+ }
+ if(pstdout){
+ f_dst = fileno(stdout);
+ } else {
+ char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+ strcpy(name, argv[optind]);
+ strcat(name, ".rz");
+ f_dst = write_open(name, is_forced);
+ if (f_dst < 0) return 1;
+ free(name);
+ }
+ } else if(pstdout){
+ f_src = fileno(stdin);
+ f_dst = fileno(stdout);
+ } else return razf_main_usage();
+ rz = razf_dopen(f_dst, "w");
+ buffer = malloc(WINDOW_SIZE);
+ while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
+ razf_close(rz); // f_dst will be closed here
+ if (argc > optind) unlink(argv[optind]);
+ free(buffer);
+ close(f_src);
+ return 0;
+ } else {
+ if(argc <= optind) return razf_main_usage();
+ if(compress == 2){
+ rz = razf_open(argv[optind], "r");
+ if(rz->file_type == FILE_TYPE_RZ) {
+ printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name");
+ printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end,
+ argv[optind]);
+ } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]);
+ } else {
+ int f_dst;
+ if (argc > optind && !pstdout) {
+ char *name;
+ if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) {
+ printf("razip: %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = write_open(name, is_forced);
+ free(name);
+ } else f_dst = fileno(stdout);
+ rz = razf_open(argv[optind], "r");
+ buffer = malloc(WINDOW_SIZE);
+ razf_seek(rz, start, SEEK_SET);
+ while(1){
+ if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE);
+ else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if(c <= 0) break;
+ start += c;
+ write(f_dst, buffer, c);
+ if(end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (!pstdout) unlink(argv[optind]);
+ }
+ razf_close(rz);
+ return 0;
+ }
+}
+
--- /dev/null
+#include <string.h>
+#include "sam.h"
+
+#define TYPE_BAM 1
+#define TYPE_READ 2
+
+bam_header_t *bam_header_dup(const bam_header_t *h0)
+{
+ bam_header_t *h;
+ int i;
+ h = bam_header_init();
+ *h = *h0;
+ h->hash = 0;
+ h->text = (char*)calloc(h->l_text + 1, 1);
+ memcpy(h->text, h0->text, h->l_text);
+ h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+ for (i = 0; i < h->n_targets; ++i) {
+ h->target_len[i] = h0->target_len[i];
+ h->target_name[i] = strdup(h0->target_name[i]);
+ }
+ if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib);
+ return h;
+}
+static void append_header_text(bam_header_t *header, char* text, int len)
+{
+ int x = header->l_text + 1;
+ int y = header->l_text + len + 1; // 1 byte null
+ if (text == 0) return;
+ kroundup32(x);
+ kroundup32(y);
+ if (x < y) header->text = (char*)realloc(header->text, y);
+ strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
+ header->l_text += len;
+ header->text[header->l_text] = 0;
+}
+
+samfile_t *samopen(const char *fn, const char *mode, const void *aux)
+{
+ samfile_t *fp;
+ fp = (samfile_t*)calloc(1, sizeof(samfile_t));
+ if (mode[0] == 'r') { // read
+ fp->type |= TYPE_READ;
+ if (mode[1] == 'b') { // binary
+ fp->type |= TYPE_BAM;
+ fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+ if (fp->x.bam == 0) goto open_err_ret;
+ fp->header = bam_header_read(fp->x.bam);
+ } else { // text
+ fp->x.tamr = sam_open(fn);
+ if (fp->x.tamr == 0) goto open_err_ret;
+ fp->header = sam_header_read(fp->x.tamr);
+ if (fp->header->n_targets == 0) { // no @SQ fields
+ if (aux) { // check if aux is present
+ bam_header_t *textheader = fp->header;
+ fp->header = sam_header_read2((const char*)aux);
+ append_header_text(fp->header, textheader->text, textheader->l_text);
+ bam_header_destroy(textheader);
+ }
+ if (fp->header->n_targets == 0)
+ fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
+ } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
+ }
+ sam_header_parse_rg(fp->header);
+ } else if (mode[0] == 'w') { // write
+ fp->header = bam_header_dup((const bam_header_t*)aux);
+ if (mode[1] == 'b') { // binary
+ char bmode[3];
+ bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0;
+ fp->type |= TYPE_BAM;
+ fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
+ if (fp->x.bam == 0) goto open_err_ret;
+ bam_header_write(fp->x.bam, fp->header);
+ } else { // text
+ // open file
+ fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
+ if (fp->x.tamr == 0) goto open_err_ret;
+ // write header
+ if (strstr(mode, "h")) {
+ int i;
+ bam_header_t *alt;
+ // parse the header text
+ alt = bam_header_init();
+ alt->l_text = fp->header->l_text; alt->text = fp->header->text;
+ sam_header_parse(alt);
+ alt->l_text = 0; alt->text = 0;
+ // check if there are @SQ lines in the header
+ fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
+ if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
+ if (alt->n_targets != fp->header->n_targets)
+ fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+ } else { // then dump ->target_{name,len}
+ for (i = 0; i < fp->header->n_targets; ++i)
+ fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
+ }
+ bam_header_destroy(alt);
+ }
+ }
+ }
+ return fp;
+
+open_err_ret:
+ free(fp);
+ return 0;
+}
+
+void samclose(samfile_t *fp)
+{
+ if (fp == 0) return;
+ if (fp->header) bam_header_destroy(fp->header);
+ if (fp->type & TYPE_BAM) bam_close(fp->x.bam);
+ else if (fp->type & TYPE_READ) sam_close(fp->x.tamr);
+ else fclose(fp->x.tamw);
+ free(fp);
+}
+
+int samread(samfile_t *fp, bam1_t *b)
+{
+ if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading
+ if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b);
+ else return sam_read1(fp->x.tamr, fp->header, b);
+}
+
+int samwrite(samfile_t *fp, const bam1_t *b)
+{
+ if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing
+ if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b);
+ else {
+ char *s = bam_format1(fp->header, b);
+ int l = strlen(s);
+ fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw);
+ free(s);
+ return l + 1;
+ }
+}
+
+int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
+{
+ bam_plbuf_t *buf;
+ int ret;
+ bam1_t *b;
+ b = bam_init1();
+ buf = bam_plbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf, mask);
+ while ((ret = samread(fp, b)) >= 0)
+ bam_plbuf_push(b, buf);
+ bam_plbuf_push(0, buf);
+ bam_plbuf_destroy(buf);
+ bam_destroy1(b);
+ return 0;
+}
--- /dev/null
+#ifndef BAM_SAM_H
+#define BAM_SAM_H
+
+#include "bam.h"
+
+/*!
+ @header
+
+ This file provides higher level of I/O routines and unifies the APIs
+ for SAM and BAM formats. These APIs are more convenient and
+ recommended.
+
+ @copyright Genome Research Ltd.
+ */
+
+/*! @typedef
+ @abstract SAM/BAM file handler
+ @field type type of the handler; bit 1 for BAM and bit 2 for reading
+ @field bam BAM file handler; valid if (type&1) == 1
+ @field tamr SAM file handler for reading; valid if type == 2
+ @field tamw SAM file handler for writing; valid if type == 0
+ @field header header struct
+ */
+typedef struct {
+ int type;
+ union {
+ tamFile tamr;
+ bamFile bam;
+ FILE *tamw;
+ } x;
+ bam_header_t *header;
+} samfile_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /*!
+ @abstract Open a SAM/BAM file
+
+ @param fn SAM/BAM file name; "-" is recognized as stdin (for
+ reading) or stdout (for writing).
+
+ @param mode open mode /[rw](b?)(u?)(h?)/: 'r' for reading, 'w' for
+ writing, 'b' for BAM I/O, 'u' for uncompressed BAM output and 'h'
+ for outputing header in SAM. If 'b' present, it must immediately
+ follow 'r' or 'w'. Valid modes are "r", "w", "wh", "rb", "wb" and
+ "wbu" exclusively.
+
+ @param aux auxiliary data; if mode[0]=='w', aux points to
+ bam_header_t; if strcmp(mode, "rb")==0 and @SQ header lines in SAM
+ are absent, aux points the file name of the list of the reference;
+ aux is not used otherwise.
+
+ @return SAM/BAM file handler
+ */
+ samfile_t *samopen(const char *fn, const char *mode, const void *aux);
+
+ /*!
+ @abstract Close a SAM/BAM handler
+ @param fp file handler to be closed
+ */
+ void samclose(samfile_t *fp);
+
+ /*!
+ @abstract Read one alignment
+ @param fp file handler
+ @param b alignment
+ @return bytes read
+ */
+ int samread(samfile_t *fp, bam1_t *b);
+
+ /*!
+ @abstract Write one alignment
+ @param fp file handler
+ @param b alignment
+ @return bytes written
+ */
+ int samwrite(samfile_t *fp, const bam1_t *b);
+
+ /*!
+ @abstract Get the pileup for a whole alignment file
+ @param fp file handler
+ @param mask mask transferred to bam_plbuf_set_mask()
+ @param func user defined function called in the pileup process
+ #param data user provided data for func()
+ */
+ int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "sam.h"
+
+static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;
+static char *g_library, *g_rg;
+
+static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b)
+{
+ if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))
+ return 1;
+ if (g_library || g_rg) {
+ uint8_t *s = bam_aux_get(b, "RG");
+ if (s) {
+ if (g_rg && strcmp(g_rg, (char*)(s + 1)) == 0) return 0;
+ if (g_library) {
+ const char *p = bam_strmap_get(h->rg2lib, (char*)(s + 1));
+ return (p && strcmp(p, g_library) == 0)? 0 : 1;
+ } return 1;
+ } else return 1;
+ } else return 0;
+}
+
+// callback function for bam_fetch()
+static int view_func(const bam1_t *b, void *data)
+{
+ if (!__g_skip_aln(((samfile_t*)data)->header, b))
+ samwrite((samfile_t*)data, b);
+ return 0;
+}
+
+static int usage(void);
+
+int main_samview(int argc, char *argv[])
+{
+ int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0;
+ samfile_t *in = 0, *out = 0;
+ char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0;
+
+ /* parse command-line options */
+ strcpy(in_mode, "r"); strcpy(out_mode, "w");
+ while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:")) >= 0) {
+ switch (c) {
+ case 'S': is_bamin = 0; break;
+ case 'b': is_bamout = 1; break;
+ case 't': fn_list = strdup(optarg); is_bamin = 0; break;
+ case 'h': is_header = 1; break;
+ case 'H': is_header_only = 1; break;
+ case 'o': fn_out = strdup(optarg); break;
+ case 'f': g_flag_on = strtol(optarg, 0, 0); break;
+ case 'F': g_flag_off = strtol(optarg, 0, 0); break;
+ case 'q': g_min_mapQ = atoi(optarg); break;
+ case 'u': is_uncompressed = 1; break;
+ case 'l': g_library = strdup(optarg); break;
+ case 'r': g_rg = strdup(optarg); break;
+ default: return usage();
+ }
+ }
+ if (is_uncompressed) is_bamout = 1;
+ if (is_header_only) is_header = 1;
+ if (is_bamout) strcat(out_mode, "b");
+ if (is_bamin) strcat(in_mode, "b");
+ if (is_header) strcat(out_mode, "h");
+ if (is_uncompressed) strcat(out_mode, "u");
+ if (argc == optind) return usage();
+
+ // open file handlers
+ if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+ fprintf(stderr, "[main_samview] fail to open file for reading.\n");
+ goto view_end;
+ }
+ if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
+ fprintf(stderr, "[main_samview] fail to open file for writing.\n");
+ goto view_end;
+ }
+ if (is_header_only) goto view_end; // no need to print alignments
+
+ if (argc == optind + 1) { // convert/print the entire file
+ bam1_t *b = bam_init1();
+ int r;
+ while ((r = samread(in, b)) >= 0) // read one alignment from `in'
+ if (!__g_skip_aln(in->header, b))
+ samwrite(out, b); // write the alignment to `out'
+ if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n");
+ bam_destroy1(b);
+ } else { // retrieve alignments in specified regions
+ int i;
+ bam_index_t *idx = 0;
+ if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index
+ if (idx == 0) { // index is unavailable
+ fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n");
+ ret = 1;
+ goto view_end;
+ }
+ for (i = optind + 1; i < argc; ++i) {
+ int tid, beg, end;
+ bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200'
+ if (tid < 0) { // reference name is not found
+ fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n");
+ continue;
+ }
+ bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments
+ }
+ bam_index_destroy(idx); // destroy the BAM index
+ }
+
+view_end:
+ // close files, free and return
+ free(fn_list); free(fn_out); free(g_library); free(g_rg);
+ samclose(in);
+ samclose(out);
+ return ret;
+}
+
+static int usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]\n\n");
+ fprintf(stderr, "Options: -b output BAM\n");
+ fprintf(stderr, " -h print header for the SAM output\n");
+ fprintf(stderr, " -H print header only (no alignments)\n");
+ fprintf(stderr, " -S input is SAM\n");
+ fprintf(stderr, " -u uncompressed BAM output (force -b)\n");
+ fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n");
+ fprintf(stderr, " -o FILE output file name [stdout]\n");
+ fprintf(stderr, " -f INT required flag, 0 for unset [0]\n");
+ fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n");
+ fprintf(stderr, " -q INT minimum mapping quality [0]\n");
+ fprintf(stderr, " -l STR only output reads in library STR [null]\n");
+ fprintf(stderr, " -r STR only output reads in read group STR [null]\n");
+ fprintf(stderr, "\n\
+Notes:\n\
+\n\
+ 1. By default, this command assumes the file on the command line is in\n\
+ the BAM format and it prints the alignments in SAM. If `-t' is\n\
+ applied, the input file is assumed to be in the SAM format. The\n\
+ file supplied with `-t' is SPACE/TAB delimited with the first two\n\
+ fields of each line consisting of the reference name and the\n\
+ corresponding sequence length. The `.fai' file generated by `faidx'\n\
+ can be used here. This file may be empty if reads are unaligned.\n\
+\n\
+ 2. SAM->BAM conversion: `samtools view -bt ref.fa.fai in.sam.gz'.\n\
+\n\
+ 3. BAM->SAM conversion: `samtools view in.bam'.\n\
+\n\
+ 4. A region should be presented in one of the following formats:\n\
+ `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\
+ specified, the input alignment file must be an indexed BAM file.\n\
+\n\
+ 5. Option `-u' is preferred over `-b' when the output is piped to\n\
+ another samtools command.\n\
+\n");
+ return 1;
+}
+
+int main_import(int argc, char *argv[])
+{
+ int argc2, ret;
+ char **argv2;
+ if (argc != 4) {
+ fprintf(stderr, "Usage: bamtk import <in.ref_list> <in.sam> <out.bam>\n");
+ return 1;
+ }
+ argc2 = 6;
+ argv2 = calloc(6, sizeof(char*));
+ argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
+ ret = main_samview(argc2, argv2);
+ free(argv2);
+ return ret;
+}
--- /dev/null
+.TH samtools 1 "6 July 2009" "samtools-0.1.5" "Bioinformatics tools"
+.SH NAME
+.PP
+samtools - Utilities for the Sequence Alignment/Map (SAM) format
+.SH SYNOPSIS
+.PP
+samtools view -bt ref_list.txt -o aln.bam aln.sam.gz
+.PP
+samtools sort aln.bam aln.sorted
+.PP
+samtools index aln.sorted.bam
+.PP
+samtools view aln.sorted.bam chr2:20,100,000-20,200,000
+.PP
+samtools merge out.bam in1.bam in2.bam in3.bam
+.PP
+samtools faidx ref.fasta
+.PP
+samtools pileup -f ref.fasta aln.sorted.bam
+.PP
+samtools tview aln.sorted.bam ref.fasta
+
+.SH DESCRIPTION
+.PP
+Samtools is a set of utilities that manipulate alignments in the BAM
+format. It imports from and exports to the SAM (Sequence Alignment/Map)
+format, does sorting, merging and indexing, and allows to retrieve reads
+in any regions swiftly.
+
+Samtools is designed to work on a stream. It regards an input file `-'
+as the standard input (stdin) and an output file `-' as the standard
+output (stdout). Several commands can thus be combined with Unix
+pipes. Samtools always output warning and error messages to the standard
+error output (stderr).
+
+Samtools is also able to open a BAM (not SAM) file on a remote FTP
+server if the BAM file name starts with `ftp://'. Samtools checks the
+current working directory for the index file and will download the index
+upon absence. Samtools achieves random FTP file access with the `REST'
+ftp command. It does not retrieve the entire alignment file unless it is
+asked to do so.
+
+.SH COMMANDS AND OPTIONS
+
+.TP 10
+.B import
+samtools import <in.ref_list> <in.sam> <out.bam>
+
+Since 0.1.4, this command is an alias of:
+
+samtools view -bt <in.ref_list> -o <out.bam> <in.sam>
+
+.TP
+.B sort
+samtools sort [-n] [-m maxMem] <in.bam> <out.prefix>
+
+Sort alignments by leftmost coordinates. File
+.I <out.prefix>.bam
+will be created. This command may also create temporary files
+.I <out.prefix>.%d.bam
+when the whole alignment cannot be fitted into memory (controlled by
+option -m).
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -n
+Sort by read names rather than by chromosomal coordinates
+.TP
+.B -m INT
+Approximately the maximum required memory. [500000000]
+.RE
+
+.TP
+.B merge
+samtools merge [-n] <out.bam> <in1.bam> <in2.bam> [...]
+
+Merge multiple sorted alignments. The header of
+.I <in1.bam>
+will be copied to
+.I <out.bam>
+and the headers of other files will be ignored.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -n
+The input alignments are sorted by read names rather than by chromosomal
+coordinates
+.RE
+
+.TP
+.B index
+samtools index <aln.bam>
+
+Index sorted alignment for fast random access. Index file
+.I <aln.bam>.bai
+will be created.
+
+.TP
+.B view
+samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F
+skipFlag] [-q minMapQ] [-l library] [-r readGroup] <in.bam>|<in.sam> [region1 [...]]
+
+Extract/print all or sub alignments in SAM or BAM format. If no region
+is specified, all the alignments will be printed; otherwise only
+alignments overlapping the specified regions will be output. An
+alignment may be given multiple times if it is overlapping several
+regions. A region can be presented, for example, in the following
+format: `chr2', `chr2:1000000' or `chr2:1,000,000-2,000,000'. The
+coordinate is 1-based.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -b
+Output in the BAM format.
+.TP
+.B -u
+Output uncompressed BAM. This option saves time spent on
+compression/decomprssion and is thus preferred when the output is piped
+to another samtools command.
+.TP
+.B -h
+Include the header in the output.
+.TP
+.B -H
+Output the header only.
+.TP
+.B -S
+Input is in SAM. If @SQ header lines are absent, the
+.B `-t'
+option is required.
+.TP
+.B -t FILE
+This file is TAB-delimited. Each line must contain the reference name
+and the length of the reference, one line for each distinct reference;
+additional fields are ignored. This file also defines the order of the
+reference sequences in sorting. If you run `samtools faidx <ref.fa>',
+the resultant index file
+.I <ref.fa>.fai
+can be used as this
+.I <in.ref_list>
+file.
+.TP
+.B -o FILE
+Output file [stdout]
+.TP
+.B -f INT
+Only output alignments with all bits in INT present in the FLAG
+field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
+.TP
+.B -F INT
+Skip alignments with bits present in INT [0]
+.TP
+.B -q INT
+Skip alignments with MAPQ smaller than INT [0]
+.TP
+.B -l STR
+Only output reads in library STR [null]
+.TP
+.B -r STR
+Only output reads in read group STR [null]
+.RE
+
+.TP
+.B faidx
+samtools faidx <ref.fasta> [region1 [...]]
+
+Index reference sequence in the FASTA format or extract subsequence from
+indexed reference sequence. If no region is specified,
+.B faidx
+will index the file and create
+.I <ref.fasta>.fai
+on the disk. If regions are speficified, the subsequences will be
+retrieved and printed to stdout in the FASTA format. The input file can
+be compressed in the
+.B RAZF
+format.
+
+.TP
+.B pileup
+samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list]
+[-iscgS2] [-T theta] [-N nHap] [-r pairDiffRate] <in.bam>|<in.sam>
+
+Print the alignment in the pileup format. In the pileup format, each
+line represents a genomic position, consisting of chromosome name,
+coordinate, reference base, read bases, read qualities and alignment
+mapping qualities. Information on match, mismatch, indel, strand,
+mapping quality and start and end of a read are all encoded at the read
+base column. At this column, a dot stands for a match to the reference
+base on the forward strand, a comma for a match on the reverse strand,
+`ACGTN' for a mismatch on the forward strand and `acgtn' for a mismatch
+on the reverse strand. A pattern `\\+[0-9]+[ACGTNacgtn]+' indicates
+there is an insertion between this reference position and the next
+reference position. The length of the insertion is given by the integer
+in the pattern, followed by the inserted sequence. Similarly, a pattern
+`-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. The
+deleted bases will be presented as `*' in the following lines. Also at
+the read base column, a symbol `^' marks the start of a read segment
+which is a contiguous subsequence on the read separated by `N/S/H' CIGAR
+operations. The ASCII of the character following `^' minus 33 gives the
+mapping quality. A symbol `$' marks the end of a read segment.
+
+If option
+.B -c
+is applied, the consensus base, consensus quality, SNP quality and RMS
+mapping quality of the reads covering the site will be inserted between
+the `reference base' and the `read bases' columns. An indel occupies an
+additional line. Each indel line consists of chromosome name,
+coordinate, a star, the genotype, consensus quality, SNP quality, RMS
+mapping quality, # covering reads, the first alllele, the second allele,
+# reads supporting the first allele, # reads supporting the second
+allele and # reads containing indels different from the top two alleles.
+
+.B OPTIONS:
+.RS
+
+.TP 10
+.B -s
+Print the mapping quality as the last column. This option makes the
+output easier to parse, although this format is not space efficient.
+
+.TP
+.B -S
+The input file is in SAM.
+
+.TP
+.B -i
+Only output pileup lines containing indels.
+
+.TP
+.B -f FILE
+The reference sequence in the FASTA format. Index file
+.I FILE.fai
+will be created if
+absent.
+
+.TP
+.B -M INT
+Cap mapping quality at INT [60]
+
+.TP
+.B -t FILE
+List of reference names ane sequence lengths, in the format described
+for the
+.B import
+command. If this option is present, samtools assumes the input
+.I <in.alignment>
+is in SAM format; otherwise it assumes in BAM format.
+
+.TP
+.B -l FILE
+List of sites at which pileup is output. This file is space
+delimited. The first two columns are required to be chromosome and
+1-based coordinate. Additional columns are ignored. It is
+recommended to use option
+.B -s
+together with
+.B -l
+as in the default format we may not know the mapping quality.
+
+.TP
+.B -c
+Call the consensus sequence using MAQ consensus model. Options
+.B -T,
+.B -N,
+.B -I
+and
+.B -r
+are only effective when
+.B -c
+or
+.B -g
+is in use.
+
+.TP
+.B -g
+Generate genotype likelihood in the binary GLFv3 format. This option
+suppresses -c, -i and -s.
+
+.TP
+.B -T FLOAT
+The theta parameter (error dependency coefficient) in the maq consensus
+calling model [0.85]
+
+.TP
+.B -N INT
+Number of haplotypes in the sample (>=2) [2]
+
+.TP
+.B -r FLOAT
+Expected fraction of differences between a pair of haplotypes [0.001]
+
+.TP
+.B -I INT
+Phred probability of an indel in sequencing/prep. [40]
+
+.RE
+
+.TP
+.B tview
+samtools tview <in.sorted.bam> [ref.fasta]
+
+Text alignment viewer (based on the ncurses library). In the viewer,
+press `?' for help and press `g' to check the alignment start from a
+region in the format like `chr10:10,000,000'. Note that if the region
+showed on the screen contains no mapped reads, a blank screen will be
+seen. This is a known issue and will be improved later.
+
+.RE
+
+.TP
+.B fixmate
+samtools fixmate <in.nameSrt.bam> <out.bam>
+
+Fill in mate coordinates, ISIZE and mate related flags from a
+name-sorted alignment.
+
+.TP
+.B rmdup
+samtools rmdup <input.srt.bam> <out.bam>
+
+Remove potential PCR duplicates: if multiple read pairs have identical
+external coordinates, only retain the pair with highest mapping quality.
+This command
+.B ONLY
+works with FR orientation and requires ISIZE is correctly set.
+
+.RE
+
+.TP
+.B rmdupse
+samtools rmdupse <input.srt.bam> <out.bam>
+
+Remove potential duplicates for single-ended reads. This command will
+treat all reads as single-ended even if they are paired in fact.
+
+.RE
+
+.TP
+.B fillmd
+samtools fillmd [-e] <aln.bam> <ref.fasta>
+
+Generate the MD tag. If the MD tag is already present, this command will
+give a warning if the MD tag generated is different from the existing
+tag.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -e
+Convert a the read base to = if it is identical to the aligned reference
+base. Indel caller does not support the = bases at the moment.
+
+.RE
+
+.SH SAM FORMAT
+
+SAM is TAB-delimited. Apart from the header lines, which are started
+with the `@' symbol, each alignment line consists of:
+
+.TS
+center box;
+cb | cb | cb
+n | l | l .
+Col Field Description
+_
+1 QNAME Query (pair) NAME
+2 FLAG bitwise FLAG
+3 RNAME Reference sequence NAME
+4 POS 1-based leftmost POSition/coordinate of clipped sequence
+5 MAPQ MAPping Quality (Phred-scaled)
+6 CIAGR extended CIGAR string
+7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
+8 MPOS 1-based Mate POSistion
+9 ISIZE Inferred insert SIZE
+10 SEQ query SEQuence on the same strand as the reference
+11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
+12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE
+.TE
+
+.PP
+Each bit in the FLAG field is defined as:
+
+.TS
+center box;
+cb | cb
+l | l .
+Flag Description
+_
+0x0001 the read is paired in sequencing
+0x0002 the read is mapped in a proper pair
+0x0004 the query sequence itself is unmapped
+0x0008 the mate is unmapped
+0x0010 strand of the query (1 for reverse)
+0x0020 strand of the mate
+0x0040 the read is the first read in a pair
+0x0080 the read is the second read in a pair
+0x0100 the alignment is not primary
+0x0200 the read fails platform/vendor quality checks
+0x0400 the read is either a PCR or an optical duplicate
+.TE
+
+.SH LIMITATIONS
+.PP
+.IP o 2
+Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c.
+.IP o 2
+CIGAR operation P is not properly handled at the moment.
+
+.SH AUTHOR
+.PP
+Heng Li from the Sanger Institute wrote the C version of samtools. Bob
+Handsaker from the Broad Institute implemented the BGZF library and Jue
+Ruan from Beijing Genomics Institute wrote the RAZF library. Various
+people in the 1000Genomes Project contributed to the SAM format
+specification.
+
+.SH SEE ALSO
+.PP
+Samtools website: http://samtools.sourceforge.net
--- /dev/null
+digraph {
+ faidx[label="faidx.c\n(faidx)"]
+ import[label="bam_import.c\n(import)"]
+ plcmd[label="bam_plcmd.c\n(pileup)"]
+ sort[label="bam_sort.c\n(sort, merge)"]
+ index[label="bam_index.c\n(index)"]
+ tview[label="bam_tview.c\n(tview)"]
+ glf[label="glf.c\n(glfview)"]
+ rmdup[label="bam_rmdup.c\n(rmdup)"]
+ fixmate[label="bam_mate.c\n(fixmate)"]
+ "bam_aux.c" -> {"bam.c", import}
+ glf -> {"bam_maqcns.c", plcmd}
+ "bgzf.c" -> {"bam.c", glf}
+ "bam.c" -> {index, "bam_pileup.c", sort, import, rmdup, fixmate}
+ "bam_pileup.c" -> {"bam_lpileup.c", plcmd}
+ {"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview
+ {import, faidx, "bam_maqcns.c"} -> plcmd
+ {tview, plcmd, faidx, sort, import, index, glf, rmdup, fixmate} -> "bamtk.c\n(view)"
+}
\ No newline at end of file