Imported Upstream version 0.1.8

author Charles Plessy <plessy@debian.org>

Tue, 13 Jul 2010 12:02:49 +0000 (21:02 +0900)

committer Charles Plessy <plessy@debian.org>

Tue, 13 Jul 2010 12:02:49 +0000 (21:02 +0900)
author Charles Plessy <plessy@debian.org>
Tue, 13 Jul 2010 12:02:49 +0000 (21:02 +0900)
committer Charles Plessy <plessy@debian.org>
Tue, 13 Jul 2010 12:02:49 +0000 (21:02 +0900)
diff --git a/AUTHORS b/AUTHORS

index 435431c9663d9fd55f5bc4dce07dad828a01fa49..95afabb14a29ac827a673003cdca7881fb43bb97 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -14,3 +14,7 @@ used in `faidx' for indexing RAZF compressed fasta files.
  
  Colin Hercus updated novo2sam.pl to support gapped alignment by
  novoalign.
+
+Petr Danecek contributed the header parsing library sam_header.c and 
+sam2vcf.pl script and added knet support to the RAZF library.
+
diff --git a/ChangeLog b/ChangeLog

index 6b1a695e5918557697bf2c031e2e02236c5733b7..6b0ff6cf4e80c0a9a1e989847e86bd1bba9434fe 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,521 @@
+------------------------------------------------------------------------
+r612 | lh3lh3 | 2010-07-11 21:08:56 -0400 (Sun, 11 Jul 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/knetfile.c
+
+fixed a compiling issue for Windows
+
+------------------------------------------------------------------------
+r611 | lh3lh3 | 2010-07-11 20:59:15 -0400 (Sun, 11 Jul 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_sort.c
+
+fixed a bug in sorting when output to stdout (by Peter Chines)
+
+------------------------------------------------------------------------
+r610 | lh3lh3 | 2010-07-09 17:05:10 -0400 (Fri, 09 Jul 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/NEWS
+   M /trunk/samtools/bam_plcmd.c
+
+change the command line option of pileup
+
+------------------------------------------------------------------------
+r609 | lh3lh3 | 2010-07-09 00:39:34 -0400 (Fri, 09 Jul 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_pileup.c
+   A /trunk/samtools/examples/toy.fa
+   A /trunk/samtools/examples/toy.sam
+
+make pileup work with CIGAR with I/D at the beginning or in the end
+
+------------------------------------------------------------------------
+r608 | lh3lh3 | 2010-07-08 22:36:12 -0400 (Thu, 08 Jul 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_maqcns.h
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bam_tview.c
+
+ * make tview more friendly
+ * a temporary remedy for an issue in indel calling
+
+------------------------------------------------------------------------
+r607 | lh3lh3 | 2010-07-08 14:43:52 -0400 (Thu, 08 Jul 2010) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-r607
+ * improved the genotype accuracy for indels
+ * use the SOAPsnp model for SNP calling by default.
+
+------------------------------------------------------------------------
+r606 | lh3lh3 | 2010-07-08 01:05:19 -0400 (Thu, 08 Jul 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/misc/Makefile
+
+removed a debugging example
+
+------------------------------------------------------------------------
+r605 | lh3lh3 | 2010-07-08 01:04:09 -0400 (Thu, 08 Jul 2010) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam_maqcns.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-.1.7-18 (r605)
+ * fixed an issue when a deletion and mismatch occur at the same time
+   and the base quality is higher than 40 (if -I40).
+
+------------------------------------------------------------------------
+r604 | lh3lh3 | 2010-07-02 19:32:24 -0400 (Fri, 02 Jul 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/misc/Makefile
+
+fixed a minor bug in idxstats
+
+------------------------------------------------------------------------
+r601 | lh3lh3 | 2010-06-16 09:03:59 -0400 (Wed, 16 Jun 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+
+fixed a minor bug in indexing
+
+------------------------------------------------------------------------
+r600 | lh3lh3 | 2010-06-15 10:17:53 -0400 (Tue, 15 Jun 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bam.c
+
+change printf() to puts in exporting
+
+------------------------------------------------------------------------
+r599 | lh3lh3 | 2010-06-13 21:41:11 -0400 (Sun, 13 Jun 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+
+minor fix. No actual effect.
+
+------------------------------------------------------------------------
+r598 | lh3lh3 | 2010-06-13 21:32:45 -0400 (Sun, 13 Jun 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+
+added Makefile targets to compile shared/dynamic library
+
+------------------------------------------------------------------------
+r596 | lh3lh3 | 2010-06-13 19:48:07 -0400 (Sun, 13 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-17 (r596)
+ * also keep the number of coor-less reads in the index file
+
+------------------------------------------------------------------------
+r595 | lh3lh3 | 2010-06-13 18:54:26 -0400 (Sun, 13 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-16 (r595)
+ * write additional information to bam index
+
+------------------------------------------------------------------------
+r594 | lh3lh3 | 2010-06-13 17:29:52 -0400 (Sun, 13 Jun 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+
+fixed a bug for unmapped sequences in indexing
+
+------------------------------------------------------------------------
+r593 | lh3lh3 | 2010-06-12 18:11:32 -0400 (Sat, 12 Jun 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/samtools.1
+
+rename iterf as iter
+
+------------------------------------------------------------------------
+r592 | lh3lh3 | 2010-06-12 18:02:38 -0400 (Sat, 12 Jun 2010) | 4 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bam_aux.c
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-15 (r592)
+ * fixed a few minor memory leaks in the new pileup code
+ * improved the functionality of mpileup
+
+------------------------------------------------------------------------
+r591 | lh3lh3 | 2010-06-12 14:09:22 -0400 (Sat, 12 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-14 (r591)
+ * elementary multi-way pileup. More testing and more functionality to be done.
+
+------------------------------------------------------------------------
+r590 | lh3lh3 | 2010-06-12 01:00:24 -0400 (Sat, 12 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-13 (r590)
+ * added mpileup APIs. No compiling errors, but not tested at all. It is late.
+
+------------------------------------------------------------------------
+r589 | lh3lh3 | 2010-06-11 22:37:09 -0400 (Fri, 11 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_pileup.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-12 (r589)
+ * added iterator-like APIs for pileup
+
+------------------------------------------------------------------------
+r588 | lh3lh3 | 2010-06-11 17:41:13 -0400 (Fri, 11 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-11 (r588)
+ * ported a few improvements from tabix back to samtools
+
+------------------------------------------------------------------------
+r587 | lh3lh3 | 2010-06-11 17:33:16 -0400 (Fri, 11 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_index.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-10 (r587)
+ * added iterator interface for bam_fetch (ported back from tabix)
+
+------------------------------------------------------------------------
+r586 | lh3lh3 | 2010-06-11 13:23:53 -0400 (Fri, 11 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   A /trunk/samtools/bam_reheader.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/bgzf.c
+
+ * samtools-0.1.7-9 (r586)
+ * added "reheader" to replace the BAM header
+
+------------------------------------------------------------------------
+r585 | lh3lh3 | 2010-06-11 12:22:06 -0400 (Fri, 11 Jun 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/kstring.h
+
+ * samtools-0.1.7-8 (r585)
+ * speed up "view"
+
+------------------------------------------------------------------------
+r584 | lh3lh3 | 2010-06-11 12:00:41 -0400 (Fri, 11 Jun 2010) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/bgzf.c
+   M /trunk/samtools/bgzf.h
+   M /trunk/samtools/kstring.h
+   M /trunk/samtools/misc/wgsim_eval.pl
+
+ * samtools-0.1.7-7 (r584)
+ * ported tabix BGZF to samtools
+ * flush BGZF after writing the BAM header and between alignment boundaries
+
+------------------------------------------------------------------------
+r583 | petulda | 2010-06-11 11:58:20 -0400 (Fri, 11 Jun 2010) | 1 line
+Changed paths:
+   A /trunk/samtools/misc/varfilter.py
+
+Initial release on behalf of Aylwyn Scally
+------------------------------------------------------------------------
+r561 | petulda | 2010-05-07 08:41:56 -0400 (Fri, 07 May 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/samtools.1
+
+Added a note about the indels coordinates
+------------------------------------------------------------------------
+r551 | petulda | 2010-04-23 09:42:13 -0400 (Fri, 23 Apr 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/sam2vcf.pl
+
+Added the possibility to print or not to print the reference allele
+------------------------------------------------------------------------
+r546 | petulda | 2010-04-15 04:33:55 -0400 (Thu, 15 Apr 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/sam_header.c
+
+More descriptive message for space separated tags
+------------------------------------------------------------------------
+r545 | petulda | 2010-04-14 11:44:50 -0400 (Wed, 14 Apr 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/sam2vcf.pl
+
+Speedup with -i, no need to query the reference all the time
+------------------------------------------------------------------------
+r541 | petulda | 2010-03-15 10:03:51 -0400 (Mon, 15 Mar 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/sam_header.c
+
+Fixed the order of sequences in the header
+------------------------------------------------------------------------
+r540 | petulda | 2010-03-04 06:28:35 -0500 (Thu, 04 Mar 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/sam2vcf.pl
+
+Added possibility to select indels only and fixed a bug in reporting homozygous indels.
+------------------------------------------------------------------------
+r539 | jmarshall | 2010-02-27 06:48:17 -0500 (Sat, 27 Feb 2010) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+
+Improve the invalid 'BAM\1' magic number error message, and also print it
+when no bytes can be read from the alleged BAM file, e.g., in the common
+user error case when a SAM file has accidentally been supplied.
+
+------------------------------------------------------------------------
+r538 | petulda | 2010-02-26 10:51:40 -0500 (Fri, 26 Feb 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/AUTHORS
+   M /trunk/samtools/bam.h
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/sam_header.c
+
+Improved efficiency of header parsing
+------------------------------------------------------------------------
+r537 | lh3lh3 | 2010-02-23 21:08:48 -0500 (Tue, 23 Feb 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/misc/export2sam.pl
+
+Updated export2sam.pl by Chris Saunders from Illumina.
+
+
+------------------------------------------------------------------------
+r536 | petulda | 2010-02-17 08:32:53 -0500 (Wed, 17 Feb 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/samtools.pl
+
+Fixed filtering of SNPs near indels. Added min indel and SNP quality filter.
+------------------------------------------------------------------------
+r535 | petulda | 2010-02-12 04:52:37 -0500 (Fri, 12 Feb 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/sam2vcf.pl
+
+Print an error for pileups in simple format
+------------------------------------------------------------------------
+r534 | lh3lh3 | 2010-02-11 14:01:41 -0500 (Thu, 11 Feb 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+
+added a hidden option in pileup to output the base position (for Erin)
+
+------------------------------------------------------------------------
+r533 | petulda | 2010-02-09 10:12:14 -0500 (Tue, 09 Feb 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/sam2vcf.pl
+
+Added possibility to specify a custom column title for the data column
+------------------------------------------------------------------------
+r532 | petulda | 2010-02-09 09:46:09 -0500 (Tue, 09 Feb 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+
+Added the -d option to limit maximum depth for indels.
+------------------------------------------------------------------------
+r531 | petulda | 2010-02-03 07:57:27 -0500 (Wed, 03 Feb 2010) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/sam2vcf.pl
+
+Added VCF header
+------------------------------------------------------------------------
+r530 | lh3lh3 | 2010-02-01 09:13:19 -0500 (Mon, 01 Feb 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/misc/samtools.pl
+   M /trunk/samtools/misc/wgsim.c
+
+ * samtools-0.1.7-6
+ * fixed a bug in faidx
+
+------------------------------------------------------------------------
+r529 | jmarshall | 2010-01-11 18:51:49 -0500 (Mon, 11 Jan 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/faidx.c
+
+Put the right filename in the error message.
+
+------------------------------------------------------------------------
+r528 | lh3lh3 | 2009-12-14 11:26:47 -0500 (Mon, 14 Dec 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-5 (r528)
+ * further add new consensus generation strategy
+
+------------------------------------------------------------------------
+r527 | petulda | 2009-12-11 12:31:05 -0500 (Fri, 11 Dec 2009) | 1 line
+Changed paths:
+   M /trunk/samtools/knetfile.c
+
+Fixed a bug in knet_seek
+------------------------------------------------------------------------
+r526 | petulda | 2009-12-11 07:51:18 -0500 (Fri, 11 Dec 2009) | 1 line
+Changed paths:
+   M /trunk/samtools/misc/sam2vcf.pl
+
+Small fix in VCF format: dot for the empty INFO field
+------------------------------------------------------------------------
+r525 | petulda | 2009-12-11 04:36:18 -0500 (Fri, 11 Dec 2009) | 1 line
+Changed paths:
+   M /trunk/samtools/sam_header.c
+
+Allow tabs in the CO header field
+------------------------------------------------------------------------
+r524 | jmarshall | 2009-12-10 10:03:58 -0500 (Thu, 10 Dec 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/Makefile.mingw
+
+Depend on libbam.a rather than the phony target, so that samtools is not
+unnecessarily rebuilt every time.  Also clean bgzip.
+
+------------------------------------------------------------------------
+r523 | jmarshall | 2009-12-10 09:45:32 -0500 (Thu, 10 Dec 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/Makefile.mingw
+
+Fix a bug in compiling bgzip: this also needs knetfile.o when _USE_KNETFILE
+is defined.  Also introduce $(KNETFILE_O) which can be set to empty to
+facilitate non-knet builds.
+
+------------------------------------------------------------------------
+r522 | lh3lh3 | 2009-12-01 13:02:36 -0500 (Tue, 01 Dec 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.7-4 (r522)
+ * fixed a bug in "view -r"
+ * added a new option "view -R" to read required read groups from a file
+
+------------------------------------------------------------------------
+r521 | lh3lh3 | 2009-12-01 10:00:12 -0500 (Tue, 01 Dec 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_md.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.7-3 (r521)
+ * calmd: optionally mask matching bases as N
+
+------------------------------------------------------------------------
+r520 | lh3lh3 | 2009-12-01 09:37:17 -0500 (Tue, 01 Dec 2009) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/misc/samtools.pl
+
+ * samtools-0.1.7-2 (r520)
+ * fixed a few issues with compilation in Windows (on behalf of John)
+ * choose a random base as the consensus (for population genetics studies)
+
+------------------------------------------------------------------------
+r519 | jmarshall | 2009-11-30 10:53:02 -0500 (Mon, 30 Nov 2009) | 6 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+
+Put libraries at the end, so they can resolve references from libbam.a
+as well, even with old-fashioned linkers.
+
+Also use libbam.a explicitly rather than "-L. -lbam" to ensure that we get
+the freshly built library, not some other libbam.a lying around the system.
+
+------------------------------------------------------------------------
+r518 | jmarshall | 2009-11-30 08:44:56 -0500 (Mon, 30 Nov 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+   M /trunk/samtools/misc/Makefile
+
+Also clean *.exe (for Cygwin users using this makefile).
+
+------------------------------------------------------------------------
+r517 | jmarshall | 2009-11-30 07:09:04 -0500 (Mon, 30 Nov 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam_index.c
+
+Index files should be opened in binary mode, not text mode.
+
+------------------------------------------------------------------------
+r516 | lh3lh3 | 2009-11-27 15:18:59 -0500 (Fri, 27 Nov 2009) | 2 lines
+Changed paths:
+   A /trunk/samtools/examples/bam2bed.c
+
+another example program
+
+------------------------------------------------------------------------
+r515 | lh3lh3 | 2009-11-27 10:44:56 -0500 (Fri, 27 Nov 2009) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam_import.c
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/misc/wgsim_eval.pl
+   M /trunk/samtools/sam.c
+
+ * samtools-0.1.7-1 (r515)
+ * report an error when .fai contains duplicated names, instead of segfault
+
+------------------------------------------------------------------------
+r514 | jmarshall | 2009-11-24 09:45:35 -0500 (Tue, 24 Nov 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam.c
+
+Format 'c'-encoded auxiliary fields correctly, as *signed* integers.
+
+------------------------------------------------------------------------
+r513 | lh3lh3 | 2009-11-16 10:13:07 -0500 (Mon, 16 Nov 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile.mingw
+
+Update Makefile.mingw for the same reason
+
+------------------------------------------------------------------------
+r512 | lh3lh3 | 2009-11-16 10:00:08 -0500 (Mon, 16 Nov 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/Makefile
+
+Fixed a bug in compiling razip
+
+------------------------------------------------------------------------
+r510 | lh3lh3 | 2009-11-10 10:55:41 -0500 (Tue, 10 Nov 2009) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/NEWS
+   M /trunk/samtools/bamtk.c
+   M /trunk/samtools/samtools.1
+
+Release samtools-0.1.7 (r510)
+
  ------------------------------------------------------------------------
  r509 | lh3lh3 | 2009-11-06 09:17:09 -0500 (Fri, 06 Nov 2009) | 3 lines
  Changed paths:
diff --git a/Makefile b/Makefile

index f3fb7a01365ee567c6bda338899c178c15230d73..35d578ffe50ded7ff4eb81caa084dbb245b17e5a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,10 @@
  CC=                    gcc
  CFLAGS=                -g -Wall -O2 #-m64 #-arch ppc
  DFLAGS=                -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1
+KNETFILE_O=    knetfile.o
  LOBJS=         bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
-                       bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o     \
-                       bam_sort.o sam_header.o
+                       bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \
+                       $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o
  AOBJS=         bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
                         bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o     \
                         bamtk.o kaln.o
@@ -30,19 +31,22 @@ all-recur lib-recur clean-recur cleanlocal-recur install-recur:
  
  all:$(PROG)
  
+.PHONY:all lib clean cleanlocal
+.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur
+
  lib:libbam.a
  
  libbam.a:$(LOBJS)
                 $(AR) -cru $@ $(LOBJS)
  
-samtools:lib $(AOBJS)
-               $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) $(LIBCURSES) -lz -L. -lbam
+samtools:$(AOBJS) libbam.a
+               $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz
  
-razip:razip.o razf.o knetfile.o
-               $(CC) $(CFLAGS) -o $@ razf.o razip.o knetfile.o -lz
+razip:razip.o razf.o $(KNETFILE_O)
+               $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz
  
-bgzip:bgzip.o bgzf.o
-               $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz
+bgzip:bgzip.o bgzf.o $(KNETFILE_O)
+               $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz
  
  razip.o:razf.h
  bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h
@@ -62,7 +66,23 @@ sam_header.o:sam_header.h khash.h
  faidx.o:faidx.h razf.h khash.h
  faidx_main.o:faidx.h razf.h
  
+
+libbam.1.dylib-local:$(LOBJS)
+               libtool -dynamic $(LOBJS) -o libbam.1.dylib -lc -lz
+
+libbam.so.1-local:$(LOBJS)
+               $(CC) -shared -Wl,-soname,libbam.so -o libbam.so.1 $(LOBJS) -lc -lz
+
+dylib:
+               @$(MAKE) cleanlocal; \
+               case `uname` in \
+                       Linux) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.so.1-local;; \
+                       Darwin) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.1.dylib-local;; \
+                       *) echo 'Unknown OS';; \
+               esac
+
+
  cleanlocal:
-               rm -fr gmon.out *.o a.out *.dSYM razip $(PROG) *~ *.a
+               rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib
  
  clean:cleanlocal-recur
diff --git a/Makefile.mingw b/Makefile.mingw

index f1ae1be081ee976f9a883defdb64cee1724bb9fc..9df4b9ad8166a66ff8545b0a4c7d9e908ecd60a6 100644 (file)
--- a/Makefile.mingw
+++ b/Makefile.mingw
@@ -2,9 +2,10 @@ CC=                    gcc.exe
  AR=                    ar.exe
  CFLAGS=                -g -Wall -O2
  DFLAGS=                -D_CURSES_LIB=2 -D_USE_KNETFILE
+KNETFILE_O=    knetfile.o
  LOBJS=         bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
                         bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bam_sort.o \
-                       knetfile.o
+                       $(KNETFILE_O)
  AOBJS=         bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
                         bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o     \
                         bamtk.o kaln.o sam_header.o
@@ -20,19 +21,22 @@ LIBPATH=
  
  all:$(PROG)
  
+.PHONY:all lib clean cleanlocal
+.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur
+
  lib:libbam.a
  
  libbam.a:$(LOBJS)
                 $(AR) -cru $@ $(LOBJS)
  
-samtools:lib $(AOBJS)
+samtools:$(AOBJS) libbam.a
                 $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32
  
-razip:razip.o razf.o knetfile.o
-               $(CC) $(CFLAGS) -o $@ razf.o razip.o knetfile.o -lz
+razip:razip.o razf.o $(KNETFILE_O)
+               $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz
  
-bgzip:bgzip.o bgzf.o
-               $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz
+bgzip:bgzip.o bgzf.o $(KNETFILE_O)
+               $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz
  
  razip.o:razf.h
  bam.o:bam.h razf.h bam_endian.h kstring.h
@@ -52,4 +56,4 @@ faidx.o:faidx.h razf.h khash.h
  faidx_main.o:faidx.h razf.h
  
  clean:
-               rm -fr gmon.out *.o *.exe *.dSYM razip $(PROG) *~ *.a
+               rm -fr gmon.out *.o *.exe *.dSYM razip bgzip $(PROG) *~ *.a
diff --git a/NEWS b/NEWS

index 8db09960a431676632e7e03649f17001076ddf06..28d6aaa81269b72f0b8cb26a971c5bd55e33c050 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,82 @@
+Beta Release 0.1.8 (11 July, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable functional changes:
+
+ * Added the `reheader' command which replaces a BAM header with a new
+   header. This command is much faster than replacing header by
+   BAM->SAM->BAM conversions.
+
+ * Added the `mpileup' command which computes the pileup of multiple
+   alignments.
+
+ * The `index' command now stores the number of mapped and unmapped
+   reads in the index file. This information can be retrieved quickly by
+   the new `idxstats' command.
+
+ * By default, pileup used the SOAPsnp model for SNP calling. This
+   avoids the floating overflow in the MAQ model which leads to spurious
+   calls in repetitive regions, although these calls will be immediately
+   filtered by varFilter.
+
+ * The `tview' command now correctly handles CIGARs like 7I10M and
+   10M1P1I10M which cause assertion failure in earlier versions.
+
+ * Tview accepts a region like `=10,000' where `=' stands for the
+   current sequence name. This saves typing for long sequence names.
+
+ * Added the `-d' option to `pileup' which avoids slow indel calling
+   in ultradeep regions by subsampling reads locally.
+
+ * Added the `-R' option to `view' which retrieves alignments in read
+   groups listed in the specified file.
+
+Performance improvements:
+
+ * The BAM->SAM conversion is up to twice faster, depending on the
+   characteristic of the input.
+
+ * Parsing SAM headers with a lot of reference sequences is now much
+   faster.
+
+ * The number of lseek() calls per query is reduced when the query
+   region contains no read alignments.
+
+Bug fixes:
+
+ * Fixed an issue in the indel caller that leads to miscall of indels.
+   Note that this solution may not work well when the sequencing indel
+   error rate is higher than the rate of SNPs.
+
+ * Fixed another issue in the indel caller which may lead to incorrect
+   genotype.
+
+ * Fixed a bug in `sort' when option `-o' is applied.
+
+ * Fixed a bug in `view -r'.
+
+APIs and other changes:
+
+ * Added iterator interfaces to random access and pileup. The callback
+   interfaces directly call the iterator interfaces.
+
+ * The BGZF blocks holding the BAM header are indepedent of alignment
+   BGZF blocks. Alignment records shorter than 64kB is guaranteed to be
+   fully contained in one BGZF block. This change is fully compatible
+   with the old version of samtools/picard.
+
+Changes in other utilities:
+
+ * Updated export2sam.pl by Chris Saunders.
+
+ * Improved the sam2vcf.pl script.
+
+ * Added a Python version of varfilter.py by Aylwyn Scally.
+
+(0.1.8: 11 July 2010, r613)
+
+
+
  Beta Release 0.1.7 (10 November, 2009)
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  
diff --git a/bam.c b/bam.c

index ee7642b3434248c9b44ef918435f150f101cae00..94b0aa8bb85bef4ae4a787af3337f1765ed8899e 100644 (file)
--- a/bam.c
+++ b/bam.c
@@ -70,6 +70,7 @@ bam_header_t *bam_header_read(bamFile fp)
  {
         bam_header_t *header;
         char buf[4];
+       int magic_len;
         int32_t i = 1, name_len;
         // check EOF
         i = bgzf_check_EOF(fp);
@@ -80,9 +81,9 @@ bam_header_t *bam_header_read(bamFile fp)
         }
         else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n");
         // read "BAM1"
-       if (bam_read(fp, buf, 4) != 4) return 0;
-       if (strncmp(buf, "BAM\001", 4)) {
-               fprintf(stderr, "[bam_header_read] wrong header\n");
+       magic_len = bam_read(fp, buf, 4);
+       if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
+               fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
                 return 0;
         }
         header = bam_header_init();
@@ -140,6 +141,7 @@ int bam_header_write(bamFile fp, const bam_header_t *header)
                         bam_write(fp, &x, 4);
                 } else bam_write(fp, &header->target_len[i], 4);
         }
+       bgzf_flush(fp);
         return 0;
  }
  
@@ -207,6 +209,7 @@ inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8
         x[5] = c->mtid;
         x[6] = c->mpos;
         x[7] = c->isize;
+       bgzf_flush_try(fp, 4 + block_len);
         if (bam_is_be) {
                 for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
                 y = block_len;
@@ -232,8 +235,8 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
         kstring_t str;
         str.l = str.m = 0; str.s = 0;
  
-       ksprintf(&str, "%s\t", bam1_qname(b));
-       if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag);
+       kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str);
+       if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); }
         else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
         else { // BAM_OFSTR
                 for (i = 0; i < 16; ++i)
@@ -241,41 +244,43 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
                                 kputc(bam_flag2char_table[i], &str);
                 kputc('\t', &str);
         }
-       if (c->tid < 0) kputs("*\t", &str);
-       else ksprintf(&str, "%s\t", header->target_name[c->tid]);
-       ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual);
+       if (c->tid < 0) kputsn("*\t", 2, &str);
+       else { kputs(header->target_name[c->tid], &str); kputc('\t', &str); }
+       kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
         if (c->n_cigar == 0) kputc('*', &str);
         else {
-               for (i = 0; i < c->n_cigar; ++i)
-                       ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]);
+               for (i = 0; i < c->n_cigar; ++i) {
+                       kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
+                       kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
+               }
         }
         kputc('\t', &str);
-       if (c->mtid < 0) kputs("*\t", &str);
-       else if (c->mtid == c->tid) kputs("=\t", &str);
-       else ksprintf(&str, "%s\t", header->target_name[c->mtid]);
-       ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize);
+       if (c->mtid < 0) kputsn("*\t", 2, &str);
+       else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
+       else { kputs(header->target_name[c->mtid], &str); kputc('\t', &str); }
+       kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
         if (c->l_qseq) {
                 for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
                 kputc('\t', &str);
                 if (t[0] == 0xff) kputc('*', &str);
                 else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
-       } else ksprintf(&str, "*\t*");
+       } else kputsn("*\t*", 3, &str);
         s = bam1_aux(b);
         while (s < b->data + b->data_len) {
                 uint8_t type, key[2];
                 key[0] = s[0]; key[1] = s[1];
                 s += 2; type = *s; ++s;
-               ksprintf(&str, "\t%c%c:", key[0], key[1]);
-               if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; }
-               else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; }
-               else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; }
-               else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; }
-               else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; }
-               else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; }
-               else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; }
+               kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
+               if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
+               else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }
+               else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }
+               else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }
+               else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }
+               else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }
+               else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
                 else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
                 else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
-               else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; }
+               else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
         }
         return str.s;
  }
@@ -288,7 +293,7 @@ char *bam_format1(const bam_header_t *header, const bam1_t *b)
  void bam_view1(const bam_header_t *header, const bam1_t *b)
  {
         char *s = bam_format1(header, b);
-       printf("%s\n", s);
+       puts(s);
         free(s);
  }
  
diff --git a/bam.h b/bam.h

index 291b30314f75d3d882b1d5dc9960e7b709a50c02..8e26ea69d3b7c6b008aba6112052310a4072b08a 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -87,7 +87,7 @@ typedef struct {
         char **target_name;
         uint32_t *target_len;
         void *dict, *hash, *rg2lib;
-       int l_text;
+       size_t l_text, n_text;
         char *text;
  } bam_header_t;
  
@@ -190,6 +190,8 @@ typedef struct {
         uint8_t *data;
  } bam1_t;
  
+typedef struct __bam_iter_t *bam_iter_t;
+
  #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
  #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
  
@@ -272,6 +274,10 @@ extern char bam_nt16_nt4_table[];
  extern "C" {
  #endif
  
+       /*********************
+        * Low-level SAM I/O *
+        *********************/
+
         /*! @abstract TAM file handler */
         typedef struct __tamFile_t *tamFile;
  
@@ -323,6 +329,7 @@ extern "C" {
           be destroyed in the first place.
          */
         int sam_header_parse(bam_header_t *h);
+       int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
  
         /*!
           @abstract       Parse @RG lines a update a header struct
@@ -336,12 +343,22 @@ extern "C" {
  
  #define sam_write1(header, b) bam_view1(header, b)
  
+
+       /********************************
+        * APIs for string dictionaries *
+        ********************************/
+
         int bam_strmap_put(void *strmap, const char *rg, const char *lib);
         const char *bam_strmap_get(const void *strmap, const char *rg);
         void *bam_strmap_dup(const void*);
         void *bam_strmap_init();
         void bam_strmap_destroy(void *strmap);
  
+
+       /*********************
+        * Low-level BAM I/O *
+        *********************/
+
         /*!
           @abstract Initialize a header structure.
           @return   the pointer to the header structure
@@ -440,6 +457,11 @@ extern "C" {
  
         const char *bam_get_library(bam_header_t *header, const bam1_t *b);
  
+
+       /***************
+        * pileup APIs *
+        ***************/
+
         /*! @typedef
           @abstract Structure for one alignment covering the pileup position.
           @field  b      pointer to the alignment
@@ -461,11 +483,25 @@ extern "C" {
                 uint32_t is_del:1, is_head:1, is_tail:1;
         } bam_pileup1_t;
  
-       struct __bam_plbuf_t;
-       /*! @abstract pileup buffer */
-       typedef struct __bam_plbuf_t bam_plbuf_t;
+       typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
  
-       void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+       struct __bam_plp_t;
+       typedef struct __bam_plp_t *bam_plp_t;
+
+       bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
+       int bam_plp_push(bam_plp_t iter, const bam1_t *b);
+       const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+       const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+       void bam_plp_set_mask(bam_plp_t iter, int mask);
+       void bam_plp_reset(bam_plp_t iter);
+       void bam_plp_destroy(bam_plp_t iter);
+
+       struct __bam_mplp_t;
+       typedef struct __bam_mplp_t *bam_mplp_t;
+
+       bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
+       void bam_mplp_destroy(bam_mplp_t iter);
+       int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
  
         /*! @typedef
           @abstract    Type of function to be called by bam_plbuf_push().
@@ -478,44 +514,16 @@ extern "C" {
          */
         typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
  
-       /*!
-         @abstract     Reset a pileup buffer for another pileup process
-         @param  buf   the pileup buffer to be reset
-        */
-       void bam_plbuf_reset(bam_plbuf_t *buf);
+       typedef struct {
+               bam_plp_t iter;
+               bam_pileup_f func;
+               void *data;
+       } bam_plbuf_t;
  
-       /*!
-         @abstract     Initialize a buffer for pileup.
-         @param  func  fucntion to be called by bam_pileup_core()
-         @param  data  user provided data
-         @return       pointer to the pileup buffer
-        */
+       void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+       void bam_plbuf_reset(bam_plbuf_t *buf);
         bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
-
-       /*!
-         @abstract    Destroy a pileup buffer.
-         @param  buf  pointer to the pileup buffer
-        */
         void bam_plbuf_destroy(bam_plbuf_t *buf);
-
-       /*!
-         @abstract    Push an alignment to the pileup buffer.
-         @param  b    alignment to be pushed
-         @param  buf  pileup buffer
-         @see         bam_plbuf_init()
-         @return      always 0 currently
-
-         @discussion If all the alignments covering a particular site have
-         been collected, this function will call the user defined function
-         as is provided to bam_plbuf_init(). The coordinate of the site and
-         all the alignments will be transferred to the user defined
-         function as function parameters.
-        
-         When all the alignments are pushed to the buffer, this function
-         needs to be called with b equal to NULL. This will flush the
-         buffer. A pileup buffer can only be reused when bam_plbuf_reset()
-         is called.
-        */
         int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
  
         int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
@@ -534,6 +542,11 @@ extern "C" {
         /*! @abstract  bam_plbuf_push() equivalent with level calculated. */
         int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
  
+
+       /*********************
+        * BAM indexing APIs *
+        *********************/
+
         struct __bam_index_t;
         typedef struct __bam_index_t bam_index_t;
  
@@ -582,6 +595,10 @@ extern "C" {
          */
         int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
  
+       bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end);
+       int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);
+       void bam_iter_destroy(bam_iter_t iter);
+
         /*!
           @abstract       Parse a region in the format: "chr2:100,000-200,000".
           @discussion     bam_header_t::hash will be initialized if empty.
@@ -594,6 +611,11 @@ extern "C" {
          */
         int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
  
+
+       /**************************
+        * APIs for optional tags *
+        **************************/
+
         /*!
           @abstract       Retrieve data of a tag
           @param  b       pointer to an alignment struct
@@ -617,6 +639,11 @@ extern "C" {
         void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
         uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
  
+
+       /*****************
+        * Miscellaneous *
+        *****************/
+
         /*!  
           @abstract Calculate the rightmost coordinate of an alignment on the
           reference genome.
diff --git a/bam_aux.c b/bam_aux.c

index 89e99f281adb652e144eea0a77074a1c6cec023a..fbcd9822b233dab64f6c861fb332846acb951ce7 100644 (file)
--- a/bam_aux.c
+++ b/bam_aux.c
@@ -115,7 +115,7 @@ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *be
         *ref_id = kh_value(h, iter);
         if (i == k) { /* dump the whole sequence */
                 *begin = 0; *end = 1<<29; free(s);
-               return -1;
+               return 0;
         }
         for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
         *begin = atoi(p);
diff --git a/bam_import.c b/bam_import.c

index 9d463d102bbb254ddf5a5b9bf32b5f216aac2187..9d84328bdbdafb5f4e76a19e7edf176121aca5b4 100644 (file)
--- a/bam_import.c
+++ b/bam_import.c
@@ -116,7 +116,7 @@ static bam_header_t *hash2header(const kh_ref_t *hash)
  bam_header_t *sam_header_read2(const char *fn)
  {
         bam_header_t *header;
-       int c, dret, ret;
+       int c, dret, ret, error = 0;
         gzFile fp;
         kstream_t *ks;
         kstring_t *str;
@@ -135,6 +135,10 @@ bam_header_t *sam_header_read2(const char *fn)
                 ks_getuntil(ks, 0, str, &dret);
                 len = atoi(str->s);
                 k = kh_put(ref, hash, s, &ret);
+               if (ret == 0) {
+                       fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s);
+                       error = 1;
+               }
                 kh_value(hash, k) = (uint64_t)len<<32 | i;
                 if (dret != '\n')
                         while ((c = ks_getc(ks)) != '\n' && c != -1);
@@ -143,6 +147,7 @@ bam_header_t *sam_header_read2(const char *fn)
         gzclose(fp);
         free(str->s); free(str);
         fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+       if (error) return 0;
         header = hash2header(hash);
         kh_destroy(ref, hash);
         return header;
@@ -163,9 +168,24 @@ static inline void parse_error(int64_t n_lines, const char * __restrict msg)
  }
  static inline void append_text(bam_header_t *header, kstring_t *str)
  {
-       int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+       size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
         kroundup32(x); kroundup32(y);
-       if (x < y) header->text = (char*)realloc(header->text, y);
+       if (x < y) 
+    {
+        header->n_text = y;
+        header->text = (char*)realloc(header->text, y);
+        if ( !header->text ) 
+        {
+            fprintf(stderr,"realloc failed to alloc %ld bytes\n", y);
+            abort();
+        }
+    }
+    // Sanity check
+    if ( header->l_text+str->l+1 >= header->n_text )
+    {
+        fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n",  header->l_text+str->l+1,header->n_text,x,y);
+        abort();
+    }
         strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
         header->l_text += str->l + 1;
         header->text[header->l_text] = 0;
diff --git a/bam_index.c b/bam_index.c

index a6278843583c605799360affb0b9c3c21fbd02c4..4152f2022f3178ef4fd0ee6b6001effae8f2ce3c 100644 (file)
--- a/bam_index.c
+++ b/bam_index.c
@@ -42,6 +42,8 @@
  // 1<<14 is the size of minimum bin.
  #define BAM_LIDX_SHIFT    14
  
+#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
+
  typedef struct {
         uint64_t u, v;
  } pair64_t;
@@ -63,6 +65,7 @@ KHASH_MAP_INIT_INT(i, bam_binlist_t)
  
  struct __bam_index_t {
         int32_t n;
+       uint64_t n_no_coor; // unmapped reads without coordinate
         khash_t(i) **index;
         bam_lidx_t *index2;
  };
@@ -98,8 +101,12 @@ static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset
                 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
                 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
         }
-       for (i = beg + 1; i <= end; ++i)
-               if (index2->offset[i] == 0) index2->offset[i] = offset;
+       if (beg == end) {
+               if (index2->offset[beg] == 0) index2->offset[beg] = offset;
+       } else {
+               for (i = beg; i <= end; ++i)
+                       if (index2->offset[i] == 0) index2->offset[i] = offset;
+       }
         index2->n = end + 1;
  }
  
@@ -113,7 +120,7 @@ static void merge_chunks(bam_index_t *idx)
                 index = idx->index[i];
                 for (k = kh_begin(index); k != kh_end(index); ++k) {
                         bam_binlist_t *p;
-                       if (!kh_exist(index, k)) continue;
+                       if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
                         p = &kh_value(index, k);
                         m = 0;
                         for (l = 1; l < p->n; ++l) {
@@ -130,6 +137,17 @@ static void merge_chunks(bam_index_t *idx)
  #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
  }
  
+static void fill_missing(bam_index_t *idx)
+{
+       int i, j;
+       for (i = 0; i < idx->n; ++i) {
+               bam_lidx_t *idx2 = &idx->index2[i];
+               for (j = 1; j < idx2->n; ++j)
+                       if (idx2->offset[j] == 0)
+                               idx2->offset[j] = idx2->offset[j-1];
+       }
+}
+
  bam_index_t *bam_index_core(bamFile fp)
  {
         bam1_t *b;
@@ -139,7 +157,7 @@ bam_index_t *bam_index_core(bamFile fp)
         uint32_t last_bin, save_bin;
         int32_t last_coor, last_tid, save_tid;
         bam1_core_t *c;
-       uint64_t save_off, last_off;
+       uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
  
         idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
         b = (bam1_t*)calloc(1, sizeof(bam1_t));
@@ -154,7 +172,10 @@ bam_index_t *bam_index_core(bamFile fp)
  
         save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
         save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+    n_mapped = n_unmapped = n_no_coor = off_end = 0;
+       off_beg = off_end = bam_tell(fp);
         while ((ret = bam_read1(fp, b)) >= 0) {
+               if (c->tid < 0) ++n_no_coor;
                 if (last_tid != c->tid) { // change of chromosomes
                         last_tid = c->tid;
                         last_bin = 0xffffffffu;
@@ -163,10 +184,17 @@ bam_index_t *bam_index_core(bamFile fp)
                                         bam1_qname(b), last_coor, c->pos, c->tid+1);
                         exit(1);
                 }
-               if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+               if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
                 if (c->bin != last_bin) { // then possibly write the binning index
                         if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
                                 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+                       if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
+                               off_end = last_off;
+                               insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
+                               insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+                               n_mapped = n_unmapped = 0;
+                               off_beg = off_end;
+                       }
                         save_off = last_off;
                         save_bin = last_bin = c->bin;
                         save_tid = c->tid;
@@ -177,13 +205,23 @@ bam_index_t *bam_index_core(bamFile fp)
                                         (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
                         exit(1);
                 }
+               if (c->flag & BAM_FUNMAP) ++n_unmapped;
+               else ++n_mapped;
                 last_off = bam_tell(fp);
                 last_coor = b->core.pos;
         }
-       if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+       if (save_tid >= 0) {
+               insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+               insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
+               insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+       }
         merge_chunks(idx);
+       fill_missing(idx);
+       if (ret >= 0)
+               while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor;
         if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
         free(b->data); free(b);
+       idx->n_no_coor = n_no_coor;
         return idx;
  }
  
@@ -261,6 +299,11 @@ void bam_index_save(const bam_index_t *idx, FILE *fp)
                                 bam_swap_endian_8p(&index2->offset[x]);
                 } else fwrite(index2->offset, 8, index2->n, fp);
         }
+       { // write the number of reads coor-less records.
+               uint64_t x = idx->n_no_coor;
+               if (bam_is_be) bam_swap_endian_8p(&x);
+               fwrite(&x, 8, 1, fp);
+       }
         fflush(fp);
  }
  
@@ -322,6 +365,8 @@ static bam_index_t *bam_index_load_core(FILE *fp)
                 if (bam_is_be)
                         for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
         }
+       if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
+       if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
         return idx;
  }
  
@@ -339,13 +384,13 @@ bam_index_t *bam_index_load_local(const char *_fn)
         } else fn = strdup(_fn);
         fnidx = (char*)calloc(strlen(fn) + 5, 1);
         strcpy(fnidx, fn); strcat(fnidx, ".bai");
-       fp = fopen(fnidx, "r");
+       fp = fopen(fnidx, "rb");
         if (fp == 0) { // try "{base}.bai"
                 char *s = strstr(fn, "bam");
                 if (s == fn + strlen(fn) - 3) {
                         strcpy(fnidx, fn);
                         fnidx[strlen(fn)-1] = 'i';
-                       fp = fopen(fnidx, "r");
+                       fp = fopen(fnidx, "rb");
                 }
         }
         free(fnidx); free(fn);
@@ -375,7 +420,7 @@ static void download_from_remote(const char *url)
                 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
                 return;
         }
-       if ((fp = fopen(fn, "w")) == 0) {
+       if ((fp = fopen(fn, "wb")) == 0) {
                 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
                 knet_close(fp_remote);
                 return;
@@ -425,7 +470,7 @@ int bam_index_build2(const char *fn, const char *_fnidx)
                 fnidx = (char*)calloc(strlen(fn) + 5, 1);
                 strcpy(fnidx, fn); strcat(fnidx, ".bai");
         } else fnidx = strdup(_fnidx);
-       fpidx = fopen(fnidx, "w");
+       fpidx = fopen(fnidx, "wb");
         if (fpidx == 0) {
                 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
                 free(fnidx);
@@ -446,7 +491,7 @@ int bam_index_build(const char *fn)
  int bam_index(int argc, char *argv[])
  {
         if (argc < 2) {
-               fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
+               fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
                 return 1;
         }
         if (argc >= 3) bam_index_build2(argv[1], argv[2]);
@@ -454,11 +499,43 @@ int bam_index(int argc, char *argv[])
         return 0;
  }
  
-#define MAX_BIN 37450 // =(8^6-1)/7+1
+int bam_idxstats(int argc, char *argv[])
+{
+       bam_index_t *idx;
+       bam_header_t *header;
+       bamFile fp;
+       int i;
+       if (argc < 2) {
+               fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
+               return 1;
+       }
+       fp = bam_open(argv[1], "r");
+       if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+       header = bam_header_read(fp);
+       bam_close(fp);
+       idx = bam_index_load(argv[1]);
+       if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+       for (i = 0; i < idx->n; ++i) {
+               khint_t k;
+               khash_t(i) *h = idx->index[i];
+               printf("%s\t%d", header->target_name[i], header->target_len[i]);
+               k = kh_get(i, h, BAM_MAX_BIN);
+               if (k != kh_end(h))
+                       printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
+               else printf("\t0\t0");
+               putchar('\n');
+       }
+       printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
+       bam_header_destroy(header);
+       bam_index_destroy(idx);
+       return 0;
+}
  
-static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
  {
         int i = 0, k;
+       if (beg >= end) return 0;
+       if (end >= 1u<<29) end = 1u<<29;
         --end;
         list[i++] = 0;
         for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
@@ -476,8 +553,15 @@ static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
         return (rend > beg && rbeg < end);
  }
  
+struct __bam_iter_t {
+       int from_first; // read from the first record; no random access
+       int tid, beg, end, n_off, i, finished;
+       uint64_t curr_off;
+       pair64_t *off;
+};
+
  // bam_fetch helper function retrieves 
-pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off)
+bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
  {
         uint16_t *bins;
         int i, n_bins, n_off;
@@ -485,17 +569,34 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e
         khint_t k;
         khash_t(i) *index;
         uint64_t min_off;
-
-       bins = (uint16_t*)calloc(MAX_BIN, 2);
+       bam_iter_t iter = 0;
+
+       if (beg < 0) beg = 0;
+       if (end < beg) return 0;
+       // initialize iter
+       iter = calloc(1, sizeof(struct __bam_iter_t));
+       iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
+       //
+       bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
         n_bins = reg2bins(beg, end, bins);
         index = idx->index[tid];
-       min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+       if (idx->index2[tid].n > 0) {
+               min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+                       : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+               if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
+                       int n = beg>>BAM_LIDX_SHIFT;
+                       if (n > idx->index2[tid].n) n = idx->index2[tid].n;
+                       for (i = n - 1; i >= 0; --i)
+                               if (idx->index2[tid].offset[i] != 0) break;
+                       if (i >= 0) min_off = idx->index2[tid].offset[i];
+               }
+       } else min_off = 0; // tabix 0.1.2 may produce such index files
         for (i = n_off = 0; i < n_bins; ++i) {
                 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
                         n_off += kh_value(index, k).n;
         }
         if (n_off == 0) {
-               free(bins); return 0;
+               free(bins); return iter;
         }
         off = (pair64_t*)calloc(n_off, 16);
         for (i = n_off = 0; i < n_bins; ++i) {
@@ -534,41 +635,62 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e
                 }
                 bam_destroy1(b);
         }
-       *cnt_off = n_off;
+       iter->n_off = n_off; iter->off = off;
+       return iter;
+}
+
+pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
+{ // for pysam compatibility
+       bam_iter_t iter;
+       pair64_t *off;
+       iter = bam_iter_query(idx, tid, beg, end);
+       off = iter->off; *cnt_off = iter->n_off;
+       free(iter);
         return off;
  }
  
-int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+void bam_iter_destroy(bam_iter_t iter)
  {
-       int n_off;
-       pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off);
-       if (off == 0) return 0;
-       {
-               // retrive alignments
-               uint64_t curr_off;
-               int i, ret, n_seeks;
-               n_seeks = 0; i = -1; curr_off = 0;
-               bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
-               for (;;) {
-                       if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
-                               if (i == n_off - 1) break; // no more chunks
-                               if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
-                               if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
-                                       bam_seek(fp, off[i+1].u, SEEK_SET);
-                                       curr_off = bam_tell(fp);
-                                       ++n_seeks;
-                               }
-                               ++i;
+       if (iter) { free(iter->off); free(iter); }
+}
+
+int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
+{
+       if (iter->finished) return -1;
+       if (iter->from_first) {
+               int ret = bam_read1(fp, b);
+               if (ret < 0) iter->finished = 1;
+               return ret;
+       }
+       if (iter->off == 0) return -1;
+       for (;;) {
+               int ret;
+               if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
+                       if (iter->i == iter->n_off - 1) break; // no more chunks
+                       if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
+                       if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
+                               bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
+                               iter->curr_off = bam_tell(fp);
                         }
-                       if ((ret = bam_read1(fp, b)) > 0) {
-                               curr_off = bam_tell(fp);
-                               if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
-                               else if (is_overlap(beg, end, b)) func(b, data);
-                       } else break; // end of file
+                       ++iter->i;
                 }
-//             fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
-               bam_destroy1(b);
+               if ((ret = bam_read1(fp, b)) > 0) {
+                       iter->curr_off = bam_tell(fp);
+                       if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed
+                       else if (is_overlap(iter->beg, iter->end, b)) return ret;
+               } else break; // end of file
         }
-       free(off);
+       iter->finished = 1;
+       return -1;
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+       bam_iter_t iter;
+       bam1_t *b;
+       b = bam_init1();
+       iter = bam_iter_query(idx, tid, beg, end);
+       while (bam_iter_read(fp, iter, b) >= 0) func(b, data);
+       bam_destroy1(b);
         return 0;
  }
diff --git a/bam_maqcns.c b/bam_maqcns.c

index 71c2185df0f27d53f74192d1d90625ca7ccb14ac..cad63d772268db289467c6a4fd625e50b7ac3889 100644 (file)
--- a/bam_maqcns.c
+++ b/bam_maqcns.c
@@ -310,6 +310,7 @@ bam_maqindel_opt_t *bam_maqindel_opt_init()
         bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
         mi->q_indel = 40;
         mi->r_indel = 0.00015;
+       mi->r_snp = 0.001;
         //
         mi->mm_penalty = 3;
         mi->indel_err = 4;
@@ -406,7 +407,8 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
         }
         { // the core part
                 char *ref2, *rs, *inscns = 0;
-               int k, l, *score, *pscore, max_ins = types[n_types-1];
+               int qr_snp, k, l, *score, *pscore, max_ins = types[n_types-1];
+               qr_snp = (int)(-4.343 * log(mi->r_snp) + .499);
                 if (max_ins > 0) { // get the consensus of inserted sequences
                         int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
                         // count occurrences
@@ -446,12 +448,18 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                 for (i = 0; i < n_types; ++i) {
                         ka_param_t ap = ka_param_blast;
                         ap.band_width = 2 * types[n_types - 1] + 2;
+                       ap.gap_end = 0;
                         // write ref2
                         for (k = 0, j = left; j <= pos; ++j)
                                 ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
                         if (types[i] <= 0) j += -types[i];
                         else for (l = 0; l < types[i]; ++l)
                                          ref2[k++] = bam_nt16_nt4_table[(int)inscns[i*max_ins + l]];
+                       if (types[0] < 0) { // mask deleted sequences
+                               int jj, tmp = types[i] >= 0? -types[0] : -types[0] + types[i];
+                               for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j)
+                                       ref2[k++] = 4;
+                       }
                         for (; j < right && ref[j]; ++j)
                                 ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
                         if (j < right) right = j;
@@ -482,22 +490,27 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                                 if (op == BAM_CMATCH) {
                                                         int k;
                                                         for (k = 0; k < len; ++k)
-                                                               if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k];
+                                                               if (ref2[x+k] != rs[y+k] && ref2[x+k] < 4)
+                                                                       ps += bam1_qual(p->b)[y+k] < qr_snp? bam1_qual(p->b)[y+k] : qr_snp;
                                                         x += len; y += len;
                                                 } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
-                                                       if (op == BAM_CINS) ps += mi->q_indel * len;
+                                                       if (op == BAM_CINS && l > 0 && l < n_acigar - 1) ps += mi->q_indel * len;
                                                         y += len;
                                                 } else if (op == BAM_CDEL) {
-                                                       ps += mi->q_indel * len;
+                                                       if (l > 0 && l < n_acigar - 1) ps += mi->q_indel * len;
                                                         x += len;
                                                 }
                                         }
                                         pscore[i*n+j] = ps;
-                                       /*if (pos == 2618517) { // for debugging only
-                                               fprintf(stderr, "pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend);
-                                               for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); fprintf(stderr, "\n");
-                                               for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l]], stderr); fputc('\n', stderr);
-                                               for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); fputc('\n', stderr);
+                                       /*if (1) { // for debugging only
+                                               fprintf(stderr, "id=%d, pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, %d, ",
+                                                               j, pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend, mi->q_indel);
+                                               for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]);
+                                               fprintf(stderr, "\n");
+                                               for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l+tbeg-left]], stderr);
+                                               fputc('\n', stderr);
+                                               for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr);
+                                               fputc('\n', stderr);
                                                 }*/
                                         free(acigar);
                                 }
@@ -560,7 +573,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                 ret->gl[0] = ret->gl[1] = 0;
                                 for (j = 0; j < n; ++j) {
                                         int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
-                                       //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2);
+                                       //fprintf(stderr, "id=%d, %d, %d, %d, %d, %d\n", j, pl[j].b->core.pos+1, types[max1_i], types[max2_i], s1, s2);
                                         if (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err;
                                         else ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err;
                                 }
diff --git a/bam_maqcns.h b/bam_maqcns.h

index fa5489d13caad7c5c20489091302bfb5b2a5799d..6cc5355fc2db2b1b19e6a3d2d330306a240fa15a 100644 (file)
--- a/bam_maqcns.h
+++ b/bam_maqcns.h
@@ -16,8 +16,9 @@ typedef struct {
  } bam_maqcns_t;
  
  typedef struct {
-       int q_indel;
-       float r_indel;
+       int q_indel; // indel sequencing error, phred scaled
+       float r_indel; // indel prior
+       float r_snp; // snp prior
         // hidden parameters, unchangeable from command line
         int mm_penalty, indel_err, ambi_thres;
  } bam_maqindel_opt_t;
diff --git a/bam_md.c b/bam_md.c

index 3ca730993cbc01065eb2c76bd66df1640808e032..17b0a4a4fa6943313be1b6d78b18c5255fe44982 100644 (file)
--- a/bam_md.c
+++ b/bam_md.c
@@ -6,7 +6,7 @@
  #include "sam.h"
  #include "kstring.h"
  
-void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm)
  {
         uint8_t *seq = bam1_seq(b);
         uint32_t *cigar = bam1_cigar(b);
@@ -53,6 +53,26 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
                 }
         }
         ksprintf(str, "%d", u);
+       // apply max_nm
+       if (max_nm > 0 && nm >= max_nm) {
+               for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+                       int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+                       if (op == BAM_CMATCH) {
+                               for (j = 0; j < l; ++j) {
+                                       int z = y + j;
+                                       int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+                                       if (ref[x+j] == 0) break; // out of boundary
+                                       if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+                                               seq[z/2] |= (z&1)? 0x0f : 0xf0;
+                                               bam1_qual(b)[z] = 0;
+                                       }
+                               }
+                               if (j < l) break;
+                               x += l; y += l;
+                       } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+                       else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               }
+       }
         // update NM
         old_nm = bam_aux_get(b, "NM");
         if (c->flag & BAM_FUNMAP) return;
@@ -83,9 +103,14 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
         free(str->s); free(str);
  }
  
+void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+{
+       bam_fillmd1_core(b, ref, is_equal, 0);
+}
+
  int bam_fillmd(int argc, char *argv[])
  {
-       int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed;
+       int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0;
         samfile_t *fp, *fpout = 0;
         faidx_t *fai;
         char *ref = 0, mode_w[8], mode_r[8];
@@ -94,12 +119,13 @@ int bam_fillmd(int argc, char *argv[])
         is_bam_out = is_sam_in = is_uncompressed = 0;
         mode_w[0] = mode_r[0] = 0;
         strcpy(mode_r, "r"); strcpy(mode_w, "w");
-       while ((c = getopt(argc, argv, "eubS")) >= 0) {
+       while ((c = getopt(argc, argv, "eubSn:")) >= 0) {
                 switch (c) {
                 case 'e': is_equal = 1; break;
                 case 'b': is_bam_out = 1; break;
                 case 'u': is_uncompressed = is_bam_out = 1; break;
                 case 'S': is_sam_in = 1; break;
+               case 'n': max_nm = atoi(optarg); break;
                 default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
                 }
         }
@@ -136,7 +162,7 @@ int bam_fillmd(int argc, char *argv[])
                                         fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
                                                         fp->header->target_name[tid]);
                         }
-                       if (ref) bam_fillmd1(b, ref, is_equal);
+                       if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm);
                 }
                 samwrite(fpout, b);
         }
diff --git a/bam_pileup.c b/bam_pileup.c

index f68f400ab5cd086dc88a33e14899abe6cd4e53bb..3c41a169127fcc5d3977df3e8febd944b7ced77e 100644 (file)
--- a/bam_pileup.c
+++ b/bam_pileup.c
@@ -73,18 +73,28 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
                                 p->qpos = y + (pos - x);
                                 if (x == pos && is_restart) p->is_head = 1;
                                 if (x + l - 1 == pos) { // come to the end of a match
-                                       if (k < c->n_cigar - 1) { // there are additional operation(s)
+                                       int has_next_match = 0;
+                                       unsigned i;
+                                       for (i = k + 1; i < c->n_cigar; ++i) {
+                                               uint32_t cigar = bam1_cigar(b)[i];
+                                               int opi = cigar&BAM_CIGAR_MASK;
+                                               if (opi == BAM_CMATCH) {
+                                                       has_next_match = 1;
+                                                       break;
+                                               } else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break;
+                                       }
+                                       if (!has_next_match) p->is_tail = 1;
+                                       if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s)
                                                 uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
                                                 int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
                                                 if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
                                                 else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
-                                               if (op_next == BAM_CDEL || op_next == BAM_CINS) {
-                                                       if (k + 2 < c->n_cigar) op_next = bam1_cigar(b)[k+2]&BAM_CIGAR_MASK;
-                                                       else p->is_tail = 1;
+                                               else if (op_next == BAM_CPAD && k + 2 < c->n_cigar) { // no working for adjacent padding
+                                                       cigar = bam1_cigar(b)[k+2]; op_next = cigar&BAM_CIGAR_MASK;
+                                                       if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
+                                                       else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
                                                 }
-                                               if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
-                                                       p->is_tail = 1; // tail
-                                       } else p->is_tail = 1; // this is the last operation; set tail
+                                       }
                                 }
                         }
                         x += l; y += l;
@@ -96,7 +106,8 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
                         x += l;
                 } else if (op == BAM_CREF_SKIP) x += l;
                 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
-               is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
+               if (is_restart) is_restart ^= (op == BAM_CMATCH);
+               else is_restart ^= (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
                 if (x > pos) {
                         if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
                         break;
@@ -108,119 +119,167 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
  
  /* --- END: Auxiliary functions */
  
-struct __bam_plbuf_t {
+/*******************
+ * pileup iterator *
+ *******************/
+
+struct __bam_plp_t {
         mempool_t *mp;
         lbnode_t *head, *tail, *dummy;
-       bam_pileup_f func;
-       void *func_data;
         int32_t tid, pos, max_tid, max_pos;
-       int max_pu, is_eof;
-       bam_pileup1_t *pu;
-       int flag_mask;
+       int is_eof, flag_mask, max_plp, error;
+       bam_pileup1_t *plp;
+       // for the "auto" interface only
+       bam1_t *b;
+       bam_plp_auto_f func;
+       void *data;
  };
  
-void bam_plbuf_reset(bam_plbuf_t *buf)
+bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
  {
-       lbnode_t *p, *q;
-       buf->max_tid = buf->max_pos = -1;
-       buf->tid = buf->pos = 0;
-       buf->is_eof = 0;
-       for (p = buf->head; p->next;) {
-               q = p->next;
-               mp_free(buf->mp, p);
-               p = q;
+       bam_plp_t iter;
+       iter = calloc(1, sizeof(struct __bam_plp_t));
+       iter->mp = mp_init();
+       iter->head = iter->tail = mp_alloc(iter->mp);
+       iter->dummy = mp_alloc(iter->mp);
+       iter->max_tid = iter->max_pos = -1;
+       iter->flag_mask = BAM_DEF_MASK;
+       if (func) {
+               iter->func = func;
+               iter->data = data;
+               iter->b = bam_init1();
         }
-       buf->head = buf->tail;
+       return iter;
  }
  
-void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
-{
-       if (mask < 0) buf->flag_mask = BAM_DEF_MASK;
-       else buf->flag_mask = BAM_FUNMAP | mask;
-}
-
-bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+void bam_plp_destroy(bam_plp_t iter)
  {
-       bam_plbuf_t *buf;
-       buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t));
-       buf->func = func; buf->func_data = data;
-       buf->mp = mp_init();
-       buf->head = buf->tail = mp_alloc(buf->mp);
-       buf->dummy = mp_alloc(buf->mp);
-       buf->max_tid = buf->max_pos = -1;
-       buf->flag_mask = BAM_DEF_MASK;
-       return buf;
+       mp_free(iter->mp, iter->dummy);
+       mp_free(iter->mp, iter->head);
+       if (iter->mp->cnt != 0)
+               fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
+       mp_destroy(iter->mp);
+       if (iter->b) bam_destroy1(iter->b);
+       free(iter->plp);
+       free(iter);
  }
  
-void bam_plbuf_destroy(bam_plbuf_t *buf)
+const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
  {
-       mp_free(buf->mp, buf->dummy);
-       mp_free(buf->mp, buf->head);
-       if (buf->mp->cnt != 0)
-               fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt);
-       mp_destroy(buf->mp);
-       free(buf->pu);
-       free(buf);
+       if (iter->error) { *_n_plp = -1; return 0; }
+       *_n_plp = 0;
+       if (iter->is_eof && iter->head->next == 0) return 0;
+       while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
+               int n_plp = 0;
+               lbnode_t *p, *q;
+               // write iter->plp at iter->pos
+               iter->dummy->next = iter->head;
+               for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
+                       if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
+                               q->next = p->next; mp_free(iter->mp, p); p = q;
+                       } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
+                               if (n_plp == iter->max_plp) { // then double the capacity
+                                       iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
+                                       iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
+                               }
+                               iter->plp[n_plp].b = &p->b;
+                               if (resolve_cigar(iter->plp + n_plp, iter->pos)) ++n_plp; // skip the read if we are looking at ref-skip
+                       }
+               }
+               iter->head = iter->dummy->next; // dummy->next may be changed
+               *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
+               // update iter->tid and iter->pos
+               if (iter->head->next) {
+                       if (iter->tid > iter->head->b.core.tid) {
+                               fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
+                               iter->error = 1;
+                               *_n_plp = -1;
+                               return 0;
+                       }
+               }
+               if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
+                       iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
+               } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
+                       iter->pos = iter->head->beg; // jump to the next position
+               } else ++iter->pos; // scan contiguously
+               // return
+               if (n_plp) return iter->plp;
+               if (iter->is_eof && iter->head->next == 0) break;
+       }
+       return 0;
  }
  
-int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+int bam_plp_push(bam_plp_t iter, const bam1_t *b)
  {
-       if (b) { // fill buffer
+       if (iter->error) return -1;
+       if (b) {
                 if (b->core.tid < 0) return 0;
-               if (b->core.flag & buf->flag_mask) return 0;
-               bam_copy1(&buf->tail->b, b);
-               buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
-               if (b->core.tid < buf->max_tid) {
+               if (b->core.flag & iter->flag_mask) return 0;
+               bam_copy1(&iter->tail->b, b);
+               iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
+               if (b->core.tid < iter->max_tid) {
                         fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+                       iter->error = 1;
                         return -1;
                 }
-               if ((b->core.tid == buf->max_tid) && (buf->tail->beg < buf->max_pos)) {
+               if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
                         fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+                       iter->error = 1;
                         return -1;
                 }
-               buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
-               if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
-                       buf->tail->next = mp_alloc(buf->mp);
-                       buf->tail = buf->tail->next;
-               }
-       } else buf->is_eof = 1;
-       while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) {
-               int n_pu = 0;
-               lbnode_t *p, *q;
-               buf->dummy->next = buf->head;
-               for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
-                       if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
-                               q->next = p->next; mp_free(buf->mp, p); p = q;
-                       } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
-                               if (n_pu == buf->max_pu) { // then double the capacity
-                                       buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
-                                       buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
-                               }
-                               buf->pu[n_pu].b = &p->b;
-                               if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
-                       }
+               iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
+               if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
+                       iter->tail->next = mp_alloc(iter->mp);
+                       iter->tail = iter->tail->next;
                 }
-               buf->head = buf->dummy->next; // dummy->next may be changed
-               if (n_pu) { // then call user defined function
-                       buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data);
-               }
-               // update tid and pos
-               if (buf->head->next) {
-                       if (buf->tid > buf->head->b.core.tid) {
-                               fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n");
-                               return 1;
+       } else iter->is_eof = 1;
+       return 0;
+}
+
+const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+       const bam_pileup1_t *plp;
+       if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
+       if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+       else {
+               *_n_plp = 0;
+               if (iter->is_eof) return 0;
+               while (iter->func(iter->data, iter->b) >= 0) {
+                       if (bam_plp_push(iter, iter->b) < 0) {
+                               *_n_plp = -1;
+                               return 0;
                         }
+                       if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
                 }
-               if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
-                       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
-               } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
-                       buf->pos = buf->head->beg; // jump to the next position
-               } else ++buf->pos; // scan contiguously
-               if (buf->is_eof && buf->head->next == 0) break;
+               bam_plp_push(iter, 0);
+               if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+               return 0;
         }
-       return 0;
  }
  
+void bam_plp_reset(bam_plp_t iter)
+{
+       lbnode_t *p, *q;
+       iter->max_tid = iter->max_pos = -1;
+       iter->tid = iter->pos = 0;
+       iter->is_eof = 0;
+       for (p = iter->head; p->next;) {
+               q = p->next;
+               mp_free(iter->mp, p);
+               p = q;
+       }
+       iter->head = iter->tail;
+}
+
+void bam_plp_set_mask(bam_plp_t iter, int mask)
+{
+       iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
+}
+
+/*****************
+ * callback APIs *
+ *****************/
+
  int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
  {
         bam_plbuf_t *buf;
@@ -236,3 +295,102 @@ int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
         bam_destroy1(b);
         return 0;
  }
+
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+       bam_plp_set_mask(buf->iter, mask);
+}
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+       bam_plp_reset(buf->iter);
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+       bam_plbuf_t *buf;
+       buf = calloc(1, sizeof(bam_plbuf_t));
+       buf->iter = bam_plp_init(0, 0);
+       buf->func = func;
+       buf->data = data;
+       return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+       bam_plp_destroy(buf->iter);
+       free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+       int ret, n_plp, tid, pos;
+       const bam_pileup1_t *plp;
+       ret = bam_plp_push(buf->iter, b);
+       if (ret < 0) return ret;
+       while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)
+               buf->func(tid, pos, n_plp, plp, buf->data);
+       return 0;
+}
+
+/***********
+ * mpileup *
+ ***********/
+
+struct __bam_mplp_t {
+       int n;
+       uint64_t min, *pos;
+       bam_plp_t *iter;
+       int *n_plp;
+       const bam_pileup1_t **plp;
+};
+
+bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+{
+       int i;
+       bam_mplp_t iter;
+       iter = calloc(1, sizeof(struct __bam_mplp_t));
+       iter->pos = calloc(n, 8);
+       iter->n_plp = calloc(n, sizeof(int));
+       iter->plp = calloc(n, sizeof(void*));
+       iter->iter = calloc(n, sizeof(void*));
+       iter->n = n;
+       iter->min = (uint64_t)-1;
+       for (i = 0; i < n; ++i) {
+               iter->iter[i] = bam_plp_init(func, data[i]);
+               iter->pos[i] = iter->min;
+       }
+       return iter;
+}
+
+void bam_mplp_destroy(bam_mplp_t iter)
+{
+       int i;
+       for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
+       free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
+       free(iter);
+}
+
+int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+{
+       int i, ret = 0;
+       uint64_t new_min = (uint64_t)-1;
+       for (i = 0; i < iter->n; ++i) {
+               if (iter->pos[i] == iter->min) {
+                       int tid, pos;
+                       iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
+                       iter->pos[i] = (uint64_t)tid<<32 | pos;
+               }
+               if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
+       }
+       iter->min = new_min;
+       if (new_min == (uint64_t)-1) return 0;
+       *_tid = new_min>>32; *_pos = (uint32_t)new_min;
+       for (i = 0; i < iter->n; ++i) {
+               if (iter->pos[i] == iter->min) {
+                       n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
+                       ++ret;
+               } else n_plp[i] = 0, plp[i] = 0;
+       }
+       return ret;
+}
diff --git a/bam_plcmd.c b/bam_plcmd.c

index ba787a98bd78726503f48af1fd16582d89a357dd..6804795bd107405982cada8544051cc073fdf55e 100644 (file)
--- a/bam_plcmd.c
+++ b/bam_plcmd.c
@@ -18,6 +18,10 @@ KHASH_MAP_INIT_INT64(64, indel_list_t)
  #define BAM_PLF_GLF        0x08
  #define BAM_PLF_VAR_ONLY   0x10
  #define BAM_PLF_2ND        0x20
+#define BAM_PLF_RANBASE    0x40
+#define BAM_PLF_1STBASE    0x80
+#define BAM_PLF_ALLBASE    0x100
+#define BAM_PLF_READPOS    0x200
  
  typedef struct {
         bam_header_t *h;
@@ -28,6 +32,7 @@ typedef struct {
         uint32_t format;
         int tid, len, last_pos;
         int mask;
+    int max_depth;  // for indel calling, ignore reads with the depth too high. 0 for unlimited
         char *ref;
         glfFile fp_glf; // for glf output only
  } pu_data_t;
@@ -121,10 +126,11 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu,
         g3->offset = pos - d->last_pos;
         d->last_pos = pos;
         glf3_write1(d->fp_glf, g3);
-       if (pos < d->len) {
+    if (pos < d->len) {
+        int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth;
                 if (proposed_indels)
-                       r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
-               else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+                       r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+               else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0);
         }
         if (r) { // then write indel line
                 int het = 3 * n, min;
@@ -152,11 +158,37 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu,
         return 0;
  }
  
+static void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
+{
+       if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+       if (!p->is_del) {
+               int j, rb, c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+               rb = (ref && pos < ref_len)? ref[pos] : 'N';
+               if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+               else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+               putchar(c);
+               if (p->indel > 0) {
+                       printf("+%d", p->indel);
+                       for (j = 1; j <= p->indel; ++j) {
+                               c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                               putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                       }
+               } else if (p->indel < 0) {
+                       printf("%d", p->indel);
+                       for (j = 1; j <= -p->indel; ++j) {
+                               c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
+                               putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                       }
+               }
+       } else putchar('*');
+       if (p->is_tail) putchar('$');
+}
+
  static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
  {
         pu_data_t *d = (pu_data_t*)data;
         bam_maqindel_ret_t *r = 0;
-       int i, j, rb, rms_mapq = -1, *proposed_indels = 0;
+       int i, rb, rms_mapq = -1, *proposed_indels = 0;
         uint64_t rms_aux;
         uint32_t cns = 0;
  
@@ -171,7 +203,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
         // update d->ref if necessary
         if (d->fai && (int)tid != d->tid) {
                 free(d->ref);
-               d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+               d->ref = faidx_fetch_seq(d->fai, d->h->target_name[tid], 0, 0x7fffffff, &d->len);
                 d->tid = tid;
         }
         rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
@@ -182,12 +214,31 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
                 if (i == n) return 0;
         }
         // call the consensus and indel
-       if (d->format & BAM_PLF_CNS) // call consensus
-               cns = bam_maqcns_call(n, pu, d->c);
-       if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels
-               if (proposed_indels) // the first element gives the size of the array
-                       r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
-               else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
+       if (d->format & BAM_PLF_CNS) { // call consensus
+               if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE)) { // use a random base or the 1st base as the consensus call
+                       const bam_pileup1_t *p = (d->format & BAM_PLF_1STBASE)? pu : pu + (int)(drand48() * n);
+                       int q = bam1_qual(p->b)[p->qpos];
+                       int mapQ = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
+                       uint32_t b = bam1_seqi(bam1_seq(p->b), p->qpos);
+                       cns = b<<28 | 0xf<<24 | mapQ<<16 | q<<8;
+               } else if (d->format & BAM_PLF_ALLBASE) { // collapse all bases
+                       uint64_t rmsQ = 0;
+                       uint32_t b = 0;
+                       for (i = 0; i < n; ++i) {
+                               const bam_pileup1_t *p = pu + i;
+                               int q = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
+                               b |= bam1_seqi(bam1_seq(p->b), p->qpos);
+                               rmsQ += q * q;
+                       }
+                       rmsQ = (uint64_t)(sqrt((double)rmsQ / n) + .499);
+                       cns = b<<28 | 0xf<<24 | rmsQ<<16 | 60<<8;
+               } else cns = bam_maqcns_call(n, pu, d->c);
+       }
+    if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels
+        int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth;
+        if (proposed_indels) // the first element gives the size of the array
+            r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+        else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0);
         }
         // when only variant sites are asked for, test if the site is a variant
         if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) {
@@ -218,27 +269,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
                 const bam_pileup1_t *p = pu + i;
                 int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
                 rms_aux += tmp * tmp;
-               if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
-               if (!p->is_del) {
-                       int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
-                       if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
-                       else c = bam1_strand(p->b)? tolower(c) : toupper(c);
-                       putchar(c);
-                       if (p->indel > 0) {
-                               printf("+%d", p->indel);
-                               for (j = 1; j <= p->indel; ++j) {
-                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
-                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
-                               }
-                       } else if (p->indel < 0) {
-                               printf("%d", p->indel);
-                               for (j = 1; j <= -p->indel; ++j) {
-                                       c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
-                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
-                               }
-                       }
-               } else putchar('*');
-               if (p->is_tail) putchar('$');
+               pileup_seq(p, pos, d->len, d->ref);
         }
         // finalize rms_mapq
         rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499);
@@ -275,6 +306,15 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
                         putchar(c);
                 }
         }
+       // print read position
+       if (d->format & BAM_PLF_READPOS) {
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       int x = pu[i].qpos;
+                       int l = pu[i].b->core.l_qseq;
+                       printf("%d,", x < l/2? x+1 : -((l-1)-x+1));
+               }
+       }
         putchar('\n');
         // print the indel line if r has been calculated. This only happens if:
         // a) -c or -i are flagged, AND b) the reference sequence is available
@@ -298,29 +338,40 @@ int bam_pileup(int argc, char *argv[])
         int c, is_SAM = 0;
         char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
         pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
+    d->max_depth = 0;
         d->tid = -1; d->mask = BAM_DEF_MASK;
         d->c = bam_maqcns_init();
+       d->c->is_soap = 1; // change the default model
         d->ido = bam_maqindel_opt_init();
-       while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2a")) >= 0) {
+       while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PA")) >= 0) {
                 switch (c) {
                 case 'a': d->c->is_soap = 1; break;
+               case 'A': d->c->is_soap = 0; break;
                 case 's': d->format |= BAM_PLF_SIMPLE; break;
                 case 't': fn_list = strdup(optarg); break;
                 case 'l': fn_pos = strdup(optarg); break;
                 case 'f': fn_fa = strdup(optarg); break;
                 case 'T': d->c->theta = atof(optarg); break;
                 case 'N': d->c->n_hap = atoi(optarg); break;
-               case 'r': d->c->het_rate = atof(optarg); break;
+               case 'r': d->c->het_rate = atof(optarg); d->ido->r_snp = d->c->het_rate; break;
                 case 'M': d->c->cap_mapQ = atoi(optarg); break;
+               case 'd': d->max_depth = atoi(optarg); break;
                 case 'c': d->format |= BAM_PLF_CNS; break;
                 case 'i': d->format |= BAM_PLF_INDEL_ONLY; break;
                 case 'v': d->format |= BAM_PLF_VAR_ONLY; break;
                 case 'm': d->mask = strtol(optarg, 0, 0); break;
                 case 'g': d->format |= BAM_PLF_GLF; break;
                 case '2': d->format |= BAM_PLF_2ND; break;
+               case 'P': d->format |= BAM_PLF_READPOS; break;
                 case 'I': d->ido->q_indel = atoi(optarg); break;
                 case 'G': d->ido->r_indel = atof(optarg); break;
                 case 'S': is_SAM = 1; break;
+               case 'R':
+                       if (strcmp(optarg, "random") == 0) d->format |= BAM_PLF_RANBASE;
+                       else if (strcmp(optarg, "first") == 0) d->format |= BAM_PLF_1STBASE;
+                       else if (strcmp(optarg, "all") == 0) d->format |= BAM_PLF_ALLBASE;
+                       else fprintf(stderr, "[bam_pileup] unrecognized -R\n");
+                       break;
                 default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
                 }
         }
@@ -330,15 +381,16 @@ int bam_pileup(int argc, char *argv[])
                 fprintf(stderr, "Usage:  samtools pileup [options] <in.bam>|<in.sam>\n\n");
                 fprintf(stderr, "Option: -s        simple (yet incomplete) pileup format\n");
                 fprintf(stderr, "        -S        the input is in SAM\n");
-               fprintf(stderr, "        -a        use the SOAPsnp model for SNP calling\n");
+               fprintf(stderr, "        -A        use the MAQ model for SNP calling\n");
                 fprintf(stderr, "        -2        output the 2nd best call and quality\n");
                 fprintf(stderr, "        -i        only show lines/consensus with indels\n");
                 fprintf(stderr, "        -m INT    filtering reads with bits in INT [%d]\n", d->mask);
                 fprintf(stderr, "        -M INT    cap mapping quality at INT [%d]\n", d->c->cap_mapQ);
+        fprintf(stderr, "        -d INT    limit maximum depth for indels [unlimited]\n");
                 fprintf(stderr, "        -t FILE   list of reference sequences (force -S)\n");
                 fprintf(stderr, "        -l FILE   list of sites at which pileup is output\n");
                 fprintf(stderr, "        -f FILE   reference sequence in the FASTA format\n\n");
-               fprintf(stderr, "        -c        output the maq consensus sequence\n");
+               fprintf(stderr, "        -c        output the SOAPsnp consensus sequence\n");
                 fprintf(stderr, "        -v        print variants only (for -c)\n");
                 fprintf(stderr, "        -g        output in the GLFv3 format (suppressing -c/-i/-s)\n");
                 fprintf(stderr, "        -T FLOAT  theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta);
@@ -350,6 +402,7 @@ int bam_pileup(int argc, char *argv[])
                 free(fn_list); free(fn_fa); free(d);
                 return 1;
         }
+       if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE|BAM_PLF_ALLBASE)) d->format |= BAM_PLF_CNS;
         if (fn_fa) d->fai = fai_load(fn_fa);
         if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling
         if (d->format & BAM_PLF_GLF) { // for glf output
@@ -390,3 +443,128 @@ int bam_pileup(int argc, char *argv[])
         free(d->ido); free(d->ref); free(d);
         return 0;
  }
+
+/***********
+ * mpileup *
+ ***********/
+
+typedef struct {
+       char *reg;
+       faidx_t *fai;
+} mplp_conf_t;
+
+typedef struct {
+       bamFile fp;
+       bam_iter_t iter;
+} mplp_aux_t;
+
+static int mplp_func(void *data, bam1_t *b)
+{
+       mplp_aux_t *ma = (mplp_aux_t*)data;
+       if (ma->iter) return bam_iter_read(ma->fp, ma->iter, b);
+       return bam_read1(ma->fp, b);
+}
+
+static int mpileup(mplp_conf_t *conf, int n, char **fn)
+{
+       mplp_aux_t **data;
+       int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid;
+       const bam_pileup1_t **plp;
+       bam_mplp_t iter;
+       bam_header_t *h = 0;
+       char *ref;
+       // allocate
+       data = calloc(n, sizeof(void*));
+       plp = calloc(n, sizeof(void*));
+       n_plp = calloc(n, sizeof(int*));
+       // read the header and initialize data
+       for (i = 0; i < n; ++i) {
+               bam_header_t *h_tmp;
+               data[i] = calloc(1, sizeof(mplp_aux_t));
+               data[i]->fp = bam_open(fn[i], "r");
+               h_tmp = bam_header_read(data[i]->fp);
+               if (conf->reg) {
+                       int beg, end;
+                       bam_index_t *idx;
+                       idx = bam_index_load(fn[i]);
+                       if (idx == 0) {
+                               fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
+                               exit(1);
+                       }
+                       if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
+                               fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
+                               exit(1);
+                       }
+                       if (i == 0) beg0 = beg, end0 = end;
+                       data[i]->iter = bam_iter_query(idx, tid, beg, end);
+                       bam_index_destroy(idx);
+               }
+               if (i == 0) h = h_tmp;
+               else {
+                       // FIXME: to check consistency
+                       bam_header_destroy(h_tmp);
+               }
+       }
+       // mpileup
+       ref_tid = -1; ref = 0;
+       iter = bam_mplp_init(n, mplp_func, (void**)data);
+       while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
+               if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
+               if (tid != ref_tid) {
+                       free(ref);
+                       if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len);
+                       ref_tid = tid;
+               }
+               printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
+               for (i = 0; i < n; ++i) {
+                       int j;
+                       printf("\t%d\t", n_plp[i]);
+                       if (n_plp[i] == 0) printf("*\t*");
+                       else {
+                               for (j = 0; j < n_plp[i]; ++j)
+                                       pileup_seq(plp[i] + j, pos, ref_len, ref);
+                               putchar('\t');
+                               for (j = 0; j < n_plp[i]; ++j) {
+                                       const bam_pileup1_t *p = plp[i] + j;
+                                       int c = bam1_qual(p->b)[p->qpos] + 33;
+                                       if (c > 126) c = 126;
+                                       putchar(c);
+                               }
+                       }
+               }
+               putchar('\n');
+       }
+       bam_mplp_destroy(iter);
+       bam_header_destroy(h);
+       for (i = 0; i < n; ++i) {
+               bam_close(data[i]->fp);
+               if (data[i]->iter) bam_iter_destroy(data[i]->iter);
+               free(data[i]);
+       }
+       free(data); free(plp); free(ref); free(n_plp);
+       return 0;
+}
+
+int bam_mpileup(int argc, char *argv[])
+{
+       int c;
+       mplp_conf_t mplp;
+       memset(&mplp, 0, sizeof(mplp_conf_t));
+       while ((c = getopt(argc, argv, "f:r:")) >= 0) {
+               switch (c) {
+               case 'f':
+                       mplp.fai = fai_load(optarg);
+                       if (mplp.fai == 0) return 1;
+                       break;
+               case 'r': mplp.reg = strdup(optarg);
+               }
+       }
+       if (argc == 1) {
+               fprintf(stderr, "Usage: samtools mpileup [-r reg] [-f in.fa] in1.bam [in2.bam [...]]\n");
+               return 1;
+       }
+       mpileup(&mplp, argc - optind, argv + optind);
+       free(mplp.reg);
+       if (mplp.fai) fai_destroy(mplp.fai);
+       return 0;
+}
diff --git a/bam_reheader.c b/bam_reheader.c

new file mode 100644 (file)

index 0000000..bae97c7
--- /dev/null
+++ b/bam_reheader.c
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "bgzf.h"
+#include "bam.h"
+
+#define BUF_SIZE 0x10000
+
+int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
+{
+       BGZF *fp;
+       bam_header_t *old;
+       int len;
+       uint8_t *buf;
+       if (in->open_mode != 'r') return -1;
+       buf = malloc(BUF_SIZE);
+       old = bam_header_read(in);
+       fp = bgzf_fdopen(fd, "w");
+       bam_header_write(fp, h);
+       if (in->block_offset < in->block_length) {
+               bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
+               bgzf_flush(fp);
+       }
+#ifdef _USE_KNETFILE
+       while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0)
+#else
+       while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0)
+#endif
+               fwrite(buf, 1, len, fp->x.fpw);
+       free(buf);
+       fp->block_offset = in->block_offset = 0;
+       bgzf_close(fp);
+       return 0;
+}
+
+int main_reheader(int argc, char *argv[])
+{
+       bam_header_t *h;
+       BGZF *in;
+       if (argc != 3) {
+               fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
+               return 1;
+       }
+       { // read the header
+               tamFile fph = sam_open(argv[1]);
+               if (fph == 0) {
+                       fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
+                       return 1;
+               }
+               h = sam_header_read(fph);
+               sam_close(fph);
+       }
+       in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r");
+       if (in == 0) {
+               fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]);
+               return 1;
+       }
+       bam_reheader(in, h, fileno(stdout));
+       bgzf_close(in);
+       return 0;
+}
diff --git a/bam_sort.c b/bam_sort.c

index 9884f3d6273f680709ee0e44532a8371413fddf4..12b1b5455280e8d8a788a82929844083a4efd084 100644 (file)
--- a/bam_sort.c
+++ b/bam_sort.c
@@ -294,7 +294,7 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size
                 mem += ret;
                 ++k;
                 if (mem >= max_mem) {
-                       sort_blocks(n++, k, buf, prefix, header, is_stdout);
+                       sort_blocks(n++, k, buf, prefix, header, 0);
                         mem = 0; k = 0;
                 }
         }
@@ -304,7 +304,7 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size
         else { // then merge
                 char **fns, *fnout;
                 fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1);
-               sort_blocks(n++, k, buf, prefix, header, is_stdout);
+               sort_blocks(n++, k, buf, prefix, header, 0);
                 fnout = (char*)calloc(strlen(prefix) + 20, 1);
                 if (is_stdout) sprintf(fnout, "-");
                 else sprintf(fnout, "%s.bam", prefix);
diff --git a/bam_tview.c b/bam_tview.c

index 4c121e7cbfa012ac0c3fa61d574f03c893e6d271..7b326fc40e7bf276b7adc3848c0667d5628a68ad 100644 (file)
--- a/bam_tview.c
+++ b/bam_tview.c
@@ -280,7 +280,7 @@ int tv_draw_aln(tview_t *tv, int tid, int pos)
  
  static void tv_win_goto(tview_t *tv, int *tid, int *pos)
  {
-       char str[256];
+       char str[256], *p;
         int i, l = 0;
         wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
         mvwprintw(tv->wgoto, 1, 2, "Goto: ");
@@ -291,10 +291,18 @@ static void tv_win_goto(tview_t *tv, int *tid, int *pos)
                         --l;
                 } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
                         int _tid = -1, _beg, _end;
-                       bam_parse_region(tv->header, str, &_tid, &_beg, &_end);
-                       if (_tid >= 0) {
-                               *tid = _tid; *pos = _beg;
-                               return;
+                       if (str[0] == '=') {
+                               _beg = strtol(str+1, &p, 10);
+                               if (_beg > 0) {
+                                       *pos = _beg;
+                                       return;
+                               }
+                       } else {
+                               bam_parse_region(tv->header, str, &_tid, &_beg, &_end);
+                               if (_tid >= 0) {
+                                       *tid = _tid; *pos = _beg;
+                                       return;
+                               }
                         }
                 } else if (isgraph(c)) {
                         if (l < TV_MAX_GOTO) str[l++] = c;
@@ -351,6 +359,7 @@ void tv_loop(tview_t *tv)
                         case '?': tv_win_help(tv); break;
                         case '\033':
                         case 'q': goto end_loop;
+                       case '/': 
                         case 'g': tv_win_goto(tv, &tid, &pos); break;
                         case 'm': tv->color_for = TV_COLOR_MAPQ; break;
                         case 'b': tv->color_for = TV_COLOR_BASEQ; break;
diff --git a/bamtk.c b/bamtk.c

index 48ac76b911ccd532cf811562991d1919b4e56059..94c4d3fdb34be8698e21e9cf20166cadd1ae06c8 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,11 +9,12 @@
  #endif
  
  #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.7 (r510)"
+#define PACKAGE_VERSION "0.1.8 (r613)"
  #endif
  
  int bam_taf2baf(int argc, char *argv[]);
  int bam_pileup(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
  int bam_merge(int argc, char *argv[]);
  int bam_index(int argc, char *argv[]);
  int bam_sort(int argc, char *argv[]);
@@ -22,9 +23,10 @@ int bam_mating(int argc, char *argv[]);
  int bam_rmdup(int argc, char *argv[]);
  int bam_flagstat(int argc, char *argv[]);
  int bam_fillmd(int argc, char *argv[]);
-
+int bam_idxstats(int argc, char *argv[]);
  int main_samview(int argc, char *argv[]);
  int main_import(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
  
  int faidx_main(int argc, char *argv[]);
  int glf3_view_main(int argc, char *argv[]);
@@ -78,17 +80,20 @@ static int usage()
         fprintf(stderr, "Command: view        SAM<->BAM conversion\n");
         fprintf(stderr, "         sort        sort alignment file\n");
         fprintf(stderr, "         pileup      generate pileup output\n");
+       fprintf(stderr, "         mpileup     multi-way pileup\n");
         fprintf(stderr, "         faidx       index/extract FASTA\n");
  #if _CURSES_LIB != 0
         fprintf(stderr, "         tview       text alignment viewer\n");
  #endif
         fprintf(stderr, "         index       index alignment\n");
+       fprintf(stderr, "         idxstats    BAM index stats (r595 or later)\n");
         fprintf(stderr, "         fixmate     fix mate information\n");
         fprintf(stderr, "         glfview     print GLFv3 file\n");
         fprintf(stderr, "         flagstat    simple stats\n");
         fprintf(stderr, "         calmd       recalculate MD/NM tags and '=' bases\n");
         fprintf(stderr, "         merge       merge sorted alignments\n");
         fprintf(stderr, "         rmdup       remove PCR duplicates\n");
+       fprintf(stderr, "         reheader    replace BAM header\n");
         fprintf(stderr, "\n");
         return 1;
  }
@@ -106,9 +111,11 @@ int main(int argc, char *argv[])
         if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
         else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
         else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);
+       else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1);
         else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
         else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
         else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
+       else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1);
         else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
         else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
         else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
@@ -117,6 +124,7 @@ int main(int argc, char *argv[])
         else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);
         else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
         else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
+       else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1);
  #if _CURSES_LIB != 0
         else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
  #endif
diff --git a/bgzf.c b/bgzf.c

index 59f902ff85501bade5a5a2f079ed9e8f7b52c38b..a6923daa0cc1cfa8d0d0bc2269255326be8439e3 100644 (file)
--- a/bgzf.c
+++ b/bgzf.c
@@ -203,9 +203,7 @@ bgzf_open(const char* __restrict path, const char* __restrict mode)
                 if (fd == -1) return 0;
          fp = open_write(fd, strstr(mode, "u")? 1 : 0);
      }
-    if (fp != NULL) {
-        fp->owned_file = 1;
-    }
+    if (fp != NULL) fp->owned_file = 1;
      return fp;
  }
  
@@ -429,20 +427,19 @@ static void cache_block(BGZF *fp, int size)
         memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
  }
  
-static
  int
-read_block(BGZF* fp)
+bgzf_read_block(BGZF* fp)
  {
      bgzf_byte_t header[BLOCK_HEADER_LENGTH];
-       int size = 0;
+       int count, size = 0;
  #ifdef _USE_KNETFILE
      int64_t block_address = knet_tell(fp->x.fpr);
         if (load_block_from_cache(fp, block_address)) return 0;
-    int count = knet_read(fp->x.fpr, header, sizeof(header));
+    count = knet_read(fp->x.fpr, header, sizeof(header));
  #else
      int64_t block_address = ftello(fp->file);
         if (load_block_from_cache(fp, block_address)) return 0;
-    int count = fread(header, 1, sizeof(header), fp->file);
+    count = fread(header, 1, sizeof(header), fp->file);
  #endif
      if (count == 0) {
          fp->block_length = 0;
@@ -472,9 +469,7 @@ read_block(BGZF* fp)
      }
         size += count;
      count = inflate_block(fp, block_length);
-    if (count < 0) {
-        return -1;
-    }
+    if (count < 0) return -1;
      if (fp->block_length != 0) {
          // Do not reset offset if this read follows a seek.
          fp->block_offset = 0;
@@ -501,7 +496,7 @@ bgzf_read(BGZF* fp, void* data, int length)
      while (bytes_read < length) {
          int available = fp->block_length - fp->block_offset;
          if (available <= 0) {
-            if (read_block(fp) != 0) {
+            if (bgzf_read_block(fp) != 0) {
                  return -1;
              }
              available = fp->block_length - fp->block_offset;
@@ -528,19 +523,16 @@ bgzf_read(BGZF* fp, void* data, int length)
      return bytes_read;
  }
  
-static
-int
-flush_block(BGZF* fp)
+int bgzf_flush(BGZF* fp)
  {
      while (fp->block_offset > 0) {
-        int block_length = deflate_block(fp, fp->block_offset);
-        if (block_length < 0) {
-            return -1;
-        }
+        int count, block_length;
+               block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) return -1;
  #ifdef _USE_KNETFILE
-        int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+        count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
  #else
-        int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+        count = fwrite(fp->compressed_block, 1, block_length, fp->file);
  #endif
          if (count != block_length) {
              report_error(fp, "write failed");
@@ -551,17 +543,22 @@ flush_block(BGZF* fp)
      return 0;
  }
  
-int
-bgzf_write(BGZF* fp, const void* data, int length)
+int bgzf_flush_try(BGZF *fp, int size)
+{
+       if (fp->block_offset + size > fp->uncompressed_block_size)
+               return bgzf_flush(fp);
+       return -1;
+}
+
+int bgzf_write(BGZF* fp, const void* data, int length)
  {
      if (fp->open_mode != 'w') {
          report_error(fp, "file not open for writing");
          return -1;
      }
  
-    if (fp->uncompressed_block == NULL) {
+    if (fp->uncompressed_block == NULL)
          fp->uncompressed_block = malloc(fp->uncompressed_block_size);
-    }
  
      const bgzf_byte_t* input = data;
      int block_length = fp->uncompressed_block_size;
@@ -574,7 +571,7 @@ bgzf_write(BGZF* fp, const void* data, int length)
          input += copy_length;
          bytes_written += copy_length;
          if (fp->block_offset == block_length) {
-            if (flush_block(fp) != 0) {
+            if (bgzf_flush(fp) != 0) {
                  break;
              }
          }
@@ -582,13 +579,10 @@ bgzf_write(BGZF* fp, const void* data, int length)
      return bytes_written;
  }
  
-int
-bgzf_close(BGZF* fp)
+int bgzf_close(BGZF* fp)
  {
      if (fp->open_mode == 'w') {
-        if (flush_block(fp) != 0) {
-            return -1;
-        }
+        if (bgzf_flush(fp) != 0) return -1;
                 { // add an empty block
                         int count, block_length = deflate_block(fp, 0);
  #ifdef _USE_KNETFILE
@@ -613,9 +607,7 @@ bgzf_close(BGZF* fp)
                 else ret = knet_close(fp->x.fpr);
          if (ret != 0) return -1;
  #else
-        if (fclose(fp->file) != 0) {
-            return -1;
-        }
+        if (fclose(fp->file) != 0) return -1;
  #endif
      }
      free(fp->uncompressed_block);
@@ -625,12 +617,6 @@ bgzf_close(BGZF* fp)
      return 0;
  }
  
-int64_t
-bgzf_tell(BGZF* fp)
-{
-    return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
-}
-
  void bgzf_set_cache_size(BGZF *fp, int cache_size)
  {
         if (fp) fp->cache_size = cache_size;
@@ -655,9 +641,11 @@ int bgzf_check_EOF(BGZF *fp)
         return (memcmp(magic, buf, 28) == 0)? 1 : 0;
  }
  
-int64_t
-bgzf_seek(BGZF* fp, int64_t pos, int where)
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
  {
+       int block_offset;
+       int64_t block_address;
+
      if (fp->open_mode != 'r') {
          report_error(fp, "file not open for read");
          return -1;
@@ -666,8 +654,8 @@ bgzf_seek(BGZF* fp, int64_t pos, int where)
          report_error(fp, "unimplemented seek option");
          return -1;
      }
-    int block_offset = pos & 0xFFFF;
-    int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+    block_offset = pos & 0xFFFF;
+    block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
  #ifdef _USE_KNETFILE
      if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
  #else
diff --git a/bgzf.h b/bgzf.h

index 91b33177ff07c667b16f4db2ab724081beee5499..099ae9a6da1785ae132be55932a2e5c930ba49cc 100644 (file)
--- a/bgzf.h
+++ b/bgzf.h
@@ -106,7 +106,7 @@ int bgzf_write(BGZF* fp, const void* data, int length);
   * Return value is non-negative on success.
   * Returns -1 on error.
   */
-int64_t bgzf_tell(BGZF* fp);
+#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
  
  /*
   * Set the file to read from the location specified by pos, which must
@@ -126,9 +126,32 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
  void bgzf_set_cache_size(BGZF *fp, int cache_size);
  
  int bgzf_check_EOF(BGZF *fp);
+int bgzf_read_block(BGZF* fp);
+int bgzf_flush(BGZF* fp);
+int bgzf_flush_try(BGZF *fp, int size);
  
  #ifdef __cplusplus
  }
  #endif
  
+static inline int bgzf_getc(BGZF *fp)
+{
+       int c;
+       if (fp->block_offset >= fp->block_length) {
+               if (bgzf_read_block(fp) != 0) return -2; /* error */
+               if (fp->block_length == 0) return -1; /* end-of-file */
+       }
+       c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+    if (fp->block_offset == fp->block_length) {
+#ifdef _USE_KNETFILE
+        fp->block_address = knet_tell(fp->x.fpr);
+#else
+        fp->block_address = ftello(fp->file);
+#endif
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+       return c;
+}
+
  #endif
diff --git a/examples/bam2bed.c b/examples/bam2bed.c

new file mode 100644 (file)

index 0000000..bb937d1
--- /dev/null
+++ b/examples/bam2bed.c
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include "sam.h"
+static int fetch_func(const bam1_t *b, void *data)
+{
+       samfile_t *fp = (samfile_t*)data;
+       uint32_t *cigar = bam1_cigar(b);
+       const bam1_core_t *c = &b->core;
+       int i, l;
+       if (b->core.tid < 0) return 0;
+       for (i = l = 0; i < c->n_cigar; ++i) {
+               int op = cigar[i]&0xf;
+               if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
+                       l += cigar[i]>>4;
+       }
+       printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid],
+                  c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+');
+       return 0;
+}
+int main(int argc, char *argv[])
+{
+       samfile_t *fp;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: bam2bed <in.bam> [region]\n");
+               return 1;
+       }
+       if ((fp = samopen(argv[1], "rb", 0)) == 0) {
+               fprintf(stderr, "bam2bed: Fail to open BAM file %s\n", argv[1]);
+               return 1;
+       }
+       if (argc == 2) { /* if a region is not specified */
+               bam1_t *b = bam_init1();
+               while (samread(fp, b) >= 0) fetch_func(b, fp);
+               bam_destroy1(b);
+       } else {
+               int ref, beg, end;
+               bam_index_t *idx;
+               if ((idx = bam_index_load(argv[1])) == 0) {
+                       fprintf(stderr, "bam2bed: BAM indexing file is not available.\n");
+                       return 1;
+               }
+               bam_parse_region(fp->header, argv[2], &ref, &beg, &end);
+               if (ref < 0) {
+                       fprintf(stderr, "bam2bed: Invalid region %s\n", argv[2]);
+                       return 1;
+               }
+               bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func);
+               bam_index_destroy(idx);
+       }
+       samclose(fp);
+       return 0;
+}
diff --git a/examples/toy.fa b/examples/toy.fa

new file mode 100644 (file)

index 0000000..38312c1
--- /dev/null
+++ b/examples/toy.fa
@@ -0,0 +1,2 @@
+>ref
+AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT
diff --git a/examples/toy.sam b/examples/toy.sam

new file mode 100644 (file)

index 0000000..baf7388
--- /dev/null
+++ b/examples/toy.sam
@@ -0,0 +1,7 @@
+@SQ    SN:ref  LN:45
+r001   163     ref     7       30      8M2I4M1D3M      =       37      39      TTAGATAAAGGATACTG       *
+r002   0       ref     9       30      1S2I6M1P1I4M2I  *       0       0       AAAAGATAAGGATAAA        *
+r003   0       ref     9       30      5H6M    *       0       0       AGCTAA  *
+r004   0       ref     16      30      6M14N1I5M       *       0       0       ATAGCTCTCAGC    *
+r003   16      ref     29      30      6H5M    *       0       0       TAGGC   *
+r001   83      ref     37      30      9M      =       7       -39     CAGCGCCAT       *
+\ No newline at end of file
diff --git a/faidx.c b/faidx.c

index 811bdf8858ff9ef18479fb644981ad4780bf7a8d..dbd8b3e48db4c7a041304b7419fd7ab868badd90 100644 (file)
--- a/faidx.c
+++ b/faidx.c
@@ -197,7 +197,7 @@ int fai_build(const char *fn)
         sprintf(str, "%s.fai", fn);
         rz = razf_open(fn, "r");
         if (rz == 0) {
-               fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",str);
+               fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
                 free(str);
                 return -1;
         }
diff --git a/knetfile.c b/knetfile.c

index 994babb6a384e2757098bd6b042970a4dd54a801..e1be4d669b845babe504dc1bfd2e2ccbe806f593 100644 (file)
--- a/knetfile.c
+++ b/knetfile.c
@@ -38,9 +38,7 @@
  #include <unistd.h>
  #include <sys/types.h>
  
-#ifdef _WIN32
-#include <winsock.h>
-#else
+#ifndef _WIN32
  #include <netdb.h>
  #include <arpa/inet.h>
  #include <sys/socket.h>
@@ -566,7 +564,7 @@ off_t knet_seek(knetFile *fp, int64_t off, int whence)
          else if (whence==SEEK_SET)
              fp->offset = off;
                 fp->is_ready = 0;
-               return fp->offset;
+               return 0;
         }
         errno = EINVAL;
      fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
diff --git a/kstring.h b/kstring.h

index f4e5a99df5b03364166338e55799edbc41b1cae4..925117a38646ab4c1b82db01e46091f9e073eefd 100644 (file)
--- a/kstring.h
+++ b/kstring.h
@@ -58,6 +58,40 @@ static inline int kputc(int c, kstring_t *s)
         return c;
  }
  
+static inline int kputw(int c, kstring_t *s)
+{
+       char buf[16];
+       int l, x;
+       if (c == 0) return kputc('0', s);
+       for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+       if (c < 0) buf[l++] = '-';
+       if (s->l + l + 1 >= s->m) {
+               s->m = s->l + l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+       }
+       for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+       s->s[s->l] = 0;
+       return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+       char buf[16];
+       int l, i;
+       unsigned x;
+       if (c == 0) return kputc('0', s);
+       for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+       if (s->l + l + 1 >= s->m) {
+               s->m = s->l + l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+       }
+       for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+       s->s[s->l] = 0;
+       return 0;
+}
+
  static inline int *ksplit(kstring_t *s, int delimiter, int *n)
  {
         int max = 0, *offsets = 0;
diff --git a/misc/Makefile b/misc/Makefile

index 4404cccd1e8169b1c3a40538099a4e9d9285cfa8..2d7139d4e8a5c62068afb381f47493f8408bc681 100644 (file)
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -1,6 +1,6 @@
  CC=                    gcc
  CXX=           g++
-CFLAGS=                -g -Wall -O2 -m64 #-arch ppc
+CFLAGS=                -g -Wall #-O2 #-m64 #-arch ppc
  CXXFLAGS=      $(CFLAGS)
  DFLAGS=                -D_FILE_OFFSET_BITS=64
  OBJS=          
@@ -27,6 +27,9 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur:
  
  lib:
  
+afs2:afs2.o
+               $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam
+
  wgsim:wgsim.o
                 $(CC) $(CFLAGS) -o $@ wgsim.o -lm
  
@@ -48,7 +51,10 @@ maq2sam-long:maq2sam.c
  md5fa.o:md5.h md5fa.c
                 $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c
  
+afs2.o:afs2.c ../bam.h
+               $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c
+
  cleanlocal:
-               rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+               rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a
  
  clean:cleanlocal-recur
diff --git a/misc/export2sam.pl b/misc/export2sam.pl

index 8e3e2800c3ac82cf9d3d3581b02768ddfd9a16f2..a2a436c5e9df54cb82e219ec4965bde40fd4e7e5 100755 (executable)
--- a/misc/export2sam.pl
+++ b/misc/export2sam.pl
@@ -1,107 +1,461 @@
-#!/usr/bin/perl -w
-
+#!/usr/bin/env perl
+#
+#
+# Script to convert GERALD export files to SAM format.
+#
+#
+#
+########## License:
+#
+# The MIT License
+#
+# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd.
+# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+#
+#
+########## ChangeLog:
+#
+# Version: 2.0.0 (15FEB2010)
+#   Script updated by Illumina in conjunction with CASAVA 1.7.0 release.
+#   Major changes are as follows:
+#   - The CIGAR string has been updated to include all gaps from ELANDv2 alignments.
+#   - The ELAND single read alignment score is always stored in the optional "SM" field
+#       and the ELAND paired read alignment score is stored in the optional "AS" field
+#       when it exists.
+#   - The MAPQ value is set to the higher of the two alignment scores, but no greater
+#       than 254,  i.e. min(254,max(SM,AS))
+#   - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's
+#       expected orientation and insert size criteria.
+#   - The default quality score translation is set for export files which contain
+#       Phread+64 quality values. An option, "--qlogodds", has been added to
+#       translate quality values from the Solexa+64 format used in export files prior
+#       to Pipeline 1.3
+#   - The export match descriptor is now reverse-complemented when necessary such that
+#       it always corresponds to the forward strand of the reference, to be consistent
+#       with other information in the SAM record. It is now written to the optional
+#       'XD' field (rather than 'MD') to acknowledge its minor differences from the 
+#       samtools match descriptor (see additional detail below).
+#   - An option, "--nofilter", has been added to include reads which have failed
+#       primary analysis quality filtration. Such reads will have the corresponding
+#       SAM flag bit (0x0200) set.
+#   - Labels in the export 'contig' field are preserved by setting RNAME to
+#       "$export_chromosome/$export_contig" when then contig label exists.
+#
+#
  # Contact: lh3
  # Version: 0.1.2 (03JAN2009)
+#
+#
+#
+########## Known Conversion Limitations:
+#
+# - Export records for reads that map to a position < 1 (allowed in export format), are converted
+#     to unmapped reads in the SAM record.
+# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the
+#     aligner could not map the read to the reference sequence set, and "QC" means that the 
+#     aligner did not attempt to map the read due to some technical limitation. Both of these 
+#     alignment types are collapsed to the single unmapped alignment state in the SAM record.
+# - The export match descriptor is slightly different than the samtools match descriptor. For
+#     this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the
+#     export match descriptor differs from the samtools version in two respects: (1) indels 
+#     are explicitly closed with the '$' character and (2) insertions must be enumerated in
+#     the match descriptor. For example a 35-base read with a two-base insertion is described
+#     as: 20^2$14
+#
+#
+#
+
+my $version = "2.0.0";
  
  use strict;
  use warnings;
-use Getopt::Std;
+
+use File::Spec qw(splitpath);
+use Getopt::Long;
+use List::Util qw(min max);
+
+
+use constant {
+  EXPORT_INDEX => 6,
+  EXPORT_READNO => 7,
+  EXPORT_READ => 8,
+  EXPORT_QUAL => 9,
+  EXPORT_CHROM => 10,
+  EXPORT_CONTIG => 11,
+  EXPORT_POS => 12,
+  EXPORT_STRAND => 13, 
+  EXPORT_MD => 14,
+  EXPORT_SEMAP => 15,
+  EXPORT_PEMAP => 16,
+  EXPORT_PASSFILT => 21,
+};
+
+
+use constant {
+  SAM_QNAME => 0,
+  SAM_FLAG => 1,
+  SAM_RNAME => 2,
+  SAM_POS => 3,
+  SAM_MAPQ => 4,
+  SAM_CIGAR => 5,
+  SAM_MRNM => 6,
+  SAM_MPOS => 7,
+  SAM_ISIZE => 8,
+  SAM_SEQ => 9,
+  SAM_QUAL => 10,
+};
+
+
+# function prototypes for Richard's code
+sub match_desc_to_cigar($);
+sub match_desc_frag_length($);
+sub reverse_compl_match_descriptor($);
+sub write_header($;$;$);
+
  
  &export2sam;
  exit;
  
+
+
+
  sub export2sam {
+
+  my $cmdline = $0 . " " . join(" ",@ARGV);
+  my $arg_count = scalar @ARGV;
+  my @spval = File::Spec->splitpath($0);
+  my $progname = $spval[2];
+
+  my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
+  my $is_nofilter = 0;
+  my $read1file;
+  my $read2file;
+  my $print_version = 0;
+  my $help = 0;
+
+  my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, 
+                           "nofilter" => \$is_nofilter,
+                           "read1=s"  => \$read1file,
+                           "read2=s"  => \$read2file,
+                           "version"  => \$print_version,
+                           "help"     => \$help );
+
+  my $usage = <<END;
+
+$progname converts GERALD export files to SAM format.
+
+Usage: $progname --read1=FILENAME [ options ] | --version | --help
+
+  --read1=FILENAME  read1 export file (mandatory)
+  --read2=FILENAME  read2 export file
+  --nofilter        include reads that failed the pipeline/RTA purity filter
+  --qlogodds        assume export file(s) use logodds quality values as reported
+                      by pipeline prior to v1.3 (default: phred quality values)
+
+END
+
+  my $version_msg = <<END;
+
+$progname version: $version
+
+END
+
+  if((not $result) or $help or ($arg_count==0)) {
+    die($usage);
+  }  
+
+  if(@ARGV) {
+    print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
+    die($usage);
+  }
+
+  if($print_version) {
+    die($version_msg);
+  }
+
+  if(not defined($read1file)) {
+    print STDERR "\nERROR: read1 export file must be specified\n\n";
+    die($usage);
+  }
+
    my ($fh1, $fh2, $is_paired);
-  $is_paired = (@ARGV >= 2);
-  die("export2sam.pl <read1.export> [<read2.export>]\n") if (@ARGV == 0);
-  open($fh1, $ARGV[0]) || die;
+
+  open($fh1, $read1file) or die("\nERROR: Can't open read1 export file: $read1file\n\n");
+  $is_paired = defined $read2file;
    if ($is_paired) {
-       open($fh2, $ARGV[1]) || die;
+    open($fh2, $read2file) or die("\nERROR: Can't open read2 export file: $read2file\n\n");
    }
-  # conversion table
+  # quality value conversion table
    my @conv_table;
-  for (-64..64) {
-       $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499));
+  if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
+    for (-64..64) {
+      $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
+    }
+  } else {               # convert from phred+64 quality values (pipeline v1.3+):
+    for (-64..-1) {
+      $conv_table[$_+64] = undef;
+    }
+    for (0..64) {
+      $conv_table[$_+64] = int(33 + $_);
+    }
    }
+  # write the header
+  print write_header( $progname, $version, $cmdline );
    # core loop
+  my $export_line_count = 0;
    while (<$fh1>) {
-       my (@s1, @s2);
-       &export2sam_aux($_, \@s1, \@conv_table, $is_paired);
-       if ($is_paired) {
-         $_ = <$fh2>;
-         &export2sam_aux($_, \@s2, \@conv_table, $is_paired);
-         if (@s1 && @s2) { # then set mate coordinate
-               my $isize = 0;
-               if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize
-                 my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3];
-                 my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3];
-                 $isize = $x2 - $x1;
-               }
-               # update mate coordinate
-               if ($s2[2] ne '*') {
-                 @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize);
-                 $s1[1] |= 0x20 if ($s2[1] & 0x10);
-               } else {
-                 $s1[1] |= 0x8;
-               }
-               if ($s1[2] ne '*') {
-                 @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize);
-                 $s2[1] |= 0x20 if ($s1[1] & 0x10);
-               } else {
-                 $s2[1] |= 0x8;
-               }
-         }
-       }
-       print join("\t", @s1), "\n" if (@s1);
-       print join("\t", @s2), "\n" if (@s2 && $is_paired);
+    $export_line_count++;
+    my (@s1, @s2);
+    &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
+    if ($is_paired) {
+      my $read2line = <$fh2>;
+      if(not $read2line){
+        die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n  Extra reads observed in read1 file at line no: $export_line_count.\n\n");
+      }
+      &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
+       
+      if (@s1 && @s2) { # then set mate coordinate
+        if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
+          die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n  Read1: $_  Read2: $read2line\n");
+        }
+
+        my $isize = 0;
+        if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
+          my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
+          my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
+          $isize = $x2 - $x1;
+        }
+
+        foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ 
+          my ($sa,$sb,$is) = @{$_};
+          if ($sb->[SAM_RNAME] ne '*') {
+            $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
+            $sa->[SAM_MPOS] = $sb->[SAM_POS];
+            $sa->[SAM_ISIZE] = $is;
+            $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
+          } else {
+            $sa->[SAM_FLAG] |= 0x8;
+          }
+        } 
+      }
+    }
+    print join("\t", @s1), "\n" if (@s1);
+    print join("\t", @s2), "\n" if (@s2 && $is_paired);
    }
    close($fh1);
-  close($fh2) if ($is_paired);
+  if($is_paired) {
+    while(my $read2line = <$fh2>){
+      $export_line_count++;
+      die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n  Extra reads observed in read2 file at line no: $export_line_count.\n\n");
+    }
+    close($fh2);
+  }
  }
  
  sub export2sam_aux {
-  my ($line, $s, $ct, $is_paired) = @_;
+  my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
    chomp($line);
    my @t = split("\t", $line);
    @$s = ();
-  return if ($t[21] ne 'Y');
+  my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
+  return if(not ($isPassFilt or $is_nofilter));
    # read name
-  $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
+  $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
    # initial flag (will be updated later)
-  $s->[1] = 0;
-  $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired);
+  $s->[SAM_FLAG] = 0;
+  if($is_paired) {
+    if($t[EXPORT_READNO] != $read_no){
+      die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
+    }
+    $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
+  }
+  $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
+
    # read & quality
-  $s->[9] = $t[8]; $s->[10] = $t[9];
-  if ($t[13] eq 'R') { # then reverse the sequence and quality
-       $s->[9] = reverse($t[8]);
-       $s->[9] =~ tr/ACGTacgt/TGCAtgca/;
-       $s->[10] = reverse($t[9]);
+  my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
+  if ($is_export_rev) { # then reverse the sequence and quality
+    $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
+    $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
+    $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
+  } else {
+    $s->[SAM_SEQ] = $t[EXPORT_READ];
+    $s->[SAM_QUAL] = $t[EXPORT_QUAL];
    }
-  $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding
-  # cigar
-  $s->[5] = length($s->[9]) . "M";
+  my @convqual = ();
+  foreach (unpack('C*', $s->[SAM_QUAL])){
+    my $val=$ct->[$_];
+    if(not defined $val){
+      my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
+      if( $_ < 64 ) { $msg .= "  Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
+      die($msg . "\n");
+    }
+    push @convqual,$val;
+  }
+
+  $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
+
+
    # coor
    my $has_coor = 0;
-  $s->[2] = "*";
-  if ($t[10] eq 'NM' || $t[10] eq 'QC') {
-       $s->[1] |= 0x4; # unmapped
-  } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) {
-       $s->[1] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
-       push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
+  $s->[SAM_RNAME] = "*";
+  if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') {
+    $s->[SAM_FLAG] |= 0x4; # unmapped
+  } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
+    $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
+    push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
+  } elsif ($t[EXPORT_POS] < 1) {
+    $s->[SAM_FLAG] |= 0x4; # unmapped
+  } else {
+    $s->[SAM_RNAME] = $t[EXPORT_CHROM];
+    $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
+    $has_coor = 1;
+  }
+  $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
+
+#  print STDERR "t[14] = " . $t[14] . "\n";
+  my $matchDesc = '';
+  $s->[SAM_CIGAR] = "*";
+  if($has_coor){
+    $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
+
+    if($matchDesc =~ /\^/){
+      # construct CIGAR string using Richard's function
+      $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
+    } else {
+      $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
+    }
+  }
+
+#  print STDERR "cigar_string = $cigar_string\n";
+
+  $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
+  if($has_coor){
+    my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
+    my $pemap = 0;
+    if($is_paired) {
+      $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
+
+      # set `proper pair' bit if non-blank, non-zero PE alignment score:
+      $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
+    }
+    $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
    } else {
-       $s->[2] = $t[10];
-       $has_coor = 1;
+    $s->[SAM_MAPQ] = 0;
    }
-  $s->[3] = $has_coor? $t[12] : 0;
-  $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R');
-  # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?)
-  $s->[4] = 0;
-  $s->[4] = $t[15] if ($t[15] ne '');
-  $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]);
    # mate coordinate
-  $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+  $s->[SAM_MRNM] = '*';
+  $s->[SAM_MPOS] = 0;
+  $s->[SAM_ISIZE] = 0;
    # aux
-  push(@$s, "BC:Z:$t[6]") if ($t[6]);
-  push(@$s, "MD:Z:$t[14]") if ($has_coor);
-  push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor);
+  push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
+  if($has_coor){
+    # The export match descriptor differs slightly from the samtools match descriptor.
+    # In order for the converted SAM files to be as compliant as possible,
+    # we put the export match descriptor in optional field 'XD' rather than 'MD':
+    push(@$s, "XD:Z:$matchDesc"); 
+    push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
+    push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
+  }
+}
+
+
+
+# 
+# the following code is taken from Richard Shaw's sorted2sam.pl file
+#
+sub reverse_compl_match_descriptor($)
+{
+#    print "\nREVERSING THE MATCH DESCRIPTOR!\n";
+    my ($match_desc) = @_;
+    my $rev_compl_match_desc = reverse($match_desc);
+    $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
+
+    # Unreverse the digits of numbers.
+    $rev_compl_match_desc = join('',
+                                 map {($_ =~ /\d+/)
+                                      ? join('', reverse(split('', $_)))
+                                      : $_} split(/(\d+)/,
+                                                  $rev_compl_match_desc));
+
+    return $rev_compl_match_desc;
+}
+
+
+
+sub match_desc_to_cigar($)
+{
+    my ($match_desc) = @_;
+
+    my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
+    my $cigar_str = '';
+    my $cigar_del_ch = 'D';
+    my $cigar_ins_ch = 'I';
+    my $cigar_match_ch = 'M';
+
+    foreach my $match_desc_part (@match_desc_parts) {
+        next if (!$match_desc_part);
+
+        if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
+            # Deletion
+            $cigar_str .= (length($1) . $cigar_del_ch);
+        } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
+            # Insertion
+            $cigar_str .= ($1 . $cigar_ins_ch);
+        } else {
+            $cigar_str .= (match_desc_frag_length($match_desc_part)
+                           . $cigar_match_ch);
+        }
+    }
+
+    return $cigar_str;
+}
+
+
+#------------------------------------------------------------------------------
+
+sub match_desc_frag_length($)
+                           {
+    my ($match_desc_str) = @_;
+    my $len = 0;
+
+    my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
+
+    foreach my $match_desc_field (@match_desc_fields) {
+        next if ($match_desc_field eq '');
+
+        $len += (($match_desc_field =~ /(\d+)/)
+                 ? $1 : length($match_desc_field));
+    }
+
+    return $len;
+}
+
+
+# argument holds the command line
+sub write_header($;$;$) 
+{
+       my ($progname,$version,$cl) = @_;
+       my $complete_header = "";
+       $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
+
+       return $complete_header;
  }
diff --git a/misc/sam2vcf.pl b/misc/sam2vcf.pl

index ede7bd8a35717ede14f470a11ad39e78d49d4f90..afaf91e1dcf60894f195bed3580b6712dc0457df 100755 (executable)
--- a/misc/sam2vcf.pl
+++ b/misc/sam2vcf.pl
@@ -1,9 +1,9 @@
  #!/usr/bin/perl -w
  # 
-# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2
-
+# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
+# 
  # Contact: pd3@sanger
-# Version: 2009-10-08
+# Version: 2010-04-23
  
  use strict;
  use warnings;
@@ -23,8 +23,12 @@ sub error
      die
          "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n",
          "Options:\n",
-        "   -r, -refseq <file.fa>            The reference sequence, required when indels are present.\n",
-        "   -h, -?, --help                   This help message.\n",
+        "   -h, -?, --help                  This help message.\n",
+        "   -i, --indels-only               Ignore SNPs.\n",
+        "   -r, --refseq <file.fa>          The reference sequence, required when indels are present.\n",
+        "   -R, --keep-ref                  Print reference alleles as well.\n",
+        "   -s, --snps-only                 Ignore indels.\n",
+        "   -t, --column-title <string>     The column title.\n",
          "\n";
  }
  
@@ -38,7 +42,11 @@ sub parse_params
  
      while (my $arg=shift(@ARGV))
      {
+        if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; }
          if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; }
+        if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; }
+        if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; }
+        if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; }
          if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
  
          error("Unknown parameter \"$arg\". Run -h for help.\n");
@@ -59,13 +67,14 @@ sub iupac_to_gtype
              );
      if ( !exists($iupac{$base}) ) 
      { 
-        if ( $ref eq $base ) { return ('.','0|0'); }
-        return ($base,'1|1');
+        if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); }
+        if ( $ref eq $base ) { return ('.','0/0'); }
+        return ($base,'1/1');
      }
      my $gt = $iupac{$base};
-    if ( $$gt[0] eq $ref  ) { return ($$gt[1],'0|1'); }
-    elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0|1'); }
-    return ("$$gt[0],$$gt[1]",'1|2');
+    if ( $$gt[0] eq $ref  ) { return ($$gt[1],'0/1'); }
+    elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); }
+    return ("$$gt[0],$$gt[1]",'1/2');
  }
  
  
@@ -97,53 +106,96 @@ sub do_pileup_to_vcf
      my $fh_out = $$opts{fh_out};
      my ($prev_chr,$prev_pos,$prev_ref);
      my $refseq;
+    my $ignore_indels = $$opts{snps_only} ? 1 : 0;
+    my $ignore_snps   = $$opts{indels_only} ? 1 : 0;
+    my $keep_ref      = $$opts{keep_ref} ? 1 : 0;
+    my $title = exists($$opts{title}) ? $$opts{title} : 'data';
+
+    print $fh_out 
+        qq[##fileformat=VCFv3.3\n],
+        qq[##INFO=DP,1,Integer,"Total Depth"\n],
+        qq[##FORMAT=GT,1,String,"Genotype"\n],
+        qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n],
+        qq[##FORMAT=DP,1,Integer,"Read Depth"\n],
+        qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n]
+        ;
  
      while (my $line=<$fh_in>)
      {
          chomp($line);
-        my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,@items) = split(/\t/,$line);
+        my (@items) = split(/\t/,$line);
+        if ( scalar @items<8 ) 
+        { 
+            error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n"); 
+        }
+        my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items;
+        $ref  = uc($ref);
+        $cons = uc($cons);
  
          my ($alt,$gt);
          if ( $ref eq '*' )
          {
              # An indel is involved.
-            if ($chr ne $prev_chr || $pos ne $prev_pos) 
+            if ( $ignore_indels )
+            { 
+                $prev_ref = $ref;
+                $prev_pos = $pos;
+                $prev_chr = $chr;
+                next; 
+            }
+
+            if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos) 
              {
                  if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); }
                  if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); }
                  $ref = $refseq->get_base($chr,$pos);
+                $ref = uc($ref);
              }
              else { $ref = $prev_ref; }
  
-            # One of the alleles can be a reference and it can come in arbitrary order
+            # One of the alleles can be a reference and it can come in arbitrary order. In some
+            #   cases */* can be encountered. In such a case, look in the additional columns.
              my ($al1,$al2) = split(m{/},$cons);
+            if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; }
              my $alt1 = parse_indel($al1);
              my $alt2 = parse_indel($al2);
              if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); }
-            if ( $alt1 && $alt2 && $alt1 eq $alt2 ) { $alt2=''; }
              if ( !$alt1 ) 
              { 
                  $alt=$alt2; 
-                $gt='0|1'; 
+                $gt='0/1'; 
              }
              elsif ( !$alt2 ) 
              { 
                  $alt=$alt1; 
-                $gt='0|1'; 
+                $gt='0/1'; 
              }
-            else 
+            elsif ( $alt1 eq $alt2 )
+            { 
+                $alt="$alt1"; 
+                $gt='1/1'; 
+            }
+            else
              { 
                  $alt="$alt1,$alt2"; 
-                $gt='1|2'; 
+                $gt='1/2'; 
              }
          }
          else
          {
+            if ( $ignore_snps || (!$keep_ref && $ref eq $cons) ) 
+            { 
+                $prev_ref = $ref;
+                $prev_pos = $pos;
+                $prev_chr = $chr;
+                next; 
+            }
+
              # SNP
              ($alt,$gt) = iupac_to_gtype($ref,$cons);
          }
  
-        print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\t\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
+        print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
  
          $prev_ref = $ref;
          $prev_pos = $pos;
@@ -167,7 +219,8 @@ use Carp;
  sub Fasta::new
  {
      my ($class,@args) = @_;
-    my $self = @args ? {@args} : {};
+    my $self = {@args};
+    bless $self, ref($class) || $class;
      if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); }
      $$self{chr}  = undef;
      $$self{from} = undef;
diff --git a/misc/samtools.pl b/misc/samtools.pl

index 320e8aaeda8ff2fab7fd91ab5e02073d5f3ba8fe..9f48b8f7586899a9b9456b3fdc09fb8a28faa3d3 100755 (executable)
--- a/misc/samtools.pl
+++ b/misc/samtools.pl
@@ -11,7 +11,7 @@ my $version = '0.3.3';
  
  my $command = shift(@ARGV);
  my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter,
-                       unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr);
+                       unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq);
  
  die("Unknown command \"$command\".\n") if (!defined($func{$command}));
  &{$func{$command}};
@@ -46,10 +46,12 @@ sub showALEN {
  # G close to a high-quality indel (SNP only)
  # Q low RMS mapping quality (SNP only)
  # g close to another indel with higher quality (indel only)
+# s low SNP quality (SNP only)
+# i low indel quality (indel only)
  
  sub varFilter {
-  my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef);
-  getopts('pq:d:D:l:Q:w:W:N:G:', \%opts);
+  my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, S=>'', i=>'');
+  getopts('pq:d:D:l:Q:w:W:N:G:S:i:', \%opts);
    die(qq/
  Usage:   samtools.pl varFilter [options] <in.cns-pileup>
  
@@ -57,6 +59,8 @@ Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
           -q INT    minimum RMS mapping quality for gaps [$opts{q}]
           -d INT    minimum read depth [$opts{d}]
           -D INT    maximum read depth [$opts{D}]
+         -S INT    minimum SNP quality [$opts{S}]
+         -i INT    minimum indel quality [$opts{i}]
  
           -G INT    min indel score for nearby SNP filtering [$opts{G}]
           -w INT    SNP within INT bp around a gap to be filtered [$opts{w}]
@@ -80,7 +84,8 @@ Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
         next if (uc($t[2]) eq uc($t[3]) || $t[3] eq '*/*'); # skip non-var sites
         # clear the out-of-range elements
         while (@staging) {
-         last if ($staging[0][2] eq $t[0] && $staging[0][3] + $max_dist >= $t[1]);
+      # Still on the same chromosome and the first element's window still affects this position?  
+         last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
           varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
         }
         my ($flt, $score) = (0, -1);
@@ -90,14 +95,32 @@ Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
         } elsif ($t[7] > $opts{D}) {
           $flt = 3;
         }
+    if ($t[2] eq '*') { # an indel
+        if ($opts{i} && $opts{i}>$t[5]) { $flt = 8; }
+    }
+    elsif ($opts{S} && $opts{S}>$t[5]) { $flt = 7; }    # SNP
+
         # site dependent filters
+    my $len=0;
         if ($flt == 0) {
           if ($t[2] eq '*') { # an indel
+        
+        # If deletion, remember the length of the deletion
+        my ($a,$b) = split(m{/},$t[3]);
+        my $alen = length($a) - 1;
+        my $blen = length($b) - 1;
+        if ( $alen>$blen )
+        {
+            if ( substr($a,0,1) eq '-' ) { $len=$alen; }
+        }
+        elsif ( substr($b,0,1) eq '-' ) { $len=$blen; }
+
                 $flt = 1 if ($t[6] < $opts{q});
                 # filtering SNPs
                 if ($t[5] >= $opts{G}) {
                   for my $x (@staging) {
-                       next if ($x->[0] >= 0 || $x->[3] + $ow < $t[1]);
+            # Is it a SNP and is it outside the SNP filter window?
+                       next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
                         $x->[1] = 5 if ($x->[1] == 0);
                   }
                 }
@@ -107,7 +130,8 @@ Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
                 $score += $opts{s} * $t[11] if ($t[9] ne '*');
                 # check the staging list for indel filtering
                 for my $x (@staging) {
-                 next if ($x->[0] < 0 || $x->[3] + $ol < $t[1]);
+          # Is it a SNP and is it outside the gap filter window
+                 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
                   if ($x->[0] < $score) {
                         $x->[1] = 6;
                   } else {
@@ -119,17 +143,17 @@ Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
                 # check adjacent SNPs
                 my $k = 1;
                 for my $x (@staging) {
-                 ++$k if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
+                 ++$k if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
                 }
                 # filtering is necessary
                 if ($k > $opts{N}) {
                   $flt = 4;
                   for my $x (@staging) {
-                        $x->[1] = 4 if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && $x->[1] == 0);
+                        $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
                   }
                 } else { # then check gap filter
                   for my $x (@staging) {
-                       next if ($x->[0] < 0 || $x->[3] + $ow < $t[1]);
+                       next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
                         if ($x->[0] >= $opts{G}) {
                           $flt = 5; last;
                         }
@@ -137,7 +161,7 @@ Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
                 }
           }
         }
-       push(@staging, [$score, $flt, @t]);
+       push(@staging, [$score, $flt, $len, @t]);
    }
    # output the last few elements in the staging list
    while (@staging) {
@@ -148,9 +172,9 @@ Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
  sub varFilter_aux {
    my ($first, $is_print) = @_;
    if ($first->[1] == 0) {
-       print join("\t", @$first[2 .. @$first-1]), "\n";
+       print join("\t", @$first[3 .. @$first-1]), "\n";
    } elsif ($is_print) {
-       print STDERR join("\t", substr("UQdDWGgX", $first->[1], 1), @$first[2 .. @$first-1]), "\n";
+       print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
    }
  }
  
@@ -226,6 +250,49 @@ sub p2q_print_str {
    }
  }
  
+#
+# sam2fq
+#
+
+sub sam2fq {
+  my %opts = (n=>20, p=>'');
+  getopts('n:p:', \%opts);
+  die("Usage: samtools.pl sam2fq [-n 20] [-p <prefix>] <inp.sam>\n") if (@ARGV == 0 && -t STDIN);
+  if ($opts{p} && $opts{n} > 1) {
+       my $pre = $opts{p};
+       my @fh;
+       for (0 .. $opts{n}-1) {
+         open($fh[$_], sprintf("| gzip > $pre.%.3d.fq.gz", $_)) || die;
+       }
+       my $i = 0;
+       while (<>) {
+         next if (/^@/);
+         chomp;
+         my @t = split("\t");
+         next if ($t[9] eq '*');
+         my ($name, $seq, $qual);
+         if ($t[1] & 16) { # reverse strand
+               $seq = reverse($t[9]);
+               $qual = reverse($t[10]);
+               $seq =~ tr/ACGTacgt/TGCAtgca/;
+         } else {
+               ($seq, $qual) = @t[9,10];
+         }
+         $name = $t[0];
+         $name .= "/1" if ($t[1] & 0x40);
+         $name .= "/2" if ($t[1] & 0x80);
+         print {$fh[$i]} "\@$name\n$seq\n";
+         if ($qual ne '*') {
+               print {$fh[$i]} "+\n$qual\n";
+         }
+         $i = 0 if (++$i == $opts{n});
+       }
+       close($fh[$_]) for (0 .. $opts{n}-1);
+  } else {
+       die("To be implemented.\n");
+  }
+}
+
  #
  # sra2hdr
  #
@@ -285,10 +352,11 @@ sub sra2hdr {
  
  sub unique {
    my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3);
-  getopts('Qf:q:r:a:b:', \%opts);
+  getopts('Qf:q:r:a:b:m', \%opts);
    die("Usage: samtools.pl unique [-f $opts{f}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
    my $last = '';
    my $recal_Q = !defined($opts{Q});
+  my $multi_only = defined($opts{m});
    my @a;
    while (<>) {
         my $score = -1;
@@ -306,16 +374,16 @@ sub unique {
         }
         $score = 1 if ($score < 1);
         if ($t[0] ne $last) {
-         &unique_aux(\@a, $opts{f}, $recal_Q) if (@a);
+         &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a);
           $last = $t[0];
         }
         push(@a, [$score, \@t]);
    }
-  &unique_aux(\@a, $opts{f}, $recal_Q) if (@a);
+  &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a);
  }
  
  sub unique_aux {
-  my ($a, $fac, $is_recal) = @_;
+  my ($a, $fac, $is_recal, $multi_only) = @_;
    my ($max, $max2, $max_i) = (0, 0, -1);
    for (my $i = 0; $i < @$a; ++$i) {
         if ($a->[$i][0] > $max) {
@@ -325,9 +393,11 @@ sub unique_aux {
         }
    }
    if ($is_recal) {
-       my $q = int($fac * ($max - $max2) / $max + .499);
-       $q = 250 if ($q > 250);
-       $a->[$max_i][1][4] = $q < 250? $q : 250;
+       if (!$multi_only || @$a > 1) {
+         my $q = int($fac * ($max - $max2) / $max + .499);
+         $q = 250 if ($q > 250);
+         $a->[$max_i][1][4] = $q < 250? $q : 250;
+       }
    }
    print join("\t", @{$a->[$max_i][1]});
    @$a = ();
diff --git a/misc/varfilter.py b/misc/varfilter.py

new file mode 100755 (executable)

index 0000000..03ce395
--- /dev/null
+++ b/misc/varfilter.py
@@ -0,0 +1,205 @@
+#!/software/bin/python
+
+# Author: lh3, converted to python and modified to add -C option by Aylwyn Scally
+#
+# About:
+#   varfilter.py is a port of Heng's samtools.pl varFilter script into 
+#   python, with an additional -C INT option. This option sets a minimum 
+#   consensus score, above which the script will output a pileup line 
+#   wherever it _could have_ called a variant, even if none is actually 
+#   called (i.e. hom-ref positions). This is important if you want to
+#   subsequently merge the calls with those for another individual to get a
+#   synoptic view of calls at each site. Without this option, and in all 
+#   other respects, it behaves like samtools.pl varFilter.
+#   
+#   Aylwyn Scally as6@sanger.ac.uk
+
+
+# Filtration code:
+#
+# C low CNS quality (hom-ref only)
+# d low depth
+# D high depth
+# W too many SNPs in a window (SNP only)
+# G close to a high-quality indel (SNP only)
+# Q low RMS mapping quality (SNP only)
+# g close to another indel with higher quality (indel only)
+# s low SNP quality (SNP only)
+# i low indel quality (indel only)
+
+
+import sys
+import getopt
+
+def usage():
+       print '''usage: varfilter.py [options] [cns-pileup]
+
+Options: -Q INT        minimum RMS mapping quality for SNPs
+                -q INT minimum RMS mapping quality for gaps
+                -d INT minimum read depth 
+                -D INT maximum read depth
+                -S INT minimum SNP quality
+                -i INT minimum indel quality
+                -C INT minimum consensus quality for hom-ref sites
+
+                -G INT min indel score for nearby SNP filtering
+                -w INT SNP within INT bp around a gap to be filtered
+
+                -W INT window size for filtering dense SNPs
+                -N INT max number of SNPs in a window
+
+                -l INT window size for filtering adjacent gaps
+
+                -p print filtered variants'''
+
+def varFilter_aux(first, is_print):
+       try:
+               if first[1] == 0:
+                       sys.stdout.write("\t".join(first[4:]) + "\n")
+               elif is_print:
+                       sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n")
+       except IOError:
+               sys.exit()
+ 
+mindepth = 3
+maxdepth = 100
+gapgapwin = 30
+minsnpmapq = 25
+mingapmapq = 10
+minindelscore = 25
+scorefactor = 100
+snpgapwin = 10
+densesnpwin = 10
+densesnps = 2
+printfilt = False
+minsnpq = 0
+minindelq = 0
+mincnsq = 0
+
+try:
+       options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', [])
+except getopt.GetoptError:
+       usage()
+       sys.exit(2)
+for (oflag, oarg) in options:
+       if oflag == '-d': mindepth = int(oarg)
+       if oflag == '-D': maxdepth = int(oarg)
+       if oflag == '-l': gapgapwin = int(oarg)
+       if oflag == '-Q': minsnpmapq = int(oarg)
+       if oflag == '-q': mingapmapq = int(oarg)
+       if oflag == '-G': minindelscore = int(oarg)
+       if oflag == '-s': scorefactor = int(oarg)
+       if oflag == '-w': snpgapwin = int(oarg)
+       if oflag == '-W': densesnpwin = int(oarg)
+       if oflag == '-C': mincnsq = int(oarg)
+       if oflag == '-N': densesnps = int(oarg)
+       if oflag == '-p': printfilt = True
+       if oflag == '-S': minsnpq = int(oarg)
+       if oflag == '-i': minindelq = int(oarg)
+
+if len(args) < 1:
+       inp = sys.stdin
+else:
+       inp = open(args[0])
+
+# calculate the window size
+max_dist = max(gapgapwin, snpgapwin, densesnpwin)
+
+staging = []
+for t in (line.strip().split() for line in inp):
+       (flt, score) = (0, -1)
+       # non-var sites
+       if t[3] == '*/*':
+               continue
+       is_snp = t[2].upper() != t[3].upper()
+       if not (is_snp or mincnsq):
+               continue
+       # clear the out-of-range elements
+       while staging:
+               # Still on the same chromosome and the first element's window still affects this position?  
+               if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]):
+                       break
+               varFilter_aux(staging.pop(0), printfilt)
+       
+       # first a simple filter
+       if int(t[7]) < mindepth:
+               flt = 2
+       elif int(t[7]) > maxdepth:
+               flt = 3
+       if t[2] == '*': # an indel
+               if minindelq and minindelq > int(t[5]):
+                       flt = 8
+       elif is_snp:
+               if minsnpq and minsnpq> int(t[5]):
+                       flt = 7
+       else:
+               if mincnsq and mincnsq > int(t[4]):
+                       flt = 9
+
+       # site dependent filters
+       dlen = 0
+       if flt == 0:
+               if t[2] == '*': # an indel
+                       # If deletion, remember the length of the deletion
+                       (a,b) = t[3].split('/')
+                       alen = len(a) - 1
+                       blen = len(b) - 1
+                       if alen>blen:
+                               if a[0] == '-': dlen=alen 
+                       elif b[0] == '-': dlen=blen 
+
+                       if int(t[6]) < mingapmapq:
+                               flt = 1
+                       # filtering SNPs
+                       if int(t[5]) >= minindelscore:
+                               for x in (y for y in staging if y[3]):
+                                       # Is it a SNP and is it outside the SNP filter window?
+                                       if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
+                                               continue
+                                       if x[1] == 0:
+                                               x[1] = 5
+                       
+                       # calculate the filtering score (different from indel quality)
+                       score = int(t[5])
+                       if t[8] != '*':
+                               score += scorefactor * int(t[10])
+                       if t[9] != '*':
+                               score += scorefactor * int(t[11])
+                       # check the staging list for indel filtering
+                       for x in (y for y in staging if y[3]):
+                         # Is it a SNP and is it outside the gap filter window
+                               if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]):
+                                       continue
+                               if x[0] < score:
+                                       x[1] = 6
+                               else:
+                                       flt = 6
+                                       break
+               else: # a SNP or hom-ref
+                       if int(t[6]) < minsnpmapq:
+                               flt = 1
+                       # check adjacent SNPs
+                       k = 1
+                       for x in (y for y in staging if y[3]):
+                               if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5):
+                                       k += 1
+                       
+                       # filtering is necessary
+                       if k > densesnps:
+                               flt = 4
+                               for x in (y for y in staging if y[3]):
+                                       if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0:
+                                               x[1] = 4
+                       else: # then check gap filter
+                               for x in (y for y in staging if y[3]):
+                                       if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
+                                               continue
+                                       if x[0] >= minindelscore:
+                                               flt = 5
+                                               break
+       
+       staging.append([score, flt, dlen, is_snp] + t)
+  
+# output the last few elements in the staging list
+while staging:
+       varFilter_aux(staging.pop(0), printfilt)
diff --git a/misc/wgsim.c b/misc/wgsim.c

index 1522eee484903740390edac608e688f627b77c2e..7b5f095d910b06f3242419066ac8879b7cdb7454 100644 (file)
--- a/misc/wgsim.c
+++ b/misc/wgsim.c
@@ -238,7 +238,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq
                 c[0] = nst_nt4_table[(int)seq->s[i]];
                 c[1] = hap1->s[i]; c[2] = hap2->s[i];
                 if (c[0] >= 4) continue;
-               if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) {
+               if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
                         printf("%s\t%d\t", name, i+1);
                         if (c[1] == c[2]) { // hom
                                 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
@@ -304,7 +304,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
         tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
         size[0] = size_l; size[1] = size_r;
  
-       Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
+       Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
  
         tot_len = n_ref = 0;
         while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
diff --git a/misc/wgsim_eval.pl b/misc/wgsim_eval.pl

index 01038f1fab67c6585e2788eb19e2ae4a6e015ab2..f919a0643a0d2950d8f817c6213a3183f4f9a6dc 100755 (executable)
--- a/misc/wgsim_eval.pl
+++ b/misc/wgsim_eval.pl
@@ -12,9 +12,9 @@ exit;
  
  sub wgsim_eval {
    my %opts = (g=>5);
-  getopts('pcg:', \%opts);
-  die("Usage: wgsim_eval.pl [-pc] [-g $opts{g}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
-  my (@c0, @c1);
+  getopts('pcag:', \%opts);
+  die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
+  my (@c0, @c1, %fnfp);
    my ($max_q, $flag) = (0, 0);
    my $gap = $opts{g};
    $flag |= 1 if (defined $opts{p});
@@ -66,14 +66,26 @@ sub wgsim_eval {
         }
         ++$c0[$q];
         ++$c1[$q] unless ($is_correct);
+       @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]});
+       ++$fnfp{$t[4]}[0];
+       ++$fnfp{$t[4]}[1] unless ($is_correct);
         print STDERR $line if (($flag&1) && !$is_correct && $q > 0);
    }
    # print
    my ($cc0, $cc1) = (0, 0);
-  for (my $i = $max_q; $i >= 0; --$i) {
-       $c0[$i] = 0 unless (defined $c0[$i]);
-       $c1[$i] = 0 unless (defined $c1[$i]);
-       $cc0 += $c0[$i]; $cc1 += $c1[$i];
-       printf("%.2dx %12d / %-12d  %12d  %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0);
+  if (!defined($opts{a})) {
+       for (my $i = $max_q; $i >= 0; --$i) {
+         $c0[$i] = 0 unless (defined $c0[$i]);
+         $c1[$i] = 0 unless (defined $c1[$i]);
+         $cc0 += $c0[$i]; $cc1 += $c1[$i];
+         printf("%.2dx %12d / %-12d  %12d  %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0);
+       }
+  } else {
+       for (reverse(sort {$a<=>$b} (keys %fnfp))) {
+         next if ($_ == 0);
+         $cc0 += $fnfp{$_}[0];
+         $cc1 += $fnfp{$_}[1];
+         print join("\t", $_, $cc0, $cc1), "\n";
+       }
    }
  }
diff --git a/sam.c b/sam.c

index ad4325bb4b8e8436d21e921478ed00467aac0e6b..ecdee02dddb98a32d47a59e3154179356acecadf 100644 (file)
--- a/sam.c
+++ b/sam.c
@@ -55,6 +55,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
                                 if (aux) { // check if aux is present
                                         bam_header_t *textheader = fp->header;
                                         fp->header = sam_header_read2((const char*)aux);
+                                       if (fp->header == 0) goto open_err_ret;
                                         append_header_text(fp->header, textheader->text, textheader->l_text);
                                         bam_header_destroy(textheader);
                                 }
diff --git a/sam_header.c b/sam_header.c

index a119c02b9c4cfaa0e6c4251454fd86cd12db3743..05d75deb2c4ced014c6ae5f8612260d05b34660a 100644 (file)
--- a/sam_header.c
+++ b/sam_header.c
@@ -10,6 +10,7 @@ KHASH_MAP_INIT_STR(str, const char *)
  
  struct _HeaderList
  {
+    struct _HeaderList *last;   // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
      struct _HeaderList *next;
      void *data;
  };
@@ -58,6 +59,34 @@ static void debug(const char *format, ...)
      va_end(ap);
  }
  
+#if 0
+// Replaced by list_append_to_end
+static list_t *list_prepend(list_t *root, void *data)
+{
+    list_t *l = malloc(sizeof(list_t));
+    l->next = root;
+    l->data = data;
+    return l;
+}
+#endif
+
+// Relies on the root->last being correct. Do not use with the other list_*
+//  routines unless they are fixed to modify root->last as well.
+static list_t *list_append_to_end(list_t *root, void *data)
+{
+    list_t *l = malloc(sizeof(list_t));
+    l->last = l;
+    l->next = NULL;
+    l->data = data;
+
+    if ( !root )
+        return l;
+
+    root->last->next = l;
+    root->last = l;
+    return root;
+}
+
  static list_t *list_append(list_t *root, void *data)
  {
      list_t *l = root;
@@ -322,7 +351,7 @@ static HeaderLine *sam_header_line_parse(const char *headerLine)
  
      while (*to && *to!='\t') to++;
      if ( to-from != 2 ) {
-               debug("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine);
+               debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
                 return 0;
         }
      
@@ -345,7 +374,11 @@ static HeaderLine *sam_header_line_parse(const char *headerLine)
          while (*to && *to!='\t') to++;
  
          if ( !required_tags[itype] && !optional_tags[itype] )
+        {
+            // CO is a special case, it can contain anything, including tabs
+            if ( *to ) { to++; continue; }
              tag = new_tag("  ",from,to-1);
+        }
          else
              tag = new_tag(from,from+3,to-1);
  
@@ -539,7 +572,8 @@ void *sam_header_parse2(const char *headerText)
      {
          hline = sam_header_line_parse(buf);
          if ( hline && sam_header_line_validate(hline) )
-            hlines = list_append(hlines, hline);
+            // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
+            hlines = list_append_to_end(hlines, hline);
          else
          {
                         if (hline) sam_header_line_free(hline);
diff --git a/sam_view.c b/sam_view.c

index 06dd01a5ffa951dcb950a97681e7c5aca0b88971..3b10e2e5372e98a2b9ebda4bd5791dc1b837a20f 100644 (file)
--- a/sam_view.c
+++ b/sam_view.c
@@ -6,7 +6,12 @@
  #include "sam_header.h"
  #include "sam.h"
  #include "faidx.h"
+#include "khash.h"
+KHASH_SET_INIT_STR(rg)
  
+typedef khash_t(rg) *rghash_t;
+
+rghash_t g_rghash = 0;
  static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;
  static char *g_library, *g_rg;
  static int g_sol2sanger_tbl[128];
@@ -32,9 +37,15 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b)
  {
         if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))
                 return 1;
-       if (g_rg) {
+       if (g_rg || g_rghash) {
                 uint8_t *s = bam_aux_get(b, "RG");
-               if (s && strcmp(g_rg, (char*)(s + 1)) == 0) return 0;
+               if (s) {
+                       if (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1;
+                       if (g_rghash) {
+                               khint_t k = kh_get(rg, g_rghash, (char*)(s + 1));
+                               return (k != kh_end(g_rghash))? 0 : 1;
+                       }
+               }
         }
         if (g_library) {
                 const char *p = bam_get_library((bam_header_t*)h, b);
@@ -58,11 +69,11 @@ int main_samview(int argc, char *argv[])
         int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0;
         int of_type = BAM_OFDEC, is_long_help = 0;
         samfile_t *in = 0, *out = 0;
-       char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
+       char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0;
  
         /* parse command-line options */
         strcpy(in_mode, "r"); strcpy(out_mode, "w");
-       while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:C")) >= 0) {
+       while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) {
                 switch (c) {
                 case 'C': slx2sngr = 1; break;
                 case 'S': is_bamin = 0; break;
@@ -77,6 +88,7 @@ int main_samview(int argc, char *argv[])
                 case 'u': is_uncompressed = 1; break;
                 case 'l': g_library = strdup(optarg); break;
                 case 'r': g_rg = strdup(optarg); break;
+               case 'R': fn_rg = strdup(optarg); break;
                 case 'x': of_type = BAM_OFHEX; break;
                 case 'X': of_type = BAM_OFSTR; break;
                 case '?': is_long_help = 1; break;
@@ -94,7 +106,19 @@ int main_samview(int argc, char *argv[])
         if (is_bamin) strcat(in_mode, "b");
         if (is_header) strcat(out_mode, "h");
         if (is_uncompressed) strcat(out_mode, "u");
-       if (argc == optind) return usage(is_long_help);
+       if (argc == optind) return usage(is_long_help); // potential memory leak...
+
+       // read the list of read groups
+       if (fn_rg) {
+               FILE *fp_rg;
+               char buf[1024];
+               int ret;
+               g_rghash = kh_init(rg);
+               fp_rg = fopen(fn_rg, "r");
+               while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me...
+                       kh_put(rg, g_rghash, strdup(buf), &ret); // we'd better check duplicates...
+               fclose(fp_rg);
+       }
  
         // generate the fn_list if necessary
         if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
@@ -147,7 +171,13 @@ int main_samview(int argc, char *argv[])
  
  view_end:
         // close files, free and return
-       free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg);
+       free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);
+       if (g_rghash) {
+               khint_t k;
+               for (k = 0; k < kh_end(g_rghash); ++k)
+                       if (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k));
+               kh_destroy(rg, g_rghash);
+       }
         samclose(in);
         samclose(out);
         return ret;
@@ -167,6 +197,7 @@ static int usage(int is_long_help)
         fprintf(stderr, "         -t FILE  list of reference names and lengths (force -S) [null]\n");
         fprintf(stderr, "         -T FILE  reference sequence file (force -S) [null]\n");
         fprintf(stderr, "         -o FILE  output file name [stdout]\n");
+       fprintf(stderr, "         -R FILE  list of read groups to be outputted [null]\n");
         fprintf(stderr, "         -f INT   required flag, 0 for unset [0]\n");
         fprintf(stderr, "         -F INT   filtering flag, 0 for unset [0]\n");
         fprintf(stderr, "         -q INT   minimum mapping quality [0]\n");
diff --git a/samtools.1 b/samtools.1

index 31375f323f14f1df17a1f2b3b6185974d94c7b47..d79d176468ebe53f41bd45e7447cd1d2b0704a4e 100644 (file)
--- a/samtools.1
+++ b/samtools.1
@@ -1,4 +1,4 @@
-.TH samtools 1 "10 November 2009" "samtools-0.1.7" "Bioinformatics tools"
+.TH samtools 1 "11 July 2010" "samtools-0.1.8" "Bioinformatics tools"
  .SH NAME
  .PP
  samtools - Utilities for the Sequence Alignment/Map (SAM) format
@@ -10,6 +10,8 @@ samtools sort aln.bam aln.sorted
  .PP
  samtools index aln.sorted.bam
  .PP
+samtools idxstats aln.sorted.bam
+.PP
  samtools view aln.sorted.bam chr2:20,100,000-20,200,000
  .PP
  samtools merge out.bam in1.bam in2.bam in3.bam
@@ -18,6 +20,8 @@ samtools faidx ref.fasta
  .PP
  samtools pileup -f ref.fasta aln.sorted.bam
  .PP
+samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
+.PP
  samtools tview aln.sorted.bam ref.fasta
  
  .SH DESCRIPTION
@@ -42,81 +46,9 @@ entire alignment file unless it is asked to do so.
  .SH COMMANDS AND OPTIONS
  
  .TP 10
-.B import
-samtools import <in.ref_list> <in.sam> <out.bam>
-
-Since 0.1.4, this command is an alias of:
-
-samtools view -bt <in.ref_list> -o <out.bam> <in.sam>
-
-.TP
-.B sort
-samtools sort [-n] [-m maxMem] <in.bam> <out.prefix>
-
-Sort alignments by leftmost coordinates. File
-.I <out.prefix>.bam
-will be created. This command may also create temporary files
-.I <out.prefix>.%d.bam
-when the whole alignment cannot be fitted into memory (controlled by
-option -m).
-
-.B OPTIONS:
-.RS
-.TP 8
-.B -n
-Sort by read names rather than by chromosomal coordinates
-.TP
-.B -m INT
-Approximately the maximum required memory. [500000000]
-.RE
-
-.TP
-.B merge
-samtools merge [-h inh.sam] [-n] <out.bam> <in1.bam> <in2.bam> [...]
-
-Merge multiple sorted alignments.
-The header reference lists of all the input BAM files, and the @SQ headers of
-.IR inh.sam ,
-if any, must all refer to the same set of reference sequences.
-The header reference list and (unless overridden by
-.BR -h )
-`@' headers of
-.I in1.bam
-will be copied to
-.IR out.bam ,
-and the headers of other files will be ignored.
-
-.B OPTIONS:
-.RS
-.TP 8
-.B -h FILE
-Use the lines of
-.I FILE
-as `@' headers to be copied to
-.IR out.bam ,
-replacing any header lines that would otherwise be copied from
-.IR in1.bam .
-.RI ( FILE
-is actually in SAM format, though any alignment records it may contain
-are ignored.)
-.TP
-.B -n
-The input alignments are sorted by read names rather than by chromosomal
-coordinates
-.RE
-
-.TP
-.B index
-samtools index <aln.bam>
-
-Index sorted alignment for fast random access. Index file
-.I <aln.bam>.bai
-will be created.
-
-.TP
  .B view
  samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F
-skipFlag] [-q minMapQ] [-l library] [-r readGroup] <in.bam>|<in.sam> [region1 [...]]
+skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] <in.bam>|<in.sam> [region1 [...]]
  
  Extract/print all or sub alignments in SAM or BAM format. If no region
  is specified, all the alignments will be printed; otherwise only
@@ -178,22 +110,21 @@ Only output reads in library STR [null]
  .TP
  .B -r STR
  Only output reads in read group STR [null]
+.TP
+.B -R FILE
+Output reads in read groups listed in
+.I FILE
+[null]
  .RE
  
  .TP
-.B faidx
-samtools faidx <ref.fasta> [region1 [...]]
+.B tview
+samtools tview <in.sorted.bam> [ref.fasta]
  
-Index reference sequence in the FASTA format or extract subsequence from
-indexed reference sequence. If no region is specified,
-.B faidx
-will index the file and create
-.I <ref.fasta>.fai
-on the disk. If regions are speficified, the subsequences will be
-retrieved and printed to stdout in the FASTA format. The input file can
-be compressed in the
-.B RAZF
-format.
+Text alignment viewer (based on the ncurses library). In the viewer,
+press `?' for help and press `g' to check the alignment start from a
+region in the format like `chr10:10,000,000' or `=10,000,000' when
+viewing the same reference sequence.
  
  .TP
  .B pileup
@@ -232,33 +163,40 @@ covering reads, the first alllele, the second allele, # reads supporting
  the first allele, # reads supporting the second allele and # reads
  containing indels different from the top two alleles.
  
+The position of indels is offset by -1.
+
  .B OPTIONS:
  .RS
-
  .TP 10
  .B -s
  Print the mapping quality as the last column. This option makes the
  output easier to parse, although this format is not space efficient.
-
  .TP
  .B -S
  The input file is in SAM.
-
  .TP
  .B -i
  Only output pileup lines containing indels.
-
  .TP
  .B -f FILE
  The reference sequence in the FASTA format. Index file
  .I FILE.fai
  will be created if
  absent.
-
  .TP
  .B -M INT
  Cap mapping quality at INT [60]
-
+.TP
+.B -m INT
+Filter reads with flag containing bits in
+.I
+INT
+[1796]
+.TP
+.B -d INT
+Use the first
+.I NUM
+reads in the pileup for indel calling for speed up. Zero for unlimited. [0]
  .TP
  .B -t FILE
  List of reference names ane sequence lengths, in the format described
@@ -267,7 +205,6 @@ for the
  command. If this option is present, samtools assumes the input
  .I <in.alignment>
  is in SAM format; otherwise it assumes in BAM format.
-
  .TP
  .B -l FILE
  List of sites at which pileup is output. This file is space
@@ -278,10 +215,9 @@ recommended to use option
  together with
  .B -l
  as in the default format we may not know the mapping quality.
-
  .TP
  .B -c
-Call the consensus sequence using MAQ consensus model. Options
+Call the consensus sequence using SOAPsnp consensus model. Options
  .B -T,
  .B -N,
  .B -I
@@ -292,38 +228,147 @@ are only effective when
  or
  .B -g
  is in use.
-
  .TP
  .B -g
  Generate genotype likelihood in the binary GLFv3 format. This option
  suppresses -c, -i and -s.
-
  .TP
  .B -T FLOAT
  The theta parameter (error dependency coefficient) in the maq consensus
  calling model [0.85]
-
  .TP
  .B -N INT
  Number of haplotypes in the sample (>=2) [2]
-
  .TP
  .B -r FLOAT
  Expected fraction of differences between a pair of haplotypes [0.001]
-
  .TP
  .B -I INT
  Phred probability of an indel in sequencing/prep. [40]
+.RE
+
+.TP
+.B mpileup
+samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]]
  
+Generate pileup for multiple BAM files. Consensus calling is not
+implemented.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -r STR
+Only generate pileup in region
+.I STR
+[all sites]
+.TP
+.B -f FILE
+The reference file [null]
  .RE
  
  .TP
-.B tview
-samtools tview <in.sorted.bam> [ref.fasta]
+.B reheader
+samtools reheader <in.header.sam> <in.bam>
  
-Text alignment viewer (based on the ncurses library). In the viewer,
-press `?' for help and press `g' to check the alignment start from a
-region in the format like `chr10:10,000,000'.
+Replace the header in
+.I in.bam
+with the header in
+.I in.header.sam.
+This command is much faster than replacing the header with a
+BAM->SAM->BAM conversion.
+
+.TP
+.B sort
+samtools sort [-no] [-m maxMem] <in.bam> <out.prefix>
+
+Sort alignments by leftmost coordinates. File
+.I <out.prefix>.bam
+will be created. This command may also create temporary files
+.I <out.prefix>.%d.bam
+when the whole alignment cannot be fitted into memory (controlled by
+option -m).
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -o
+Output the final alignment to the standard output.
+.TP
+.B -n
+Sort by read names rather than by chromosomal coordinates
+.TP
+.B -m INT
+Approximately the maximum required memory. [500000000]
+.RE
+
+.TP
+.B merge
+samtools merge [-h inh.sam] [-nr] <out.bam> <in1.bam> <in2.bam> [...]
+
+Merge multiple sorted alignments.
+The header reference lists of all the input BAM files, and the @SQ headers of
+.IR inh.sam ,
+if any, must all refer to the same set of reference sequences.
+The header reference list and (unless overridden by
+.BR -h )
+`@' headers of
+.I in1.bam
+will be copied to
+.IR out.bam ,
+and the headers of other files will be ignored.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -h FILE
+Use the lines of
+.I FILE
+as `@' headers to be copied to
+.IR out.bam ,
+replacing any header lines that would otherwise be copied from
+.IR in1.bam .
+.RI ( FILE
+is actually in SAM format, though any alignment records it may contain
+are ignored.)
+.TP
+.B -r
+Attach an RG tag to each alignment. The tag value is inferred from file names.
+.TP
+.B -n
+The input alignments are sorted by read names rather than by chromosomal
+coordinates
+.RE
+
+.TP
+.B index
+samtools index <aln.bam>
+
+Index sorted alignment for fast random access. Index file
+.I <aln.bam>.bai
+will be created.
+
+.TP
+.B idxstats
+samtools idxstats <aln.bam>
+
+Retrieve and print stats in the index file. The output is TAB delimited
+with each line consisting of reference sequence name, sequence length, #
+mapped reads and # unmapped reads.
+
+.TP
+.B faidx
+samtools faidx <ref.fasta> [region1 [...]]
+
+Index reference sequence in the FASTA format or extract subsequence from
+indexed reference sequence. If no region is specified,
+.B faidx
+will index the file and create
+.I <ref.fasta>.fai
+on the disk. If regions are speficified, the subsequences will be
+retrieved and printed to stdout in the FASTA format. The input file can
+be compressed in the
+.B RAZF
+format.
  
  .TP
  .B fixmate
@@ -334,28 +379,34 @@ name-sorted alignment.
  
  .TP
  .B rmdup
-samtools rmdup <input.srt.bam> <out.bam>
+samtools rmdup [-sS] <input.srt.bam> <out.bam>
  
  Remove potential PCR duplicates: if multiple read pairs have identical
  external coordinates, only retain the pair with highest mapping quality.
-This command
+In the paired-end mode, this command
  .B ONLY
-works with FR orientation and requires ISIZE is correctly set.
-
-.TP
-.B rmdupse
-samtools rmdupse <input.srt.bam> <out.bam>
+works with FR orientation and requires ISIZE is correctly set. It does
+not work for unpaired reads (e.g. two ends mapped to different
+chromosomes or orphan reads).
  
-Remove potential duplicates for single-ended reads. This command will
-treat all reads as single-ended even if they are paired in fact.
+.B OPTIONS:
+.RS
+.TP 8
+.B -s
+Remove duplicate for single-end reads. By default, the command works for
+paired-end reads only.
+.TP 8
+.B -S
+Treat paired-end reads and single-end reads.
+.RE
  
  .TP
-.B fillmd
-samtools fillmd [-e] <aln.bam> <ref.fasta>
+.B calmd
+samtools calmd [-eubS] <aln.bam> <ref.fasta>
  
  Generate the MD tag. If the MD tag is already present, this command will
  give a warning if the MD tag generated is different from the existing
-tag.
+tag. Output SAM by default.
  
  .B OPTIONS:
  .RS
@@ -363,7 +414,15 @@ tag.
  .B -e
  Convert a the read base to = if it is identical to the aligned reference
  base. Indel caller does not support the = bases at the moment.
-
+.TP
+.B -u
+Output uncompressed BAM
+.TP
+.B -b
+Output compressed BAM
+.TP
+.B -S
+The input is SAM with header lines
  .RE
  
  .SH SAM FORMAT
@@ -396,21 +455,21 @@ Each bit in the FLAG field is defined as:
  
  .TS
  center box;
-cb | cb
-l | l .
-Flag   Description
+cb | cb | cb
+l | c | l .
+Flag   Chr     Description
  _
-0x0001 the read is paired in sequencing
-0x0002 the read is mapped in a proper pair
-0x0004 the query sequence itself is unmapped
-0x0008 the mate is unmapped
-0x0010 strand of the query (1 for reverse)
-0x0020 strand of the mate
-0x0040 the read is the first read in a pair
-0x0080 the read is the second read in a pair
-0x0100 the alignment is not primary
-0x0200 the read fails platform/vendor quality checks
-0x0400 the read is either a PCR or an optical duplicate
+0x0001 p       the read is paired in sequencing
+0x0002 P       the read is mapped in a proper pair
+0x0004 u       the query sequence itself is unmapped
+0x0008 U       the mate is unmapped
+0x0010 r       strand of the query (1 for reverse)
+0x0020 R       strand of the mate
+0x0040 1       the read is the first read in a pair
+0x0080 2       the read is the second read in a pair
+0x0100 s       the alignment is not primary
+0x0200 f       the read fails platform/vendor quality checks
+0x0400 d       the read is either a PCR or an optical duplicate
  .TE
  
  .SH LIMITATIONS
@@ -418,23 +477,23 @@ _
  .IP o 2
  Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c.
  .IP o 2
-CIGAR operation P is not properly handled at the moment.
-.IP o 2
  In merging, the input files are required to have the same number of
  reference sequences. The requirement can be relaxed. In addition,
  merging does not reconstruct the header dictionaries
  automatically. Endusers have to provide the correct header. Picard is
  better at merging.
  .IP o 2
-Samtools' rmdup does not work for single-end data and does not remove
-duplicates across chromosomes. Picard is better.
+Samtools paired-end rmdup does not work for unpaired reads (e.g. orphan
+reads or ends mapped to different chromosomes). If this is a concern,
+please use Picard's MarkDuplicate which correctly handles these cases,
+although a little slower.
  
  .SH AUTHOR
  .PP
  Heng Li from the Sanger Institute wrote the C version of samtools. Bob
  Handsaker from the Broad Institute implemented the BGZF library and Jue
  Ruan from Beijing Genomics Institute wrote the RAZF library. Various
-people in the 1000Genomes Project contributed to the SAM format
+people in the 1000 Genomes Project contributed to the SAM format
  specification.
  
  .SH SEE ALSO
diff --git a/samtools.txt b/samtools.txt

index feec2386e7c9694c12164fda385ce33d06d1f9f7..20e6c15cd707d11a7d1ed41f15b9d54732cd67ec 100644 (file)
--- a/samtools.txt
+++ b/samtools.txt
@@ -12,6 +12,8 @@ SYNOPSIS
  
         samtools index aln.sorted.bam
  
+       samtools idxstats aln.sorted.bam
+
         samtools view aln.sorted.bam chr2:20,100,000-20,200,000
  
         samtools merge out.bam in1.bam in2.bam in3.bam
@@ -20,6 +22,8 @@ SYNOPSIS
  
         samtools pileup -f ref.fasta aln.sorted.bam
  
+       samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
+
         samtools tview aln.sorted.bam ref.fasta
  
  
@@ -43,70 +47,19 @@ DESCRIPTION
  
  
  COMMANDS AND OPTIONS
-       import    samtools import <in.ref_list> <in.sam> <out.bam>
-
-                 Since 0.1.4, this command is an alias of:
-
-                 samtools view -bt <in.ref_list> -o <out.bam> <in.sam>
-
-
-       sort      samtools sort [-n] [-m maxMem] <in.bam> <out.prefix>
-
-                 Sort  alignments  by  leftmost  coordinates.  File  <out.pre-
-                 fix>.bam will be created. This command may also create tempo-
-                 rary files <out.prefix>.%d.bam when the whole alignment  can-
-                 not be fitted into memory (controlled by option -m).
-
-                 OPTIONS:
-
-                 -n      Sort by read names rather than by chromosomal coordi-
-                         nates
-
-                 -m INT  Approximately   the    maximum    required    memory.
-                         [500000000]
-
-
-       merge     samtools   merge   [-h   inh.sam]  [-n]  <out.bam>  <in1.bam>
-                 <in2.bam> [...]
-
-                 Merge multiple sorted alignments.  The header reference lists
-                 of  all  the input BAM files, and the @SQ headers of inh.sam,
-                 if  any,  must  all  refer  to  the  same  set  of  reference
-                 sequences.   The header reference list and (unless overridden
-                 by -h) `@' headers of in1.bam will be copied to out.bam,  and
-                 the headers of other files will be ignored.
-
-                 OPTIONS:
-
-                 -h FILE Use  the lines of FILE as `@' headers to be copied to
-                         out.bam, replacing any header lines that would other-
-                         wise  be  copied  from in1.bam.  (FILE is actually in
-                         SAM format, though any alignment records it may  con-
-                         tain are ignored.)
-
-                 -n      The  input alignments are sorted by read names rather
-                         than by chromosomal coordinates
-
-
-       index     samtools index <aln.bam>
-
-                 Index sorted alignment for fast  random  access.  Index  file
-                 <aln.bam>.bai will be created.
-
-
         view      samtools  view  [-bhuHS]  [-t  in.refList]  [-o  output]  [-f
-                 reqFlag] [-F skipFlag] [-q minMapQ] [-l  library]  [-r  read-
-                 Group] <in.bam>|<in.sam> [region1 [...]]
+                 reqFlag]  [-F  skipFlag]  [-q minMapQ] [-l library] [-r read-
+                 Group] [-R rgFile] <in.bam>|<in.sam> [region1 [...]]
  
-                 Extract/print  all or sub alignments in SAM or BAM format. If
-                 no region is specified, all the alignments will  be  printed;
-                 otherwise  only  alignments overlapping the specified regions
-                 will be output. An alignment may be given multiple  times  if
+                 Extract/print all or sub alignments in SAM or BAM format.  If
+                 no  region  is specified, all the alignments will be printed;
+                 otherwise only alignments overlapping the  specified  regions
+                 will  be  output. An alignment may be given multiple times if
                   it is overlapping several regions. A region can be presented,
-                 for example, in  the  following  format:  `chr2'  (the  whole
-                 chr2),  `chr2:1000000'  (region starting from 1,000,000bp) or
-                 `chr2:1,000,000-2,000,000'  (region  between  1,000,000   and
-                 2,000,000bp  including  the  end  points).  The coordinate is
+                 for  example,  in  the  following  format:  `chr2' (the whole
+                 chr2), `chr2:1000000' (region starting from  1,000,000bp)  or
+                 `chr2:1,000,000-2,000,000'   (region  between  1,000,000  and
+                 2,000,000bp including the  end  points).  The  coordinate  is
                   1-based.
  
                   OPTIONS:
@@ -114,27 +67,27 @@ COMMANDS AND OPTIONS
                   -b      Output in the BAM format.
  
                   -u      Output uncompressed BAM. This option saves time spent
-                         on  compression/decomprssion  and  is  thus preferred
+                         on compression/decomprssion  and  is  thus  preferred
                           when the output is piped to another samtools command.
  
                   -h      Include the header in the output.
  
                   -H      Output the header only.
  
-                 -S      Input  is in SAM. If @SQ header lines are absent, the
+                 -S      Input is in SAM. If @SQ header lines are absent,  the
                           `-t' option is required.
  
-                 -t FILE This file is TAB-delimited. Each  line  must  contain
-                         the  reference  name and the length of the reference,
-                         one line  for  each  distinct  reference;  additional
-                         fields  are ignored. This file also defines the order
-                         of the reference sequences in  sorting.  If  you  run
-                         `samtools  faidx  <ref.fa>', the resultant index file
-                         <ref.fa>.fai can be used as this <in.ref_list>  file.
+                 -t FILE This  file  is  TAB-delimited. Each line must contain
+                         the reference name and the length of  the  reference,
+                         one  line  for  each  distinct  reference; additional
+                         fields are ignored. This file also defines the  order
+                         of  the  reference  sequences  in sorting. If you run
+                         `samtools faidx <ref.fa>', the resultant  index  file
+                         <ref.fa>.fai  can be used as this <in.ref_list> file.
  
                   -o FILE Output file [stdout]
  
-                 -f INT  Only  output  alignments with all bits in INT present
+                 -f INT  Only output alignments with all bits in  INT  present
                           in the FLAG field. INT can be in hex in the format of
                           /^0x[0-9A-F]+/ [0]
  
@@ -146,125 +99,197 @@ COMMANDS AND OPTIONS
  
                   -r STR  Only output reads in read group STR [null]
  
+                 -R FILE Output reads in read groups listed in FILE [null]
  
-       faidx     samtools faidx <ref.fasta> [region1 [...]]
  
-                 Index  reference sequence in the FASTA format or extract sub-
-                 sequence from indexed reference sequence.  If  no  region  is
-                 specified,   faidx   will   index   the   file   and   create
-                 <ref.fasta>.fai on the disk. If regions are speficified,  the
-                 subsequences  will  be retrieved and printed to stdout in the
-                 FASTA format. The input file can be compressed  in  the  RAZF
-                 format.
+       tview     samtools tview <in.sorted.bam> [ref.fasta]
+
+                 Text alignment viewer (based on the ncurses library). In  the
+                 viewer,  press `?' for help and press `g' to check the align-
+                 ment   start   from   a   region   in   the    format    like
+                 `chr10:10,000,000'  or  `=10,000,000'  when  viewing the same
+                 reference sequence.
  
  
-       pileup    samtools   pileup  [-f  in.ref.fasta]  [-t  in.ref_list]  [-l
-                 in.site_list]   [-iscgS2]   [-T   theta]   [-N   nHap]    [-r
+       pileup    samtools  pileup  [-f  in.ref.fasta]  [-t  in.ref_list]   [-l
+                 in.site_list]    [-iscgS2]   [-T   theta]   [-N   nHap]   [-r
                   pairDiffRate] <in.bam>|<in.sam>
  
-                 Print  the alignment in the pileup format. In the pileup for-
-                 mat, each line represents a genomic position,  consisting  of
+                 Print the alignment in the pileup format. In the pileup  for-
+                 mat,  each  line represents a genomic position, consisting of
                   chromosome name, coordinate, reference base, read bases, read
-                 qualities and alignment  mapping  qualities.  Information  on
+                 qualities  and  alignment  mapping  qualities. Information on
                   match, mismatch, indel, strand, mapping quality and start and
-                 end of a read are all encoded at the  read  base  column.  At
-                 this  column,  a dot stands for a match to the reference base
-                 on the forward strand, a comma for a  match  on  the  reverse
-                 strand,  `ACGTN'  for  a  mismatch  on the forward strand and
-                 `acgtn' for a mismatch  on  the  reverse  strand.  A  pattern
-                 `\+[0-9]+[ACGTNacgtn]+'   indicates  there  is  an  insertion
-                 between this reference position and the next reference  posi-
-                 tion.  The length of the insertion is given by the integer in
-                 the pattern, followed by the inserted sequence. Similarly,  a
+                 end  of  a  read  are all encoded at the read base column. At
+                 this column, a dot stands for a match to the  reference  base
+                 on  the  forward  strand,  a comma for a match on the reverse
+                 strand, `ACGTN' for a mismatch  on  the  forward  strand  and
+                 `acgtn'  for  a  mismatch  on  the  reverse strand. A pattern
+                 `\+[0-9]+[ACGTNacgtn]+'  indicates  there  is  an   insertion
+                 between  this reference position and the next reference posi-
+                 tion. The length of the insertion is given by the integer  in
+                 the  pattern, followed by the inserted sequence. Similarly, a
                   pattern `-[0-9]+[ACGTNacgtn]+' represents a deletion from the
-                 reference. The deleted bases will be presented as `*' in  the
-                 following  lines.  Also at the read base column, a symbol `^'
-                 marks the start of a read segment which is a contiguous  sub-
-                 sequence  on  the read separated by `N/S/H' CIGAR operations.
-                 The ASCII of the character following `^' minus 33  gives  the
-                 mapping  quality.  A  symbol `$' marks the end of a read seg-
+                 reference.  The deleted bases will be presented as `*' in the
+                 following lines. Also at the read base column, a  symbol  `^'
+                 marks  the start of a read segment which is a contiguous sub-
+                 sequence on the read separated by `N/S/H'  CIGAR  operations.
+                 The  ASCII  of the character following `^' minus 33 gives the
+                 mapping quality. A symbol `$' marks the end of  a  read  seg-
                   ment.
  
-                 If option -c is applied,  the  consensus  base,  Phred-scaled
-                 consensus  quality, SNP quality (i.e. the Phred-scaled proba-
+                 If  option  -c  is  applied, the consensus base, Phred-scaled
+                 consensus quality, SNP quality (i.e. the Phred-scaled  proba-
                   bility of the consensus being identical to the reference) and
-                 root  mean square (RMS) mapping quality of the reads covering
-                 the site will be inserted between the  `reference  base'  and
-                 the  `read  bases'  columns.  An indel occupies an additional
-                 line. Each indel line consists of  chromosome  name,  coordi-
-                 nate,  a  star, the genotype, consensus quality, SNP quality,
+                 root mean square (RMS) mapping quality of the reads  covering
+                 the  site  will  be inserted between the `reference base' and
+                 the `read bases' columns. An  indel  occupies  an  additional
+                 line.  Each  indel  line consists of chromosome name, coordi-
+                 nate, a star, the genotype, consensus quality,  SNP  quality,
                   RMS mapping quality, # covering reads, the first alllele, the
-                 second  allele,  # reads supporting the first allele, # reads
-                 supporting the second allele and #  reads  containing  indels
+                 second allele, # reads supporting the first allele,  #  reads
+                 supporting  the  second  allele and # reads containing indels
                   different from the top two alleles.
  
-                 OPTIONS:
+                 The position of indels is offset by -1.
  
+                 OPTIONS:
  
-                 -s        Print  the mapping quality as the last column. This
-                           option makes the output easier to  parse,  although
+                 -s        Print the mapping quality as the last column.  This
+                           option  makes  the output easier to parse, although
                             this format is not space efficient.
  
-
                   -S        The input file is in SAM.
  
-
                   -i        Only output pileup lines containing indels.
  
-
-                 -f FILE   The  reference  sequence in the FASTA format. Index
+                 -f FILE   The reference sequence in the FASTA  format.  Index
                             file FILE.fai will be created if absent.
  
-
                   -M INT    Cap mapping quality at INT [60]
  
+                 -m INT    Filter  reads  with  flag  containing  bits  in INT
+                           [1796]
  
-                 -t FILE   List of reference names ane  sequence  lengths,  in
-                           the  format  described  for  the import command. If
-                           this option is present, samtools assumes the  input
+                 -d INT    Use the first NUM reads in  the  pileup  for  indel
+                           calling for speed up. Zero for unlimited. [0]
+
+                 -t FILE   List  of  reference  names ane sequence lengths, in
+                           the format described for  the  import  command.  If
+                           this  option is present, samtools assumes the input
                             <in.alignment>  is  in  SAM  format;  otherwise  it
                             assumes in BAM format.
  
-
-                 -l FILE   List of sites at which pileup is output. This  file
-                           is  space  delimited.  The  first  two  columns are
-                           required to be chromosome and  1-based  coordinate.
-                           Additional  columns  are ignored. It is recommended
+                 -l FILE   List  of sites at which pileup is output. This file
+                           is space  delimited.  The  first  two  columns  are
+                           required  to  be chromosome and 1-based coordinate.
+                           Additional columns are ignored. It  is  recommended
                             to use option -s together with -l as in the default
                             format we may not know the mapping quality.
  
-
-                 -c        Call  the  consensus  sequence  using MAQ consensus
+                 -c        Call the consensus sequence using SOAPsnp consensus
                             model. Options -T, -N, -I and -r are only effective
                             when -c or -g is in use.
  
-
-                 -g        Generate  genotype  likelihood  in the binary GLFv3
+                 -g        Generate genotype likelihood in  the  binary  GLFv3
                             format. This option suppresses -c, -i and -s.
  
-
-                 -T FLOAT  The theta parameter (error dependency  coefficient)
+                 -T FLOAT  The  theta parameter (error dependency coefficient)
                             in the maq consensus calling model [0.85]
  
-
                   -N INT    Number of haplotypes in the sample (>=2) [2]
  
-
-                 -r FLOAT  Expected  fraction of differences between a pair of
+                 -r FLOAT  Expected fraction of differences between a pair  of
                             haplotypes [0.001]
  
-
-                 -I INT    Phred probability of an indel  in  sequencing/prep.
+                 -I INT    Phred  probability  of an indel in sequencing/prep.
                             [40]
  
  
+       mpileup   samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]]
  
-       tview     samtools tview <in.sorted.bam> [ref.fasta]
+                 Generate pileup for multiple BAM files. Consensus calling  is
+                 not implemented.
+
+                 OPTIONS:
+
+                 -r STR  Only generate pileup in region STR [all sites]
+
+                 -f FILE The reference file [null]
+
+
+       reheader  samtools reheader <in.header.sam> <in.bam>
+
+                 Replace   the   header   in   in.bam   with   the  header  in
+                 in.header.sam.  This command is much  faster  than  replacing
+                 the header with a BAM->SAM->BAM conversion.
+
+
+       sort      samtools sort [-no] [-m maxMem] <in.bam> <out.prefix>
+
+                 Sort  alignments  by  leftmost  coordinates.  File  <out.pre-
+                 fix>.bam will be created. This command may also create tempo-
+                 rary  files <out.prefix>.%d.bam when the whole alignment can-
+                 not be fitted into memory (controlled by option -m).
+
+                 OPTIONS:
+
+                 -o      Output the final alignment to the standard output.
+
+                 -n      Sort by read names rather than by chromosomal coordi-
+                         nates
+
+                 -m INT  Approximately    the    maximum    required   memory.
+                         [500000000]
+
+
+       merge     samtools  merge  [-h  inh.sam]  [-nr]   <out.bam>   <in1.bam>
+                 <in2.bam> [...]
+
+                 Merge multiple sorted alignments.  The header reference lists
+                 of all the input BAM files, and the @SQ headers  of  inh.sam,
+                 if  any,  must  all  refer  to  the  same  set  of  reference
+                 sequences.  The header reference list and (unless  overridden
+                 by  -h) `@' headers of in1.bam will be copied to out.bam, and
+                 the headers of other files will be ignored.
+
+                 OPTIONS:
+
+                 -h FILE Use the lines of FILE as `@' headers to be copied  to
+                         out.bam, replacing any header lines that would other-
+                         wise be copied from in1.bam.  (FILE  is  actually  in
+                         SAM  format, though any alignment records it may con-
+                         tain are ignored.)
+
+                 -r      Attach an RG tag to each alignment. The tag value  is
+                         inferred from file names.
+
+                 -n      The  input alignments are sorted by read names rather
+                         than by chromosomal coordinates
+
+
+       index     samtools index <aln.bam>
+
+                 Index sorted alignment for fast  random  access.  Index  file
+                 <aln.bam>.bai will be created.
+
+
+       idxstats  samtools idxstats <aln.bam>
  
-                 Text  alignment viewer (based on the ncurses library). In the
-                 viewer, press `?' for help and press `g' to check the  align-
-                 ment    start    from   a   region   in   the   format   like
-                 `chr10:10,000,000'.
+                 Retrieve and print stats in the index file. The output is TAB
+                 delimited with each line  consisting  of  reference  sequence
+                 name, sequence length, # mapped reads and # unmapped reads.
+
+
+       faidx     samtools faidx <ref.fasta> [region1 [...]]
+
+                 Index  reference sequence in the FASTA format or extract sub-
+                 sequence from indexed reference sequence.  If  no  region  is
+                 specified,   faidx   will   index   the   file   and   create
+                 <ref.fasta>.fai on the disk. If regions are speficified,  the
+                 subsequences  will  be retrieved and printed to stdout in the
+                 FASTA format. The input file can be compressed  in  the  RAZF
+                 format.
  
  
         fixmate   samtools fixmate <in.nameSrt.bam> <out.bam>
@@ -273,26 +298,28 @@ COMMANDS AND OPTIONS
                   name-sorted alignment.
  
  
-       rmdup     samtools rmdup <input.srt.bam> <out.bam>
+       rmdup     samtools rmdup [-sS] <input.srt.bam> <out.bam>
  
-                 Remove  potential PCR duplicates: if multiple read pairs have
-                 identical external coordinates, only  retain  the  pair  with
-                 highest  mapping  quality.   This  command ONLY works with FR
-                 orientation and requires ISIZE is correctly set.
+                 Remove potential PCR duplicates: if multiple read pairs  have
+                 identical  external  coordinates,  only  retain the pair with
+                 highest mapping quality.  In the paired-end mode,  this  com-
+                 mand  ONLY  works  with  FR orientation and requires ISIZE is
+                 correctly set. It does not work for unpaired reads (e.g.  two
+                 ends mapped to different chromosomes or orphan reads).
  
+                 OPTIONS:
  
-       rmdupse   samtools rmdupse <input.srt.bam> <out.bam>
+                 -s      Remove  duplicate  for  single-end reads. By default,
+                         the command works for paired-end reads only.
  
-                 Remove potential duplicates for single-ended reads. This com-
-                 mand  will  treat  all reads as single-ended even if they are
-                 paired in fact.
+                 -S      Treat paired-end reads and single-end reads.
  
  
-       fillmd    samtools fillmd [-e] <aln.bam> <ref.fasta>
+       calmd     samtools calmd [-eubS] <aln.bam> <ref.fasta>
  
                   Generate the MD tag. If the MD tag is already  present,  this
                   command  will  give a warning if the MD tag generated is dif-
-                 ferent from the existing tag.
+                 ferent from the existing tag. Output SAM by default.
  
                   OPTIONS:
  
@@ -300,6 +327,11 @@ COMMANDS AND OPTIONS
                           the  aligned  reference  base.  Indel caller does not
                           support the = bases at the moment.
  
+                 -u      Output uncompressed BAM
+
+                 -b      Output compressed BAM
+
+                 -S      The input is SAM with header lines
  
  
  SAM FORMAT
@@ -327,43 +359,43 @@ SAM FORMAT
         Each bit in the FLAG field is defined as:
  
  
-             +-------+--------------------------------------------------+
-             | Flag  |                   Description                    |
-             +-------+--------------------------------------------------+
-             |0x0001 | the read is paired in sequencing                 |
-             |0x0002 | the read is mapped in a proper pair              |
-             |0x0004 | the query sequence itself is unmapped            |
-             |0x0008 | the mate is unmapped                             |
-             |0x0010 | strand of the query (1 for reverse)              |
-             |0x0020 | strand of the mate                               |
-             |0x0040 | the read is the first read in a pair             |
-             |0x0080 | the read is the second read in a pair            |
-             |0x0100 | the alignment is not primary                     |
-             |0x0200 | the read fails platform/vendor quality checks    |
-             |0x0400 | the read is either a PCR or an optical duplicate |
-             +-------+--------------------------------------------------+
+          +-------+-----+--------------------------------------------------+
+          | Flag  | Chr |                   Description                    |
+          +-------+-----+--------------------------------------------------+
+          |0x0001 |  p  | the read is paired in sequencing                 |
+          |0x0002 |  P  | the read is mapped in a proper pair              |
+          |0x0004 |  u  | the query sequence itself is unmapped            |
+          |0x0008 |  U  | the mate is unmapped                             |
+          |0x0010 |  r  | strand of the query (1 for reverse)              |
+          |0x0020 |  R  | strand of the mate                               |
+          |0x0040 |  1  | the read is the first read in a pair             |
+          |0x0080 |  2  | the read is the second read in a pair            |
+          |0x0100 |  s  | the alignment is not primary                     |
+          |0x0200 |  f  | the read fails platform/vendor quality checks    |
+          |0x0400 |  d  | the read is either a PCR or an optical duplicate |
+          +-------+-----+--------------------------------------------------+
  
  LIMITATIONS
         o Unaligned   words  used  in  bam_import.c,  bam_endian.h,  bam.c  and
           bam_aux.c.
  
-       o CIGAR operation P is not properly handled at the moment.
-
         o In merging, the input files are required to have the same  number  of
           reference  sequences.  The  requirement  can be relaxed. In addition,
           merging does not reconstruct the header  dictionaries  automatically.
           Endusers  have  to  provide  the  correct header. Picard is better at
           merging.
  
-       o Samtools' rmdup does not work for single-end data and does not remove
-         duplicates across chromosomes. Picard is better.
+       o Samtools paired-end rmdup does not  work  for  unpaired  reads  (e.g.
+         orphan  reads  or ends mapped to different chromosomes). If this is a
+         concern, please use Picard's MarkDuplicate  which  correctly  handles
+         these cases, although a little slower.
  
  
  AUTHOR
         Heng  Li from the Sanger Institute wrote the C version of samtools. Bob
         Handsaker from the Broad Institute implemented the BGZF library and Jue
         Ruan  from  Beijing  Genomics Institute wrote the RAZF library. Various
-       people in the 1000Genomes Project contributed to the SAM format  speci-
+       people in the 1000 Genomes Project contributed to the SAM format speci-
         fication.
  
  
@@ -372,4 +404,4 @@ SEE ALSO
  
  
  
-samtools-0.1.7                 10 November 2009                    samtools(1)
+samtools-0.1.8                   11 July 2010                      samtools(1)
diff --git a/win32/xcurses.h b/win32/xcurses.h

new file mode 100644 (file)

index 0000000..6f3ce19
--- /dev/null
+++ b/win32/xcurses.h
@@ -0,0 +1,1377 @@
+/* Public Domain Curses */
+
+/* $Id: curses.h,v 1.295 2008/07/15 17:13:25 wmcbrine Exp $ */
+
+/*----------------------------------------------------------------------*
+ *                              PDCurses                                *
+ *----------------------------------------------------------------------*/
+
+#ifndef __PDCURSES__
+#define __PDCURSES__ 1
+
+/*man-start**************************************************************
+
+PDCurses definitions list:  (Only define those needed)
+
+    XCURSES         True if compiling for X11.
+    PDC_RGB         True if you want to use RGB color definitions
+                    (Red = 1, Green = 2, Blue = 4) instead of BGR.
+    PDC_WIDE        True if building wide-character support.
+    PDC_DLL_BUILD   True if building a Win32 DLL.
+    NCURSES_MOUSE_VERSION   Use the ncurses mouse API instead
+                            of PDCurses' traditional mouse API.
+
+PDCurses portable platform definitions list:
+
+    PDC_BUILD       Defines API build version.
+    PDCURSES        Enables access to PDCurses-only routines.
+    XOPEN           Always true.
+    SYSVcurses      True if you are compiling for SYSV portability.
+    BSDcurses       True if you are compiling for BSD portability.
+
+**man-end****************************************************************/
+
+#define PDC_BUILD 3401
+#define PDCURSES        1      /* PDCurses-only routines */
+#define XOPEN           1      /* X/Open Curses routines */
+#define SYSVcurses      1      /* System V Curses routines */
+#define BSDcurses       1      /* BSD Curses routines */
+#define CHTYPE_LONG     1      /* size of chtype; long */
+
+/*----------------------------------------------------------------------*/
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>             /* Required by X/Open usage below */
+
+#ifdef PDC_WIDE
+# include <wchar.h>
+#endif
+
+#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS)
+extern "C"
+{
+# define bool _bool
+#endif
+
+/*----------------------------------------------------------------------
+ *
+ *  PDCurses Manifest Constants
+ *
+ */
+
+#ifndef FALSE
+# define FALSE 0
+#endif
+#ifndef TRUE
+# define TRUE 1
+#endif
+#ifndef NULL
+# define NULL (void *)0
+#endif
+#ifndef ERR
+# define ERR (-1)
+#endif
+#ifndef OK
+# define OK 0
+#endif
+
+/*----------------------------------------------------------------------
+ *
+ *  PDCurses Type Declarations
+ *
+ */
+
+typedef unsigned char bool;    /* PDCurses Boolean type */
+
+#ifdef CHTYPE_LONG
+# if _LP64
+typedef unsigned int chtype;
+# else
+typedef unsigned long chtype;  /* 16-bit attr + 16-bit char */
+# endif
+#else
+typedef unsigned short chtype; /* 8-bit attr + 8-bit char */
+#endif
+
+#ifdef PDC_WIDE
+typedef chtype cchar_t;
+#endif
+
+typedef chtype attr_t;
+
+/*----------------------------------------------------------------------
+ *
+ *  PDCurses Mouse Interface -- SYSVR4, with extensions
+ *
+ */
+
+typedef struct
+{
+    int x;           /* absolute column, 0 based, measured in characters */
+    int y;           /* absolute row, 0 based, measured in characters */
+    short button[3]; /* state of each button */
+    int changes;     /* flags indicating what has changed with the mouse */
+} MOUSE_STATUS;
+
+#define BUTTON_RELEASED         0x0000
+#define BUTTON_PRESSED          0x0001
+#define BUTTON_CLICKED          0x0002
+#define BUTTON_DOUBLE_CLICKED   0x0003
+#define BUTTON_TRIPLE_CLICKED   0x0004
+#define BUTTON_MOVED            0x0005  /* PDCurses */
+#define WHEEL_SCROLLED          0x0006  /* PDCurses */
+#define BUTTON_ACTION_MASK      0x0007  /* PDCurses */
+
+#define PDC_BUTTON_SHIFT        0x0008  /* PDCurses */
+#define PDC_BUTTON_CONTROL      0x0010  /* PDCurses */
+#define PDC_BUTTON_ALT          0x0020  /* PDCurses */
+#define BUTTON_MODIFIER_MASK    0x0038  /* PDCurses */
+
+#define MOUSE_X_POS             (Mouse_status.x)
+#define MOUSE_Y_POS             (Mouse_status.y)
+
+/*
+ * Bits associated with the .changes field:
+ *   3         2         1         0
+ * 210987654321098765432109876543210
+ *                                 1 <- button 1 has changed
+ *                                10 <- button 2 has changed
+ *                               100 <- button 3 has changed
+ *                              1000 <- mouse has moved
+ *                             10000 <- mouse position report
+ *                            100000 <- mouse wheel up
+ *                           1000000 <- mouse wheel down
+ */
+
+#define PDC_MOUSE_MOVED         0x0008
+#define PDC_MOUSE_POSITION      0x0010
+#define PDC_MOUSE_WHEEL_UP      0x0020
+#define PDC_MOUSE_WHEEL_DOWN    0x0040
+
+#define A_BUTTON_CHANGED        (Mouse_status.changes & 7)
+#define MOUSE_MOVED             (Mouse_status.changes & PDC_MOUSE_MOVED)
+#define MOUSE_POS_REPORT        (Mouse_status.changes & PDC_MOUSE_POSITION)
+#define BUTTON_CHANGED(x)       (Mouse_status.changes & (1 << ((x) - 1)))
+#define BUTTON_STATUS(x)        (Mouse_status.button[(x) - 1])
+#define MOUSE_WHEEL_UP          (Mouse_status.changes & PDC_MOUSE_WHEEL_UP)
+#define MOUSE_WHEEL_DOWN        (Mouse_status.changes & PDC_MOUSE_WHEEL_DOWN)
+
+/* mouse bit-masks */
+
+#define BUTTON1_RELEASED        0x00000001L
+#define BUTTON1_PRESSED         0x00000002L
+#define BUTTON1_CLICKED         0x00000004L
+#define BUTTON1_DOUBLE_CLICKED  0x00000008L
+#define BUTTON1_TRIPLE_CLICKED  0x00000010L
+#define BUTTON1_MOVED           0x00000010L /* PDCurses */
+
+#define BUTTON2_RELEASED        0x00000020L
+#define BUTTON2_PRESSED         0x00000040L
+#define BUTTON2_CLICKED         0x00000080L
+#define BUTTON2_DOUBLE_CLICKED  0x00000100L
+#define BUTTON2_TRIPLE_CLICKED  0x00000200L
+#define BUTTON2_MOVED           0x00000200L /* PDCurses */
+
+#define BUTTON3_RELEASED        0x00000400L
+#define BUTTON3_PRESSED         0x00000800L
+#define BUTTON3_CLICKED         0x00001000L
+#define BUTTON3_DOUBLE_CLICKED  0x00002000L
+#define BUTTON3_TRIPLE_CLICKED  0x00004000L
+#define BUTTON3_MOVED           0x00004000L /* PDCurses */
+
+/* For the ncurses-compatible functions only, BUTTON4_PRESSED and 
+   BUTTON5_PRESSED are returned for mouse scroll wheel up and down; 
+   otherwise PDCurses doesn't support buttons 4 and 5 */
+
+#define BUTTON4_RELEASED        0x00008000L
+#define BUTTON4_PRESSED         0x00010000L
+#define BUTTON4_CLICKED         0x00020000L
+#define BUTTON4_DOUBLE_CLICKED  0x00040000L
+#define BUTTON4_TRIPLE_CLICKED  0x00080000L
+
+#define BUTTON5_RELEASED        0x00100000L
+#define BUTTON5_PRESSED         0x00200000L
+#define BUTTON5_CLICKED         0x00400000L
+#define BUTTON5_DOUBLE_CLICKED  0x00800000L
+#define BUTTON5_TRIPLE_CLICKED  0x01000000L
+
+#define MOUSE_WHEEL_SCROLL      0x02000000L /* PDCurses */
+#define BUTTON_MODIFIER_SHIFT   0x04000000L /* PDCurses */
+#define BUTTON_MODIFIER_CONTROL 0x08000000L /* PDCurses */
+#define BUTTON_MODIFIER_ALT     0x10000000L /* PDCurses */
+
+#define ALL_MOUSE_EVENTS        0x1fffffffL
+#define REPORT_MOUSE_POSITION   0x20000000L
+
+/* ncurses mouse interface */
+
+typedef unsigned long mmask_t;
+
+typedef struct
+{
+        short id;       /* unused, always 0 */
+        int x, y, z;    /* x, y same as MOUSE_STATUS; z unused */
+        mmask_t bstate; /* equivalent to changes + button[], but
+                           in the same format as used for mousemask() */
+} MEVENT;
+
+#ifdef NCURSES_MOUSE_VERSION
+# define BUTTON_SHIFT   BUTTON_MODIFIER_SHIFT
+# define BUTTON_CONTROL BUTTON_MODIFIER_CONTROL
+# define BUTTON_CTRL    BUTTON_MODIFIER_CONTROL
+# define BUTTON_ALT     BUTTON_MODIFIER_ALT
+#else
+# define BUTTON_SHIFT   PDC_BUTTON_SHIFT
+# define BUTTON_CONTROL PDC_BUTTON_CONTROL
+# define BUTTON_ALT     PDC_BUTTON_ALT
+#endif
+
+/*----------------------------------------------------------------------
+ *
+ *  PDCurses Structure Definitions
+ *
+ */
+
+typedef struct _win       /* definition of a window */
+{
+    int   _cury;          /* current pseudo-cursor */
+    int   _curx;
+    int   _maxy;          /* max window coordinates */
+    int   _maxx;
+    int   _begy;          /* origin on screen */
+    int   _begx;
+    int   _flags;         /* window properties */
+    chtype _attrs;        /* standard attributes and colors */
+    chtype _bkgd;         /* background, normally blank */
+    bool  _clear;         /* causes clear at next refresh */
+    bool  _leaveit;       /* leaves cursor where it is */
+    bool  _scroll;        /* allows window scrolling */
+    bool  _nodelay;       /* input character wait flag */
+    bool  _immed;         /* immediate update flag */
+    bool  _sync;          /* synchronise window ancestors */
+    bool  _use_keypad;    /* flags keypad key mode active */
+    chtype **_y;          /* pointer to line pointer array */
+    int   *_firstch;      /* first changed character in line */
+    int   *_lastch;       /* last changed character in line */
+    int   _tmarg;         /* top of scrolling region */
+    int   _bmarg;         /* bottom of scrolling region */
+    int   _delayms;       /* milliseconds of delay for getch() */
+    int   _parx, _pary;   /* coords relative to parent (0,0) */
+    struct _win *_parent; /* subwin's pointer to parent win */
+} WINDOW;
+
+/* Avoid using the SCREEN struct directly -- use the corresponding 
+   functions if possible. This struct may eventually be made private. */
+
+typedef struct
+{
+    bool  alive;          /* if initscr() called, and not endwin() */
+    bool  autocr;         /* if cr -> lf */
+    bool  cbreak;         /* if terminal unbuffered */
+    bool  echo;           /* if terminal echo */
+    bool  raw_inp;        /* raw input mode (v. cooked input) */
+    bool  raw_out;        /* raw output mode (7 v. 8 bits) */
+    bool  audible;        /* FALSE if the bell is visual */
+    bool  mono;           /* TRUE if current screen is mono */
+    bool  resized;        /* TRUE if TERM has been resized */
+    bool  orig_attr;      /* TRUE if we have the original colors */
+    short orig_fore;      /* original screen foreground color */
+    short orig_back;      /* original screen foreground color */
+    int   cursrow;        /* position of physical cursor */
+    int   curscol;        /* position of physical cursor */
+    int   visibility;     /* visibility of cursor */
+    int   orig_cursor;    /* original cursor size */
+    int   lines;          /* new value for LINES */
+    int   cols;           /* new value for COLS */
+    unsigned long _trap_mbe;       /* trap these mouse button events */
+    unsigned long _map_mbe_to_key; /* map mouse buttons to slk */
+    int   mouse_wait;              /* time to wait (in ms) for a
+                                      button release after a press, in 
+                                      order to count it as a click */
+    int   slklines;                /* lines in use by slk_init() */
+    WINDOW *slk_winptr;            /* window for slk */
+    int   linesrippedoff;          /* lines ripped off via ripoffline() */
+    int   linesrippedoffontop;     /* lines ripped off on 
+                                      top via ripoffline() */
+    int   delaytenths;             /* 1/10ths second to wait block
+                                      getch() for */
+    bool  _preserve;               /* TRUE if screen background
+                                      to be preserved */
+    int   _restore;                /* specifies if screen background
+                                      to be restored, and how */
+    bool  save_key_modifiers;      /* TRUE if each key modifiers saved
+                                      with each key press */
+    bool  return_key_modifiers;    /* TRUE if modifier keys are
+                                      returned as "real" keys */
+    bool  key_code;                /* TRUE if last key is a special key;
+                                      used internally by get_wch() */
+#ifdef XCURSES
+    int   XcurscrSize;    /* size of Xcurscr shared memory block */
+    bool  sb_on;
+    int   sb_viewport_y;
+    int   sb_viewport_x;
+    int   sb_total_y;
+    int   sb_total_x;
+    int   sb_cur_y;
+    int   sb_cur_x;
+#endif
+    short line_color;     /* color of line attributes - default -1 */
+} SCREEN;
+
+/*----------------------------------------------------------------------
+ *
+ *  PDCurses External Variables
+ *
+ */
+
+#ifdef PDC_DLL_BUILD
+# ifdef CURSES_LIBRARY
+#  define PDCEX __declspec(dllexport) extern
+# else
+#  define PDCEX __declspec(dllimport)
+# endif
+#else
+# define PDCEX extern
+#endif
+
+PDCEX  int          LINES;        /* terminal height */
+PDCEX  int          COLS;         /* terminal width */
+PDCEX  WINDOW       *stdscr;      /* the default screen window */
+PDCEX  WINDOW       *curscr;      /* the current screen image */
+PDCEX  SCREEN       *SP;          /* curses variables */
+PDCEX  MOUSE_STATUS Mouse_status;
+PDCEX  int          COLORS;
+PDCEX  int          COLOR_PAIRS;
+PDCEX  int          TABSIZE;
+PDCEX  chtype       acs_map[];    /* alternate character set map */
+PDCEX  char         ttytype[];    /* terminal name/description */
+
+/*man-start**************************************************************
+
+PDCurses Text Attributes
+========================
+
+Originally, PDCurses used a short (16 bits) for its chtype. To include 
+color, a number of things had to be sacrificed from the strict Unix and 
+System V support. The main problem was fitting all character attributes 
+and color into an unsigned char (all 8 bits!).
+
+Today, PDCurses by default uses a long (32 bits) for its chtype, as in 
+System V. The short chtype is still available, by undefining CHTYPE_LONG 
+and rebuilding the library.
+
+The following is the structure of a win->_attrs chtype:
+
+short form:
+
+-------------------------------------------------
+|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
+-------------------------------------------------
+  color number |  attrs |   character eg 'a'
+
+The available non-color attributes are bold, reverse and blink. Others 
+have no effect. The high order char is an index into an array of 
+physical colors (defined in color.c) -- 32 foreground/background color 
+pairs (5 bits) plus 3 bits for other attributes.
+
+long form:
+
+----------------------------------------------------------------------------
+|31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|..| 3| 2| 1| 0|
+----------------------------------------------------------------------------
+      color number      |     modifiers         |      character eg 'a'
+
+The available non-color attributes are bold, underline, invisible, 
+right-line, left-line, protect, reverse and blink. 256 color pairs (8 
+bits), 8 bits for other attributes, and 16 bits for character data.
+
+**man-end****************************************************************/
+
+/*** Video attribute macros ***/
+
+#define A_NORMAL      (chtype)0
+
+#ifdef CHTYPE_LONG
+# define A_ALTCHARSET (chtype)0x00010000
+# define A_RIGHTLINE  (chtype)0x00020000
+# define A_LEFTLINE   (chtype)0x00040000
+# define A_INVIS      (chtype)0x00080000
+# define A_UNDERLINE  (chtype)0x00100000
+# define A_REVERSE    (chtype)0x00200000
+# define A_BLINK      (chtype)0x00400000
+# define A_BOLD       (chtype)0x00800000
+
+# define A_ATTRIBUTES (chtype)0xffff0000
+# define A_CHARTEXT   (chtype)0x0000ffff
+# define A_COLOR      (chtype)0xff000000
+
+# define A_ITALIC     A_INVIS
+# define A_PROTECT    (A_UNDERLINE | A_LEFTLINE | A_RIGHTLINE)
+
+# define PDC_ATTR_SHIFT  19
+# define PDC_COLOR_SHIFT 24
+#else
+# define A_BOLD       (chtype)0x0100  /* X/Open */
+# define A_REVERSE    (chtype)0x0200  /* X/Open */
+# define A_BLINK      (chtype)0x0400  /* X/Open */
+
+# define A_ATTRIBUTES (chtype)0xff00  /* X/Open */
+# define A_CHARTEXT   (chtype)0x00ff  /* X/Open */
+# define A_COLOR      (chtype)0xf800  /* System V */
+
+# define A_ALTCHARSET A_NORMAL        /* X/Open */
+# define A_PROTECT    A_NORMAL        /* X/Open */
+# define A_UNDERLINE  A_NORMAL        /* X/Open */
+
+# define A_LEFTLINE   A_NORMAL
+# define A_RIGHTLINE  A_NORMAL
+# define A_ITALIC     A_NORMAL
+# define A_INVIS      A_NORMAL
+
+# define PDC_ATTR_SHIFT   8
+# define PDC_COLOR_SHIFT 11
+#endif
+
+#define A_STANDOUT    (A_REVERSE | A_BOLD) /* X/Open */
+#define A_DIM         A_NORMAL
+
+#define CHR_MSK       A_CHARTEXT           /* Obsolete */
+#define ATR_MSK       A_ATTRIBUTES         /* Obsolete */
+#define ATR_NRM       A_NORMAL             /* Obsolete */
+
+/* For use with attr_t -- X/Open says, "these shall be distinct", so 
+   this is a non-conforming implementation. */
+
+#define WA_ALTCHARSET A_ALTCHARSET
+#define WA_BLINK      A_BLINK
+#define WA_BOLD       A_BOLD
+#define WA_DIM        A_DIM
+#define WA_INVIS      A_INVIS
+#define WA_LEFT       A_LEFTLINE
+#define WA_PROTECT    A_PROTECT
+#define WA_REVERSE    A_REVERSE
+#define WA_RIGHT      A_RIGHTLINE
+#define WA_STANDOUT   A_STANDOUT
+#define WA_UNDERLINE  A_UNDERLINE
+
+#define WA_HORIZONTAL A_NORMAL
+#define WA_LOW        A_NORMAL
+#define WA_TOP        A_NORMAL
+#define WA_VERTICAL   A_NORMAL
+
+/*** Alternate character set macros ***/
+
+/* 'w' = 32-bit chtype; acs_map[] index | A_ALTCHARSET
+   'n' = 16-bit chtype; it gets the fallback set because no bit is 
+         available for A_ALTCHARSET */
+
+#ifdef CHTYPE_LONG
+# define ACS_PICK(w, n) ((chtype)w | A_ALTCHARSET)
+#else
+# define ACS_PICK(w, n) ((chtype)n)
+#endif
+
+/* VT100-compatible symbols -- box chars */
+
+#define ACS_ULCORNER  ACS_PICK('l', '+')
+#define ACS_LLCORNER  ACS_PICK('m', '+')
+#define ACS_URCORNER  ACS_PICK('k', '+')
+#define ACS_LRCORNER  ACS_PICK('j', '+')
+#define ACS_RTEE      ACS_PICK('u', '+')
+#define ACS_LTEE      ACS_PICK('t', '+')
+#define ACS_BTEE      ACS_PICK('v', '+')
+#define ACS_TTEE      ACS_PICK('w', '+')
+#define ACS_HLINE     ACS_PICK('q', '-')
+#define ACS_VLINE     ACS_PICK('x', '|')
+#define ACS_PLUS      ACS_PICK('n', '+')
+
+/* VT100-compatible symbols -- other */
+
+#define ACS_S1        ACS_PICK('o', '-')
+#define ACS_S9        ACS_PICK('s', '_')
+#define ACS_DIAMOND   ACS_PICK('`', '+')
+#define ACS_CKBOARD   ACS_PICK('a', ':')
+#define ACS_DEGREE    ACS_PICK('f', '\'')
+#define ACS_PLMINUS   ACS_PICK('g', '#')
+#define ACS_BULLET    ACS_PICK('~', 'o')
+
+/* Teletype 5410v1 symbols -- these are defined in SysV curses, but
+   are not well-supported by most terminals. Stick to VT100 characters
+   for optimum portability. */
+
+#define ACS_LARROW    ACS_PICK(',', '<')
+#define ACS_RARROW    ACS_PICK('+', '>')
+#define ACS_DARROW    ACS_PICK('.', 'v')
+#define ACS_UARROW    ACS_PICK('-', '^')
+#define ACS_BOARD     ACS_PICK('h', '#')
+#define ACS_LANTERN   ACS_PICK('i', '*')
+#define ACS_BLOCK     ACS_PICK('0', '#')
+
+/* That goes double for these -- undocumented SysV symbols. Don't use
+   them. */
+
+#define ACS_S3        ACS_PICK('p', '-')
+#define ACS_S7        ACS_PICK('r', '-')
+#define ACS_LEQUAL    ACS_PICK('y', '<')
+#define ACS_GEQUAL    ACS_PICK('z', '>')
+#define ACS_PI        ACS_PICK('{', 'n')
+#define ACS_NEQUAL    ACS_PICK('|', '+')
+#define ACS_STERLING  ACS_PICK('}', 'L')
+
+/* Box char aliases */
+
+#define ACS_BSSB      ACS_ULCORNER
+#define ACS_SSBB      ACS_LLCORNER
+#define ACS_BBSS      ACS_URCORNER
+#define ACS_SBBS      ACS_LRCORNER
+#define ACS_SBSS      ACS_RTEE
+#define ACS_SSSB      ACS_LTEE
+#define ACS_SSBS      ACS_BTEE
+#define ACS_BSSS      ACS_TTEE
+#define ACS_BSBS      ACS_HLINE
+#define ACS_SBSB      ACS_VLINE
+#define ACS_SSSS      ACS_PLUS
+
+/* cchar_t aliases */
+
+#ifdef PDC_WIDE
+# define WACS_ULCORNER (&(acs_map['l']))
+# define WACS_LLCORNER (&(acs_map['m']))
+# define WACS_URCORNER (&(acs_map['k']))
+# define WACS_LRCORNER (&(acs_map['j']))
+# define WACS_RTEE     (&(acs_map['u']))
+# define WACS_LTEE     (&(acs_map['t']))
+# define WACS_BTEE     (&(acs_map['v']))
+# define WACS_TTEE     (&(acs_map['w']))
+# define WACS_HLINE    (&(acs_map['q']))
+# define WACS_VLINE    (&(acs_map['x']))
+# define WACS_PLUS     (&(acs_map['n']))
+
+# define WACS_S1       (&(acs_map['o']))
+# define WACS_S9       (&(acs_map['s']))
+# define WACS_DIAMOND  (&(acs_map['`']))
+# define WACS_CKBOARD  (&(acs_map['a']))
+# define WACS_DEGREE   (&(acs_map['f']))
+# define WACS_PLMINUS  (&(acs_map['g']))
+# define WACS_BULLET   (&(acs_map['~']))
+
+# define WACS_LARROW   (&(acs_map[',']))
+# define WACS_RARROW   (&(acs_map['+']))
+# define WACS_DARROW   (&(acs_map['.']))
+# define WACS_UARROW   (&(acs_map['-']))
+# define WACS_BOARD    (&(acs_map['h']))
+# define WACS_LANTERN  (&(acs_map['i']))
+# define WACS_BLOCK    (&(acs_map['0']))
+
+# define WACS_S3       (&(acs_map['p']))
+# define WACS_S7       (&(acs_map['r']))
+# define WACS_LEQUAL   (&(acs_map['y']))
+# define WACS_GEQUAL   (&(acs_map['z']))
+# define WACS_PI       (&(acs_map['{']))
+# define WACS_NEQUAL   (&(acs_map['|']))
+# define WACS_STERLING (&(acs_map['}']))
+
+# define WACS_BSSB     WACS_ULCORNER
+# define WACS_SSBB     WACS_LLCORNER
+# define WACS_BBSS     WACS_URCORNER
+# define WACS_SBBS     WACS_LRCORNER
+# define WACS_SBSS     WACS_RTEE
+# define WACS_SSSB     WACS_LTEE
+# define WACS_SSBS     WACS_BTEE
+# define WACS_BSSS     WACS_TTEE
+# define WACS_BSBS     WACS_HLINE
+# define WACS_SBSB     WACS_VLINE
+# define WACS_SSSS     WACS_PLUS
+#endif
+
+/*** Color macros ***/
+
+#define COLOR_BLACK   0
+
+#ifdef PDC_RGB        /* RGB */
+# define COLOR_RED    1
+# define COLOR_GREEN  2
+# define COLOR_BLUE   4
+#else                 /* BGR */
+# define COLOR_BLUE   1
+# define COLOR_GREEN  2
+# define COLOR_RED    4
+#endif
+
+#define COLOR_CYAN    (COLOR_BLUE | COLOR_GREEN)
+#define COLOR_MAGENTA (COLOR_RED | COLOR_BLUE)
+#define COLOR_YELLOW  (COLOR_RED | COLOR_GREEN)
+
+#define COLOR_WHITE   7
+
+/*----------------------------------------------------------------------
+ *
+ *  Function and Keypad Key Definitions.
+ *  Many are just for compatibility.
+ *
+ */
+
+#define KEY_CODE_YES  0x100  /* If get_wch() gives a key code */
+
+#define KEY_BREAK     0x101  /* Not on PC KBD */
+#define KEY_DOWN      0x102  /* Down arrow key */
+#define KEY_UP        0x103  /* Up arrow key */
+#define KEY_LEFT      0x104  /* Left arrow key */
+#define KEY_RIGHT     0x105  /* Right arrow key */
+#define KEY_HOME      0x106  /* home key */
+#define KEY_BACKSPACE 0x107  /* not on pc */
+#define KEY_F0        0x108  /* function keys; 64 reserved */
+
+#define KEY_DL        0x148  /* delete line */
+#define KEY_IL        0x149  /* insert line */
+#define KEY_DC        0x14a  /* delete character */
+#define KEY_IC        0x14b  /* insert char or enter ins mode */
+#define KEY_EIC       0x14c  /* exit insert char mode */
+#define KEY_CLEAR     0x14d  /* clear screen */
+#define KEY_EOS       0x14e  /* clear to end of screen */
+#define KEY_EOL       0x14f  /* clear to end of line */
+#define KEY_SF        0x150  /* scroll 1 line forward */
+#define KEY_SR        0x151  /* scroll 1 line back (reverse) */
+#define KEY_NPAGE     0x152  /* next page */
+#define KEY_PPAGE     0x153  /* previous page */
+#define KEY_STAB      0x154  /* set tab */
+#define KEY_CTAB      0x155  /* clear tab */
+#define KEY_CATAB     0x156  /* clear all tabs */
+#define KEY_ENTER     0x157  /* enter or send (unreliable) */
+#define KEY_SRESET    0x158  /* soft/reset (partial/unreliable) */
+#define KEY_RESET     0x159  /* reset/hard reset (unreliable) */
+#define KEY_PRINT     0x15a  /* print/copy */
+#define KEY_LL        0x15b  /* home down/bottom (lower left) */
+#define KEY_ABORT     0x15c  /* abort/terminate key (any) */
+#define KEY_SHELP     0x15d  /* short help */
+#define KEY_LHELP     0x15e  /* long help */
+#define KEY_BTAB      0x15f  /* Back tab key */
+#define KEY_BEG       0x160  /* beg(inning) key */
+#define KEY_CANCEL    0x161  /* cancel key */
+#define KEY_CLOSE     0x162  /* close key */
+#define KEY_COMMAND   0x163  /* cmd (command) key */
+#define KEY_COPY      0x164  /* copy key */
+#define KEY_CREATE    0x165  /* create key */
+#define KEY_END       0x166  /* end key */
+#define KEY_EXIT      0x167  /* exit key */
+#define KEY_FIND      0x168  /* find key */
+#define KEY_HELP      0x169  /* help key */
+#define KEY_MARK      0x16a  /* mark key */
+#define KEY_MESSAGE   0x16b  /* message key */
+#define KEY_MOVE      0x16c  /* move key */
+#define KEY_NEXT      0x16d  /* next object key */
+#define KEY_OPEN      0x16e  /* open key */
+#define KEY_OPTIONS   0x16f  /* options key */
+#define KEY_PREVIOUS  0x170  /* previous object key */
+#define KEY_REDO      0x171  /* redo key */
+#define KEY_REFERENCE 0x172  /* ref(erence) key */
+#define KEY_REFRESH   0x173  /* refresh key */
+#define KEY_REPLACE   0x174  /* replace key */
+#define KEY_RESTART   0x175  /* restart key */
+#define KEY_RESUME    0x176  /* resume key */
+#define KEY_SAVE      0x177  /* save key */
+#define KEY_SBEG      0x178  /* shifted beginning key */
+#define KEY_SCANCEL   0x179  /* shifted cancel key */
+#define KEY_SCOMMAND  0x17a  /* shifted command key */
+#define KEY_SCOPY     0x17b  /* shifted copy key */
+#define KEY_SCREATE   0x17c  /* shifted create key */
+#define KEY_SDC       0x17d  /* shifted delete char key */
+#define KEY_SDL       0x17e  /* shifted delete line key */
+#define KEY_SELECT    0x17f  /* select key */
+#define KEY_SEND      0x180  /* shifted end key */
+#define KEY_SEOL      0x181  /* shifted clear line key */
+#define KEY_SEXIT     0x182  /* shifted exit key */
+#define KEY_SFIND     0x183  /* shifted find key */
+#define KEY_SHOME     0x184  /* shifted home key */
+#define KEY_SIC       0x185  /* shifted input key */
+
+#define KEY_SLEFT     0x187  /* shifted left arrow key */
+#define KEY_SMESSAGE  0x188  /* shifted message key */
+#define KEY_SMOVE     0x189  /* shifted move key */
+#define KEY_SNEXT     0x18a  /* shifted next key */
+#define KEY_SOPTIONS  0x18b  /* shifted options key */
+#define KEY_SPREVIOUS 0x18c  /* shifted prev key */
+#define KEY_SPRINT    0x18d  /* shifted print key */
+#define KEY_SREDO     0x18e  /* shifted redo key */
+#define KEY_SREPLACE  0x18f  /* shifted replace key */
+#define KEY_SRIGHT    0x190  /* shifted right arrow */
+#define KEY_SRSUME    0x191  /* shifted resume key */
+#define KEY_SSAVE     0x192  /* shifted save key */
+#define KEY_SSUSPEND  0x193  /* shifted suspend key */
+#define KEY_SUNDO     0x194  /* shifted undo key */
+#define KEY_SUSPEND   0x195  /* suspend key */
+#define KEY_UNDO      0x196  /* undo key */
+
+/* PDCurses-specific key definitions -- PC only */
+
+#define ALT_0         0x197
+#define ALT_1         0x198
+#define ALT_2         0x199
+#define ALT_3         0x19a
+#define ALT_4         0x19b
+#define ALT_5         0x19c
+#define ALT_6         0x19d
+#define ALT_7         0x19e
+#define ALT_8         0x19f
+#define ALT_9         0x1a0
+#define ALT_A         0x1a1
+#define ALT_B         0x1a2
+#define ALT_C         0x1a3
+#define ALT_D         0x1a4
+#define ALT_E         0x1a5
+#define ALT_F         0x1a6
+#define ALT_G         0x1a7
+#define ALT_H         0x1a8
+#define ALT_I         0x1a9
+#define ALT_J         0x1aa
+#define ALT_K         0x1ab
+#define ALT_L         0x1ac
+#define ALT_M         0x1ad
+#define ALT_N         0x1ae
+#define ALT_O         0x1af
+#define ALT_P         0x1b0
+#define ALT_Q         0x1b1
+#define ALT_R         0x1b2
+#define ALT_S         0x1b3
+#define ALT_T         0x1b4
+#define ALT_U         0x1b5
+#define ALT_V         0x1b6
+#define ALT_W         0x1b7
+#define ALT_X         0x1b8
+#define ALT_Y         0x1b9
+#define ALT_Z         0x1ba
+
+#define CTL_LEFT      0x1bb  /* Control-Left-Arrow */
+#define CTL_RIGHT     0x1bc
+#define CTL_PGUP      0x1bd
+#define CTL_PGDN      0x1be
+#define CTL_HOME      0x1bf
+#define CTL_END       0x1c0
+
+#define KEY_A1        0x1c1  /* upper left on Virtual keypad */
+#define KEY_A2        0x1c2  /* upper middle on Virt. keypad */
+#define KEY_A3        0x1c3  /* upper right on Vir. keypad */
+#define KEY_B1        0x1c4  /* middle left on Virt. keypad */
+#define KEY_B2        0x1c5  /* center on Virt. keypad */
+#define KEY_B3        0x1c6  /* middle right on Vir. keypad */
+#define KEY_C1        0x1c7  /* lower left on Virt. keypad */
+#define KEY_C2        0x1c8  /* lower middle on Virt. keypad */
+#define KEY_C3        0x1c9  /* lower right on Vir. keypad */
+
+#define PADSLASH      0x1ca  /* slash on keypad */
+#define PADENTER      0x1cb  /* enter on keypad */
+#define CTL_PADENTER  0x1cc  /* ctl-enter on keypad */
+#define ALT_PADENTER  0x1cd  /* alt-enter on keypad */
+#define PADSTOP       0x1ce  /* stop on keypad */
+#define PADSTAR       0x1cf  /* star on keypad */
+#define PADMINUS      0x1d0  /* minus on keypad */
+#define PADPLUS       0x1d1  /* plus on keypad */
+#define CTL_PADSTOP   0x1d2  /* ctl-stop on keypad */
+#define CTL_PADCENTER 0x1d3  /* ctl-enter on keypad */
+#define CTL_PADPLUS   0x1d4  /* ctl-plus on keypad */
+#define CTL_PADMINUS  0x1d5  /* ctl-minus on keypad */
+#define CTL_PADSLASH  0x1d6  /* ctl-slash on keypad */
+#define CTL_PADSTAR   0x1d7  /* ctl-star on keypad */
+#define ALT_PADPLUS   0x1d8  /* alt-plus on keypad */
+#define ALT_PADMINUS  0x1d9  /* alt-minus on keypad */
+#define ALT_PADSLASH  0x1da  /* alt-slash on keypad */
+#define ALT_PADSTAR   0x1db  /* alt-star on keypad */
+#define ALT_PADSTOP   0x1dc  /* alt-stop on keypad */
+#define CTL_INS       0x1dd  /* ctl-insert */
+#define ALT_DEL       0x1de  /* alt-delete */
+#define ALT_INS       0x1df  /* alt-insert */
+#define CTL_UP        0x1e0  /* ctl-up arrow */
+#define CTL_DOWN      0x1e1  /* ctl-down arrow */
+#define CTL_TAB       0x1e2  /* ctl-tab */
+#define ALT_TAB       0x1e3
+#define ALT_MINUS     0x1e4
+#define ALT_EQUAL     0x1e5
+#define ALT_HOME      0x1e6
+#define ALT_PGUP      0x1e7
+#define ALT_PGDN      0x1e8
+#define ALT_END       0x1e9
+#define ALT_UP        0x1ea  /* alt-up arrow */
+#define ALT_DOWN      0x1eb  /* alt-down arrow */
+#define ALT_RIGHT     0x1ec  /* alt-right arrow */
+#define ALT_LEFT      0x1ed  /* alt-left arrow */
+#define ALT_ENTER     0x1ee  /* alt-enter */
+#define ALT_ESC       0x1ef  /* alt-escape */
+#define ALT_BQUOTE    0x1f0  /* alt-back quote */
+#define ALT_LBRACKET  0x1f1  /* alt-left bracket */
+#define ALT_RBRACKET  0x1f2  /* alt-right bracket */
+#define ALT_SEMICOLON 0x1f3  /* alt-semi-colon */
+#define ALT_FQUOTE    0x1f4  /* alt-forward quote */
+#define ALT_COMMA     0x1f5  /* alt-comma */
+#define ALT_STOP      0x1f6  /* alt-stop */
+#define ALT_FSLASH    0x1f7  /* alt-forward slash */
+#define ALT_BKSP      0x1f8  /* alt-backspace */
+#define CTL_BKSP      0x1f9  /* ctl-backspace */
+#define PAD0          0x1fa  /* keypad 0 */
+
+#define CTL_PAD0      0x1fb  /* ctl-keypad 0 */
+#define CTL_PAD1      0x1fc
+#define CTL_PAD2      0x1fd
+#define CTL_PAD3      0x1fe
+#define CTL_PAD4      0x1ff
+#define CTL_PAD5      0x200
+#define CTL_PAD6      0x201
+#define CTL_PAD7      0x202
+#define CTL_PAD8      0x203
+#define CTL_PAD9      0x204
+
+#define ALT_PAD0      0x205  /* alt-keypad 0 */
+#define ALT_PAD1      0x206
+#define ALT_PAD2      0x207
+#define ALT_PAD3      0x208
+#define ALT_PAD4      0x209
+#define ALT_PAD5      0x20a
+#define ALT_PAD6      0x20b
+#define ALT_PAD7      0x20c
+#define ALT_PAD8      0x20d
+#define ALT_PAD9      0x20e
+
+#define CTL_DEL       0x20f  /* clt-delete */
+#define ALT_BSLASH    0x210  /* alt-back slash */
+#define CTL_ENTER     0x211  /* ctl-enter */
+
+#define SHF_PADENTER  0x212  /* shift-enter on keypad */
+#define SHF_PADSLASH  0x213  /* shift-slash on keypad */
+#define SHF_PADSTAR   0x214  /* shift-star  on keypad */
+#define SHF_PADPLUS   0x215  /* shift-plus  on keypad */
+#define SHF_PADMINUS  0x216  /* shift-minus on keypad */
+#define SHF_UP        0x217  /* shift-up on keypad */
+#define SHF_DOWN      0x218  /* shift-down on keypad */
+#define SHF_IC        0x219  /* shift-insert on keypad */
+#define SHF_DC        0x21a  /* shift-delete on keypad */
+
+#define KEY_MOUSE     0x21b  /* "mouse" key */
+#define KEY_SHIFT_L   0x21c  /* Left-shift */
+#define KEY_SHIFT_R   0x21d  /* Right-shift */
+#define KEY_CONTROL_L 0x21e  /* Left-control */
+#define KEY_CONTROL_R 0x21f  /* Right-control */
+#define KEY_ALT_L     0x220  /* Left-alt */
+#define KEY_ALT_R     0x221  /* Right-alt */
+#define KEY_RESIZE    0x222  /* Window resize */
+#define KEY_SUP       0x223  /* Shifted up arrow */
+#define KEY_SDOWN     0x224  /* Shifted down arrow */
+
+#define KEY_MIN       KEY_BREAK      /* Minimum curses key value */
+#define KEY_MAX       KEY_SDOWN      /* Maximum curses key */
+
+#define KEY_F(n)      (KEY_F0 + (n))
+
+/*----------------------------------------------------------------------
+ *
+ *  PDCurses Function Declarations
+ *
+ */
+
+/* Standard */
+
+int     addch(const chtype);
+int     addchnstr(const chtype *, int);
+int     addchstr(const chtype *);
+int     addnstr(const char *, int);
+int     addstr(const char *);
+int     attroff(chtype);
+int     attron(chtype);
+int     attrset(chtype);
+int     attr_get(attr_t *, short *, void *);
+int     attr_off(attr_t, void *);
+int     attr_on(attr_t, void *);
+int     attr_set(attr_t, short, void *);
+int     baudrate(void);
+int     beep(void);
+int     bkgd(chtype);
+void    bkgdset(chtype);
+int     border(chtype, chtype, chtype, chtype, chtype, chtype, chtype, chtype);
+int     box(WINDOW *, chtype, chtype);
+bool    can_change_color(void);
+int     cbreak(void); 
+int     chgat(int, attr_t, short, const void *);
+int     clearok(WINDOW *, bool);
+int     clear(void);
+int     clrtobot(void);
+int     clrtoeol(void);
+int     color_content(short, short *, short *, short *);
+int     color_set(short, void *);
+int     copywin(const WINDOW *, WINDOW *, int, int, int, int, int, int, int);
+int     curs_set(int);
+int     def_prog_mode(void);
+int     def_shell_mode(void);
+int     delay_output(int);
+int     delch(void);
+int     deleteln(void);
+void    delscreen(SCREEN *); 
+int     delwin(WINDOW *);
+WINDOW *derwin(WINDOW *, int, int, int, int);
+int     doupdate(void);
+WINDOW *dupwin(WINDOW *);
+int     echochar(const chtype);
+int     echo(void);
+int     endwin(void);
+char    erasechar(void);
+int     erase(void);
+void    filter(void);
+int     flash(void);
+int     flushinp(void);
+chtype  getbkgd(WINDOW *);
+int     getnstr(char *, int);
+int     getstr(char *);
+WINDOW *getwin(FILE *);
+int     halfdelay(int);
+bool    has_colors(void);
+bool    has_ic(void);
+bool    has_il(void);
+int     hline(chtype, int);
+void    idcok(WINDOW *, bool);
+int     idlok(WINDOW *, bool);
+void    immedok(WINDOW *, bool);
+int     inchnstr(chtype *, int);
+int     inchstr(chtype *);
+chtype  inch(void);
+int     init_color(short, short, short, short);
+int     init_pair(short, short, short);
+WINDOW *initscr(void);
+int     innstr(char *, int);
+int     insch(chtype);
+int     insdelln(int);
+int     insertln(void);
+int     insnstr(const char *, int);
+int     insstr(const char *);
+int     instr(char *);
+int     intrflush(WINDOW *, bool);
+bool    isendwin(void);
+bool    is_linetouched(WINDOW *, int);
+bool    is_wintouched(WINDOW *);
+char   *keyname(int);
+int     keypad(WINDOW *, bool);
+char    killchar(void);
+int     leaveok(WINDOW *, bool);
+char   *longname(void);
+int     meta(WINDOW *, bool);
+int     move(int, int);
+int     mvaddch(int, int, const chtype);
+int     mvaddchnstr(int, int, const chtype *, int);
+int     mvaddchstr(int, int, const chtype *);
+int     mvaddnstr(int, int, const char *, int);
+int     mvaddstr(int, int, const char *);
+int     mvchgat(int, int, int, attr_t, short, const void *);
+int     mvcur(int, int, int, int);
+int     mvdelch(int, int);
+int     mvderwin(WINDOW *, int, int);
+int     mvgetch(int, int);
+int     mvgetnstr(int, int, char *, int);
+int     mvgetstr(int, int, char *);
+int     mvhline(int, int, chtype, int);
+chtype  mvinch(int, int);
+int     mvinchnstr(int, int, chtype *, int);
+int     mvinchstr(int, int, chtype *);
+int     mvinnstr(int, int, char *, int);
+int     mvinsch(int, int, chtype);
+int     mvinsnstr(int, int, const char *, int);
+int     mvinsstr(int, int, const char *);
+int     mvinstr(int, int, char *);
+int     mvprintw(int, int, const char *, ...);
+int     mvscanw(int, int, const char *, ...);
+int     mvvline(int, int, chtype, int);
+int     mvwaddchnstr(WINDOW *, int, int, const chtype *, int);
+int     mvwaddchstr(WINDOW *, int, int, const chtype *);
+int     mvwaddch(WINDOW *, int, int, const chtype);
+int     mvwaddnstr(WINDOW *, int, int, const char *, int);
+int     mvwaddstr(WINDOW *, int, int, const char *);
+int     mvwchgat(WINDOW *, int, int, int, attr_t, short, const void *);
+int     mvwdelch(WINDOW *, int, int);
+int     mvwgetch(WINDOW *, int, int);
+int     mvwgetnstr(WINDOW *, int, int, char *, int);
+int     mvwgetstr(WINDOW *, int, int, char *);
+int     mvwhline(WINDOW *, int, int, chtype, int);
+int     mvwinchnstr(WINDOW *, int, int, chtype *, int);
+int     mvwinchstr(WINDOW *, int, int, chtype *);
+chtype  mvwinch(WINDOW *, int, int);
+int     mvwinnstr(WINDOW *, int, int, char *, int);
+int     mvwinsch(WINDOW *, int, int, chtype);
+int     mvwinsnstr(WINDOW *, int, int, const char *, int);
+int     mvwinsstr(WINDOW *, int, int, const char *);
+int     mvwinstr(WINDOW *, int, int, char *);
+int     mvwin(WINDOW *, int, int);
+int     mvwprintw(WINDOW *, int, int, const char *, ...);
+int     mvwscanw(WINDOW *, int, int, const char *, ...);
+int     mvwvline(WINDOW *, int, int, chtype, int);
+int     napms(int);
+WINDOW *newpad(int, int);
+SCREEN *newterm(const char *, FILE *, FILE *);
+WINDOW *newwin(int, int, int, int);
+int     nl(void);
+int     nocbreak(void);
+int     nodelay(WINDOW *, bool);
+int     noecho(void);
+int     nonl(void);
+void    noqiflush(void);
+int     noraw(void);
+int     notimeout(WINDOW *, bool);
+int     overlay(const WINDOW *, WINDOW *);
+int     overwrite(const WINDOW *, WINDOW *);
+int     pair_content(short, short *, short *);
+int     pechochar(WINDOW *, chtype);
+int     pnoutrefresh(WINDOW *, int, int, int, int, int, int);
+int     prefresh(WINDOW *, int, int, int, int, int, int);
+int     printw(const char *, ...);
+int     putwin(WINDOW *, FILE *);
+void    qiflush(void);
+int     raw(void);
+int     redrawwin(WINDOW *);
+int     refresh(void);
+int     reset_prog_mode(void);
+int     reset_shell_mode(void);
+int     resetty(void);
+int     ripoffline(int, int (*)(WINDOW *, int));
+int     savetty(void);
+int     scanw(const char *, ...);
+int     scr_dump(const char *);
+int     scr_init(const char *);
+int     scr_restore(const char *);
+int     scr_set(const char *);
+int     scrl(int);
+int     scroll(WINDOW *);
+int     scrollok(WINDOW *, bool);
+SCREEN *set_term(SCREEN *);
+int     setscrreg(int, int);
+int     slk_attroff(const chtype);
+int     slk_attr_off(const attr_t, void *);
+int     slk_attron(const chtype);
+int     slk_attr_on(const attr_t, void *);
+int     slk_attrset(const chtype);
+int     slk_attr_set(const attr_t, short, void *);
+int     slk_clear(void);
+int     slk_color(short);
+int     slk_init(int);
+char   *slk_label(int);
+int     slk_noutrefresh(void);
+int     slk_refresh(void);
+int     slk_restore(void);
+int     slk_set(int, const char *, int);
+int     slk_touch(void);
+int     standend(void);
+int     standout(void);
+int     start_color(void);
+WINDOW *subpad(WINDOW *, int, int, int, int);
+WINDOW *subwin(WINDOW *, int, int, int, int);
+int     syncok(WINDOW *, bool);
+chtype  termattrs(void);
+attr_t  term_attrs(void);
+char   *termname(void);
+void    timeout(int);
+int     touchline(WINDOW *, int, int);
+int     touchwin(WINDOW *);
+int     typeahead(int);
+int     untouchwin(WINDOW *);
+void    use_env(bool);
+int     vidattr(chtype);
+int     vid_attr(attr_t, short, void *);
+int     vidputs(chtype, int (*)(int));
+int     vid_puts(attr_t, short, void *, int (*)(int));
+int     vline(chtype, int);
+int     vw_printw(WINDOW *, const char *, va_list);
+int     vwprintw(WINDOW *, const char *, va_list);
+int     vw_scanw(WINDOW *, const char *, va_list);
+int     vwscanw(WINDOW *, const char *, va_list);
+int     waddchnstr(WINDOW *, const chtype *, int);
+int     waddchstr(WINDOW *, const chtype *);
+int     waddch(WINDOW *, const chtype);
+int     waddnstr(WINDOW *, const char *, int);
+int     waddstr(WINDOW *, const char *);
+int     wattroff(WINDOW *, chtype);
+int     wattron(WINDOW *, chtype);
+int     wattrset(WINDOW *, chtype);
+int     wattr_get(WINDOW *, attr_t *, short *, void *);
+int     wattr_off(WINDOW *, attr_t, void *);
+int     wattr_on(WINDOW *, attr_t, void *);
+int     wattr_set(WINDOW *, attr_t, short, void *);
+void    wbkgdset(WINDOW *, chtype);
+int     wbkgd(WINDOW *, chtype);
+int     wborder(WINDOW *, chtype, chtype, chtype, chtype,
+                chtype, chtype, chtype, chtype);
+int     wchgat(WINDOW *, int, attr_t, short, const void *);
+int     wclear(WINDOW *);
+int     wclrtobot(WINDOW *);
+int     wclrtoeol(WINDOW *);
+int     wcolor_set(WINDOW *, short, void *);
+void    wcursyncup(WINDOW *);
+int     wdelch(WINDOW *);
+int     wdeleteln(WINDOW *);
+int     wechochar(WINDOW *, const chtype);
+int     werase(WINDOW *);
+int     wgetch(WINDOW *);
+int     wgetnstr(WINDOW *, char *, int);
+int     wgetstr(WINDOW *, char *);
+int     whline(WINDOW *, chtype, int);
+int     winchnstr(WINDOW *, chtype *, int);
+int     winchstr(WINDOW *, chtype *);
+chtype  winch(WINDOW *);
+int     winnstr(WINDOW *, char *, int);
+int     winsch(WINDOW *, chtype);
+int     winsdelln(WINDOW *, int);
+int     winsertln(WINDOW *);
+int     winsnstr(WINDOW *, const char *, int);
+int     winsstr(WINDOW *, const char *);
+int     winstr(WINDOW *, char *);
+int     wmove(WINDOW *, int, int);
+int     wnoutrefresh(WINDOW *);
+int     wprintw(WINDOW *, const char *, ...);
+int     wredrawln(WINDOW *, int, int);
+int     wrefresh(WINDOW *);
+int     wscanw(WINDOW *, const char *, ...);
+int     wscrl(WINDOW *, int);
+int     wsetscrreg(WINDOW *, int, int);
+int     wstandend(WINDOW *);
+int     wstandout(WINDOW *);
+void    wsyncdown(WINDOW *);
+void    wsyncup(WINDOW *);
+void    wtimeout(WINDOW *, int);
+int     wtouchln(WINDOW *, int, int, int);
+int     wvline(WINDOW *, chtype, int);
+
+/* Wide-character functions */
+
+#ifdef PDC_WIDE
+int     addnwstr(const wchar_t *, int);
+int     addwstr(const wchar_t *);
+int     add_wch(const cchar_t *);
+int     add_wchnstr(const cchar_t *, int);
+int     add_wchstr(const cchar_t *);
+int     border_set(const cchar_t *, const cchar_t *, const cchar_t *, 
+                   const cchar_t *, const cchar_t *, const cchar_t *, 
+                   const cchar_t *, const cchar_t *);
+int     box_set(WINDOW *, const cchar_t *, const cchar_t *);
+int     echo_wchar(const cchar_t *);
+int     erasewchar(wchar_t *);
+int     getbkgrnd(cchar_t *);
+int     getcchar(const cchar_t *, wchar_t *, attr_t *, short *, void *);
+int     getn_wstr(wint_t *, int);
+int     get_wch(wint_t *);
+int     get_wstr(wint_t *);
+int     hline_set(const cchar_t *, int);
+int     innwstr(wchar_t *, int);
+int     ins_nwstr(const wchar_t *, int);
+int     ins_wch(const cchar_t *);
+int     ins_wstr(const wchar_t *);
+int     inwstr(wchar_t *);
+int     in_wch(cchar_t *);
+int     in_wchnstr(cchar_t *, int);
+int     in_wchstr(cchar_t *);
+char   *key_name(wchar_t);
+int     killwchar(wchar_t *);
+int     mvaddnwstr(int, int, const wchar_t *, int);
+int     mvaddwstr(int, int, const wchar_t *);
+int     mvadd_wch(int, int, const cchar_t *);
+int     mvadd_wchnstr(int, int, const cchar_t *, int);
+int     mvadd_wchstr(int, int, const cchar_t *);
+int     mvgetn_wstr(int, int, wint_t *, int);
+int     mvget_wch(int, int, wint_t *);
+int     mvget_wstr(int, int, wint_t *);
+int     mvhline_set(int, int, const cchar_t *, int);
+int     mvinnwstr(int, int, wchar_t *, int);
+int     mvins_nwstr(int, int, const wchar_t *, int);
+int     mvins_wch(int, int, const cchar_t *);
+int     mvins_wstr(int, int, const wchar_t *);
+int     mvinwstr(int, int, wchar_t *);
+int     mvin_wch(int, int, cchar_t *);
+int     mvin_wchnstr(int, int, cchar_t *, int);
+int     mvin_wchstr(int, int, cchar_t *);
+int     mvvline_set(int, int, const cchar_t *, int);
+int     mvwaddnwstr(WINDOW *, int, int, const wchar_t *, int);
+int     mvwaddwstr(WINDOW *, int, int, const wchar_t *);
+int     mvwadd_wch(WINDOW *, int, int, const cchar_t *);
+int     mvwadd_wchnstr(WINDOW *, int, int, const cchar_t *, int);
+int     mvwadd_wchstr(WINDOW *, int, int, const cchar_t *);
+int     mvwgetn_wstr(WINDOW *, int, int, wint_t *, int);
+int     mvwget_wch(WINDOW *, int, int, wint_t *);
+int     mvwget_wstr(WINDOW *, int, int, wint_t *);
+int     mvwhline_set(WINDOW *, int, int, const cchar_t *, int);
+int     mvwinnwstr(WINDOW *, int, int, wchar_t *, int);
+int     mvwins_nwstr(WINDOW *, int, int, const wchar_t *, int);
+int     mvwins_wch(WINDOW *, int, int, const cchar_t *);
+int     mvwins_wstr(WINDOW *, int, int, const wchar_t *);
+int     mvwin_wch(WINDOW *, int, int, cchar_t *);
+int     mvwin_wchnstr(WINDOW *, int, int, cchar_t *, int);
+int     mvwin_wchstr(WINDOW *, int, int, cchar_t *);
+int     mvwinwstr(WINDOW *, int, int, wchar_t *);
+int     mvwvline_set(WINDOW *, int, int, const cchar_t *, int);
+int     pecho_wchar(WINDOW *, const cchar_t*);
+int     setcchar(cchar_t*, const wchar_t*, const attr_t, short, const void*);
+int     slk_wset(int, const wchar_t *, int);
+int     unget_wch(const wchar_t);
+int     vline_set(const cchar_t *, int);
+int     waddnwstr(WINDOW *, const wchar_t *, int);
+int     waddwstr(WINDOW *, const wchar_t *);
+int     wadd_wch(WINDOW *, const cchar_t *);
+int     wadd_wchnstr(WINDOW *, const cchar_t *, int);
+int     wadd_wchstr(WINDOW *, const cchar_t *);
+int     wbkgrnd(WINDOW *, const cchar_t *);
+void    wbkgrndset(WINDOW *, const cchar_t *);
+int     wborder_set(WINDOW *, const cchar_t *, const cchar_t *,
+                    const cchar_t *, const cchar_t *, const cchar_t *, 
+                    const cchar_t *, const cchar_t *, const cchar_t *);
+int     wecho_wchar(WINDOW *, const cchar_t *);
+int     wgetbkgrnd(WINDOW *, cchar_t *);
+int     wgetn_wstr(WINDOW *, wint_t *, int);
+int     wget_wch(WINDOW *, wint_t *);
+int     wget_wstr(WINDOW *, wint_t *);
+int     whline_set(WINDOW *, const cchar_t *, int);
+int     winnwstr(WINDOW *, wchar_t *, int);
+int     wins_nwstr(WINDOW *, const wchar_t *, int);
+int     wins_wch(WINDOW *, const cchar_t *);
+int     wins_wstr(WINDOW *, const wchar_t *);
+int     winwstr(WINDOW *, wchar_t *);
+int     win_wch(WINDOW *, cchar_t *);
+int     win_wchnstr(WINDOW *, cchar_t *, int);
+int     win_wchstr(WINDOW *, cchar_t *);
+wchar_t *wunctrl(cchar_t *);
+int     wvline_set(WINDOW *, const cchar_t *, int);
+#endif
+
+/* Quasi-standard */
+
+chtype  getattrs(WINDOW *);
+int     getbegx(WINDOW *);
+int     getbegy(WINDOW *);
+int     getmaxx(WINDOW *);
+int     getmaxy(WINDOW *);
+int     getparx(WINDOW *);
+int     getpary(WINDOW *);
+int     getcurx(WINDOW *);
+int     getcury(WINDOW *);
+void    traceoff(void);
+void    traceon(void);
+char   *unctrl(chtype);
+
+int     crmode(void);
+int     nocrmode(void);
+int     draino(int);
+int     resetterm(void);
+int     fixterm(void);
+int     saveterm(void);
+int     setsyx(int, int);
+
+int     mouse_set(unsigned long);
+int     mouse_on(unsigned long);
+int     mouse_off(unsigned long);
+int     request_mouse_pos(void);
+int     map_button(unsigned long);
+void    wmouse_position(WINDOW *, int *, int *);
+unsigned long getmouse(void);
+unsigned long getbmap(void);
+
+/* ncurses */
+
+int     assume_default_colors(int, int);
+const char *curses_version(void);
+bool    has_key(int);
+int     use_default_colors(void);
+int     wresize(WINDOW *, int, int);
+
+int     mouseinterval(int);
+mmask_t mousemask(mmask_t, mmask_t *);
+bool    mouse_trafo(int *, int *, bool);
+int     nc_getmouse(MEVENT *);
+int     ungetmouse(MEVENT *);
+bool    wenclose(const WINDOW *, int, int);
+bool    wmouse_trafo(const WINDOW *, int *, int *, bool);
+
+/* PDCurses */
+
+int     addrawch(chtype);
+int     insrawch(chtype);
+bool    is_termresized(void);
+int     mvaddrawch(int, int, chtype);
+int     mvdeleteln(int, int);
+int     mvinsertln(int, int);
+int     mvinsrawch(int, int, chtype);
+int     mvwaddrawch(WINDOW *, int, int, chtype);
+int     mvwdeleteln(WINDOW *, int, int);
+int     mvwinsertln(WINDOW *, int, int);
+int     mvwinsrawch(WINDOW *, int, int, chtype);
+int     raw_output(bool);
+int     resize_term(int, int);
+WINDOW *resize_window(WINDOW *, int, int);
+int     waddrawch(WINDOW *, chtype);
+int     winsrawch(WINDOW *, chtype);
+char    wordchar(void);
+
+#ifdef PDC_WIDE
+wchar_t *slk_wlabel(int);
+#endif
+
+void    PDC_debug(const char *, ...);
+int     PDC_ungetch(int);
+int     PDC_set_blink(bool);
+int     PDC_set_line_color(short);
+void    PDC_set_title(const char *);
+
+int     PDC_clearclipboard(void);
+int     PDC_freeclipboard(char *);
+int     PDC_getclipboard(char **, long *);
+int     PDC_setclipboard(const char *, long);
+
+unsigned long PDC_get_input_fd(void);
+unsigned long PDC_get_key_modifiers(void);
+int     PDC_return_key_modifiers(bool);
+int     PDC_save_key_modifiers(bool);
+
+#ifdef XCURSES
+WINDOW *Xinitscr(int, char **);
+void    XCursesExit(void);
+int     sb_init(void);
+int     sb_set_horz(int, int, int);
+int     sb_set_vert(int, int, int);
+int     sb_get_horz(int *, int *, int *);
+int     sb_get_vert(int *, int *, int *);
+int     sb_refresh(void);
+#endif
+
+/*** Functions defined as macros ***/
+
+/* getch() and ungetch() conflict with some DOS libraries */
+
+#define getch()            wgetch(stdscr)
+#define ungetch(ch)        PDC_ungetch(ch)
+
+#define COLOR_PAIR(n)      (((chtype)(n) << PDC_COLOR_SHIFT) & A_COLOR)
+#define PAIR_NUMBER(n)     (((n) & A_COLOR) >> PDC_COLOR_SHIFT)
+
+/* These will _only_ work as macros */
+
+#define getbegyx(w, y, x)  (y = getbegy(w), x = getbegx(w))
+#define getmaxyx(w, y, x)  (y = getmaxy(w), x = getmaxx(w))
+#define getparyx(w, y, x)  (y = getpary(w), x = getparx(w))
+#define getyx(w, y, x)     (y = getcury(w), x = getcurx(w))
+
+#define getsyx(y, x)       { if (curscr->_leaveit) (y)=(x)=-1; \
+                             else getyx(curscr,(y),(x)); }
+
+#ifdef NCURSES_MOUSE_VERSION
+# define getmouse(x) nc_getmouse(x)
+#endif
+
+/* return codes from PDC_getclipboard() and PDC_setclipboard() calls */
+
+#define PDC_CLIP_SUCCESS         0
+#define PDC_CLIP_ACCESS_ERROR    1
+#define PDC_CLIP_EMPTY           2
+#define PDC_CLIP_MEMORY_ERROR    3
+
+/* PDCurses key modifier masks */
+
+#define PDC_KEY_MODIFIER_SHIFT   1
+#define PDC_KEY_MODIFIER_CONTROL 2
+#define PDC_KEY_MODIFIER_ALT     4
+#define PDC_KEY_MODIFIER_NUMLOCK 8
+
+#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS)
+# undef bool
+}
+#endif
+
+#endif  /* __PDCURSES__ */
diff --git a/win32/zconf.h b/win32/zconf.h

new file mode 100644 (file)

index 0000000..03a9431
--- /dev/null
+++ b/win32/zconf.h
@@ -0,0 +1,332 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZCONF_H
+#define ZCONF_H
+
+/*
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ */
+#ifdef Z_PREFIX
+#  define deflateInit_          z_deflateInit_
+#  define deflate               z_deflate
+#  define deflateEnd            z_deflateEnd
+#  define inflateInit_          z_inflateInit_
+#  define inflate               z_inflate
+#  define inflateEnd            z_inflateEnd
+#  define deflateInit2_         z_deflateInit2_
+#  define deflateSetDictionary  z_deflateSetDictionary
+#  define deflateCopy           z_deflateCopy
+#  define deflateReset          z_deflateReset
+#  define deflateParams         z_deflateParams
+#  define deflateBound          z_deflateBound
+#  define deflatePrime          z_deflatePrime
+#  define inflateInit2_         z_inflateInit2_
+#  define inflateSetDictionary  z_inflateSetDictionary
+#  define inflateSync           z_inflateSync
+#  define inflateSyncPoint      z_inflateSyncPoint
+#  define inflateCopy           z_inflateCopy
+#  define inflateReset          z_inflateReset
+#  define inflateBack           z_inflateBack
+#  define inflateBackEnd        z_inflateBackEnd
+#  define compress              z_compress
+#  define compress2             z_compress2
+#  define compressBound         z_compressBound
+#  define uncompress            z_uncompress
+#  define adler32               z_adler32
+#  define crc32                 z_crc32
+#  define get_crc_table         z_get_crc_table
+#  define zError                z_zError
+
+#  define alloc_func            z_alloc_func
+#  define free_func             z_free_func
+#  define in_func               z_in_func
+#  define out_func              z_out_func
+#  define Byte                  z_Byte
+#  define uInt                  z_uInt
+#  define uLong                 z_uLong
+#  define Bytef                 z_Bytef
+#  define charf                 z_charf
+#  define intf                  z_intf
+#  define uIntf                 z_uIntf
+#  define uLongf                z_uLongf
+#  define voidpf                z_voidpf
+#  define voidp                 z_voidp
+#endif
+
+#if defined(__MSDOS__) && !defined(MSDOS)
+#  define MSDOS
+#endif
+#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
+#  define OS2
+#endif
+#if defined(_WINDOWS) && !defined(WINDOWS)
+#  define WINDOWS
+#endif
+#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
+#  ifndef WIN32
+#    define WIN32
+#  endif
+#endif
+#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
+#  if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
+#    ifndef SYS16BIT
+#      define SYS16BIT
+#    endif
+#  endif
+#endif
+
+/*
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#ifdef SYS16BIT
+#  define MAXSEG_64K
+#endif
+#ifdef MSDOS
+#  define UNALIGNED_OK
+#endif
+
+#ifdef __STDC_VERSION__
+#  ifndef STDC
+#    define STDC
+#  endif
+#  if __STDC_VERSION__ >= 199901L
+#    ifndef STDC99
+#      define STDC99
+#    endif
+#  endif
+#endif
+#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
+#  define STDC
+#endif
+
+#if defined(__OS400__) && !defined(STDC)    /* iSeries (formerly AS/400). */
+#  define STDC
+#endif
+
+#ifndef STDC
+#  ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+#    define const       /* note: need a more gentle solution here */
+#  endif
+#endif
+
+/* Some Mac compilers merge all .h files incorrectly: */
+#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__)
+#  define NO_DUMMY_DECL
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  ifdef MAXSEG_64K
+#    define MAX_MEM_LEVEL 8
+#  else
+#    define MAX_MEM_LEVEL 9
+#  endif
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+            (1 << (windowBits+2)) +  (1 << (memLevel+9))
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus a few kilobytes
+ for small objects.
+*/
+
+                        /* Type declarations */
+
+#ifndef OF /* function prototypes */
+#  ifdef STDC
+#    define OF(args)  args
+#  else
+#    define OF(args)  ()
+#  endif
+#endif
+
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h.  If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#ifdef SYS16BIT
+#  if defined(M_I86SM) || defined(M_I86MM)
+     /* MSC small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef _MSC_VER
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#  if (defined(__SMALL__) || defined(__MEDIUM__))
+     /* Turbo C small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef __BORLANDC__
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#endif
+
+#if defined(WINDOWS) || defined(WIN32)
+   /* If building or using zlib as a DLL, define ZLIB_DLL.
+    * This is not mandatory, but it offers a little performance increase.
+    */
+#  ifdef ZLIB_DLL
+#    if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
+#      ifdef ZLIB_INTERNAL
+#        define ZEXTERN extern __declspec(dllexport)
+#      else
+#        define ZEXTERN extern __declspec(dllimport)
+#      endif
+#    endif
+#  endif  /* ZLIB_DLL */
+   /* If building or using zlib with the WINAPI/WINAPIV calling convention,
+    * define ZLIB_WINAPI.
+    * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+    */
+#  ifdef ZLIB_WINAPI
+#    ifdef FAR
+#      undef FAR
+#    endif
+#    include <windows.h>
+     /* No need for _export, use ZLIB.DEF instead. */
+     /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+#    define ZEXPORT WINAPI
+#    ifdef WIN32
+#      define ZEXPORTVA WINAPIV
+#    else
+#      define ZEXPORTVA FAR CDECL
+#    endif
+#  endif
+#endif
+
+#if defined (__BEOS__)
+#  ifdef ZLIB_DLL
+#    ifdef ZLIB_INTERNAL
+#      define ZEXPORT   __declspec(dllexport)
+#      define ZEXPORTVA __declspec(dllexport)
+#    else
+#      define ZEXPORT   __declspec(dllimport)
+#      define ZEXPORTVA __declspec(dllimport)
+#    endif
+#  endif
+#endif
+
+#ifndef ZEXTERN
+#  define ZEXTERN extern
+#endif
+#ifndef ZEXPORT
+#  define ZEXPORT
+#endif
+#ifndef ZEXPORTVA
+#  define ZEXPORTVA
+#endif
+
+#ifndef FAR
+#  define FAR
+#endif
+
+#if !defined(__MACTYPES__)
+typedef unsigned char  Byte;  /* 8 bits */
+#endif
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+
+#ifdef SMALL_MEDIUM
+   /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
+#  define Bytef Byte FAR
+#else
+   typedef Byte  FAR Bytef;
+#endif
+typedef char  FAR charf;
+typedef int   FAR intf;
+typedef uInt  FAR uIntf;
+typedef uLong FAR uLongf;
+
+#ifdef STDC
+   typedef void const *voidpc;
+   typedef void FAR   *voidpf;
+   typedef void       *voidp;
+#else
+   typedef Byte const *voidpc;
+   typedef Byte FAR   *voidpf;
+   typedef Byte       *voidp;
+#endif
+
+#if 0           /* HAVE_UNISTD_H -- this line is updated by ./configure */
+#  include <sys/types.h> /* for off_t */
+#  include <unistd.h>    /* for SEEK_* and off_t */
+#  ifdef VMS
+#    include <unixio.h>   /* for off_t */
+#  endif
+#  define z_off_t off_t
+#endif
+#ifndef SEEK_SET
+#  define SEEK_SET        0       /* Seek from beginning of file.  */
+#  define SEEK_CUR        1       /* Seek from current position.  */
+#  define SEEK_END        2       /* Set file pointer to EOF plus "offset" */
+#endif
+#ifndef z_off_t
+#  define z_off_t long
+#endif
+
+#if defined(__OS400__)
+#  define NO_vsnprintf
+#endif
+
+#if defined(__MVS__)
+#  define NO_vsnprintf
+#  ifdef FAR
+#    undef FAR
+#  endif
+#endif
+
+/* MVS linker does not support external names larger than 8 bytes */
+#if defined(__MVS__)
+#   pragma map(deflateInit_,"DEIN")
+#   pragma map(deflateInit2_,"DEIN2")
+#   pragma map(deflateEnd,"DEEND")
+#   pragma map(deflateBound,"DEBND")
+#   pragma map(inflateInit_,"ININ")
+#   pragma map(inflateInit2_,"ININ2")
+#   pragma map(inflateEnd,"INEND")
+#   pragma map(inflateSync,"INSY")
+#   pragma map(inflateSetDictionary,"INSEDI")
+#   pragma map(compressBound,"CMBND")
+#   pragma map(inflate_table,"INTABL")
+#   pragma map(inflate_fast,"INFA")
+#   pragma map(inflate_copyright,"INCOPY")
+#endif
+
+#endif /* ZCONF_H */
diff --git a/win32/zlib.h b/win32/zlib.h

new file mode 100644 (file)

index 0000000..0228179
--- /dev/null
+++ b/win32/zlib.h
@@ -0,0 +1,1357 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.3, July 18th, 2005
+
+  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+  (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+#ifndef ZLIB_H
+#define ZLIB_H
+
+#include "zconf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIB_VERSION "1.2.3"
+#define ZLIB_VERNUM 0x1230
+
+/*
+     The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed
+  data.  This version of the library supports only one compression method
+  (deflation) but other algorithms will be added later and will have the same
+  stream interface.
+
+     Compression can be done in a single step if the buffers are large
+  enough (for example if an input file is mmap'ed), or can be done by
+  repeated calls of the compression function.  In the latter case, the
+  application must provide more input and/or consume the output
+  (providing more output space) before each call.
+
+     The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
+     The library also supports reading and writing files in gzip (.gz) format
+  with an interface similar to that of stdio using the functions that start
+  with "gz".  The gzip format is different from the zlib format.  gzip is a
+  gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+     This library can optionally read and write gzip streams in memory as well.
+
+     The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
+     The library does not install any signal handler. The decoder checks
+  the consistency of the compressed data, so the library should never
+  crash even in case of corrupted input.
+*/
+
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void   (*free_func)  OF((voidpf opaque, voidpf address));
+
+struct internal_state;
+
+typedef struct z_stream_s {
+    Bytef    *next_in;  /* next input byte */
+    uInt     avail_in;  /* number of bytes available at next_in */
+    uLong    total_in;  /* total nb of input bytes read so far */
+
+    Bytef    *next_out; /* next output byte should be put there */
+    uInt     avail_out; /* remaining free space at next_out */
+    uLong    total_out; /* total nb of bytes output so far */
+
+    char     *msg;      /* last error message, NULL if no error */
+    struct internal_state FAR *state; /* not visible by applications */
+
+    alloc_func zalloc;  /* used to allocate the internal state */
+    free_func  zfree;   /* used to free the internal state */
+    voidpf     opaque;  /* private data object passed to zalloc and zfree */
+
+    int     data_type;  /* best guess about the data type: binary or text */
+    uLong   adler;      /* adler32 value of the uncompressed data */
+    uLong   reserved;   /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+     gzip header information passed to and from zlib routines.  See RFC 1952
+  for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+    int     text;       /* true if compressed data believed to be text */
+    uLong   time;       /* modification time */
+    int     xflags;     /* extra flags (not used when writing a gzip file) */
+    int     os;         /* operating system */
+    Bytef   *extra;     /* pointer to extra field or Z_NULL if none */
+    uInt    extra_len;  /* extra field length (valid if extra != Z_NULL) */
+    uInt    extra_max;  /* space at extra (only when reading header) */
+    Bytef   *name;      /* pointer to zero-terminated file name or Z_NULL */
+    uInt    name_max;   /* space at name (only when reading header) */
+    Bytef   *comment;   /* pointer to zero-terminated comment or Z_NULL */
+    uInt    comm_max;   /* space at comment (only when reading header) */
+    int     hcrc;       /* true if there was or will be a header crc */
+    int     done;       /* true when done reading gzip header (not used
+                           when writing a gzip file) */
+} gz_header;
+
+typedef gz_header FAR *gz_headerp;
+
+/*
+   The application must update next_in and avail_in when avail_in has
+   dropped to zero. It must update next_out and avail_out when avail_out
+   has dropped to zero. The application must initialize zalloc, zfree and
+   opaque before calling the init function. All other fields are set by the
+   compression library and must not be updated by the application.
+
+   The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree. This can be useful for custom
+   memory management. The compression library attaches no meaning to the
+   opaque value.
+
+   zalloc must return Z_NULL if there is not enough memory for the object.
+   If zlib is used in a multi-threaded application, zalloc and zfree must be
+   thread safe.
+
+   On 16-bit systems, the functions zalloc and zfree must be able to allocate
+   exactly 65536 bytes, but will not be required to allocate more than this
+   if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
+   pointers returned by zalloc for objects of exactly 65536 bytes *must*
+   have their offset normalized to zero. The default allocation function
+   provided by this library ensures this (see zutil.c). To reduce memory
+   requirements and avoid any allocation of 64K objects, at the expense of
+   compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
+
+   The fields total_in and total_out can be used for statistics or
+   progress reports. After compression, total_in holds the total size of
+   the uncompressed data and may be saved for use in the decompressor
+   (particularly if the decompressor wants to decompress everything in
+   a single step).
+*/
+
+                        /* constants */
+
+#define Z_NO_FLUSH      0
+#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */
+#define Z_SYNC_FLUSH    2
+#define Z_FULL_FLUSH    3
+#define Z_FINISH        4
+#define Z_BLOCK         5
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK            0
+#define Z_STREAM_END    1
+#define Z_NEED_DICT     2
+#define Z_ERRNO        (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR   (-3)
+#define Z_MEM_ERROR    (-4)
+#define Z_BUF_ERROR    (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative
+ * values are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION         0
+#define Z_BEST_SPEED             1
+#define Z_BEST_COMPRESSION       9
+#define Z_DEFAULT_COMPRESSION  (-1)
+/* compression levels */
+
+#define Z_FILTERED            1
+#define Z_HUFFMAN_ONLY        2
+#define Z_RLE                 3
+#define Z_FIXED               4
+#define Z_DEFAULT_STRATEGY    0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY   0
+#define Z_TEXT     1
+#define Z_ASCII    Z_TEXT   /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN  2
+/* Possible values of the data_type field (though see inflate()) */
+
+#define Z_DEFLATED   8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL  0  /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+                        /* basic functions */
+
+ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+   If the first character differs, the library code actually used is
+   not compatible with the zlib.h header file used by the application.
+   This check is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+
+     Initializes the internal stream state for compression. The fields
+   zalloc, zfree and opaque must be initialized before by the caller.
+   If zalloc and zfree are set to Z_NULL, deflateInit updates them to
+   use default allocation functions.
+
+     The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+   1 gives best speed, 9 gives best compression, 0 gives no compression at
+   all (the input data is simply copied a block at a time).
+   Z_DEFAULT_COMPRESSION requests a default compromise between speed and
+   compression (currently equivalent to level 6).
+
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if level is not a valid compression level,
+   Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+   with the version assumed by the caller (ZLIB_VERSION).
+   msg is set to null if there is no error message.  deflateInit does not
+   perform any compression: this will be done by deflate().
+*/
+
+
+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+/*
+    deflate compresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full. It may introduce some
+  output latency (reading input without producing any output) except when
+  forced to flush.
+
+    The detailed semantics are as follows. deflate performs one or both of the
+  following actions:
+
+  - Compress more input starting at next_in and update next_in and avail_in
+    accordingly. If not all input can be processed (because there is not
+    enough room in the output buffer), next_in and avail_in are updated and
+    processing will resume at this point for the next call of deflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly. This action is forced if the parameter flush is non zero.
+    Forcing flush frequently degrades the compression ratio, so this parameter
+    should be set only when necessary (in interactive applications).
+    Some output may be provided even if flush is not set.
+
+  Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming
+  more output, and updating avail_in or avail_out accordingly; avail_out
+  should never be zero before the call. The application can consume the
+  compressed output when it wants, for example when the output buffer is full
+  (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
+  and with zero avail_out, it must be called again after making room in the
+  output buffer because there might be more output pending.
+
+    Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+  decide how much data to accumualte before producing output, in order to
+  maximize compression.
+
+    If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+  flushed to the output buffer and the output is aligned on a byte boundary, so
+  that the decompressor can get all input data available so far. (In particular
+  avail_in is zero after the call if enough output space has been provided
+  before the call.)  Flushing may degrade compression for some compression
+  algorithms and so it should be used only when necessary.
+
+    If flush is set to Z_FULL_FLUSH, all output is flushed as with
+  Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+  restart from this point if previous compressed data has been damaged or if
+  random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
+  compression.
+
+    If deflate returns with avail_out == 0, this function must be called again
+  with the same value of the flush parameter and more output space (updated
+  avail_out), until the flush is complete (deflate returns with non-zero
+  avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+  avail_out is greater than six to avoid repeated flush markers due to
+  avail_out == 0 on return.
+
+    If the parameter flush is set to Z_FINISH, pending input is processed,
+  pending output is flushed and deflate returns with Z_STREAM_END if there
+  was enough output space; if deflate returns with Z_OK, this function must be
+  called again with Z_FINISH and more output space (updated avail_out) but no
+  more input data, until it returns with Z_STREAM_END or an error. After
+  deflate has returned Z_STREAM_END, the only possible operations on the
+  stream are deflateReset or deflateEnd.
+
+    Z_FINISH can be used immediately after deflateInit if all the compression
+  is to be done in a single step. In this case, avail_out must be at least
+  the value returned by deflateBound (see below). If deflate does not return
+  Z_STREAM_END, then it must be called again as described above.
+
+    deflate() sets strm->adler to the adler32 checksum of all input read
+  so far (that is, total_in bytes).
+
+    deflate() may update strm->data_type if it can make a good guess about
+  the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
+  binary. This field is only for information purposes and does not affect
+  the compression algorithm in any manner.
+
+    deflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if all input has been
+  consumed and all output has been produced (only when flush is set to
+  Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+  if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible
+  (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
+  fatal, and deflate() can be called again with more input and more output
+  space to continue compressing.
+*/
+
+
+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any
+   pending output.
+
+     deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+   stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+   prematurely (some input or output was discarded). In the error case,
+   msg may be set but then points to a static string (which must not be
+   deallocated).
+*/
+
+
+/*
+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+
+     Initializes the internal stream state for decompression. The fields
+   next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+   the caller. If next_in is not Z_NULL and avail_in is large enough (the exact
+   value depends on the compression method), inflateInit determines the
+   compression method from the zlib header and allocates all data structures
+   accordingly; otherwise the allocation will be deferred to the first call of
+   inflate.  If zalloc and zfree are set to Z_NULL, inflateInit updates them to
+   use default allocation functions.
+
+     inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller.  msg is set to null if there is no error
+   message. inflateInit does not perform any decompression apart from reading
+   the zlib header if present: this will be done by inflate().  (So next_in and
+   avail_in may be modified, but next_out and avail_out are unchanged.)
+*/
+
+
+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+/*
+    inflate decompresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full. It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+  The detailed semantics are as follows. inflate performs one or both of the
+  following actions:
+
+  - Decompress more input starting at next_in and update next_in and avail_in
+    accordingly. If not all input can be processed (because there is not
+    enough room in the output buffer), next_in is updated and processing
+    will resume at this point for the next call of inflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly.  inflate() provides as much output as possible, until there
+    is no more input data or no more space in the output buffer (see below
+    about the flush parameter).
+
+  Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming
+  more output, and updating the next_* and avail_* values accordingly.
+  The application can consume the uncompressed output when it wants, for
+  example when the output buffer is full (avail_out == 0), or after each
+  call of inflate(). If inflate returns Z_OK and with zero avail_out, it
+  must be called again after making room in the output buffer because there
+  might be more output pending.
+
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
+  Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer. Z_BLOCK requests that inflate() stop
+  if and when it gets to the next deflate block boundary. When decoding the
+  zlib or gzip format, this will cause inflate() to return immediately after
+  the header and before the first block. When doing a raw inflate, inflate()
+  will go ahead and process the first block, and will return when it gets to
+  the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  Also to assist in this, on return inflate() will set strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64
+  if inflate() is currently decoding the last block in the deflate stream,
+  plus 128 if inflate() returned immediately after decoding an end-of-block
+  code or decoding the complete header up to just before the first byte of the
+  deflate stream. The end-of-block will not be indicated until all of the
+  uncompressed data from that block has been written to strm->next_out.  The
+  number of unused bits may in general be greater than seven, except when
+  bit 7 of data_type is set, in which case the number of unused bits will be
+  less than eight.
+
+    inflate() should normally be called until it returns Z_STREAM_END or an
+  error. However if all decompression is to be performed in a single step
+  (a single call of inflate), the parameter flush should be set to
+  Z_FINISH. In this case all pending input is processed and all pending
+  output is flushed; avail_out must be large enough to hold all the
+  uncompressed data. (The size of the uncompressed data may have been saved
+  by the compressor for this purpose.) The next operation on this stream must
+  be inflateEnd to deallocate the decompression state. The use of Z_FINISH
+  is never required, but can be used to inform inflate that a faster approach
+  may be used for the single inflate() call.
+
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call. So the only effect of the flush parameter in this implementation
+  is on the return value of inflate(), as noted below, or when it returns early
+  because Z_BLOCK is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the adler32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the adler32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below. At the end of the stream, inflate() checks that its computed adler32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() will decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically.  Any information
+  contained in the gzip header is not retained, so applications that need that
+  information should instead use raw inflate, see inflateInit2() below, or
+  inflateBack() and perform their own processing of the gzip header and
+  trailer.
+
+    inflate() returns Z_OK if some progress has been made (more input processed
+  or more output produced), Z_STREAM_END if the end of the compressed data has
+  been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+  preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+  Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+  output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing. If Z_DATA_ERROR is returned, the application may then
+  call inflateSync() to look for a good compression block if a partial recovery
+  of the data is desired.
+*/
+
+
+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any
+   pending output.
+
+     inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+   was inconsistent. In the error case, msg may be set but then points to a
+   static string (which must not be deallocated).
+*/
+
+                        /* Advanced functions */
+
+/*
+    The following functions are needed only in some special applications.
+*/
+
+/*
+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
+                                     int  level,
+                                     int  method,
+                                     int  windowBits,
+                                     int  memLevel,
+                                     int  strategy));
+
+     This is another version of deflateInit with more compression options. The
+   fields next_in, zalloc, zfree and opaque must be initialized before by
+   the caller.
+
+     The method parameter is the compression method. It must be Z_DEFLATED in
+   this version of the library.
+
+     The windowBits parameter is the base two logarithm of the window size
+   (the size of the history buffer). It should be in the range 8..15 for this
+   version of the library. Larger values of this parameter result in better
+   compression at the expense of memory usage. The default value is 15 if
+   deflateInit is used instead.
+
+     windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
+   determines the window size. deflate() will then generate raw deflate data
+   with no zlib header or trailer, and will not compute an adler32 check value.
+
+     windowBits can also be greater than 15 for optional gzip encoding. Add
+   16 to windowBits to write a simple gzip header and trailer around the
+   compressed data instead of a zlib wrapper. The gzip header will have no
+   file name, no extra data, no comment, no modification time (set to zero),
+   no header crc, and the operating system will be set to 255 (unknown).  If a
+   gzip stream is being written, strm->adler is a crc32 instead of an adler32.
+
+     The memLevel parameter specifies how much memory should be allocated
+   for the internal compression state. memLevel=1 uses minimum memory but
+   is slow and reduces compression ratio; memLevel=9 uses maximum memory
+   for optimal speed. The default value is 8. See zconf.h for total memory
+   usage as a function of windowBits and memLevel.
+
+     The strategy parameter is used to tune the compression algorithm. Use the
+   value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+   filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+   string match), or Z_RLE to limit match distances to one (run-length
+   encoding). Filtered data consists mostly of small values with a somewhat
+   random distribution. In this case, the compression algorithm is tuned to
+   compress them better. The effect of Z_FILTERED is to force more Huffman
+   coding and less string matching; it is somewhat intermediate between
+   Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as
+   Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy
+   parameter only affects the compression ratio but not the correctness of the
+   compressed output even if it is not set appropriately.  Z_FIXED prevents the
+   use of dynamic Huffman codes, allowing for a simpler decoder for special
+   applications.
+
+      deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid
+   method). msg is set to null if there is no error message.  deflateInit2 does
+   not perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength));
+/*
+     Initializes the compression dictionary from the given byte sequence
+   without producing any compressed output. This function must be called
+   immediately after deflateInit, deflateInit2 or deflateReset, before any
+   call of deflate. The compressor and decompressor must use exactly the same
+   dictionary (see inflateSetDictionary).
+
+     The dictionary should consist of strings (byte sequences) that are likely
+   to be encountered later in the data to be compressed, with the most commonly
+   used strings preferably put towards the end of the dictionary. Using a
+   dictionary is most useful when the data to be compressed is short and can be
+   predicted with good accuracy; the data can then be compressed better than
+   with the default empty dictionary.
+
+     Depending on the size of the compression data structures selected by
+   deflateInit or deflateInit2, a part of the dictionary may in effect be
+   discarded, for example if the dictionary is larger than the window size in
+   deflate or deflate2. Thus the strings most likely to be useful should be
+   put at the end of the dictionary, not at the front. In addition, the
+   current implementation of deflate will use at most the window size minus
+   262 bytes of the provided dictionary.
+
+     Upon return of this function, strm->adler is set to the adler32 value
+   of the dictionary; the decompressor may later use this value to determine
+   which dictionary has been used by the compressor. (The adler32 value
+   applies to the whole dictionary even if only a subset of the dictionary is
+   actually used by the compressor.) If a raw deflate was requested, then the
+   adler32 value is not computed and strm->adler is not set.
+
+     deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+   parameter is invalid (such as NULL dictionary) or the stream state is
+   inconsistent (for example if deflate has already been called for this stream
+   or if the compression method is bsort). deflateSetDictionary does not
+   perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
+                                    z_streamp source));
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when several compression strategies will be
+   tried, for example when there are several ways of pre-processing the input
+   data with a filter. The streams that will be discarded should then be freed
+   by calling deflateEnd.  Note that deflateCopy duplicates the internal
+   compression state which can be quite large, so this strategy is slow and
+   can consume lots of memory.
+
+     deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being NULL). msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to deflateEnd followed by deflateInit,
+   but does not free and reallocate all the internal compression state.
+   The stream will keep the same compression level and any other attributes
+   that may have been set by deflateInit2.
+
+      deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
+                                      int level,
+                                      int strategy));
+/*
+     Dynamically update the compression level and compression strategy.  The
+   interpretation of level and strategy is as in deflateInit2.  This can be
+   used to switch between compression and straight copy of the input data, or
+   to switch to a different kind of input data requiring a different
+   strategy. If the compression level is changed, the input available so far
+   is compressed with the old level (and may be flushed); the new level will
+   take effect only at the next call of deflate().
+
+     Before the call of deflateParams, the stream state must be set as for
+   a call of deflate(), since the currently available input may have to
+   be compressed and flushed. In particular, strm->avail_out must be non-zero.
+
+     deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
+   stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR
+   if strm->avail_out was zero.
+*/
+
+ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
+                                    int good_length,
+                                    int max_lazy,
+                                    int nice_length,
+                                    int max_chain));
+/*
+     Fine tune deflate's internal compression parameters.  This should only be
+   used by someone who understands the algorithm used by zlib's deflate for
+   searching for the best matching string, and even then only by the most
+   fanatic optimizer trying to squeeze out the last compressed bit for their
+   specific input data.  Read the deflate.c source code for the meaning of the
+   max_lazy, good_length, nice_length, and max_chain parameters.
+
+     deflateTune() can be called after deflateInit() or deflateInit2(), and
+   returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
+                                       uLong sourceLen));
+/*
+     deflateBound() returns an upper bound on the compressed size after
+   deflation of sourceLen bytes.  It must be called after deflateInit()
+   or deflateInit2().  This would be used to allocate an output buffer
+   for deflation in a single pass, and so would be called before deflate().
+*/
+
+ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value));
+/*
+     deflatePrime() inserts bits in the deflate output stream.  The intent
+  is that this function is used to start off the deflate output with the
+  bits leftover from a previous deflate stream when appending to it.  As such,
+  this function can only be used for raw deflate, and must be used before the
+  first deflate() call after a deflateInit2() or deflateReset().  bits must be
+  less than or equal to 16, and that many of the least significant bits of
+  value will be inserted in the output.
+
+      deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
+                                         gz_headerp head));
+/*
+      deflateSetHeader() provides gzip header information for when a gzip
+   stream is requested by deflateInit2().  deflateSetHeader() may be called
+   after deflateInit2() or deflateReset() and before the first call of
+   deflate().  The text, time, os, extra field, name, and comment information
+   in the provided gz_header structure are written to the gzip header (xflag is
+   ignored -- the extra flags are set according to the compression level).  The
+   caller must assure that, if not Z_NULL, name and comment are terminated with
+   a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+   available there.  If hcrc is true, a gzip header crc is included.  Note that
+   the current versions of the command-line version of gzip (up through version
+   1.3.x) do not support header crc's, and will report that it is a "multi-part
+   gzip file" and give up.
+
+      If deflateSetHeader is not used, the default gzip header has text false,
+   the time set to zero, and os set to 255, with no extra, name, or comment
+   fields.  The gzip header is returned to the default state by deflateReset().
+
+      deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
+                                     int  windowBits));
+
+     This is another version of inflateInit with an extra parameter. The
+   fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+   before by the caller.
+
+     The windowBits parameter is the base two logarithm of the maximum window
+   size (the size of the history buffer).  It should be in the range 8..15 for
+   this version of the library. The default value is 15 if inflateInit is used
+   instead. windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used. If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+   determines the window size. inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream. This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values. If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an adler32 or a crc32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is. Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding. Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is
+   a crc32 instead of an adler32.
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
+   is set to null if there is no error message.  inflateInit2 does not perform
+   any decompression apart from reading the zlib header if present: this will
+   be done by inflate(). (So next_in and avail_in may be modified, but next_out
+   and avail_out are unchanged.)
+*/
+
+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength));
+/*
+     Initializes the decompression dictionary from the given uncompressed byte
+   sequence. This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+   can be determined from the adler32 value returned by that call of inflate.
+   The compressor and decompressor must use exactly the same dictionary (see
+   deflateSetDictionary).  For raw inflate, this function can be called
+   immediately after inflateInit2() or inflateReset() and before any call of
+   inflate() to set the dictionary.  The application must insure that the
+   dictionary that was used for compression is provided.
+
+     inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+   parameter is invalid (such as NULL dictionary) or the stream state is
+   inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+   expected one (incorrect adler32 value). inflateSetDictionary does not
+   perform any decompression: this will be done by subsequent calls of
+   inflate().
+*/
+
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+/*
+    Skips invalid compressed data until a full flush point (see above the
+  description of deflate with Z_FULL_FLUSH) can be found, or until all
+  available input is skipped. No output is provided.
+
+    inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR
+  if no more input was provided, Z_DATA_ERROR if no flush point has been found,
+  or Z_STREAM_ERROR if the stream structure was inconsistent. In the success
+  case, the application may save the current current value of total_in which
+  indicates where valid compressed data was found. In the error case, the
+  application may repeatedly call inflateSync, providing more input each time,
+  until success or end of the input data.
+*/
+
+ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
+                                    z_streamp source));
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when randomly accessing a large stream.  The
+   first pass through the stream can periodically record the inflate state,
+   allowing restarting inflate at those points when randomly accessing the
+   stream.
+
+     inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being NULL). msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to inflateEnd followed by inflateInit,
+   but does not free and reallocate all the internal decompression state.
+   The stream will keep attributes that may have been set by inflateInit2.
+
+      inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value));
+/*
+     This function inserts bits in the inflate input stream.  The intent is
+  that this function is used to start inflating at a bit position in the
+  middle of a byte.  The provided bits will be used before any bytes are used
+  from next_in.  This function should only be used with raw inflate, and
+  should be used before the first inflate() call after inflateInit2() or
+  inflateReset().  bits must be less than or equal to 16, and that many of the
+  least significant bits of value will be inserted in the input.
+
+      inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
+                                         gz_headerp head));
+/*
+      inflateGetHeader() requests that gzip header information be stored in the
+   provided gz_header structure.  inflateGetHeader() may be called after
+   inflateInit2() or inflateReset(), and before the first call of inflate().
+   As inflate() processes the gzip stream, head->done is zero until the header
+   is completed, at which time head->done is set to one.  If a zlib stream is
+   being decoded, then head->done is set to -1 to indicate that there will be
+   no gzip header information forthcoming.  Note that Z_BLOCK can be used to
+   force inflate() to return immediately after header processing is complete
+   and before any actual data is decompressed.
+
+      The text, time, xflags, and os fields are filled in with the gzip header
+   contents.  hcrc is set to true if there is a header CRC.  (The header CRC
+   was valid if done is set to one.)  If extra is not Z_NULL, then extra_max
+   contains the maximum number of bytes to write to extra.  Once done is true,
+   extra_len contains the actual extra field length, and extra contains the
+   extra field, or that field truncated if extra_max is less than extra_len.
+   If name is not Z_NULL, then up to name_max characters are written there,
+   terminated with a zero unless the length is greater than name_max.  If
+   comment is not Z_NULL, then up to comm_max characters are written there,
+   terminated with a zero unless the length is greater than comm_max.  When
+   any of extra, name, or comment are not Z_NULL and the respective field is
+   not present in the header, then that field is set to Z_NULL to signal its
+   absence.  This allows the use of deflateSetHeader() with the returned
+   structure to duplicate the header.  However if those fields are set to
+   allocated memory, then the application will need to save those pointers
+   elsewhere so that they can be eventually freed.
+
+      If inflateGetHeader is not used, then the header information is simply
+   discarded.  The header is always checked for validity, including the header
+   CRC if present.  inflateReset() will reset the process to discard the header
+   information.  The application would need to call inflateGetHeader() again to
+   retrieve the header from the next gzip stream.
+
+      inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
+                                        unsigned char FAR *window));
+
+     Initialize the internal stream state for decompression using inflateBack()
+   calls.  The fields zalloc, zfree and opaque in strm must be initialized
+   before the call.  If zalloc and zfree are Z_NULL, then the default library-
+   derived memory allocation routines are used.  windowBits is the base two
+   logarithm of the window size, in the range 8..15.  window is a caller
+   supplied buffer of that size.  Except for special applications where it is
+   assured that deflate was used with small window sizes, windowBits must be 15
+   and a 32K byte window must be supplied to be able to decompress general
+   deflate streams.
+
+     See inflateBack() for the usage of these routines.
+
+     inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+   the paramaters are invalid, Z_MEM_ERROR if the internal state could not
+   be allocated, or Z_VERSION_ERROR if the version of the library does not
+   match the version of the header file.
+*/
+
+typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *));
+typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+
+ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
+                                    in_func in, void FAR *in_desc,
+                                    out_func out, void FAR *out_desc));
+/*
+     inflateBack() does a raw inflate with a single call using a call-back
+   interface for input and output.  This is more efficient than inflate() for
+   file i/o applications in that it avoids copying between the output and the
+   sliding window by simply making the window itself the output buffer.  This
+   function trusts the application to not change the output buffer passed by
+   the output function, at least until inflateBack() returns.
+
+     inflateBackInit() must be called first to allocate the internal state
+   and to initialize the state with the user-provided window buffer.
+   inflateBack() may then be used multiple times to inflate a complete, raw
+   deflate stream with each call.  inflateBackEnd() is then called to free
+   the allocated state.
+
+     A raw deflate stream is one with no zlib or gzip header or trailer.
+   This routine would normally be used in a utility that reads zip or gzip
+   files and writes out uncompressed files.  The utility would decode the
+   header and process the trailer on its own, hence this routine expects
+   only the raw deflate stream to decompress.  This is different from the
+   normal behavior of inflate(), which expects either a zlib or gzip header and
+   trailer around the deflate stream.
+
+     inflateBack() uses two subroutines supplied by the caller that are then
+   called by inflateBack() for input and output.  inflateBack() calls those
+   routines until it reads a complete deflate stream and writes out all of the
+   uncompressed data, or until it encounters an error.  The function's
+   parameters and return types are defined above in the in_func and out_func
+   typedefs.  inflateBack() will call in(in_desc, &buf) which should return the
+   number of bytes of provided input, and a pointer to that input in buf.  If
+   there is no input available, in() must return zero--buf is ignored in that
+   case--and inflateBack() will return a buffer error.  inflateBack() will call
+   out(out_desc, buf, len) to write the uncompressed data buf[0..len-1].  out()
+   should return zero on success, or non-zero on failure.  If out() returns
+   non-zero, inflateBack() will return with an error.  Neither in() nor out()
+   are permitted to change the contents of the window provided to
+   inflateBackInit(), which is also the buffer that out() uses to write from.
+   The length written by out() will be at most the window size.  Any non-zero
+   amount of input may be provided by in().
+
+     For convenience, inflateBack() can be provided input on the first call by
+   setting strm->next_in and strm->avail_in.  If that input is exhausted, then
+   in() will be called.  Therefore strm->next_in must be initialized before
+   calling inflateBack().  If strm->next_in is Z_NULL, then in() will be called
+   immediately for input.  If strm->next_in is not Z_NULL, then strm->avail_in
+   must also be initialized, and then if strm->avail_in is not zero, input will
+   initially be taken from strm->next_in[0 .. strm->avail_in - 1].
+
+     The in_desc and out_desc parameters of inflateBack() is passed as the
+   first parameter of in() and out() respectively when they are called.  These
+   descriptors can be optionally used to pass any information that the caller-
+   supplied in() and out() functions need to do their job.
+
+     On return, inflateBack() will set strm->next_in and strm->avail_in to
+   pass back any unused input that was provided by the last in() call.  The
+   return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+   if in() or out() returned an error, Z_DATA_ERROR if there was a format
+   error in the deflate stream (in which case strm->msg is set to indicate the
+   nature of the error), or Z_STREAM_ERROR if the stream was not properly
+   initialized.  In the case of Z_BUF_ERROR, an input or output error can be
+   distinguished using strm->next_in which will be Z_NULL only if in() returned
+   an error.  If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to
+   out() returning non-zero.  (in() will always be called before out(), so
+   strm->next_in is assured to be defined if out() returns non-zero.)  Note
+   that inflateBack() cannot return Z_OK.
+*/
+
+ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+/*
+     All memory allocated by inflateBackInit() is freed.
+
+     inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+   state was inconsistent.
+*/
+
+ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+/* Return flags indicating compile-time options.
+
+    Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+     1.0: size of uInt
+     3.2: size of uLong
+     5.4: size of voidpf (pointer)
+     7.6: size of z_off_t
+
+    Compiler, assembler, and debug options:
+     8: DEBUG
+     9: ASMV or ASMINF -- use ASM code
+     10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+     11: 0 (reserved)
+
+    One-time table building (smaller code, but not thread-safe if true):
+     12: BUILDFIXED -- build static block decoding tables when needed
+     13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+     14,15: 0 (reserved)
+
+    Library content (indicates missing functionality):
+     16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+                          deflate code when not needed)
+     17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+                    and decode gzip streams (to avoid linking crc code)
+     18-19: 0 (reserved)
+
+    Operation variations (changes in library functionality):
+     20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+     21: FASTEST -- deflate algorithm with only one, lowest compression level
+     22,23: 0 (reserved)
+
+    The sprintf variant used by gzprintf (zero is best):
+     24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+     25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+     26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+    Remainder:
+     27-31: 0 (reserved)
+ */
+
+
+                        /* utility functions */
+
+/*
+     The following utility functions are implemented on top of the
+   basic stream-oriented functions. To simplify the interface, some
+   default options are assumed (compression level and memory usage,
+   standard memory allocation functions). The source code of these
+   utility functions can easily be modified if you need special options.
+*/
+
+ZEXTERN int ZEXPORT compress OF((Bytef *dest,   uLongf *destLen,
+                                 const Bytef *source, uLong sourceLen));
+/*
+     Compresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer. Upon entry, destLen is the total
+   size of the destination buffer, which must be at least the value returned
+   by compressBound(sourceLen). Upon exit, destLen is the actual size of the
+   compressed buffer.
+     This function can be used to compress a whole file at once if the
+   input file is mmap'ed.
+     compress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer.
+*/
+
+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest,   uLongf *destLen,
+                                  const Bytef *source, uLong sourceLen,
+                                  int level));
+/*
+     Compresses the source buffer into the destination buffer. The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer. Upon entry, destLen is the total size of the
+   destination buffer, which must be at least the value returned by
+   compressBound(sourceLen). Upon exit, destLen is the actual size of the
+   compressed buffer.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+/*
+     compressBound() returns an upper bound on the compressed size after
+   compress() or compress2() on sourceLen bytes.  It would be used before
+   a compress() or compress2() call to allocate the destination buffer.
+*/
+
+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest,   uLongf *destLen,
+                                   const Bytef *source, uLong sourceLen));
+/*
+     Decompresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer. Upon entry, destLen is the total
+   size of the destination buffer, which must be large enough to hold the
+   entire uncompressed data. (The size of the uncompressed data must have
+   been saved previously by the compressor and transmitted to the decompressor
+   by some mechanism outside the scope of this compression library.)
+   Upon exit, destLen is the actual size of the compressed buffer.
+     This function can be used to decompress a whole file at once if the
+   input file is mmap'ed.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.
+*/
+
+
+typedef voidp gzFile;
+
+ZEXTERN gzFile ZEXPORT gzopen  OF((const char *path, const char *mode));
+/*
+     Opens a gzip (.gz) file for reading or writing. The mode parameter
+   is as in fopen ("rb" or "wb") but can also include a compression level
+   ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for
+   Huffman only compression as in "wb1h", or 'R' for run-length encoding
+   as in "wb1R". (See the description of deflateInit2 for more information
+   about the strategy parameter.)
+
+     gzopen can be used to read a file which is not in gzip format; in this
+   case gzread will directly read from the file without decompression.
+
+     gzopen returns NULL if the file could not be opened or if there was
+   insufficient memory to allocate the (de)compression state; errno
+   can be checked to distinguish the two cases (if errno is zero, the
+   zlib error is Z_MEM_ERROR).  */
+
+ZEXTERN gzFile ZEXPORT gzdopen  OF((int fd, const char *mode));
+/*
+     gzdopen() associates a gzFile with the file descriptor fd.  File
+   descriptors are obtained from calls like open, dup, creat, pipe or
+   fileno (in the file has been previously opened with fopen).
+   The mode parameter is as in gzopen.
+     The next call of gzclose on the returned gzFile will also close the
+   file descriptor fd, just like fclose(fdopen(fd), mode) closes the file
+   descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode).
+     gzdopen returns NULL if there was insufficient memory to allocate
+   the (de)compression state.
+*/
+
+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+/*
+     Dynamically update the compression level or strategy. See the description
+   of deflateInit2 for the meaning of these parameters.
+     gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
+   opened for writing.
+*/
+
+ZEXTERN int ZEXPORT    gzread  OF((gzFile file, voidp buf, unsigned len));
+/*
+     Reads the given number of uncompressed bytes from the compressed file.
+   If the input file was not in gzip format, gzread copies the given number
+   of bytes into the buffer.
+     gzread returns the number of uncompressed bytes actually read (0 for
+   end of file, -1 for error). */
+
+ZEXTERN int ZEXPORT    gzwrite OF((gzFile file,
+                                   voidpc buf, unsigned len));
+/*
+     Writes the given number of uncompressed bytes into the compressed file.
+   gzwrite returns the number of uncompressed bytes actually written
+   (0 in case of error).
+*/
+
+ZEXTERN int ZEXPORTVA   gzprintf OF((gzFile file, const char *format, ...));
+/*
+     Converts, formats, and writes the args to the compressed file under
+   control of the format string, as in fprintf. gzprintf returns the number of
+   uncompressed bytes actually written (0 in case of error).  The number of
+   uncompressed bytes written is limited to 4095. The caller should assure that
+   this limit is not exceeded. If it is exceeded, then gzprintf() will return
+   return an error (0) with nothing written. In this case, there may also be a
+   buffer overflow with unpredictable consequences, which is possible only if
+   zlib was compiled with the insecure functions sprintf() or vsprintf()
+   because the secure snprintf() or vsnprintf() functions were not available.
+*/
+
+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+/*
+      Writes the given null-terminated string to the compressed file, excluding
+   the terminating null character.
+      gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+/*
+      Reads bytes from the compressed file until len-1 characters are read, or
+   a newline character is read and transferred to buf, or an end-of-file
+   condition is encountered.  The string is then terminated with a null
+   character.
+      gzgets returns buf, or Z_NULL in case of error.
+*/
+
+ZEXTERN int ZEXPORT    gzputc OF((gzFile file, int c));
+/*
+      Writes c, converted to an unsigned char, into the compressed file.
+   gzputc returns the value that was written, or -1 in case of error.
+*/
+
+ZEXTERN int ZEXPORT    gzgetc OF((gzFile file));
+/*
+      Reads one byte from the compressed file. gzgetc returns this byte
+   or -1 in case of end of file or error.
+*/
+
+ZEXTERN int ZEXPORT    gzungetc OF((int c, gzFile file));
+/*
+      Push one character back onto the stream to be read again later.
+   Only one character of push-back is allowed.  gzungetc() returns the
+   character pushed, or -1 on failure.  gzungetc() will fail if a
+   character has been pushed but not read yet, or if c is -1. The pushed
+   character will be discarded if the stream is repositioned with gzseek()
+   or gzrewind().
+*/
+
+ZEXTERN int ZEXPORT    gzflush OF((gzFile file, int flush));
+/*
+     Flushes all pending output into the compressed file. The parameter
+   flush is as in the deflate() function. The return value is the zlib
+   error number (see function gzerror below). gzflush returns Z_OK if
+   the flush parameter is Z_FINISH and all output could be flushed.
+     gzflush should be called only when strictly necessary because it can
+   degrade compression.
+*/
+
+ZEXTERN z_off_t ZEXPORT    gzseek OF((gzFile file,
+                                      z_off_t offset, int whence));
+/*
+      Sets the starting position for the next gzread or gzwrite on the
+   given compressed file. The offset represents a number of bytes in the
+   uncompressed data stream. The whence parameter is defined as in lseek(2);
+   the value SEEK_END is not supported.
+     If the file is opened for reading, this function is emulated but can be
+   extremely slow. If the file is opened for writing, only forward seeks are
+   supported; gzseek then compresses a sequence of zeroes up to the new
+   starting position.
+
+      gzseek returns the resulting offset location as measured in bytes from
+   the beginning of the uncompressed stream, or -1 in case of error, in
+   particular if the file is opened for writing and the new starting position
+   would be before the current position.
+*/
+
+ZEXTERN int ZEXPORT    gzrewind OF((gzFile file));
+/*
+     Rewinds the given file. This function is supported only for reading.
+
+   gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+*/
+
+ZEXTERN z_off_t ZEXPORT    gztell OF((gzFile file));
+/*
+     Returns the starting position for the next gzread or gzwrite on the
+   given compressed file. This position represents a number of bytes in the
+   uncompressed data stream.
+
+   gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+/*
+     Returns 1 when EOF has previously been detected reading the given
+   input stream, otherwise zero.
+*/
+
+ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+/*
+     Returns 1 if file is being read directly without decompression, otherwise
+   zero.
+*/
+
+ZEXTERN int ZEXPORT    gzclose OF((gzFile file));
+/*
+     Flushes all pending output if necessary, closes the compressed file
+   and deallocates all the (de)compression state. The return value is the zlib
+   error number (see function gzerror below).
+*/
+
+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+/*
+     Returns the error message for the last error which occurred on the
+   given compressed file. errnum is set to zlib error number. If an
+   error occurred in the file system and not in the compression library,
+   errnum is set to Z_ERRNO and the application may consult errno
+   to get the exact error code.
+*/
+
+ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+/*
+     Clears the error and end-of-file flags for file. This is analogous to the
+   clearerr() function in stdio. This is useful for continuing to read a gzip
+   file that is being written concurrently.
+*/
+
+                        /* checksum functions */
+
+/*
+     These functions are not related to compression but are exported
+   anyway because they might be useful in applications using the
+   compression library.
+*/
+
+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+/*
+     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+   return the updated checksum. If buf is NULL, this function returns
+   the required initial value for the checksum.
+   An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+   much faster. Usage example:
+
+     uLong adler = adler32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       adler = adler32(adler, buffer, length);
+     }
+     if (adler != original_adler) error();
+*/
+
+ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
+                                          z_off_t len2));
+/*
+     Combine two Adler-32 checksums into one.  For two sequences of bytes, seq1
+   and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+   each, adler1 and adler2.  adler32_combine() returns the Adler-32 checksum of
+   seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.
+*/
+
+ZEXTERN uLong ZEXPORT crc32   OF((uLong crc, const Bytef *buf, uInt len));
+/*
+     Update a running CRC-32 with the bytes buf[0..len-1] and return the
+   updated CRC-32. If buf is NULL, this function returns the required initial
+   value for the for the crc. Pre- and post-conditioning (one's complement) is
+   performed within this function so it shouldn't be done by the application.
+   Usage example:
+
+     uLong crc = crc32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       crc = crc32(crc, buffer, length);
+     }
+     if (crc != original_crc) error();
+*/
+
+ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+
+/*
+     Combine two CRC-32 check values into one.  For two sequences of bytes,
+   seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+   calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
+   check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+   len2.
+*/
+
+
+                        /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
+                                     const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
+                                     const char *version, int stream_size));
+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int  level, int  method,
+                                      int windowBits, int memLevel,
+                                      int strategy, const char *version,
+                                      int stream_size));
+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int  windowBits,
+                                      const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
+                                         unsigned char FAR *window,
+                                         const char *version,
+                                         int stream_size));
+#define deflateInit(strm, level) \
+        deflateInit_((strm), (level),       ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit(strm) \
+        inflateInit_((strm),                ZLIB_VERSION, sizeof(z_stream))
+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+        deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+                      (strategy),           ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit2(strm, windowBits) \
+        inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
+#define inflateBackInit(strm, windowBits, window) \
+        inflateBackInit_((strm), (windowBits), (window), \
+        ZLIB_VERSION, sizeof(z_stream))
+
+
+#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL)
+    struct internal_state {int dummy;}; /* hack for buggy compilers */
+#endif
+
+ZEXTERN const char   * ZEXPORT zError           OF((int));
+ZEXTERN int            ZEXPORT inflateSyncPoint OF((z_streamp z));
+ZEXTERN const uLongf * ZEXPORT get_crc_table    OF((void));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZLIB_H */
author	Charles Plessy <plessy@debian.org>
	Tue, 13 Jul 2010 12:02:49 +0000 (21:02 +0900)
committer	Charles Plessy <plessy@debian.org>
	Tue, 13 Jul 2010 12:02:49 +0000 (21:02 +0900)
AUTHORS		patch \| blob \| history
ChangeLog		patch \| blob \| history
Makefile		patch \| blob \| history
Makefile.mingw		patch \| blob \| history
NEWS		patch \| blob \| history
bam.c		patch \| blob \| history
bam.h		patch \| blob \| history
bam_aux.c		patch \| blob \| history
bam_import.c		patch \| blob \| history
bam_index.c		patch \| blob \| history
bam_maqcns.c		patch \| blob \| history
bam_maqcns.h		patch \| blob \| history
bam_md.c		patch \| blob \| history
bam_pileup.c		patch \| blob \| history
bam_plcmd.c		patch \| blob \| history
bam_reheader.c	[new file with mode: 0644]	patch \| blob
bam_sort.c		patch \| blob \| history
bam_tview.c		patch \| blob \| history
bamtk.c		patch \| blob \| history
bgzf.c		patch \| blob \| history
bgzf.h		patch \| blob \| history
examples/bam2bed.c	[new file with mode: 0644]	patch \| blob
examples/toy.fa	[new file with mode: 0644]	patch \| blob
examples/toy.sam	[new file with mode: 0644]	patch \| blob
faidx.c		patch \| blob \| history
knetfile.c		patch \| blob \| history
kstring.h		patch \| blob \| history
misc/Makefile		patch \| blob \| history
misc/export2sam.pl		patch \| blob \| history
misc/sam2vcf.pl		patch \| blob \| history
misc/samtools.pl		patch \| blob \| history
misc/varfilter.py	[new file with mode: 0755]	patch \| blob
misc/wgsim.c		patch \| blob \| history
misc/wgsim_eval.pl		patch \| blob \| history
sam.c		patch \| blob \| history
sam_header.c		patch \| blob \| history
sam_view.c		patch \| blob \| history
samtools.1		patch \| blob \| history
samtools.txt		patch \| blob \| history
win32/xcurses.h	[new file with mode: 0644]	patch \| blob
win32/zconf.h	[new file with mode: 0644]	patch \| blob
win32/zlib.h	[new file with mode: 0644]	patch \| blob