From cb12a866906ec4ac644de0e658679261c82ab098 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 13 Jul 2010 21:02:49 +0900 Subject: [PATCH] Imported Upstream version 0.1.8 --- AUTHORS | 4 + ChangeLog | 518 +++++++++++++++++ Makefile | 38 +- Makefile.mingw | 18 +- NEWS | 79 +++ bam.c | 55 +- bam.h | 107 ++-- bam_aux.c | 2 +- bam_import.c | 26 +- bam_index.c | 216 +++++-- bam_maqcns.c | 33 +- bam_maqcns.h | 5 +- bam_md.c | 34 +- bam_pileup.c | 342 ++++++++--- bam_plcmd.c | 250 ++++++-- bam_reheader.c | 60 ++ bam_sort.c | 4 +- bam_tview.c | 19 +- bamtk.c | 12 +- bgzf.c | 76 +-- bgzf.h | 25 +- examples/bam2bed.c | 51 ++ examples/toy.fa | 2 + examples/toy.sam | 7 + faidx.c | 2 +- knetfile.c | 6 +- kstring.h | 34 ++ misc/Makefile | 10 +- misc/export2sam.pl | 496 +++++++++++++--- misc/sam2vcf.pl | 93 ++- misc/samtools.pl | 108 +++- misc/varfilter.py | 205 +++++++ misc/wgsim.c | 4 +- misc/wgsim_eval.pl | 28 +- sam.c | 1 + sam_header.c | 38 +- sam_view.c | 43 +- samtools.1 | 333 ++++++----- samtools.txt | 376 ++++++------ win32/xcurses.h | 1377 ++++++++++++++++++++++++++++++++++++++++++++ win32/zconf.h | 332 +++++++++++ win32/zlib.h | 1357 +++++++++++++++++++++++++++++++++++++++++++ 42 files changed, 6052 insertions(+), 774 deletions(-) create mode 100644 bam_reheader.c create mode 100644 examples/bam2bed.c create mode 100644 examples/toy.fa create mode 100644 examples/toy.sam create mode 100755 misc/varfilter.py create mode 100644 win32/xcurses.h create mode 100644 win32/zconf.h create mode 100644 win32/zlib.h diff --git a/AUTHORS b/AUTHORS index 435431c..95afabb 100644 --- a/AUTHORS +++ b/AUTHORS @@ -14,3 +14,7 @@ used in `faidx' for indexing RAZF compressed fasta files. Colin Hercus updated novo2sam.pl to support gapped alignment by novoalign. + +Petr Danecek contributed the header parsing library sam_header.c and +sam2vcf.pl script and added knet support to the RAZF library. + diff --git a/ChangeLog b/ChangeLog index 6b1a695..6b0ff6c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,521 @@ +------------------------------------------------------------------------ +r612 | lh3lh3 | 2010-07-11 21:08:56 -0400 (Sun, 11 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +fixed a compiling issue for Windows + +------------------------------------------------------------------------ +r611 | lh3lh3 | 2010-07-11 20:59:15 -0400 (Sun, 11 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_sort.c + +fixed a bug in sorting when output to stdout (by Peter Chines) + +------------------------------------------------------------------------ +r610 | lh3lh3 | 2010-07-09 17:05:10 -0400 (Fri, 09 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/NEWS + M /trunk/samtools/bam_plcmd.c + +change the command line option of pileup + +------------------------------------------------------------------------ +r609 | lh3lh3 | 2010-07-09 00:39:34 -0400 (Fri, 09 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_pileup.c + A /trunk/samtools/examples/toy.fa + A /trunk/samtools/examples/toy.sam + +make pileup work with CIGAR with I/D at the beginning or in the end + +------------------------------------------------------------------------ +r608 | lh3lh3 | 2010-07-08 22:36:12 -0400 (Thu, 08 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_tview.c + + * make tview more friendly + * a temporary remedy for an issue in indel calling + +------------------------------------------------------------------------ +r607 | lh3lh3 | 2010-07-08 14:43:52 -0400 (Thu, 08 Jul 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-r607 + * improved the genotype accuracy for indels + * use the SOAPsnp model for SNP calling by default. + +------------------------------------------------------------------------ +r606 | lh3lh3 | 2010-07-08 01:05:19 -0400 (Thu, 08 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/misc/Makefile + +removed a debugging example + +------------------------------------------------------------------------ +r605 | lh3lh3 | 2010-07-08 01:04:09 -0400 (Thu, 08 Jul 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-.1.7-18 (r605) + * fixed an issue when a deletion and mismatch occur at the same time + and the base quality is higher than 40 (if -I40). + +------------------------------------------------------------------------ +r604 | lh3lh3 | 2010-07-02 19:32:24 -0400 (Fri, 02 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_index.c + M /trunk/samtools/misc/Makefile + +fixed a minor bug in idxstats + +------------------------------------------------------------------------ +r601 | lh3lh3 | 2010-06-16 09:03:59 -0400 (Wed, 16 Jun 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_index.c + +fixed a minor bug in indexing + +------------------------------------------------------------------------ +r600 | lh3lh3 | 2010-06-15 10:17:53 -0400 (Tue, 15 Jun 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam.c + +change printf() to puts in exporting + +------------------------------------------------------------------------ +r599 | lh3lh3 | 2010-06-13 21:41:11 -0400 (Sun, 13 Jun 2010) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + +minor fix. No actual effect. + +------------------------------------------------------------------------ +r598 | lh3lh3 | 2010-06-13 21:32:45 -0400 (Sun, 13 Jun 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + +added Makefile targets to compile shared/dynamic library + +------------------------------------------------------------------------ +r596 | lh3lh3 | 2010-06-13 19:48:07 -0400 (Sun, 13 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-17 (r596) + * also keep the number of coor-less reads in the index file + +------------------------------------------------------------------------ +r595 | lh3lh3 | 2010-06-13 18:54:26 -0400 (Sun, 13 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-16 (r595) + * write additional information to bam index + +------------------------------------------------------------------------ +r594 | lh3lh3 | 2010-06-13 17:29:52 -0400 (Sun, 13 Jun 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_index.c + +fixed a bug for unmapped sequences in indexing + +------------------------------------------------------------------------ +r593 | lh3lh3 | 2010-06-12 18:11:32 -0400 (Sat, 12 Jun 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/samtools.1 + +rename iterf as iter + +------------------------------------------------------------------------ +r592 | lh3lh3 | 2010-06-12 18:02:38 -0400 (Sat, 12 Jun 2010) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-15 (r592) + * fixed a few minor memory leaks in the new pileup code + * improved the functionality of mpileup + +------------------------------------------------------------------------ +r591 | lh3lh3 | 2010-06-12 14:09:22 -0400 (Sat, 12 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-14 (r591) + * elementary multi-way pileup. More testing and more functionality to be done. + +------------------------------------------------------------------------ +r590 | lh3lh3 | 2010-06-12 01:00:24 -0400 (Sat, 12 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-13 (r590) + * added mpileup APIs. No compiling errors, but not tested at all. It is late. + +------------------------------------------------------------------------ +r589 | lh3lh3 | 2010-06-11 22:37:09 -0400 (Fri, 11 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-12 (r589) + * added iterator-like APIs for pileup + +------------------------------------------------------------------------ +r588 | lh3lh3 | 2010-06-11 17:41:13 -0400 (Fri, 11 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-11 (r588) + * ported a few improvements from tabix back to samtools + +------------------------------------------------------------------------ +r587 | lh3lh3 | 2010-06-11 17:33:16 -0400 (Fri, 11 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-10 (r587) + * added iterator interface for bam_fetch (ported back from tabix) + +------------------------------------------------------------------------ +r586 | lh3lh3 | 2010-06-11 13:23:53 -0400 (Fri, 11 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + A /trunk/samtools/bam_reheader.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + + * samtools-0.1.7-9 (r586) + * added "reheader" to replace the BAM header + +------------------------------------------------------------------------ +r585 | lh3lh3 | 2010-06-11 12:22:06 -0400 (Fri, 11 Jun 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kstring.h + + * samtools-0.1.7-8 (r585) + * speed up "view" + +------------------------------------------------------------------------ +r584 | lh3lh3 | 2010-06-11 12:00:41 -0400 (Fri, 11 Jun 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/kstring.h + M /trunk/samtools/misc/wgsim_eval.pl + + * samtools-0.1.7-7 (r584) + * ported tabix BGZF to samtools + * flush BGZF after writing the BAM header and between alignment boundaries + +------------------------------------------------------------------------ +r583 | petulda | 2010-06-11 11:58:20 -0400 (Fri, 11 Jun 2010) | 1 line +Changed paths: + A /trunk/samtools/misc/varfilter.py + +Initial release on behalf of Aylwyn Scally +------------------------------------------------------------------------ +r561 | petulda | 2010-05-07 08:41:56 -0400 (Fri, 07 May 2010) | 1 line +Changed paths: + M /trunk/samtools/samtools.1 + +Added a note about the indels coordinates +------------------------------------------------------------------------ +r551 | petulda | 2010-04-23 09:42:13 -0400 (Fri, 23 Apr 2010) | 1 line +Changed paths: + M /trunk/samtools/misc/sam2vcf.pl + +Added the possibility to print or not to print the reference allele +------------------------------------------------------------------------ +r546 | petulda | 2010-04-15 04:33:55 -0400 (Thu, 15 Apr 2010) | 1 line +Changed paths: + M /trunk/samtools/sam_header.c + +More descriptive message for space separated tags +------------------------------------------------------------------------ +r545 | petulda | 2010-04-14 11:44:50 -0400 (Wed, 14 Apr 2010) | 1 line +Changed paths: + M /trunk/samtools/misc/sam2vcf.pl + +Speedup with -i, no need to query the reference all the time +------------------------------------------------------------------------ +r541 | petulda | 2010-03-15 10:03:51 -0400 (Mon, 15 Mar 2010) | 1 line +Changed paths: + M /trunk/samtools/sam_header.c + +Fixed the order of sequences in the header +------------------------------------------------------------------------ +r540 | petulda | 2010-03-04 06:28:35 -0500 (Thu, 04 Mar 2010) | 1 line +Changed paths: + M /trunk/samtools/misc/sam2vcf.pl + +Added possibility to select indels only and fixed a bug in reporting homozygous indels. +------------------------------------------------------------------------ +r539 | jmarshall | 2010-02-27 06:48:17 -0500 (Sat, 27 Feb 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam.c + +Improve the invalid 'BAM\1' magic number error message, and also print it +when no bytes can be read from the alleged BAM file, e.g., in the common +user error case when a SAM file has accidentally been supplied. + +------------------------------------------------------------------------ +r538 | petulda | 2010-02-26 10:51:40 -0500 (Fri, 26 Feb 2010) | 1 line +Changed paths: + M /trunk/samtools/AUTHORS + M /trunk/samtools/bam.h + M /trunk/samtools/bam_import.c + M /trunk/samtools/sam_header.c + +Improved efficiency of header parsing +------------------------------------------------------------------------ +r537 | lh3lh3 | 2010-02-23 21:08:48 -0500 (Tue, 23 Feb 2010) | 3 lines +Changed paths: + M /trunk/samtools/misc/export2sam.pl + +Updated export2sam.pl by Chris Saunders from Illumina. + + +------------------------------------------------------------------------ +r536 | petulda | 2010-02-17 08:32:53 -0500 (Wed, 17 Feb 2010) | 1 line +Changed paths: + M /trunk/samtools/misc/samtools.pl + +Fixed filtering of SNPs near indels. Added min indel and SNP quality filter. +------------------------------------------------------------------------ +r535 | petulda | 2010-02-12 04:52:37 -0500 (Fri, 12 Feb 2010) | 1 line +Changed paths: + M /trunk/samtools/misc/sam2vcf.pl + +Print an error for pileups in simple format +------------------------------------------------------------------------ +r534 | lh3lh3 | 2010-02-11 14:01:41 -0500 (Thu, 11 Feb 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + +added a hidden option in pileup to output the base position (for Erin) + +------------------------------------------------------------------------ +r533 | petulda | 2010-02-09 10:12:14 -0500 (Tue, 09 Feb 2010) | 1 line +Changed paths: + M /trunk/samtools/misc/sam2vcf.pl + +Added possibility to specify a custom column title for the data column +------------------------------------------------------------------------ +r532 | petulda | 2010-02-09 09:46:09 -0500 (Tue, 09 Feb 2010) | 1 line +Changed paths: + M /trunk/samtools/bam_plcmd.c + +Added the -d option to limit maximum depth for indels. +------------------------------------------------------------------------ +r531 | petulda | 2010-02-03 07:57:27 -0500 (Wed, 03 Feb 2010) | 1 line +Changed paths: + M /trunk/samtools/misc/sam2vcf.pl + +Added VCF header +------------------------------------------------------------------------ +r530 | lh3lh3 | 2010-02-01 09:13:19 -0500 (Mon, 01 Feb 2010) | 3 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/samtools.pl + M /trunk/samtools/misc/wgsim.c + + * samtools-0.1.7-6 + * fixed a bug in faidx + +------------------------------------------------------------------------ +r529 | jmarshall | 2010-01-11 18:51:49 -0500 (Mon, 11 Jan 2010) | 2 lines +Changed paths: + M /trunk/samtools/faidx.c + +Put the right filename in the error message. + +------------------------------------------------------------------------ +r528 | lh3lh3 | 2009-12-14 11:26:47 -0500 (Mon, 14 Dec 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-5 (r528) + * further add new consensus generation strategy + +------------------------------------------------------------------------ +r527 | petulda | 2009-12-11 12:31:05 -0500 (Fri, 11 Dec 2009) | 1 line +Changed paths: + M /trunk/samtools/knetfile.c + +Fixed a bug in knet_seek +------------------------------------------------------------------------ +r526 | petulda | 2009-12-11 07:51:18 -0500 (Fri, 11 Dec 2009) | 1 line +Changed paths: + M /trunk/samtools/misc/sam2vcf.pl + +Small fix in VCF format: dot for the empty INFO field +------------------------------------------------------------------------ +r525 | petulda | 2009-12-11 04:36:18 -0500 (Fri, 11 Dec 2009) | 1 line +Changed paths: + M /trunk/samtools/sam_header.c + +Allow tabs in the CO header field +------------------------------------------------------------------------ +r524 | jmarshall | 2009-12-10 10:03:58 -0500 (Thu, 10 Dec 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/Makefile.mingw + +Depend on libbam.a rather than the phony target, so that samtools is not +unnecessarily rebuilt every time. Also clean bgzip. + +------------------------------------------------------------------------ +r523 | jmarshall | 2009-12-10 09:45:32 -0500 (Thu, 10 Dec 2009) | 4 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/Makefile.mingw + +Fix a bug in compiling bgzip: this also needs knetfile.o when _USE_KNETFILE +is defined. Also introduce $(KNETFILE_O) which can be set to empty to +facilitate non-knet builds. + +------------------------------------------------------------------------ +r522 | lh3lh3 | 2009-12-01 13:02:36 -0500 (Tue, 01 Dec 2009) | 4 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.7-4 (r522) + * fixed a bug in "view -r" + * added a new option "view -R" to read required read groups from a file + +------------------------------------------------------------------------ +r521 | lh3lh3 | 2009-12-01 10:00:12 -0500 (Tue, 01 Dec 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.7-3 (r521) + * calmd: optionally mask matching bases as N + +------------------------------------------------------------------------ +r520 | lh3lh3 | 2009-12-01 09:37:17 -0500 (Tue, 01 Dec 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/samtools.pl + + * samtools-0.1.7-2 (r520) + * fixed a few issues with compilation in Windows (on behalf of John) + * choose a random base as the consensus (for population genetics studies) + +------------------------------------------------------------------------ +r519 | jmarshall | 2009-11-30 10:53:02 -0500 (Mon, 30 Nov 2009) | 6 lines +Changed paths: + M /trunk/samtools/Makefile + +Put libraries at the end, so they can resolve references from libbam.a +as well, even with old-fashioned linkers. + +Also use libbam.a explicitly rather than "-L. -lbam" to ensure that we get +the freshly built library, not some other libbam.a lying around the system. + +------------------------------------------------------------------------ +r518 | jmarshall | 2009-11-30 08:44:56 -0500 (Mon, 30 Nov 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/misc/Makefile + +Also clean *.exe (for Cygwin users using this makefile). + +------------------------------------------------------------------------ +r517 | jmarshall | 2009-11-30 07:09:04 -0500 (Mon, 30 Nov 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_index.c + +Index files should be opened in binary mode, not text mode. + +------------------------------------------------------------------------ +r516 | lh3lh3 | 2009-11-27 15:18:59 -0500 (Fri, 27 Nov 2009) | 2 lines +Changed paths: + A /trunk/samtools/examples/bam2bed.c + +another example program + +------------------------------------------------------------------------ +r515 | lh3lh3 | 2009-11-27 10:44:56 -0500 (Fri, 27 Nov 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/wgsim_eval.pl + M /trunk/samtools/sam.c + + * samtools-0.1.7-1 (r515) + * report an error when .fai contains duplicated names, instead of segfault + +------------------------------------------------------------------------ +r514 | jmarshall | 2009-11-24 09:45:35 -0500 (Tue, 24 Nov 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam.c + +Format 'c'-encoded auxiliary fields correctly, as *signed* integers. + +------------------------------------------------------------------------ +r513 | lh3lh3 | 2009-11-16 10:13:07 -0500 (Mon, 16 Nov 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.mingw + +Update Makefile.mingw for the same reason + +------------------------------------------------------------------------ +r512 | lh3lh3 | 2009-11-16 10:00:08 -0500 (Mon, 16 Nov 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + +Fixed a bug in compiling razip + +------------------------------------------------------------------------ +r510 | lh3lh3 | 2009-11-10 10:55:41 -0500 (Tue, 10 Nov 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + +Release samtools-0.1.7 (r510) + ------------------------------------------------------------------------ r509 | lh3lh3 | 2009-11-06 09:17:09 -0500 (Fri, 06 Nov 2009) | 3 lines Changed paths: diff --git a/Makefile b/Makefile index f3fb7a0..35d578f 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,10 @@ CC= gcc CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1 +KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \ - bam_sort.o sam_header.o + bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o @@ -30,19 +31,22 @@ all-recur lib-recur clean-recur cleanlocal-recur install-recur: all:$(PROG) +.PHONY:all lib clean cleanlocal +.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur + lib:libbam.a libbam.a:$(LOBJS) $(AR) -cru $@ $(LOBJS) -samtools:lib $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) $(LIBCURSES) -lz -L. -lbam +samtools:$(AOBJS) libbam.a + $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -razip:razip.o razf.o knetfile.o - $(CC) $(CFLAGS) -o $@ razf.o razip.o knetfile.o -lz +razip:razip.o razf.o $(KNETFILE_O) + $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz -bgzip:bgzip.o bgzf.o - $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz +bgzip:bgzip.o bgzf.o $(KNETFILE_O) + $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz razip.o:razf.h bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h @@ -62,7 +66,23 @@ sam_header.o:sam_header.h khash.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h + +libbam.1.dylib-local:$(LOBJS) + libtool -dynamic $(LOBJS) -o libbam.1.dylib -lc -lz + +libbam.so.1-local:$(LOBJS) + $(CC) -shared -Wl,-soname,libbam.so -o libbam.so.1 $(LOBJS) -lc -lz + +dylib: + @$(MAKE) cleanlocal; \ + case `uname` in \ + Linux) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.so.1-local;; \ + Darwin) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.1.dylib-local;; \ + *) echo 'Unknown OS';; \ + esac + + cleanlocal: - rm -fr gmon.out *.o a.out *.dSYM razip $(PROG) *~ *.a + rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib clean:cleanlocal-recur diff --git a/Makefile.mingw b/Makefile.mingw index f1ae1be..9df4b9a 100644 --- a/Makefile.mingw +++ b/Makefile.mingw @@ -2,9 +2,10 @@ CC= gcc.exe AR= ar.exe CFLAGS= -g -Wall -O2 DFLAGS= -D_CURSES_LIB=2 -D_USE_KNETFILE +KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bam_sort.o \ - knetfile.o + $(KNETFILE_O) AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o sam_header.o @@ -20,19 +21,22 @@ LIBPATH= all:$(PROG) +.PHONY:all lib clean cleanlocal +.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur + lib:libbam.a libbam.a:$(LOBJS) $(AR) -cru $@ $(LOBJS) -samtools:lib $(AOBJS) +samtools:$(AOBJS) libbam.a $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 -razip:razip.o razf.o knetfile.o - $(CC) $(CFLAGS) -o $@ razf.o razip.o knetfile.o -lz +razip:razip.o razf.o $(KNETFILE_O) + $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz -bgzip:bgzip.o bgzf.o - $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz +bgzip:bgzip.o bgzf.o $(KNETFILE_O) + $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz razip.o:razf.h bam.o:bam.h razf.h bam_endian.h kstring.h @@ -52,4 +56,4 @@ faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h clean: - rm -fr gmon.out *.o *.exe *.dSYM razip $(PROG) *~ *.a + rm -fr gmon.out *.o *.exe *.dSYM razip bgzip $(PROG) *~ *.a diff --git a/NEWS b/NEWS index 8db0996..28d6aaa 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,82 @@ +Beta Release 0.1.8 (11 July, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable functional changes: + + * Added the `reheader' command which replaces a BAM header with a new + header. This command is much faster than replacing header by + BAM->SAM->BAM conversions. + + * Added the `mpileup' command which computes the pileup of multiple + alignments. + + * The `index' command now stores the number of mapped and unmapped + reads in the index file. This information can be retrieved quickly by + the new `idxstats' command. + + * By default, pileup used the SOAPsnp model for SNP calling. This + avoids the floating overflow in the MAQ model which leads to spurious + calls in repetitive regions, although these calls will be immediately + filtered by varFilter. + + * The `tview' command now correctly handles CIGARs like 7I10M and + 10M1P1I10M which cause assertion failure in earlier versions. + + * Tview accepts a region like `=10,000' where `=' stands for the + current sequence name. This saves typing for long sequence names. + + * Added the `-d' option to `pileup' which avoids slow indel calling + in ultradeep regions by subsampling reads locally. + + * Added the `-R' option to `view' which retrieves alignments in read + groups listed in the specified file. + +Performance improvements: + + * The BAM->SAM conversion is up to twice faster, depending on the + characteristic of the input. + + * Parsing SAM headers with a lot of reference sequences is now much + faster. + + * The number of lseek() calls per query is reduced when the query + region contains no read alignments. + +Bug fixes: + + * Fixed an issue in the indel caller that leads to miscall of indels. + Note that this solution may not work well when the sequencing indel + error rate is higher than the rate of SNPs. + + * Fixed another issue in the indel caller which may lead to incorrect + genotype. + + * Fixed a bug in `sort' when option `-o' is applied. + + * Fixed a bug in `view -r'. + +APIs and other changes: + + * Added iterator interfaces to random access and pileup. The callback + interfaces directly call the iterator interfaces. + + * The BGZF blocks holding the BAM header are indepedent of alignment + BGZF blocks. Alignment records shorter than 64kB is guaranteed to be + fully contained in one BGZF block. This change is fully compatible + with the old version of samtools/picard. + +Changes in other utilities: + + * Updated export2sam.pl by Chris Saunders. + + * Improved the sam2vcf.pl script. + + * Added a Python version of varfilter.py by Aylwyn Scally. + +(0.1.8: 11 July 2010, r613) + + + Beta Release 0.1.7 (10 November, 2009) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bam.c b/bam.c index ee7642b..94b0aa8 100644 --- a/bam.c +++ b/bam.c @@ -70,6 +70,7 @@ bam_header_t *bam_header_read(bamFile fp) { bam_header_t *header; char buf[4]; + int magic_len; int32_t i = 1, name_len; // check EOF i = bgzf_check_EOF(fp); @@ -80,9 +81,9 @@ bam_header_t *bam_header_read(bamFile fp) } else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n"); // read "BAM1" - if (bam_read(fp, buf, 4) != 4) return 0; - if (strncmp(buf, "BAM\001", 4)) { - fprintf(stderr, "[bam_header_read] wrong header\n"); + magic_len = bam_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { + fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); return 0; } header = bam_header_init(); @@ -140,6 +141,7 @@ int bam_header_write(bamFile fp, const bam_header_t *header) bam_write(fp, &x, 4); } else bam_write(fp, &header->target_len[i], 4); } + bgzf_flush(fp); return 0; } @@ -207,6 +209,7 @@ inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8 x[5] = c->mtid; x[6] = c->mpos; x[7] = c->isize; + bgzf_flush_try(fp, 4 + block_len); if (bam_is_be) { for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); y = block_len; @@ -232,8 +235,8 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kstring_t str; str.l = str.m = 0; str.s = 0; - ksprintf(&str, "%s\t", bam1_qname(b)); - if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag); + kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str); + if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); } else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); else { // BAM_OFSTR for (i = 0; i < 16; ++i) @@ -241,41 +244,43 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kputc(bam_flag2char_table[i], &str); kputc('\t', &str); } - if (c->tid < 0) kputs("*\t", &str); - else ksprintf(&str, "%s\t", header->target_name[c->tid]); - ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); + if (c->tid < 0) kputsn("*\t", 2, &str); + else { kputs(header->target_name[c->tid], &str); kputc('\t', &str); } + kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { - for (i = 0; i < c->n_cigar; ++i) - ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); + for (i = 0; i < c->n_cigar; ++i) { + kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); + kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + } } kputc('\t', &str); - if (c->mtid < 0) kputs("*\t", &str); - else if (c->mtid == c->tid) kputs("=\t", &str); - else ksprintf(&str, "%s\t", header->target_name[c->mtid]); - ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); + if (c->mtid < 0) kputsn("*\t", 2, &str); + else if (c->mtid == c->tid) kputsn("=\t", 2, &str); + else { kputs(header->target_name[c->mtid], &str); kputc('\t', &str); } + kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); kputc('\t', &str); if (t[0] == 0xff) kputc('*', &str); else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); - } else ksprintf(&str, "*\t*"); + } else kputsn("*\t*", 3, &str); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; - ksprintf(&str, "\t%c%c:", key[0], key[1]); - if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } - else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } - else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; } - else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } - else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } - else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } - else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } + kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); + if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } + else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } + else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } + else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } + else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } + else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } + else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } - else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } } return str.s; } @@ -288,7 +293,7 @@ char *bam_format1(const bam_header_t *header, const bam1_t *b) void bam_view1(const bam_header_t *header, const bam1_t *b) { char *s = bam_format1(header, b); - printf("%s\n", s); + puts(s); free(s); } diff --git a/bam.h b/bam.h index 291b303..8e26ea6 100644 --- a/bam.h +++ b/bam.h @@ -87,7 +87,7 @@ typedef struct { char **target_name; uint32_t *target_len; void *dict, *hash, *rg2lib; - int l_text; + size_t l_text, n_text; char *text; } bam_header_t; @@ -190,6 +190,8 @@ typedef struct { uint8_t *data; } bam1_t; +typedef struct __bam_iter_t *bam_iter_t; + #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) @@ -272,6 +274,10 @@ extern char bam_nt16_nt4_table[]; extern "C" { #endif + /********************* + * Low-level SAM I/O * + *********************/ + /*! @abstract TAM file handler */ typedef struct __tamFile_t *tamFile; @@ -323,6 +329,7 @@ extern "C" { be destroyed in the first place. */ int sam_header_parse(bam_header_t *h); + int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); /*! @abstract Parse @RG lines a update a header struct @@ -336,12 +343,22 @@ extern "C" { #define sam_write1(header, b) bam_view1(header, b) + + /******************************** + * APIs for string dictionaries * + ********************************/ + int bam_strmap_put(void *strmap, const char *rg, const char *lib); const char *bam_strmap_get(const void *strmap, const char *rg); void *bam_strmap_dup(const void*); void *bam_strmap_init(); void bam_strmap_destroy(void *strmap); + + /********************* + * Low-level BAM I/O * + *********************/ + /*! @abstract Initialize a header structure. @return the pointer to the header structure @@ -440,6 +457,11 @@ extern "C" { const char *bam_get_library(bam_header_t *header, const bam1_t *b); + + /*************** + * pileup APIs * + ***************/ + /*! @typedef @abstract Structure for one alignment covering the pileup position. @field b pointer to the alignment @@ -461,11 +483,25 @@ extern "C" { uint32_t is_del:1, is_head:1, is_tail:1; } bam_pileup1_t; - struct __bam_plbuf_t; - /*! @abstract pileup buffer */ - typedef struct __bam_plbuf_t bam_plbuf_t; + typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); - void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + struct __bam_plp_t; + typedef struct __bam_plp_t *bam_plp_t; + + bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); + int bam_plp_push(bam_plp_t iter, const bam1_t *b); + const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + void bam_plp_set_mask(bam_plp_t iter, int mask); + void bam_plp_reset(bam_plp_t iter); + void bam_plp_destroy(bam_plp_t iter); + + struct __bam_mplp_t; + typedef struct __bam_mplp_t *bam_mplp_t; + + bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); + void bam_mplp_destroy(bam_mplp_t iter); + int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); /*! @typedef @abstract Type of function to be called by bam_plbuf_push(). @@ -478,44 +514,16 @@ extern "C" { */ typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); - /*! - @abstract Reset a pileup buffer for another pileup process - @param buf the pileup buffer to be reset - */ - void bam_plbuf_reset(bam_plbuf_t *buf); + typedef struct { + bam_plp_t iter; + bam_pileup_f func; + void *data; + } bam_plbuf_t; - /*! - @abstract Initialize a buffer for pileup. - @param func fucntion to be called by bam_pileup_core() - @param data user provided data - @return pointer to the pileup buffer - */ + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + void bam_plbuf_reset(bam_plbuf_t *buf); bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); - - /*! - @abstract Destroy a pileup buffer. - @param buf pointer to the pileup buffer - */ void bam_plbuf_destroy(bam_plbuf_t *buf); - - /*! - @abstract Push an alignment to the pileup buffer. - @param b alignment to be pushed - @param buf pileup buffer - @see bam_plbuf_init() - @return always 0 currently - - @discussion If all the alignments covering a particular site have - been collected, this function will call the user defined function - as is provided to bam_plbuf_init(). The coordinate of the site and - all the alignments will be transferred to the user defined - function as function parameters. - - When all the alignments are pushed to the buffer, this function - needs to be called with b equal to NULL. This will flush the - buffer. A pileup buffer can only be reused when bam_plbuf_reset() - is called. - */ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); @@ -534,6 +542,11 @@ extern "C" { /*! @abstract bam_plbuf_push() equivalent with level calculated. */ int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); + + /********************* + * BAM indexing APIs * + *********************/ + struct __bam_index_t; typedef struct __bam_index_t bam_index_t; @@ -582,6 +595,10 @@ extern "C" { */ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); + bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end); + int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b); + void bam_iter_destroy(bam_iter_t iter); + /*! @abstract Parse a region in the format: "chr2:100,000-200,000". @discussion bam_header_t::hash will be initialized if empty. @@ -594,6 +611,11 @@ extern "C" { */ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + + /************************** + * APIs for optional tags * + **************************/ + /*! @abstract Retrieve data of a tag @param b pointer to an alignment struct @@ -617,6 +639,11 @@ extern "C" { void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() + + /***************** + * Miscellaneous * + *****************/ + /*! @abstract Calculate the rightmost coordinate of an alignment on the reference genome. diff --git a/bam_aux.c b/bam_aux.c index 89e99f2..fbcd982 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -115,7 +115,7 @@ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *be *ref_id = kh_value(h, iter); if (i == k) { /* dump the whole sequence */ *begin = 0; *end = 1<<29; free(s); - return -1; + return 0; } for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; *begin = atoi(p); diff --git a/bam_import.c b/bam_import.c index 9d463d1..9d84328 100644 --- a/bam_import.c +++ b/bam_import.c @@ -116,7 +116,7 @@ static bam_header_t *hash2header(const kh_ref_t *hash) bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; - int c, dret, ret; + int c, dret, ret, error = 0; gzFile fp; kstream_t *ks; kstring_t *str; @@ -135,6 +135,10 @@ bam_header_t *sam_header_read2(const char *fn) ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); + if (ret == 0) { + fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s); + error = 1; + } kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); @@ -143,6 +147,7 @@ bam_header_t *sam_header_read2(const char *fn) gzclose(fp); free(str->s); free(str); fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); + if (error) return 0; header = hash2header(hash); kh_destroy(ref, hash); return header; @@ -163,9 +168,24 @@ static inline void parse_error(int64_t n_lines, const char * __restrict msg) } static inline void append_text(bam_header_t *header, kstring_t *str) { - int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null + size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null kroundup32(x); kroundup32(y); - if (x < y) header->text = (char*)realloc(header->text, y); + if (x < y) + { + header->n_text = y; + header->text = (char*)realloc(header->text, y); + if ( !header->text ) + { + fprintf(stderr,"realloc failed to alloc %ld bytes\n", y); + abort(); + } + } + // Sanity check + if ( header->l_text+str->l+1 >= header->n_text ) + { + fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,header->n_text,x,y); + abort(); + } strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. header->l_text += str->l + 1; header->text[header->l_text] = 0; diff --git a/bam_index.c b/bam_index.c index a627884..4152f20 100644 --- a/bam_index.c +++ b/bam_index.c @@ -42,6 +42,8 @@ // 1<<14 is the size of minimum bin. #define BAM_LIDX_SHIFT 14 +#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1 + typedef struct { uint64_t u, v; } pair64_t; @@ -63,6 +65,7 @@ KHASH_MAP_INIT_INT(i, bam_binlist_t) struct __bam_index_t { int32_t n; + uint64_t n_no_coor; // unmapped reads without coordinate khash_t(i) **index; bam_lidx_t *index2; }; @@ -98,8 +101,12 @@ static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); } - for (i = beg + 1; i <= end; ++i) - if (index2->offset[i] == 0) index2->offset[i] = offset; + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } index2->n = end + 1; } @@ -113,7 +120,7 @@ static void merge_chunks(bam_index_t *idx) index = idx->index[i]; for (k = kh_begin(index); k != kh_end(index); ++k) { bam_binlist_t *p; - if (!kh_exist(index, k)) continue; + if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue; p = &kh_value(index, k); m = 0; for (l = 1; l < p->n; ++l) { @@ -130,6 +137,17 @@ static void merge_chunks(bam_index_t *idx) #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) } +static void fill_missing(bam_index_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + bam_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + bam_index_t *bam_index_core(bamFile fp) { bam1_t *b; @@ -139,7 +157,7 @@ bam_index_t *bam_index_core(bamFile fp) uint32_t last_bin, save_bin; int32_t last_coor, last_tid, save_tid; bam1_core_t *c; - uint64_t save_off, last_off; + uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor; idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); b = (bam1_t*)calloc(1, sizeof(bam1_t)); @@ -154,7 +172,10 @@ bam_index_t *bam_index_core(bamFile fp) save_bin = save_tid = last_tid = last_bin = 0xffffffffu; save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; + n_mapped = n_unmapped = n_no_coor = off_end = 0; + off_beg = off_end = bam_tell(fp); while ((ret = bam_read1(fp, b)) >= 0) { + if (c->tid < 0) ++n_no_coor; if (last_tid != c->tid) { // change of chromosomes last_tid = c->tid; last_bin = 0xffffffffu; @@ -163,10 +184,17 @@ bam_index_t *bam_index_core(bamFile fp) bam1_qname(b), last_coor, c->pos, c->tid+1); exit(1); } - if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element + off_end = last_off; + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); + n_mapped = n_unmapped = 0; + off_beg = off_end; + } save_off = last_off; save_bin = last_bin = c->bin; save_tid = c->tid; @@ -177,13 +205,23 @@ bam_index_t *bam_index_core(bamFile fp) (unsigned long long)bam_tell(fp), (unsigned long long)last_off); exit(1); } + if (c->flag & BAM_FUNMAP) ++n_unmapped; + else ++n_mapped; last_off = bam_tell(fp); last_coor = b->core.pos; } - if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + if (save_tid >= 0) { + insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); + } merge_chunks(idx); + fill_missing(idx); + if (ret >= 0) + while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor; if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); free(b->data); free(b); + idx->n_no_coor = n_no_coor; return idx; } @@ -261,6 +299,11 @@ void bam_index_save(const bam_index_t *idx, FILE *fp) bam_swap_endian_8p(&index2->offset[x]); } else fwrite(index2->offset, 8, index2->n, fp); } + { // write the number of reads coor-less records. + uint64_t x = idx->n_no_coor; + if (bam_is_be) bam_swap_endian_8p(&x); + fwrite(&x, 8, 1, fp); + } fflush(fp); } @@ -322,6 +365,8 @@ static bam_index_t *bam_index_load_core(FILE *fp) if (bam_is_be) for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); } + if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0; + if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor); return idx; } @@ -339,13 +384,13 @@ bam_index_t *bam_index_load_local(const char *_fn) } else fn = strdup(_fn); fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); - fp = fopen(fnidx, "r"); + fp = fopen(fnidx, "rb"); if (fp == 0) { // try "{base}.bai" char *s = strstr(fn, "bam"); if (s == fn + strlen(fn) - 3) { strcpy(fnidx, fn); fnidx[strlen(fn)-1] = 'i'; - fp = fopen(fnidx, "r"); + fp = fopen(fnidx, "rb"); } } free(fnidx); free(fn); @@ -375,7 +420,7 @@ static void download_from_remote(const char *url) fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); return; } - if ((fp = fopen(fn, "w")) == 0) { + if ((fp = fopen(fn, "wb")) == 0) { fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); knet_close(fp_remote); return; @@ -425,7 +470,7 @@ int bam_index_build2(const char *fn, const char *_fnidx) fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); } else fnidx = strdup(_fnidx); - fpidx = fopen(fnidx, "w"); + fpidx = fopen(fnidx, "wb"); if (fpidx == 0) { fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); free(fnidx); @@ -446,7 +491,7 @@ int bam_index_build(const char *fn) int bam_index(int argc, char *argv[]) { if (argc < 2) { - fprintf(stderr, "Usage: samtools index []\n"); + fprintf(stderr, "Usage: samtools index [out.index]\n"); return 1; } if (argc >= 3) bam_index_build2(argv[1], argv[2]); @@ -454,11 +499,43 @@ int bam_index(int argc, char *argv[]) return 0; } -#define MAX_BIN 37450 // =(8^6-1)/7+1 +int bam_idxstats(int argc, char *argv[]) +{ + bam_index_t *idx; + bam_header_t *header; + bamFile fp; + int i; + if (argc < 2) { + fprintf(stderr, "Usage: samtools idxstats \n"); + return 1; + } + fp = bam_open(argv[1], "r"); + if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; } + header = bam_header_read(fp); + bam_close(fp); + idx = bam_index_load(argv[1]); + if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; } + for (i = 0; i < idx->n; ++i) { + khint_t k; + khash_t(i) *h = idx->index[i]; + printf("%s\t%d", header->target_name[i], header->target_len[i]); + k = kh_get(i, h, BAM_MAX_BIN); + if (k != kh_end(h)) + printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v); + else printf("\t0\t0"); + putchar('\n'); + } + printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor); + bam_header_destroy(header); + bam_index_destroy(idx); + return 0; +} -static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN]) { int i = 0, k; + if (beg >= end) return 0; + if (end >= 1u<<29) end = 1u<<29; --end; list[i++] = 0; for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; @@ -476,8 +553,15 @@ static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) return (rend > beg && rbeg < end); } +struct __bam_iter_t { + int from_first; // read from the first record; no random access + int tid, beg, end, n_off, i, finished; + uint64_t curr_off; + pair64_t *off; +}; + // bam_fetch helper function retrieves -pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off) +bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) { uint16_t *bins; int i, n_bins, n_off; @@ -485,17 +569,34 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e khint_t k; khash_t(i) *index; uint64_t min_off; - - bins = (uint16_t*)calloc(MAX_BIN, 2); + bam_iter_t iter = 0; + + if (beg < 0) beg = 0; + if (end < beg) return 0; + // initialize iter + iter = calloc(1, sizeof(struct __bam_iter_t)); + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; + // + bins = (uint16_t*)calloc(BAM_MAX_BIN, 2); n_bins = reg2bins(beg, end, bins); index = idx->index[tid]; - min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + if (idx->index2[tid].n > 0) { + min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4 + int n = beg>>BAM_LIDX_SHIFT; + if (n > idx->index2[tid].n) n = idx->index2[tid].n; + for (i = n - 1; i >= 0; --i) + if (idx->index2[tid].offset[i] != 0) break; + if (i >= 0) min_off = idx->index2[tid].offset[i]; + } + } else min_off = 0; // tabix 0.1.2 may produce such index files for (i = n_off = 0; i < n_bins; ++i) { if ((k = kh_get(i, index, bins[i])) != kh_end(index)) n_off += kh_value(index, k).n; } if (n_off == 0) { - free(bins); return 0; + free(bins); return iter; } off = (pair64_t*)calloc(n_off, 16); for (i = n_off = 0; i < n_bins; ++i) { @@ -534,41 +635,62 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e } bam_destroy1(b); } - *cnt_off = n_off; + iter->n_off = n_off; iter->off = off; + return iter; +} + +pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off) +{ // for pysam compatibility + bam_iter_t iter; + pair64_t *off; + iter = bam_iter_query(idx, tid, beg, end); + off = iter->off; *cnt_off = iter->n_off; + free(iter); return off; } -int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +void bam_iter_destroy(bam_iter_t iter) { - int n_off; - pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off); - if (off == 0) return 0; - { - // retrive alignments - uint64_t curr_off; - int i, ret, n_seeks; - n_seeks = 0; i = -1; curr_off = 0; - bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); - for (;;) { - if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk - if (i == n_off - 1) break; // no more chunks - if (i >= 0) assert(curr_off == off[i].v); // otherwise bug - if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek - bam_seek(fp, off[i+1].u, SEEK_SET); - curr_off = bam_tell(fp); - ++n_seeks; - } - ++i; + if (iter) { free(iter->off); free(iter); } +} + +int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) +{ + if (iter->finished) return -1; + if (iter->from_first) { + int ret = bam_read1(fp, b); + if (ret < 0) iter->finished = 1; + return ret; + } + if (iter->off == 0) return -1; + for (;;) { + int ret; + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk + if (iter->i == iter->n_off - 1) break; // no more chunks + if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug + if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek + bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET); + iter->curr_off = bam_tell(fp); } - if ((ret = bam_read1(fp, b)) > 0) { - curr_off = bam_tell(fp); - if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed - else if (is_overlap(beg, end, b)) func(b, data); - } else break; // end of file + ++iter->i; } -// fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); - bam_destroy1(b); + if ((ret = bam_read1(fp, b)) > 0) { + iter->curr_off = bam_tell(fp); + if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed + else if (is_overlap(iter->beg, iter->end, b)) return ret; + } else break; // end of file } - free(off); + iter->finished = 1; + return -1; +} + +int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +{ + bam_iter_t iter; + bam1_t *b; + b = bam_init1(); + iter = bam_iter_query(idx, tid, beg, end); + while (bam_iter_read(fp, iter, b) >= 0) func(b, data); + bam_destroy1(b); return 0; } diff --git a/bam_maqcns.c b/bam_maqcns.c index 71c2185..cad63d7 100644 --- a/bam_maqcns.c +++ b/bam_maqcns.c @@ -310,6 +310,7 @@ bam_maqindel_opt_t *bam_maqindel_opt_init() bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); mi->q_indel = 40; mi->r_indel = 0.00015; + mi->r_snp = 0.001; // mi->mm_penalty = 3; mi->indel_err = 4; @@ -406,7 +407,8 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } { // the core part char *ref2, *rs, *inscns = 0; - int k, l, *score, *pscore, max_ins = types[n_types-1]; + int qr_snp, k, l, *score, *pscore, max_ins = types[n_types-1]; + qr_snp = (int)(-4.343 * log(mi->r_snp) + .499); if (max_ins > 0) { // get the consensus of inserted sequences int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); // count occurrences @@ -446,12 +448,18 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c for (i = 0; i < n_types; ++i) { ka_param_t ap = ka_param_blast; ap.band_width = 2 * types[n_types - 1] + 2; + ap.gap_end = 0; // write ref2 for (k = 0, j = left; j <= pos; ++j) ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; if (types[i] <= 0) j += -types[i]; else for (l = 0; l < types[i]; ++l) ref2[k++] = bam_nt16_nt4_table[(int)inscns[i*max_ins + l]]; + if (types[0] < 0) { // mask deleted sequences + int jj, tmp = types[i] >= 0? -types[0] : -types[0] + types[i]; + for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j) + ref2[k++] = 4; + } for (; j < right && ref[j]; ++j) ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; if (j < right) right = j; @@ -482,22 +490,27 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c if (op == BAM_CMATCH) { int k; for (k = 0; k < len; ++k) - if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k]; + if (ref2[x+k] != rs[y+k] && ref2[x+k] < 4) + ps += bam1_qual(p->b)[y+k] < qr_snp? bam1_qual(p->b)[y+k] : qr_snp; x += len; y += len; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - if (op == BAM_CINS) ps += mi->q_indel * len; + if (op == BAM_CINS && l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; y += len; } else if (op == BAM_CDEL) { - ps += mi->q_indel * len; + if (l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; x += len; } } pscore[i*n+j] = ps; - /*if (pos == 2618517) { // for debugging only - fprintf(stderr, "pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend); - for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); fprintf(stderr, "\n"); - for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l]], stderr); fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); fputc('\n', stderr); + /*if (1) { // for debugging only + fprintf(stderr, "id=%d, pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, %d, ", + j, pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend, mi->q_indel); + for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); + fprintf(stderr, "\n"); + for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l+tbeg-left]], stderr); + fputc('\n', stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); + fputc('\n', stderr); }*/ free(acigar); } @@ -560,7 +573,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c ret->gl[0] = ret->gl[1] = 0; for (j = 0; j < n; ++j) { int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; - //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2); + //fprintf(stderr, "id=%d, %d, %d, %d, %d, %d\n", j, pl[j].b->core.pos+1, types[max1_i], types[max2_i], s1, s2); if (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err; else ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err; } diff --git a/bam_maqcns.h b/bam_maqcns.h index fa5489d..6cc5355 100644 --- a/bam_maqcns.h +++ b/bam_maqcns.h @@ -16,8 +16,9 @@ typedef struct { } bam_maqcns_t; typedef struct { - int q_indel; - float r_indel; + int q_indel; // indel sequencing error, phred scaled + float r_indel; // indel prior + float r_snp; // snp prior // hidden parameters, unchangeable from command line int mm_penalty, indel_err, ambi_thres; } bam_maqindel_opt_t; diff --git a/bam_md.c b/bam_md.c index 3ca7309..17b0a4a 100644 --- a/bam_md.c +++ b/bam_md.c @@ -6,7 +6,7 @@ #include "sam.h" #include "kstring.h" -void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); @@ -53,6 +53,26 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal) } } ksprintf(str, "%d", u); + // apply max_nm + if (max_nm > 0 && nm >= max_nm) { + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match + seq[z/2] |= (z&1)? 0x0f : 0xf0; + bam1_qual(b)[z] = 0; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } // update NM old_nm = bam_aux_get(b, "NM"); if (c->flag & BAM_FUNMAP) return; @@ -83,9 +103,14 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal) free(str->s); free(str); } +void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +{ + bam_fillmd1_core(b, ref, is_equal, 0); +} + int bam_fillmd(int argc, char *argv[]) { - int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed; + int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; @@ -94,12 +119,13 @@ int bam_fillmd(int argc, char *argv[]) is_bam_out = is_sam_in = is_uncompressed = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "eubS")) >= 0) { + while ((c = getopt(argc, argv, "eubSn:")) >= 0) { switch (c) { case 'e': is_equal = 1; break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': is_sam_in = 1; break; + case 'n': max_nm = atoi(optarg); break; default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; } } @@ -136,7 +162,7 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", fp->header->target_name[tid]); } - if (ref) bam_fillmd1(b, ref, is_equal); + if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); } samwrite(fpout, b); } diff --git a/bam_pileup.c b/bam_pileup.c index f68f400..3c41a16 100644 --- a/bam_pileup.c +++ b/bam_pileup.c @@ -73,18 +73,28 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) p->qpos = y + (pos - x); if (x == pos && is_restart) p->is_head = 1; if (x + l - 1 == pos) { // come to the end of a match - if (k < c->n_cigar - 1) { // there are additional operation(s) + int has_next_match = 0; + unsigned i; + for (i = k + 1; i < c->n_cigar; ++i) { + uint32_t cigar = bam1_cigar(b)[i]; + int opi = cigar&BAM_CIGAR_MASK; + if (opi == BAM_CMATCH) { + has_next_match = 1; + break; + } else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break; + } + if (!has_next_match) p->is_tail = 1; + if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s) uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins - if (op_next == BAM_CDEL || op_next == BAM_CINS) { - if (k + 2 < c->n_cigar) op_next = bam1_cigar(b)[k+2]&BAM_CIGAR_MASK; - else p->is_tail = 1; + else if (op_next == BAM_CPAD && k + 2 < c->n_cigar) { // no working for adjacent padding + cigar = bam1_cigar(b)[k+2]; op_next = cigar&BAM_CIGAR_MASK; + if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del + else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins } - if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP) - p->is_tail = 1; // tail - } else p->is_tail = 1; // this is the last operation; set tail + } } } x += l; y += l; @@ -96,7 +106,8 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) x += l; } else if (op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); + if (is_restart) is_restart ^= (op == BAM_CMATCH); + else is_restart ^= (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); if (x > pos) { if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all break; @@ -108,119 +119,167 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) /* --- END: Auxiliary functions */ -struct __bam_plbuf_t { +/******************* + * pileup iterator * + *******************/ + +struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail, *dummy; - bam_pileup_f func; - void *func_data; int32_t tid, pos, max_tid, max_pos; - int max_pu, is_eof; - bam_pileup1_t *pu; - int flag_mask; + int is_eof, flag_mask, max_plp, error; + bam_pileup1_t *plp; + // for the "auto" interface only + bam1_t *b; + bam_plp_auto_f func; + void *data; }; -void bam_plbuf_reset(bam_plbuf_t *buf) +bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) { - lbnode_t *p, *q; - buf->max_tid = buf->max_pos = -1; - buf->tid = buf->pos = 0; - buf->is_eof = 0; - for (p = buf->head; p->next;) { - q = p->next; - mp_free(buf->mp, p); - p = q; + bam_plp_t iter; + iter = calloc(1, sizeof(struct __bam_plp_t)); + iter->mp = mp_init(); + iter->head = iter->tail = mp_alloc(iter->mp); + iter->dummy = mp_alloc(iter->mp); + iter->max_tid = iter->max_pos = -1; + iter->flag_mask = BAM_DEF_MASK; + if (func) { + iter->func = func; + iter->data = data; + iter->b = bam_init1(); } - buf->head = buf->tail; + return iter; } -void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) -{ - if (mask < 0) buf->flag_mask = BAM_DEF_MASK; - else buf->flag_mask = BAM_FUNMAP | mask; -} - -bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +void bam_plp_destroy(bam_plp_t iter) { - bam_plbuf_t *buf; - buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t)); - buf->func = func; buf->func_data = data; - buf->mp = mp_init(); - buf->head = buf->tail = mp_alloc(buf->mp); - buf->dummy = mp_alloc(buf->mp); - buf->max_tid = buf->max_pos = -1; - buf->flag_mask = BAM_DEF_MASK; - return buf; + mp_free(iter->mp, iter->dummy); + mp_free(iter->mp, iter->head); + if (iter->mp->cnt != 0) + fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt); + mp_destroy(iter->mp); + if (iter->b) bam_destroy1(iter->b); + free(iter->plp); + free(iter); } -void bam_plbuf_destroy(bam_plbuf_t *buf) +const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) { - mp_free(buf->mp, buf->dummy); - mp_free(buf->mp, buf->head); - if (buf->mp->cnt != 0) - fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt); - mp_destroy(buf->mp); - free(buf->pu); - free(buf); + if (iter->error) { *_n_plp = -1; return 0; } + *_n_plp = 0; + if (iter->is_eof && iter->head->next == 0) return 0; + while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { + int n_plp = 0; + lbnode_t *p, *q; + // write iter->plp at iter->pos + iter->dummy->next = iter->head; + for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove + q->next = p->next; mp_free(iter->mp, p); p = q; + } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup + if (n_plp == iter->max_plp) { // then double the capacity + iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; + iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); + } + iter->plp[n_plp].b = &p->b; + if (resolve_cigar(iter->plp + n_plp, iter->pos)) ++n_plp; // skip the read if we are looking at ref-skip + } + } + iter->head = iter->dummy->next; // dummy->next may be changed + *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; + // update iter->tid and iter->pos + if (iter->head->next) { + if (iter->tid > iter->head->b.core.tid) { + fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__); + iter->error = 1; + *_n_plp = -1; + return 0; + } + } + if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence + iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference + } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid + iter->pos = iter->head->beg; // jump to the next position + } else ++iter->pos; // scan contiguously + // return + if (n_plp) return iter->plp; + if (iter->is_eof && iter->head->next == 0) break; + } + return 0; } -int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +int bam_plp_push(bam_plp_t iter, const bam1_t *b) { - if (b) { // fill buffer + if (iter->error) return -1; + if (b) { if (b->core.tid < 0) return 0; - if (b->core.flag & buf->flag_mask) return 0; - bam_copy1(&buf->tail->b, b); - buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); - if (b->core.tid < buf->max_tid) { + if (b->core.flag & iter->flag_mask) return 0; + bam_copy1(&iter->tail->b, b); + iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); + if (b->core.tid < iter->max_tid) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); + iter->error = 1; return -1; } - if ((b->core.tid == buf->max_tid) && (buf->tail->beg < buf->max_pos)) { + if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); + iter->error = 1; return -1; } - buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg; - if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) { - buf->tail->next = mp_alloc(buf->mp); - buf->tail = buf->tail->next; - } - } else buf->is_eof = 1; - while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) { - int n_pu = 0; - lbnode_t *p, *q; - buf->dummy->next = buf->head; - for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) { - if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list - q->next = p->next; mp_free(buf->mp, p); p = q; - } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup - if (n_pu == buf->max_pu) { // then double the capacity - buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256; - buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu); - } - buf->pu[n_pu].b = &p->b; - if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP - } + iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; + if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { + iter->tail->next = mp_alloc(iter->mp); + iter->tail = iter->tail->next; } - buf->head = buf->dummy->next; // dummy->next may be changed - if (n_pu) { // then call user defined function - buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data); - } - // update tid and pos - if (buf->head->next) { - if (buf->tid > buf->head->b.core.tid) { - fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); - return 1; + } else iter->is_eof = 1; + return 0; +} + +const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + const bam_pileup1_t *plp; + if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + else { + *_n_plp = 0; + if (iter->is_eof) return 0; + while (iter->func(iter->data, iter->b) >= 0) { + if (bam_plp_push(iter, iter->b) < 0) { + *_n_plp = -1; + return 0; } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; } - if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence - buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference - } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid - buf->pos = buf->head->beg; // jump to the next position - } else ++buf->pos; // scan contiguously - if (buf->is_eof && buf->head->next == 0) break; + bam_plp_push(iter, 0); + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + return 0; } - return 0; } +void bam_plp_reset(bam_plp_t iter) +{ + lbnode_t *p, *q; + iter->max_tid = iter->max_pos = -1; + iter->tid = iter->pos = 0; + iter->is_eof = 0; + for (p = iter->head; p->next;) { + q = p->next; + mp_free(iter->mp, p); + p = q; + } + iter->head = iter->tail; +} + +void bam_plp_set_mask(bam_plp_t iter, int mask) +{ + iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask); +} + +/***************** + * callback APIs * + *****************/ + int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) { bam_plbuf_t *buf; @@ -236,3 +295,102 @@ int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) bam_destroy1(b); return 0; } + +void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) +{ + bam_plp_set_mask(buf->iter, mask); +} + +void bam_plbuf_reset(bam_plbuf_t *buf) +{ + bam_plp_reset(buf->iter); +} + +bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +{ + bam_plbuf_t *buf; + buf = calloc(1, sizeof(bam_plbuf_t)); + buf->iter = bam_plp_init(0, 0); + buf->func = func; + buf->data = data; + return buf; +} + +void bam_plbuf_destroy(bam_plbuf_t *buf) +{ + bam_plp_destroy(buf->iter); + free(buf); +} + +int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +{ + int ret, n_plp, tid, pos; + const bam_pileup1_t *plp; + ret = bam_plp_push(buf->iter, b); + if (ret < 0) return ret; + while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) + buf->func(tid, pos, n_plp, plp, buf->data); + return 0; +} + +/*********** + * mpileup * + ***********/ + +struct __bam_mplp_t { + int n; + uint64_t min, *pos; + bam_plp_t *iter; + int *n_plp; + const bam_pileup1_t **plp; +}; + +bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) +{ + int i; + bam_mplp_t iter; + iter = calloc(1, sizeof(struct __bam_mplp_t)); + iter->pos = calloc(n, 8); + iter->n_plp = calloc(n, sizeof(int)); + iter->plp = calloc(n, sizeof(void*)); + iter->iter = calloc(n, sizeof(void*)); + iter->n = n; + iter->min = (uint64_t)-1; + for (i = 0; i < n; ++i) { + iter->iter[i] = bam_plp_init(func, data[i]); + iter->pos[i] = iter->min; + } + return iter; +} + +void bam_mplp_destroy(bam_mplp_t iter) +{ + int i; + for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); + free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); + free(iter); +} + +int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +{ + int i, ret = 0; + uint64_t new_min = (uint64_t)-1; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { + int tid, pos; + iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); + iter->pos[i] = (uint64_t)tid<<32 | pos; + } + if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; + } + iter->min = new_min; + if (new_min == (uint64_t)-1) return 0; + *_tid = new_min>>32; *_pos = (uint32_t)new_min; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { + n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; + ++ret; + } else n_plp[i] = 0, plp[i] = 0; + } + return ret; +} diff --git a/bam_plcmd.c b/bam_plcmd.c index ba787a9..6804795 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -18,6 +18,10 @@ KHASH_MAP_INIT_INT64(64, indel_list_t) #define BAM_PLF_GLF 0x08 #define BAM_PLF_VAR_ONLY 0x10 #define BAM_PLF_2ND 0x20 +#define BAM_PLF_RANBASE 0x40 +#define BAM_PLF_1STBASE 0x80 +#define BAM_PLF_ALLBASE 0x100 +#define BAM_PLF_READPOS 0x200 typedef struct { bam_header_t *h; @@ -28,6 +32,7 @@ typedef struct { uint32_t format; int tid, len, last_pos; int mask; + int max_depth; // for indel calling, ignore reads with the depth too high. 0 for unlimited char *ref; glfFile fp_glf; // for glf output only } pu_data_t; @@ -121,10 +126,11 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, g3->offset = pos - d->last_pos; d->last_pos = pos; glf3_write1(d->fp_glf, g3); - if (pos < d->len) { + if (pos < d->len) { + int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; if (proposed_indels) - r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); - else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); } if (r) { // then write indel line int het = 3 * n, min; @@ -152,11 +158,37 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, return 0; } +static void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) +{ + if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); + if (!p->is_del) { + int j, rb, c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + rb = (ref && pos < ref_len)? ref[pos] : 'N'; + if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + putchar(c); + if (p->indel > 0) { + printf("+%d", p->indel); + for (j = 1; j <= p->indel; ++j) { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } else if (p->indel < 0) { + printf("%d", p->indel); + for (j = 1; j <= -p->indel; ++j) { + c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } + } else putchar('*'); + if (p->is_tail) putchar('$'); +} + static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) { pu_data_t *d = (pu_data_t*)data; bam_maqindel_ret_t *r = 0; - int i, j, rb, rms_mapq = -1, *proposed_indels = 0; + int i, rb, rms_mapq = -1, *proposed_indels = 0; uint64_t rms_aux; uint32_t cns = 0; @@ -171,7 +203,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p // update d->ref if necessary if (d->fai && (int)tid != d->tid) { free(d->ref); - d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->ref = faidx_fetch_seq(d->fai, d->h->target_name[tid], 0, 0x7fffffff, &d->len); d->tid = tid; } rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; @@ -182,12 +214,31 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p if (i == n) return 0; } // call the consensus and indel - if (d->format & BAM_PLF_CNS) // call consensus - cns = bam_maqcns_call(n, pu, d->c); - if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels - if (proposed_indels) // the first element gives the size of the array - r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); - else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + if (d->format & BAM_PLF_CNS) { // call consensus + if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE)) { // use a random base or the 1st base as the consensus call + const bam_pileup1_t *p = (d->format & BAM_PLF_1STBASE)? pu : pu + (int)(drand48() * n); + int q = bam1_qual(p->b)[p->qpos]; + int mapQ = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + uint32_t b = bam1_seqi(bam1_seq(p->b), p->qpos); + cns = b<<28 | 0xf<<24 | mapQ<<16 | q<<8; + } else if (d->format & BAM_PLF_ALLBASE) { // collapse all bases + uint64_t rmsQ = 0; + uint32_t b = 0; + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int q = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + b |= bam1_seqi(bam1_seq(p->b), p->qpos); + rmsQ += q * q; + } + rmsQ = (uint64_t)(sqrt((double)rmsQ / n) + .499); + cns = b<<28 | 0xf<<24 | rmsQ<<16 | 60<<8; + } else cns = bam_maqcns_call(n, pu, d->c); + } + if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels + int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; + if (proposed_indels) // the first element gives the size of the array + r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); } // when only variant sites are asked for, test if the site is a variant if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { @@ -218,27 +269,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p const bam_pileup1_t *p = pu + i; int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; rms_aux += tmp * tmp; - if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); - if (!p->is_del) { - int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; - if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; - else c = bam1_strand(p->b)? tolower(c) : toupper(c); - putchar(c); - if (p->indel > 0) { - printf("+%d", p->indel); - for (j = 1; j <= p->indel; ++j) { - c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } - } else if (p->indel < 0) { - printf("%d", p->indel); - for (j = 1; j <= -p->indel; ++j) { - c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N'; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } - } - } else putchar('*'); - if (p->is_tail) putchar('$'); + pileup_seq(p, pos, d->len, d->ref); } // finalize rms_mapq rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); @@ -275,6 +306,15 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p putchar(c); } } + // print read position + if (d->format & BAM_PLF_READPOS) { + putchar('\t'); + for (i = 0; i < n; ++i) { + int x = pu[i].qpos; + int l = pu[i].b->core.l_qseq; + printf("%d,", x < l/2? x+1 : -((l-1)-x+1)); + } + } putchar('\n'); // print the indel line if r has been calculated. This only happens if: // a) -c or -i are flagged, AND b) the reference sequence is available @@ -298,29 +338,40 @@ int bam_pileup(int argc, char *argv[]) int c, is_SAM = 0; char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); + d->max_depth = 0; d->tid = -1; d->mask = BAM_DEF_MASK; d->c = bam_maqcns_init(); + d->c->is_soap = 1; // change the default model d->ido = bam_maqindel_opt_init(); - while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2a")) >= 0) { + while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PA")) >= 0) { switch (c) { case 'a': d->c->is_soap = 1; break; + case 'A': d->c->is_soap = 0; break; case 's': d->format |= BAM_PLF_SIMPLE; break; case 't': fn_list = strdup(optarg); break; case 'l': fn_pos = strdup(optarg); break; case 'f': fn_fa = strdup(optarg); break; case 'T': d->c->theta = atof(optarg); break; case 'N': d->c->n_hap = atoi(optarg); break; - case 'r': d->c->het_rate = atof(optarg); break; + case 'r': d->c->het_rate = atof(optarg); d->ido->r_snp = d->c->het_rate; break; case 'M': d->c->cap_mapQ = atoi(optarg); break; + case 'd': d->max_depth = atoi(optarg); break; case 'c': d->format |= BAM_PLF_CNS; break; case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; case 'v': d->format |= BAM_PLF_VAR_ONLY; break; case 'm': d->mask = strtol(optarg, 0, 0); break; case 'g': d->format |= BAM_PLF_GLF; break; case '2': d->format |= BAM_PLF_2ND; break; + case 'P': d->format |= BAM_PLF_READPOS; break; case 'I': d->ido->q_indel = atoi(optarg); break; case 'G': d->ido->r_indel = atof(optarg); break; case 'S': is_SAM = 1; break; + case 'R': + if (strcmp(optarg, "random") == 0) d->format |= BAM_PLF_RANBASE; + else if (strcmp(optarg, "first") == 0) d->format |= BAM_PLF_1STBASE; + else if (strcmp(optarg, "all") == 0) d->format |= BAM_PLF_ALLBASE; + else fprintf(stderr, "[bam_pileup] unrecognized -R\n"); + break; default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; } } @@ -330,15 +381,16 @@ int bam_pileup(int argc, char *argv[]) fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); fprintf(stderr, " -S the input is in SAM\n"); - fprintf(stderr, " -a use the SOAPsnp model for SNP calling\n"); + fprintf(stderr, " -A use the MAQ model for SNP calling\n"); fprintf(stderr, " -2 output the 2nd best call and quality\n"); fprintf(stderr, " -i only show lines/consensus with indels\n"); fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask); fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); + fprintf(stderr, " -d INT limit maximum depth for indels [unlimited]\n"); fprintf(stderr, " -t FILE list of reference sequences (force -S)\n"); fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); - fprintf(stderr, " -c output the maq consensus sequence\n"); + fprintf(stderr, " -c output the SOAPsnp consensus sequence\n"); fprintf(stderr, " -v print variants only (for -c)\n"); fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n"); fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta); @@ -350,6 +402,7 @@ int bam_pileup(int argc, char *argv[]) free(fn_list); free(fn_fa); free(d); return 1; } + if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE|BAM_PLF_ALLBASE)) d->format |= BAM_PLF_CNS; if (fn_fa) d->fai = fai_load(fn_fa); if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling if (d->format & BAM_PLF_GLF) { // for glf output @@ -390,3 +443,128 @@ int bam_pileup(int argc, char *argv[]) free(d->ido); free(d->ref); free(d); return 0; } + +/*********** + * mpileup * + ***********/ + +typedef struct { + char *reg; + faidx_t *fai; +} mplp_conf_t; + +typedef struct { + bamFile fp; + bam_iter_t iter; +} mplp_aux_t; + +static int mplp_func(void *data, bam1_t *b) +{ + mplp_aux_t *ma = (mplp_aux_t*)data; + if (ma->iter) return bam_iter_read(ma->fp, ma->iter, b); + return bam_read1(ma->fp, b); +} + +static int mpileup(mplp_conf_t *conf, int n, char **fn) +{ + mplp_aux_t **data; + int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid; + const bam_pileup1_t **plp; + bam_mplp_t iter; + bam_header_t *h = 0; + char *ref; + // allocate + data = calloc(n, sizeof(void*)); + plp = calloc(n, sizeof(void*)); + n_plp = calloc(n, sizeof(int*)); + // read the header and initialize data + for (i = 0; i < n; ++i) { + bam_header_t *h_tmp; + data[i] = calloc(1, sizeof(mplp_aux_t)); + data[i]->fp = bam_open(fn[i], "r"); + h_tmp = bam_header_read(data[i]->fp); + if (conf->reg) { + int beg, end; + bam_index_t *idx; + idx = bam_index_load(fn[i]); + if (idx == 0) { + fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); + exit(1); + } + if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { + fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); + exit(1); + } + if (i == 0) beg0 = beg, end0 = end; + data[i]->iter = bam_iter_query(idx, tid, beg, end); + bam_index_destroy(idx); + } + if (i == 0) h = h_tmp; + else { + // FIXME: to check consistency + bam_header_destroy(h_tmp); + } + } + // mpileup + ref_tid = -1; ref = 0; + iter = bam_mplp_init(n, mplp_func, (void**)data); + while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { + if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested + if (tid != ref_tid) { + free(ref); + if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); + ref_tid = tid; + } + printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + int j; + printf("\t%d\t", n_plp[i]); + if (n_plp[i] == 0) printf("*\t*"); + else { + for (j = 0; j < n_plp[i]; ++j) + pileup_seq(plp[i] + j, pos, ref_len, ref); + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = bam1_qual(p->b)[p->qpos] + 33; + if (c > 126) c = 126; + putchar(c); + } + } + } + putchar('\n'); + } + bam_mplp_destroy(iter); + bam_header_destroy(h); + for (i = 0; i < n; ++i) { + bam_close(data[i]->fp); + if (data[i]->iter) bam_iter_destroy(data[i]->iter); + free(data[i]); + } + free(data); free(plp); free(ref); free(n_plp); + return 0; +} + +int bam_mpileup(int argc, char *argv[]) +{ + int c; + mplp_conf_t mplp; + memset(&mplp, 0, sizeof(mplp_conf_t)); + while ((c = getopt(argc, argv, "f:r:")) >= 0) { + switch (c) { + case 'f': + mplp.fai = fai_load(optarg); + if (mplp.fai == 0) return 1; + break; + case 'r': mplp.reg = strdup(optarg); + } + } + if (argc == 1) { + fprintf(stderr, "Usage: samtools mpileup [-r reg] [-f in.fa] in1.bam [in2.bam [...]]\n"); + return 1; + } + mpileup(&mplp, argc - optind, argv + optind); + free(mplp.reg); + if (mplp.fai) fai_destroy(mplp.fai); + return 0; +} diff --git a/bam_reheader.c b/bam_reheader.c new file mode 100644 index 0000000..bae97c7 --- /dev/null +++ b/bam_reheader.c @@ -0,0 +1,60 @@ +#include +#include +#include "bgzf.h" +#include "bam.h" + +#define BUF_SIZE 0x10000 + +int bam_reheader(BGZF *in, const bam_header_t *h, int fd) +{ + BGZF *fp; + bam_header_t *old; + int len; + uint8_t *buf; + if (in->open_mode != 'r') return -1; + buf = malloc(BUF_SIZE); + old = bam_header_read(in); + fp = bgzf_fdopen(fd, "w"); + bam_header_write(fp, h); + if (in->block_offset < in->block_length) { + bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); + bgzf_flush(fp); + } +#ifdef _USE_KNETFILE + while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) +#else + while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) +#endif + fwrite(buf, 1, len, fp->x.fpw); + free(buf); + fp->block_offset = in->block_offset = 0; + bgzf_close(fp); + return 0; +} + +int main_reheader(int argc, char *argv[]) +{ + bam_header_t *h; + BGZF *in; + if (argc != 3) { + fprintf(stderr, "Usage: samtools reheader \n"); + return 1; + } + { // read the header + tamFile fph = sam_open(argv[1]); + if (fph == 0) { + fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); + return 1; + } + h = sam_header_read(fph); + sam_close(fph); + } + in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); + return 1; + } + bam_reheader(in, h, fileno(stdout)); + bgzf_close(in); + return 0; +} diff --git a/bam_sort.c b/bam_sort.c index 9884f3d..12b1b54 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -294,7 +294,7 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size mem += ret; ++k; if (mem >= max_mem) { - sort_blocks(n++, k, buf, prefix, header, is_stdout); + sort_blocks(n++, k, buf, prefix, header, 0); mem = 0; k = 0; } } @@ -304,7 +304,7 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size else { // then merge char **fns, *fnout; fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); - sort_blocks(n++, k, buf, prefix, header, is_stdout); + sort_blocks(n++, k, buf, prefix, header, 0); fnout = (char*)calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); diff --git a/bam_tview.c b/bam_tview.c index 4c121e7..7b326fc 100644 --- a/bam_tview.c +++ b/bam_tview.c @@ -280,7 +280,7 @@ int tv_draw_aln(tview_t *tv, int tid, int pos) static void tv_win_goto(tview_t *tv, int *tid, int *pos) { - char str[256]; + char str[256], *p; int i, l = 0; wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); mvwprintw(tv->wgoto, 1, 2, "Goto: "); @@ -291,10 +291,18 @@ static void tv_win_goto(tview_t *tv, int *tid, int *pos) --l; } else if (c == KEY_ENTER || c == '\012' || c == '\015') { int _tid = -1, _beg, _end; - bam_parse_region(tv->header, str, &_tid, &_beg, &_end); - if (_tid >= 0) { - *tid = _tid; *pos = _beg; - return; + if (str[0] == '=') { + _beg = strtol(str+1, &p, 10); + if (_beg > 0) { + *pos = _beg; + return; + } + } else { + bam_parse_region(tv->header, str, &_tid, &_beg, &_end); + if (_tid >= 0) { + *tid = _tid; *pos = _beg; + return; + } } } else if (isgraph(c)) { if (l < TV_MAX_GOTO) str[l++] = c; @@ -351,6 +359,7 @@ void tv_loop(tview_t *tv) case '?': tv_win_help(tv); break; case '\033': case 'q': goto end_loop; + case '/': case 'g': tv_win_goto(tv, &tid, &pos); break; case 'm': tv->color_for = TV_COLOR_MAPQ; break; case 'b': tv->color_for = TV_COLOR_BASEQ; break; diff --git a/bamtk.c b/bamtk.c index 48ac76b..94c4d3f 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,11 +9,12 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.7 (r510)" +#define PACKAGE_VERSION "0.1.8 (r613)" #endif int bam_taf2baf(int argc, char *argv[]); int bam_pileup(int argc, char *argv[]); +int bam_mpileup(int argc, char *argv[]); int bam_merge(int argc, char *argv[]); int bam_index(int argc, char *argv[]); int bam_sort(int argc, char *argv[]); @@ -22,9 +23,10 @@ int bam_mating(int argc, char *argv[]); int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); int bam_fillmd(int argc, char *argv[]); - +int bam_idxstats(int argc, char *argv[]); int main_samview(int argc, char *argv[]); int main_import(int argc, char *argv[]); +int main_reheader(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); @@ -78,17 +80,20 @@ static int usage() fprintf(stderr, "Command: view SAM<->BAM conversion\n"); fprintf(stderr, " sort sort alignment file\n"); fprintf(stderr, " pileup generate pileup output\n"); + fprintf(stderr, " mpileup multi-way pileup\n"); fprintf(stderr, " faidx index/extract FASTA\n"); #if _CURSES_LIB != 0 fprintf(stderr, " tview text alignment viewer\n"); #endif fprintf(stderr, " index index alignment\n"); + fprintf(stderr, " idxstats BAM index stats (r595 or later)\n"); fprintf(stderr, " fixmate fix mate information\n"); fprintf(stderr, " glfview print GLFv3 file\n"); fprintf(stderr, " flagstat simple stats\n"); fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); fprintf(stderr, " merge merge sorted alignments\n"); fprintf(stderr, " rmdup remove PCR duplicates\n"); + fprintf(stderr, " reheader replace BAM header\n"); fprintf(stderr, "\n"); return 1; } @@ -106,9 +111,11 @@ int main(int argc, char *argv[]) if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); + else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1); else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); + else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1); else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); @@ -117,6 +124,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); + else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); #if _CURSES_LIB != 0 else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); #endif diff --git a/bgzf.c b/bgzf.c index 59f902f..a6923da 100644 --- a/bgzf.c +++ b/bgzf.c @@ -203,9 +203,7 @@ bgzf_open(const char* __restrict path, const char* __restrict mode) if (fd == -1) return 0; fp = open_write(fd, strstr(mode, "u")? 1 : 0); } - if (fp != NULL) { - fp->owned_file = 1; - } + if (fp != NULL) fp->owned_file = 1; return fp; } @@ -429,20 +427,19 @@ static void cache_block(BGZF *fp, int size) memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); } -static int -read_block(BGZF* fp) +bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; - int size = 0; + int count, size = 0; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); if (load_block_from_cache(fp, block_address)) return 0; - int count = knet_read(fp->x.fpr, header, sizeof(header)); + count = knet_read(fp->x.fpr, header, sizeof(header)); #else int64_t block_address = ftello(fp->file); if (load_block_from_cache(fp, block_address)) return 0; - int count = fread(header, 1, sizeof(header), fp->file); + count = fread(header, 1, sizeof(header), fp->file); #endif if (count == 0) { fp->block_length = 0; @@ -472,9 +469,7 @@ read_block(BGZF* fp) } size += count; count = inflate_block(fp, block_length); - if (count < 0) { - return -1; - } + if (count < 0) return -1; if (fp->block_length != 0) { // Do not reset offset if this read follows a seek. fp->block_offset = 0; @@ -501,7 +496,7 @@ bgzf_read(BGZF* fp, void* data, int length) while (bytes_read < length) { int available = fp->block_length - fp->block_offset; if (available <= 0) { - if (read_block(fp) != 0) { + if (bgzf_read_block(fp) != 0) { return -1; } available = fp->block_length - fp->block_offset; @@ -528,19 +523,16 @@ bgzf_read(BGZF* fp, void* data, int length) return bytes_read; } -static -int -flush_block(BGZF* fp) +int bgzf_flush(BGZF* fp) { while (fp->block_offset > 0) { - int block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) { - return -1; - } + int count, block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; #ifdef _USE_KNETFILE - int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); #else - int count = fwrite(fp->compressed_block, 1, block_length, fp->file); + count = fwrite(fp->compressed_block, 1, block_length, fp->file); #endif if (count != block_length) { report_error(fp, "write failed"); @@ -551,17 +543,22 @@ flush_block(BGZF* fp) return 0; } -int -bgzf_write(BGZF* fp, const void* data, int length) +int bgzf_flush_try(BGZF *fp, int size) +{ + if (fp->block_offset + size > fp->uncompressed_block_size) + return bgzf_flush(fp); + return -1; +} + +int bgzf_write(BGZF* fp, const void* data, int length) { if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; } - if (fp->uncompressed_block == NULL) { + if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); - } const bgzf_byte_t* input = data; int block_length = fp->uncompressed_block_size; @@ -574,7 +571,7 @@ bgzf_write(BGZF* fp, const void* data, int length) input += copy_length; bytes_written += copy_length; if (fp->block_offset == block_length) { - if (flush_block(fp) != 0) { + if (bgzf_flush(fp) != 0) { break; } } @@ -582,13 +579,10 @@ bgzf_write(BGZF* fp, const void* data, int length) return bytes_written; } -int -bgzf_close(BGZF* fp) +int bgzf_close(BGZF* fp) { if (fp->open_mode == 'w') { - if (flush_block(fp) != 0) { - return -1; - } + if (bgzf_flush(fp) != 0) return -1; { // add an empty block int count, block_length = deflate_block(fp, 0); #ifdef _USE_KNETFILE @@ -613,9 +607,7 @@ bgzf_close(BGZF* fp) else ret = knet_close(fp->x.fpr); if (ret != 0) return -1; #else - if (fclose(fp->file) != 0) { - return -1; - } + if (fclose(fp->file) != 0) return -1; #endif } free(fp->uncompressed_block); @@ -625,12 +617,6 @@ bgzf_close(BGZF* fp) return 0; } -int64_t -bgzf_tell(BGZF* fp) -{ - return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); -} - void bgzf_set_cache_size(BGZF *fp, int cache_size) { if (fp) fp->cache_size = cache_size; @@ -655,9 +641,11 @@ int bgzf_check_EOF(BGZF *fp) return (memcmp(magic, buf, 28) == 0)? 1 : 0; } -int64_t -bgzf_seek(BGZF* fp, int64_t pos, int where) +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { + int block_offset; + int64_t block_address; + if (fp->open_mode != 'r') { report_error(fp, "file not open for read"); return -1; @@ -666,8 +654,8 @@ bgzf_seek(BGZF* fp, int64_t pos, int where) report_error(fp, "unimplemented seek option"); return -1; } - int block_offset = pos & 0xFFFF; - int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; + block_offset = pos & 0xFFFF; + block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; #ifdef _USE_KNETFILE if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { #else diff --git a/bgzf.h b/bgzf.h index 91b3317..099ae9a 100644 --- a/bgzf.h +++ b/bgzf.h @@ -106,7 +106,7 @@ int bgzf_write(BGZF* fp, const void* data, int length); * Return value is non-negative on success. * Returns -1 on error. */ -int64_t bgzf_tell(BGZF* fp); +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) /* * Set the file to read from the location specified by pos, which must @@ -126,9 +126,32 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); void bgzf_set_cache_size(BGZF *fp, int cache_size); int bgzf_check_EOF(BGZF *fp); +int bgzf_read_block(BGZF* fp); +int bgzf_flush(BGZF* fp); +int bgzf_flush_try(BGZF *fp, int size); #ifdef __cplusplus } #endif +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + #endif diff --git a/examples/bam2bed.c b/examples/bam2bed.c new file mode 100644 index 0000000..bb937d1 --- /dev/null +++ b/examples/bam2bed.c @@ -0,0 +1,51 @@ +#include +#include "sam.h" +static int fetch_func(const bam1_t *b, void *data) +{ + samfile_t *fp = (samfile_t*)data; + uint32_t *cigar = bam1_cigar(b); + const bam1_core_t *c = &b->core; + int i, l; + if (b->core.tid < 0) return 0; + for (i = l = 0; i < c->n_cigar; ++i) { + int op = cigar[i]&0xf; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) + l += cigar[i]>>4; + } + printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid], + c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+'); + return 0; +} +int main(int argc, char *argv[]) +{ + samfile_t *fp; + if (argc == 1) { + fprintf(stderr, "Usage: bam2bed [region]\n"); + return 1; + } + if ((fp = samopen(argv[1], "rb", 0)) == 0) { + fprintf(stderr, "bam2bed: Fail to open BAM file %s\n", argv[1]); + return 1; + } + if (argc == 2) { /* if a region is not specified */ + bam1_t *b = bam_init1(); + while (samread(fp, b) >= 0) fetch_func(b, fp); + bam_destroy1(b); + } else { + int ref, beg, end; + bam_index_t *idx; + if ((idx = bam_index_load(argv[1])) == 0) { + fprintf(stderr, "bam2bed: BAM indexing file is not available.\n"); + return 1; + } + bam_parse_region(fp->header, argv[2], &ref, &beg, &end); + if (ref < 0) { + fprintf(stderr, "bam2bed: Invalid region %s\n", argv[2]); + return 1; + } + bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func); + bam_index_destroy(idx); + } + samclose(fp); + return 0; +} diff --git a/examples/toy.fa b/examples/toy.fa new file mode 100644 index 0000000..38312c1 --- /dev/null +++ b/examples/toy.fa @@ -0,0 +1,2 @@ +>ref +AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT diff --git a/examples/toy.sam b/examples/toy.sam new file mode 100644 index 0000000..baf7388 --- /dev/null +++ b/examples/toy.sam @@ -0,0 +1,7 @@ +@SQ SN:ref LN:45 +r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG * +r002 0 ref 9 30 1S2I6M1P1I4M2I * 0 0 AAAAGATAAGGATAAA * +r003 0 ref 9 30 5H6M * 0 0 AGCTAA * +r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * +r003 16 ref 29 30 6H5M * 0 0 TAGGC * +r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * \ No newline at end of file diff --git a/faidx.c b/faidx.c index 811bdf8..dbd8b3e 100644 --- a/faidx.c +++ b/faidx.c @@ -197,7 +197,7 @@ int fai_build(const char *fn) sprintf(str, "%s.fai", fn); rz = razf_open(fn, "r"); if (rz == 0) { - fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",str); + fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); free(str); return -1; } diff --git a/knetfile.c b/knetfile.c index 994babb..e1be4d6 100644 --- a/knetfile.c +++ b/knetfile.c @@ -38,9 +38,7 @@ #include #include -#ifdef _WIN32 -#include -#else +#ifndef _WIN32 #include #include #include @@ -566,7 +564,7 @@ off_t knet_seek(knetFile *fp, int64_t off, int whence) else if (whence==SEEK_SET) fp->offset = off; fp->is_ready = 0; - return fp->offset; + return 0; } errno = EINVAL; fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); diff --git a/kstring.h b/kstring.h index f4e5a99..925117a 100644 --- a/kstring.h +++ b/kstring.h @@ -58,6 +58,40 @@ static inline int kputc(int c, kstring_t *s) return c; } +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + static inline int *ksplit(kstring_t *s, int delimiter, int *n) { int max = 0, *offsets = 0; diff --git a/misc/Makefile b/misc/Makefile index 4404ccc..2d7139d 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -1,6 +1,6 @@ CC= gcc CXX= g++ -CFLAGS= -g -Wall -O2 -m64 #-arch ppc +CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= @@ -27,6 +27,9 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: +afs2:afs2.o + $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam + wgsim:wgsim.o $(CC) $(CFLAGS) -o $@ wgsim.o -lm @@ -48,7 +51,10 @@ maq2sam-long:maq2sam.c md5fa.o:md5.h md5fa.c $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c +afs2.o:afs2.c ../bam.h + $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c + cleanlocal: - rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a + rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a clean:cleanlocal-recur diff --git a/misc/export2sam.pl b/misc/export2sam.pl index 8e3e280..a2a436c 100755 --- a/misc/export2sam.pl +++ b/misc/export2sam.pl @@ -1,107 +1,461 @@ -#!/usr/bin/perl -w - +#!/usr/bin/env perl +# +# +# Script to convert GERALD export files to SAM format. +# +# +# +########## License: +# +# The MIT License +# +# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd. +# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# +# +########## ChangeLog: +# +# Version: 2.0.0 (15FEB2010) +# Script updated by Illumina in conjunction with CASAVA 1.7.0 release. +# Major changes are as follows: +# - The CIGAR string has been updated to include all gaps from ELANDv2 alignments. +# - The ELAND single read alignment score is always stored in the optional "SM" field +# and the ELAND paired read alignment score is stored in the optional "AS" field +# when it exists. +# - The MAPQ value is set to the higher of the two alignment scores, but no greater +# than 254, i.e. min(254,max(SM,AS)) +# - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's +# expected orientation and insert size criteria. +# - The default quality score translation is set for export files which contain +# Phread+64 quality values. An option, "--qlogodds", has been added to +# translate quality values from the Solexa+64 format used in export files prior +# to Pipeline 1.3 +# - The export match descriptor is now reverse-complemented when necessary such that +# it always corresponds to the forward strand of the reference, to be consistent +# with other information in the SAM record. It is now written to the optional +# 'XD' field (rather than 'MD') to acknowledge its minor differences from the +# samtools match descriptor (see additional detail below). +# - An option, "--nofilter", has been added to include reads which have failed +# primary analysis quality filtration. Such reads will have the corresponding +# SAM flag bit (0x0200) set. +# - Labels in the export 'contig' field are preserved by setting RNAME to +# "$export_chromosome/$export_contig" when then contig label exists. +# +# # Contact: lh3 # Version: 0.1.2 (03JAN2009) +# +# +# +########## Known Conversion Limitations: +# +# - Export records for reads that map to a position < 1 (allowed in export format), are converted +# to unmapped reads in the SAM record. +# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the +# aligner could not map the read to the reference sequence set, and "QC" means that the +# aligner did not attempt to map the read due to some technical limitation. Both of these +# alignment types are collapsed to the single unmapped alignment state in the SAM record. +# - The export match descriptor is slightly different than the samtools match descriptor. For +# this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the +# export match descriptor differs from the samtools version in two respects: (1) indels +# are explicitly closed with the '$' character and (2) insertions must be enumerated in +# the match descriptor. For example a 35-base read with a two-base insertion is described +# as: 20^2$14 +# +# +# + +my $version = "2.0.0"; use strict; use warnings; -use Getopt::Std; + +use File::Spec qw(splitpath); +use Getopt::Long; +use List::Util qw(min max); + + +use constant { + EXPORT_INDEX => 6, + EXPORT_READNO => 7, + EXPORT_READ => 8, + EXPORT_QUAL => 9, + EXPORT_CHROM => 10, + EXPORT_CONTIG => 11, + EXPORT_POS => 12, + EXPORT_STRAND => 13, + EXPORT_MD => 14, + EXPORT_SEMAP => 15, + EXPORT_PEMAP => 16, + EXPORT_PASSFILT => 21, +}; + + +use constant { + SAM_QNAME => 0, + SAM_FLAG => 1, + SAM_RNAME => 2, + SAM_POS => 3, + SAM_MAPQ => 4, + SAM_CIGAR => 5, + SAM_MRNM => 6, + SAM_MPOS => 7, + SAM_ISIZE => 8, + SAM_SEQ => 9, + SAM_QUAL => 10, +}; + + +# function prototypes for Richard's code +sub match_desc_to_cigar($); +sub match_desc_frag_length($); +sub reverse_compl_match_descriptor($); +sub write_header($;$;$); + &export2sam; exit; + + + sub export2sam { + + my $cmdline = $0 . " " . join(" ",@ARGV); + my $arg_count = scalar @ARGV; + my @spval = File::Spec->splitpath($0); + my $progname = $spval[2]; + + my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values + my $is_nofilter = 0; + my $read1file; + my $read2file; + my $print_version = 0; + my $help = 0; + + my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, + "nofilter" => \$is_nofilter, + "read1=s" => \$read1file, + "read2=s" => \$read2file, + "version" => \$print_version, + "help" => \$help ); + + my $usage = <= 2); - die("export2sam.pl []\n") if (@ARGV == 0); - open($fh1, $ARGV[0]) || die; + + open($fh1, $read1file) or die("\nERROR: Can't open read1 export file: $read1file\n\n"); + $is_paired = defined $read2file; if ($is_paired) { - open($fh2, $ARGV[1]) || die; + open($fh2, $read2file) or die("\nERROR: Can't open read2 export file: $read2file\n\n"); } - # conversion table + # quality value conversion table my @conv_table; - for (-64..64) { - $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499)); + if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3): + for (-64..64) { + $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499); + } + } else { # convert from phred+64 quality values (pipeline v1.3+): + for (-64..-1) { + $conv_table[$_+64] = undef; + } + for (0..64) { + $conv_table[$_+64] = int(33 + $_); + } } + # write the header + print write_header( $progname, $version, $cmdline ); # core loop + my $export_line_count = 0; while (<$fh1>) { - my (@s1, @s2); - &export2sam_aux($_, \@s1, \@conv_table, $is_paired); - if ($is_paired) { - $_ = <$fh2>; - &export2sam_aux($_, \@s2, \@conv_table, $is_paired); - if (@s1 && @s2) { # then set mate coordinate - my $isize = 0; - if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize - my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3]; - my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3]; - $isize = $x2 - $x1; - } - # update mate coordinate - if ($s2[2] ne '*') { - @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize); - $s1[1] |= 0x20 if ($s2[1] & 0x10); - } else { - $s1[1] |= 0x8; - } - if ($s1[2] ne '*') { - @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize); - $s2[1] |= 0x20 if ($s1[1] & 0x10); - } else { - $s2[1] |= 0x8; - } - } - } - print join("\t", @s1), "\n" if (@s1); - print join("\t", @s2), "\n" if (@s2 && $is_paired); + $export_line_count++; + my (@s1, @s2); + &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); + if ($is_paired) { + my $read2line = <$fh2>; + if(not $read2line){ + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); + } + &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); + + if (@s1 && @s2) { # then set mate coordinate + if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ + die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); + } + + my $isize = 0; + if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize + my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; + my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; + $isize = $x2 - $x1; + } + + foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ + my ($sa,$sb,$is) = @{$_}; + if ($sb->[SAM_RNAME] ne '*') { + $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; + $sa->[SAM_MPOS] = $sb->[SAM_POS]; + $sa->[SAM_ISIZE] = $is; + $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); + } else { + $sa->[SAM_FLAG] |= 0x8; + } + } + } + } + print join("\t", @s1), "\n" if (@s1); + print join("\t", @s2), "\n" if (@s2 && $is_paired); } close($fh1); - close($fh2) if ($is_paired); + if($is_paired) { + while(my $read2line = <$fh2>){ + $export_line_count++; + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); + } + close($fh2); + } } sub export2sam_aux { - my ($line, $s, $ct, $is_paired) = @_; + my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; chomp($line); my @t = split("\t", $line); @$s = (); - return if ($t[21] ne 'Y'); + my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); + return if(not ($isPassFilt or $is_nofilter)); # read name - $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; + $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; # initial flag (will be updated later) - $s->[1] = 0; - $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired); + $s->[SAM_FLAG] = 0; + if($is_paired) { + if($t[EXPORT_READNO] != $read_no){ + die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); + } + $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); + } + $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); + # read & quality - $s->[9] = $t[8]; $s->[10] = $t[9]; - if ($t[13] eq 'R') { # then reverse the sequence and quality - $s->[9] = reverse($t[8]); - $s->[9] =~ tr/ACGTacgt/TGCAtgca/; - $s->[10] = reverse($t[9]); + my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); + if ($is_export_rev) { # then reverse the sequence and quality + $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); + $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; + $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); + } else { + $s->[SAM_SEQ] = $t[EXPORT_READ]; + $s->[SAM_QUAL] = $t[EXPORT_QUAL]; } - $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding - # cigar - $s->[5] = length($s->[9]) . "M"; + my @convqual = (); + foreach (unpack('C*', $s->[SAM_QUAL])){ + my $val=$ct->[$_]; + if(not defined $val){ + my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; + if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } + die($msg . "\n"); + } + push @convqual,$val; + } + + $s->[SAM_QUAL] = pack('C*',@convqual); # change coding + + # coor my $has_coor = 0; - $s->[2] = "*"; - if ($t[10] eq 'NM' || $t[10] eq 'QC') { - $s->[1] |= 0x4; # unmapped - } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) { - $s->[1] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? - push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + $s->[SAM_RNAME] = "*"; + if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') { + $s->[SAM_FLAG] |= 0x4; # unmapped + } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { + $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? + push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + } elsif ($t[EXPORT_POS] < 1) { + $s->[SAM_FLAG] |= 0x4; # unmapped + } else { + $s->[SAM_RNAME] = $t[EXPORT_CHROM]; + $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); + $has_coor = 1; + } + $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; + +# print STDERR "t[14] = " . $t[14] . "\n"; + my $matchDesc = ''; + $s->[SAM_CIGAR] = "*"; + if($has_coor){ + $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; + + if($matchDesc =~ /\^/){ + # construct CIGAR string using Richard's function + $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing + } else { + $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; + } + } + +# print STDERR "cigar_string = $cigar_string\n"; + + $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); + if($has_coor){ + my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; + my $pemap = 0; + if($is_paired) { + $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; + + # set `proper pair' bit if non-blank, non-zero PE alignment score: + $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); + } + $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); } else { - $s->[2] = $t[10]; - $has_coor = 1; + $s->[SAM_MAPQ] = 0; } - $s->[3] = $has_coor? $t[12] : 0; - $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R'); - # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?) - $s->[4] = 0; - $s->[4] = $t[15] if ($t[15] ne ''); - $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]); # mate coordinate - $s->[6] = '*'; $s->[7] = $s->[8] = 0; + $s->[SAM_MRNM] = '*'; + $s->[SAM_MPOS] = 0; + $s->[SAM_ISIZE] = 0; # aux - push(@$s, "BC:Z:$t[6]") if ($t[6]); - push(@$s, "MD:Z:$t[14]") if ($has_coor); - push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor); + push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); + if($has_coor){ + # The export match descriptor differs slightly from the samtools match descriptor. + # In order for the converted SAM files to be as compliant as possible, + # we put the export match descriptor in optional field 'XD' rather than 'MD': + push(@$s, "XD:Z:$matchDesc"); + push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); + push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); + } +} + + + +# +# the following code is taken from Richard Shaw's sorted2sam.pl file +# +sub reverse_compl_match_descriptor($) +{ +# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; + my ($match_desc) = @_; + my $rev_compl_match_desc = reverse($match_desc); + $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; + + # Unreverse the digits of numbers. + $rev_compl_match_desc = join('', + map {($_ =~ /\d+/) + ? join('', reverse(split('', $_))) + : $_} split(/(\d+)/, + $rev_compl_match_desc)); + + return $rev_compl_match_desc; +} + + + +sub match_desc_to_cigar($) +{ + my ($match_desc) = @_; + + my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); + my $cigar_str = ''; + my $cigar_del_ch = 'D'; + my $cigar_ins_ch = 'I'; + my $cigar_match_ch = 'M'; + + foreach my $match_desc_part (@match_desc_parts) { + next if (!$match_desc_part); + + if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { + # Deletion + $cigar_str .= (length($1) . $cigar_del_ch); + } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { + # Insertion + $cigar_str .= ($1 . $cigar_ins_ch); + } else { + $cigar_str .= (match_desc_frag_length($match_desc_part) + . $cigar_match_ch); + } + } + + return $cigar_str; +} + + +#------------------------------------------------------------------------------ + +sub match_desc_frag_length($) + { + my ($match_desc_str) = @_; + my $len = 0; + + my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); + + foreach my $match_desc_field (@match_desc_fields) { + next if ($match_desc_field eq ''); + + $len += (($match_desc_field =~ /(\d+)/) + ? $1 : length($match_desc_field)); + } + + return $len; +} + + +# argument holds the command line +sub write_header($;$;$) +{ + my ($progname,$version,$cl) = @_; + my $complete_header = ""; + $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; + + return $complete_header; } diff --git a/misc/sam2vcf.pl b/misc/sam2vcf.pl index ede7bd8..afaf91e 100755 --- a/misc/sam2vcf.pl +++ b/misc/sam2vcf.pl @@ -1,9 +1,9 @@ #!/usr/bin/perl -w # -# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2 - +# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 +# # Contact: pd3@sanger -# Version: 2009-10-08 +# Version: 2010-04-23 use strict; use warnings; @@ -23,8 +23,12 @@ sub error die "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n", "Options:\n", - " -r, -refseq The reference sequence, required when indels are present.\n", - " -h, -?, --help This help message.\n", + " -h, -?, --help This help message.\n", + " -i, --indels-only Ignore SNPs.\n", + " -r, --refseq The reference sequence, required when indels are present.\n", + " -R, --keep-ref Print reference alleles as well.\n", + " -s, --snps-only Ignore indels.\n", + " -t, --column-title The column title.\n", "\n"; } @@ -38,7 +42,11 @@ sub parse_params while (my $arg=shift(@ARGV)) { + if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; } if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; } + if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; } + if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; } + if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); @@ -59,13 +67,14 @@ sub iupac_to_gtype ); if ( !exists($iupac{$base}) ) { - if ( $ref eq $base ) { return ('.','0|0'); } - return ($base,'1|1'); + if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); } + if ( $ref eq $base ) { return ('.','0/0'); } + return ($base,'1/1'); } my $gt = $iupac{$base}; - if ( $$gt[0] eq $ref ) { return ($$gt[1],'0|1'); } - elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0|1'); } - return ("$$gt[0],$$gt[1]",'1|2'); + if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); } + elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); } + return ("$$gt[0],$$gt[1]",'1/2'); } @@ -97,53 +106,96 @@ sub do_pileup_to_vcf my $fh_out = $$opts{fh_out}; my ($prev_chr,$prev_pos,$prev_ref); my $refseq; + my $ignore_indels = $$opts{snps_only} ? 1 : 0; + my $ignore_snps = $$opts{indels_only} ? 1 : 0; + my $keep_ref = $$opts{keep_ref} ? 1 : 0; + my $title = exists($$opts{title}) ? $$opts{title} : 'data'; + + print $fh_out + qq[##fileformat=VCFv3.3\n], + qq[##INFO=DP,1,Integer,"Total Depth"\n], + qq[##FORMAT=GT,1,String,"Genotype"\n], + qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n], + qq[##FORMAT=DP,1,Integer,"Read Depth"\n], + qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n] + ; while (my $line=<$fh_in>) { chomp($line); - my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,@items) = split(/\t/,$line); + my (@items) = split(/\t/,$line); + if ( scalar @items<8 ) + { + error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n"); + } + my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items; + $ref = uc($ref); + $cons = uc($cons); my ($alt,$gt); if ( $ref eq '*' ) { # An indel is involved. - if ($chr ne $prev_chr || $pos ne $prev_pos) + if ( $ignore_indels ) + { + $prev_ref = $ref; + $prev_pos = $pos; + $prev_chr = $chr; + next; + } + + if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos) { if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); } if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); } $ref = $refseq->get_base($chr,$pos); + $ref = uc($ref); } else { $ref = $prev_ref; } - # One of the alleles can be a reference and it can come in arbitrary order + # One of the alleles can be a reference and it can come in arbitrary order. In some + # cases */* can be encountered. In such a case, look in the additional columns. my ($al1,$al2) = split(m{/},$cons); + if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; } my $alt1 = parse_indel($al1); my $alt2 = parse_indel($al2); if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); } - if ( $alt1 && $alt2 && $alt1 eq $alt2 ) { $alt2=''; } if ( !$alt1 ) { $alt=$alt2; - $gt='0|1'; + $gt='0/1'; } elsif ( !$alt2 ) { $alt=$alt1; - $gt='0|1'; + $gt='0/1'; } - else + elsif ( $alt1 eq $alt2 ) + { + $alt="$alt1"; + $gt='1/1'; + } + else { $alt="$alt1,$alt2"; - $gt='1|2'; + $gt='1/2'; } } else { + if ( $ignore_snps || (!$keep_ref && $ref eq $cons) ) + { + $prev_ref = $ref; + $prev_pos = $pos; + $prev_chr = $chr; + next; + } + # SNP ($alt,$gt) = iupac_to_gtype($ref,$cons); } - print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\t\tGT:GQ:DP\t$gt:$cons_qual:$depth\n"; + print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n"; $prev_ref = $ref; $prev_pos = $pos; @@ -167,7 +219,8 @@ use Carp; sub Fasta::new { my ($class,@args) = @_; - my $self = @args ? {@args} : {}; + my $self = {@args}; + bless $self, ref($class) || $class; if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); } $$self{chr} = undef; $$self{from} = undef; diff --git a/misc/samtools.pl b/misc/samtools.pl index 320e8aa..9f48b8f 100755 --- a/misc/samtools.pl +++ b/misc/samtools.pl @@ -11,7 +11,7 @@ my $version = '0.3.3'; my $command = shift(@ARGV); my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, - unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr); + unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; @@ -46,10 +46,12 @@ sub showALEN { # G close to a high-quality indel (SNP only) # Q low RMS mapping quality (SNP only) # g close to another indel with higher quality (indel only) +# s low SNP quality (SNP only) +# i low indel quality (indel only) sub varFilter { - my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef); - getopts('pq:d:D:l:Q:w:W:N:G:', \%opts); + my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, S=>'', i=>''); + getopts('pq:d:D:l:Q:w:W:N:G:S:i:', \%opts); die(qq/ Usage: samtools.pl varFilter [options] @@ -57,6 +59,8 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] -q INT minimum RMS mapping quality for gaps [$opts{q}] -d INT minimum read depth [$opts{d}] -D INT maximum read depth [$opts{D}] + -S INT minimum SNP quality [$opts{S}] + -i INT minimum indel quality [$opts{i}] -G INT min indel score for nearby SNP filtering [$opts{G}] -w INT SNP within INT bp around a gap to be filtered [$opts{w}] @@ -80,7 +84,8 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] next if (uc($t[2]) eq uc($t[3]) || $t[3] eq '*/*'); # skip non-var sites # clear the out-of-range elements while (@staging) { - last if ($staging[0][2] eq $t[0] && $staging[0][3] + $max_dist >= $t[1]); + # Still on the same chromosome and the first element's window still affects this position? + last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much } my ($flt, $score) = (0, -1); @@ -90,14 +95,32 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] } elsif ($t[7] > $opts{D}) { $flt = 3; } + if ($t[2] eq '*') { # an indel + if ($opts{i} && $opts{i}>$t[5]) { $flt = 8; } + } + elsif ($opts{S} && $opts{S}>$t[5]) { $flt = 7; } # SNP + # site dependent filters + my $len=0; if ($flt == 0) { if ($t[2] eq '*') { # an indel + + # If deletion, remember the length of the deletion + my ($a,$b) = split(m{/},$t[3]); + my $alen = length($a) - 1; + my $blen = length($b) - 1; + if ( $alen>$blen ) + { + if ( substr($a,0,1) eq '-' ) { $len=$alen; } + } + elsif ( substr($b,0,1) eq '-' ) { $len=$blen; } + $flt = 1 if ($t[6] < $opts{q}); # filtering SNPs if ($t[5] >= $opts{G}) { for my $x (@staging) { - next if ($x->[0] >= 0 || $x->[3] + $ow < $t[1]); + # Is it a SNP and is it outside the SNP filter window? + next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]); $x->[1] = 5 if ($x->[1] == 0); } } @@ -107,7 +130,8 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] $score += $opts{s} * $t[11] if ($t[9] ne '*'); # check the staging list for indel filtering for my $x (@staging) { - next if ($x->[0] < 0 || $x->[3] + $ol < $t[1]); + # Is it a SNP and is it outside the gap filter window + next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]); if ($x->[0] < $score) { $x->[1] = 6; } else { @@ -119,17 +143,17 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] # check adjacent SNPs my $k = 1; for my $x (@staging) { - ++$k if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5)); + ++$k if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5)); } # filtering is necessary if ($k > $opts{N}) { $flt = 4; for my $x (@staging) { - $x->[1] = 4 if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && $x->[1] == 0); + $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0); } } else { # then check gap filter for my $x (@staging) { - next if ($x->[0] < 0 || $x->[3] + $ow < $t[1]); + next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]); if ($x->[0] >= $opts{G}) { $flt = 5; last; } @@ -137,7 +161,7 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] } } } - push(@staging, [$score, $flt, @t]); + push(@staging, [$score, $flt, $len, @t]); } # output the last few elements in the staging list while (@staging) { @@ -148,9 +172,9 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] sub varFilter_aux { my ($first, $is_print) = @_; if ($first->[1] == 0) { - print join("\t", @$first[2 .. @$first-1]), "\n"; + print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { - print STDERR join("\t", substr("UQdDWGgX", $first->[1], 1), @$first[2 .. @$first-1]), "\n"; + print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } @@ -226,6 +250,49 @@ sub p2q_print_str { } } +# +# sam2fq +# + +sub sam2fq { + my %opts = (n=>20, p=>''); + getopts('n:p:', \%opts); + die("Usage: samtools.pl sam2fq [-n 20] [-p ] \n") if (@ARGV == 0 && -t STDIN); + if ($opts{p} && $opts{n} > 1) { + my $pre = $opts{p}; + my @fh; + for (0 .. $opts{n}-1) { + open($fh[$_], sprintf("| gzip > $pre.%.3d.fq.gz", $_)) || die; + } + my $i = 0; + while (<>) { + next if (/^@/); + chomp; + my @t = split("\t"); + next if ($t[9] eq '*'); + my ($name, $seq, $qual); + if ($t[1] & 16) { # reverse strand + $seq = reverse($t[9]); + $qual = reverse($t[10]); + $seq =~ tr/ACGTacgt/TGCAtgca/; + } else { + ($seq, $qual) = @t[9,10]; + } + $name = $t[0]; + $name .= "/1" if ($t[1] & 0x40); + $name .= "/2" if ($t[1] & 0x80); + print {$fh[$i]} "\@$name\n$seq\n"; + if ($qual ne '*') { + print {$fh[$i]} "+\n$qual\n"; + } + $i = 0 if (++$i == $opts{n}); + } + close($fh[$_]) for (0 .. $opts{n}-1); + } else { + die("To be implemented.\n"); + } +} + # # sra2hdr # @@ -285,10 +352,11 @@ sub sra2hdr { sub unique { my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3); - getopts('Qf:q:r:a:b:', \%opts); + getopts('Qf:q:r:a:b:m', \%opts); die("Usage: samtools.pl unique [-f $opts{f}] \n") if (@ARGV == 0 && -t STDIN); my $last = ''; my $recal_Q = !defined($opts{Q}); + my $multi_only = defined($opts{m}); my @a; while (<>) { my $score = -1; @@ -306,16 +374,16 @@ sub unique { } $score = 1 if ($score < 1); if ($t[0] ne $last) { - &unique_aux(\@a, $opts{f}, $recal_Q) if (@a); + &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); $last = $t[0]; } push(@a, [$score, \@t]); } - &unique_aux(\@a, $opts{f}, $recal_Q) if (@a); + &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); } sub unique_aux { - my ($a, $fac, $is_recal) = @_; + my ($a, $fac, $is_recal, $multi_only) = @_; my ($max, $max2, $max_i) = (0, 0, -1); for (my $i = 0; $i < @$a; ++$i) { if ($a->[$i][0] > $max) { @@ -325,9 +393,11 @@ sub unique_aux { } } if ($is_recal) { - my $q = int($fac * ($max - $max2) / $max + .499); - $q = 250 if ($q > 250); - $a->[$max_i][1][4] = $q < 250? $q : 250; + if (!$multi_only || @$a > 1) { + my $q = int($fac * ($max - $max2) / $max + .499); + $q = 250 if ($q > 250); + $a->[$max_i][1][4] = $q < 250? $q : 250; + } } print join("\t", @{$a->[$max_i][1]}); @$a = (); diff --git a/misc/varfilter.py b/misc/varfilter.py new file mode 100755 index 0000000..03ce395 --- /dev/null +++ b/misc/varfilter.py @@ -0,0 +1,205 @@ +#!/software/bin/python + +# Author: lh3, converted to python and modified to add -C option by Aylwyn Scally +# +# About: +# varfilter.py is a port of Heng's samtools.pl varFilter script into +# python, with an additional -C INT option. This option sets a minimum +# consensus score, above which the script will output a pileup line +# wherever it _could have_ called a variant, even if none is actually +# called (i.e. hom-ref positions). This is important if you want to +# subsequently merge the calls with those for another individual to get a +# synoptic view of calls at each site. Without this option, and in all +# other respects, it behaves like samtools.pl varFilter. +# +# Aylwyn Scally as6@sanger.ac.uk + + +# Filtration code: +# +# C low CNS quality (hom-ref only) +# d low depth +# D high depth +# W too many SNPs in a window (SNP only) +# G close to a high-quality indel (SNP only) +# Q low RMS mapping quality (SNP only) +# g close to another indel with higher quality (indel only) +# s low SNP quality (SNP only) +# i low indel quality (indel only) + + +import sys +import getopt + +def usage(): + print '''usage: varfilter.py [options] [cns-pileup] + +Options: -Q INT minimum RMS mapping quality for SNPs + -q INT minimum RMS mapping quality for gaps + -d INT minimum read depth + -D INT maximum read depth + -S INT minimum SNP quality + -i INT minimum indel quality + -C INT minimum consensus quality for hom-ref sites + + -G INT min indel score for nearby SNP filtering + -w INT SNP within INT bp around a gap to be filtered + + -W INT window size for filtering dense SNPs + -N INT max number of SNPs in a window + + -l INT window size for filtering adjacent gaps + + -p print filtered variants''' + +def varFilter_aux(first, is_print): + try: + if first[1] == 0: + sys.stdout.write("\t".join(first[4:]) + "\n") + elif is_print: + sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n") + except IOError: + sys.exit() + +mindepth = 3 +maxdepth = 100 +gapgapwin = 30 +minsnpmapq = 25 +mingapmapq = 10 +minindelscore = 25 +scorefactor = 100 +snpgapwin = 10 +densesnpwin = 10 +densesnps = 2 +printfilt = False +minsnpq = 0 +minindelq = 0 +mincnsq = 0 + +try: + options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', []) +except getopt.GetoptError: + usage() + sys.exit(2) +for (oflag, oarg) in options: + if oflag == '-d': mindepth = int(oarg) + if oflag == '-D': maxdepth = int(oarg) + if oflag == '-l': gapgapwin = int(oarg) + if oflag == '-Q': minsnpmapq = int(oarg) + if oflag == '-q': mingapmapq = int(oarg) + if oflag == '-G': minindelscore = int(oarg) + if oflag == '-s': scorefactor = int(oarg) + if oflag == '-w': snpgapwin = int(oarg) + if oflag == '-W': densesnpwin = int(oarg) + if oflag == '-C': mincnsq = int(oarg) + if oflag == '-N': densesnps = int(oarg) + if oflag == '-p': printfilt = True + if oflag == '-S': minsnpq = int(oarg) + if oflag == '-i': minindelq = int(oarg) + +if len(args) < 1: + inp = sys.stdin +else: + inp = open(args[0]) + +# calculate the window size +max_dist = max(gapgapwin, snpgapwin, densesnpwin) + +staging = [] +for t in (line.strip().split() for line in inp): + (flt, score) = (0, -1) + # non-var sites + if t[3] == '*/*': + continue + is_snp = t[2].upper() != t[3].upper() + if not (is_snp or mincnsq): + continue + # clear the out-of-range elements + while staging: + # Still on the same chromosome and the first element's window still affects this position? + if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]): + break + varFilter_aux(staging.pop(0), printfilt) + + # first a simple filter + if int(t[7]) < mindepth: + flt = 2 + elif int(t[7]) > maxdepth: + flt = 3 + if t[2] == '*': # an indel + if minindelq and minindelq > int(t[5]): + flt = 8 + elif is_snp: + if minsnpq and minsnpq> int(t[5]): + flt = 7 + else: + if mincnsq and mincnsq > int(t[4]): + flt = 9 + + # site dependent filters + dlen = 0 + if flt == 0: + if t[2] == '*': # an indel + # If deletion, remember the length of the deletion + (a,b) = t[3].split('/') + alen = len(a) - 1 + blen = len(b) - 1 + if alen>blen: + if a[0] == '-': dlen=alen + elif b[0] == '-': dlen=blen + + if int(t[6]) < mingapmapq: + flt = 1 + # filtering SNPs + if int(t[5]) >= minindelscore: + for x in (y for y in staging if y[3]): + # Is it a SNP and is it outside the SNP filter window? + if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]): + continue + if x[1] == 0: + x[1] = 5 + + # calculate the filtering score (different from indel quality) + score = int(t[5]) + if t[8] != '*': + score += scorefactor * int(t[10]) + if t[9] != '*': + score += scorefactor * int(t[11]) + # check the staging list for indel filtering + for x in (y for y in staging if y[3]): + # Is it a SNP and is it outside the gap filter window + if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]): + continue + if x[0] < score: + x[1] = 6 + else: + flt = 6 + break + else: # a SNP or hom-ref + if int(t[6]) < minsnpmapq: + flt = 1 + # check adjacent SNPs + k = 1 + for x in (y for y in staging if y[3]): + if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5): + k += 1 + + # filtering is necessary + if k > densesnps: + flt = 4 + for x in (y for y in staging if y[3]): + if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0: + x[1] = 4 + else: # then check gap filter + for x in (y for y in staging if y[3]): + if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]): + continue + if x[0] >= minindelscore: + flt = 5 + break + + staging.append([score, flt, dlen, is_snp] + t) + +# output the last few elements in the staging list +while staging: + varFilter_aux(staging.pop(0), printfilt) diff --git a/misc/wgsim.c b/misc/wgsim.c index 1522eee..7b5f095 100644 --- a/misc/wgsim.c +++ b/misc/wgsim.c @@ -238,7 +238,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq c[0] = nst_nt4_table[(int)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; - if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) { + if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { printf("%s\t%d\t", name, i+1); if (c[1] == c[2]) { // hom if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution @@ -304,7 +304,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, tmp_seq[1] = (uint8_t*)calloc(l+2, 1); size[0] = size_l; size[1] = size_r; - Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; tot_len = n_ref = 0; while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { diff --git a/misc/wgsim_eval.pl b/misc/wgsim_eval.pl index 01038f1..f919a06 100755 --- a/misc/wgsim_eval.pl +++ b/misc/wgsim_eval.pl @@ -12,9 +12,9 @@ exit; sub wgsim_eval { my %opts = (g=>5); - getopts('pcg:', \%opts); - die("Usage: wgsim_eval.pl [-pc] [-g $opts{g}] \n") if (@ARGV == 0 && -t STDIN); - my (@c0, @c1); + getopts('pcag:', \%opts); + die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] \n") if (@ARGV == 0 && -t STDIN); + my (@c0, @c1, %fnfp); my ($max_q, $flag) = (0, 0); my $gap = $opts{g}; $flag |= 1 if (defined $opts{p}); @@ -66,14 +66,26 @@ sub wgsim_eval { } ++$c0[$q]; ++$c1[$q] unless ($is_correct); + @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]}); + ++$fnfp{$t[4]}[0]; + ++$fnfp{$t[4]}[1] unless ($is_correct); print STDERR $line if (($flag&1) && !$is_correct && $q > 0); } # print my ($cc0, $cc1) = (0, 0); - for (my $i = $max_q; $i >= 0; --$i) { - $c0[$i] = 0 unless (defined $c0[$i]); - $c1[$i] = 0 unless (defined $c1[$i]); - $cc0 += $c0[$i]; $cc1 += $c1[$i]; - printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0); + if (!defined($opts{a})) { + for (my $i = $max_q; $i >= 0; --$i) { + $c0[$i] = 0 unless (defined $c0[$i]); + $c1[$i] = 0 unless (defined $c1[$i]); + $cc0 += $c0[$i]; $cc1 += $c1[$i]; + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0); + } + } else { + for (reverse(sort {$a<=>$b} (keys %fnfp))) { + next if ($_ == 0); + $cc0 += $fnfp{$_}[0]; + $cc1 += $fnfp{$_}[1]; + print join("\t", $_, $cc0, $cc1), "\n"; + } } } diff --git a/sam.c b/sam.c index ad4325b..ecdee02 100644 --- a/sam.c +++ b/sam.c @@ -55,6 +55,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) if (aux) { // check if aux is present bam_header_t *textheader = fp->header; fp->header = sam_header_read2((const char*)aux); + if (fp->header == 0) goto open_err_ret; append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } diff --git a/sam_header.c b/sam_header.c index a119c02..05d75de 100644 --- a/sam_header.c +++ b/sam_header.c @@ -10,6 +10,7 @@ KHASH_MAP_INIT_STR(str, const char *) struct _HeaderList { + struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. struct _HeaderList *next; void *data; }; @@ -58,6 +59,34 @@ static void debug(const char *format, ...) va_end(ap); } +#if 0 +// Replaced by list_append_to_end +static list_t *list_prepend(list_t *root, void *data) +{ + list_t *l = malloc(sizeof(list_t)); + l->next = root; + l->data = data; + return l; +} +#endif + +// Relies on the root->last being correct. Do not use with the other list_* +// routines unless they are fixed to modify root->last as well. +static list_t *list_append_to_end(list_t *root, void *data) +{ + list_t *l = malloc(sizeof(list_t)); + l->last = l; + l->next = NULL; + l->data = data; + + if ( !root ) + return l; + + root->last->next = l; + root->last = l; + return root; +} + static list_t *list_append(list_t *root, void *data) { list_t *l = root; @@ -322,7 +351,7 @@ static HeaderLine *sam_header_line_parse(const char *headerLine) while (*to && *to!='\t') to++; if ( to-from != 2 ) { - debug("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine); + debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); return 0; } @@ -345,7 +374,11 @@ static HeaderLine *sam_header_line_parse(const char *headerLine) while (*to && *to!='\t') to++; if ( !required_tags[itype] && !optional_tags[itype] ) + { + // CO is a special case, it can contain anything, including tabs + if ( *to ) { to++; continue; } tag = new_tag(" ",from,to-1); + } else tag = new_tag(from,from+3,to-1); @@ -539,7 +572,8 @@ void *sam_header_parse2(const char *headerText) { hline = sam_header_line_parse(buf); if ( hline && sam_header_line_validate(hline) ) - hlines = list_append(hlines, hline); + // With too many (~250,000) reference sequences the header parsing was too slow with list_append. + hlines = list_append_to_end(hlines, hline); else { if (hline) sam_header_line_free(hline); diff --git a/sam_view.c b/sam_view.c index 06dd01a..3b10e2e 100644 --- a/sam_view.c +++ b/sam_view.c @@ -6,7 +6,12 @@ #include "sam_header.h" #include "sam.h" #include "faidx.h" +#include "khash.h" +KHASH_SET_INIT_STR(rg) +typedef khash_t(rg) *rghash_t; + +rghash_t g_rghash = 0; static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; static char *g_library, *g_rg; static int g_sol2sanger_tbl[128]; @@ -32,9 +37,15 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) { if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; - if (g_rg) { + if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); - if (s && strcmp(g_rg, (char*)(s + 1)) == 0) return 0; + if (s) { + if (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1; + if (g_rghash) { + khint_t k = kh_get(rg, g_rghash, (char*)(s + 1)); + return (k != kh_end(g_rghash))? 0 : 1; + } + } } if (g_library) { const char *p = bam_get_library((bam_header_t*)h, b); @@ -58,11 +69,11 @@ int main_samview(int argc, char *argv[]) int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0; int of_type = BAM_OFDEC, is_long_help = 0; samfile_t *in = 0, *out = 0; - char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:C")) >= 0) { + while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { switch (c) { case 'C': slx2sngr = 1; break; case 'S': is_bamin = 0; break; @@ -77,6 +88,7 @@ int main_samview(int argc, char *argv[]) case 'u': is_uncompressed = 1; break; case 'l': g_library = strdup(optarg); break; case 'r': g_rg = strdup(optarg); break; + case 'R': fn_rg = strdup(optarg); break; case 'x': of_type = BAM_OFHEX; break; case 'X': of_type = BAM_OFSTR; break; case '?': is_long_help = 1; break; @@ -94,7 +106,19 @@ int main_samview(int argc, char *argv[]) if (is_bamin) strcat(in_mode, "b"); if (is_header) strcat(out_mode, "h"); if (is_uncompressed) strcat(out_mode, "u"); - if (argc == optind) return usage(is_long_help); + if (argc == optind) return usage(is_long_help); // potential memory leak... + + // read the list of read groups + if (fn_rg) { + FILE *fp_rg; + char buf[1024]; + int ret; + g_rghash = kh_init(rg); + fp_rg = fopen(fn_rg, "r"); + while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me... + kh_put(rg, g_rghash, strdup(buf), &ret); // we'd better check duplicates... + fclose(fp_rg); + } // generate the fn_list if necessary if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref); @@ -147,7 +171,13 @@ int main_samview(int argc, char *argv[]) view_end: // close files, free and return - free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); + free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); + if (g_rghash) { + khint_t k; + for (k = 0; k < kh_end(g_rghash); ++k) + if (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k)); + kh_destroy(rg, g_rghash); + } samclose(in); samclose(out); return ret; @@ -167,6 +197,7 @@ static int usage(int is_long_help) fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -R FILE list of read groups to be outputted [null]\n"); fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); fprintf(stderr, " -q INT minimum mapping quality [0]\n"); diff --git a/samtools.1 b/samtools.1 index 31375f3..d79d176 100644 --- a/samtools.1 +++ b/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "10 November 2009" "samtools-0.1.7" "Bioinformatics tools" +.TH samtools 1 "11 July 2010" "samtools-0.1.8" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -10,6 +10,8 @@ samtools sort aln.bam aln.sorted .PP samtools index aln.sorted.bam .PP +samtools idxstats aln.sorted.bam +.PP samtools view aln.sorted.bam chr2:20,100,000-20,200,000 .PP samtools merge out.bam in1.bam in2.bam in3.bam @@ -18,6 +20,8 @@ samtools faidx ref.fasta .PP samtools pileup -f ref.fasta aln.sorted.bam .PP +samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam +.PP samtools tview aln.sorted.bam ref.fasta .SH DESCRIPTION @@ -42,81 +46,9 @@ entire alignment file unless it is asked to do so. .SH COMMANDS AND OPTIONS .TP 10 -.B import -samtools import - -Since 0.1.4, this command is an alias of: - -samtools view -bt -o - -.TP -.B sort -samtools sort [-n] [-m maxMem] - -Sort alignments by leftmost coordinates. File -.I .bam -will be created. This command may also create temporary files -.I .%d.bam -when the whole alignment cannot be fitted into memory (controlled by -option -m). - -.B OPTIONS: -.RS -.TP 8 -.B -n -Sort by read names rather than by chromosomal coordinates -.TP -.B -m INT -Approximately the maximum required memory. [500000000] -.RE - -.TP -.B merge -samtools merge [-h inh.sam] [-n] [...] - -Merge multiple sorted alignments. -The header reference lists of all the input BAM files, and the @SQ headers of -.IR inh.sam , -if any, must all refer to the same set of reference sequences. -The header reference list and (unless overridden by -.BR -h ) -`@' headers of -.I in1.bam -will be copied to -.IR out.bam , -and the headers of other files will be ignored. - -.B OPTIONS: -.RS -.TP 8 -.B -h FILE -Use the lines of -.I FILE -as `@' headers to be copied to -.IR out.bam , -replacing any header lines that would otherwise be copied from -.IR in1.bam . -.RI ( FILE -is actually in SAM format, though any alignment records it may contain -are ignored.) -.TP -.B -n -The input alignments are sorted by read names rather than by chromosomal -coordinates -.RE - -.TP -.B index -samtools index - -Index sorted alignment for fast random access. Index file -.I .bai -will be created. - -.TP .B view samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F -skipFlag] [-q minMapQ] [-l library] [-r readGroup] | [region1 [...]] +skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] | [region1 [...]] Extract/print all or sub alignments in SAM or BAM format. If no region is specified, all the alignments will be printed; otherwise only @@ -178,22 +110,21 @@ Only output reads in library STR [null] .TP .B -r STR Only output reads in read group STR [null] +.TP +.B -R FILE +Output reads in read groups listed in +.I FILE +[null] .RE .TP -.B faidx -samtools faidx [region1 [...]] +.B tview +samtools tview [ref.fasta] -Index reference sequence in the FASTA format or extract subsequence from -indexed reference sequence. If no region is specified, -.B faidx -will index the file and create -.I .fai -on the disk. If regions are speficified, the subsequences will be -retrieved and printed to stdout in the FASTA format. The input file can -be compressed in the -.B RAZF -format. +Text alignment viewer (based on the ncurses library). In the viewer, +press `?' for help and press `g' to check the alignment start from a +region in the format like `chr10:10,000,000' or `=10,000,000' when +viewing the same reference sequence. .TP .B pileup @@ -232,33 +163,40 @@ covering reads, the first alllele, the second allele, # reads supporting the first allele, # reads supporting the second allele and # reads containing indels different from the top two alleles. +The position of indels is offset by -1. + .B OPTIONS: .RS - .TP 10 .B -s Print the mapping quality as the last column. This option makes the output easier to parse, although this format is not space efficient. - .TP .B -S The input file is in SAM. - .TP .B -i Only output pileup lines containing indels. - .TP .B -f FILE The reference sequence in the FASTA format. Index file .I FILE.fai will be created if absent. - .TP .B -M INT Cap mapping quality at INT [60] - +.TP +.B -m INT +Filter reads with flag containing bits in +.I +INT +[1796] +.TP +.B -d INT +Use the first +.I NUM +reads in the pileup for indel calling for speed up. Zero for unlimited. [0] .TP .B -t FILE List of reference names ane sequence lengths, in the format described @@ -267,7 +205,6 @@ for the command. If this option is present, samtools assumes the input .I is in SAM format; otherwise it assumes in BAM format. - .TP .B -l FILE List of sites at which pileup is output. This file is space @@ -278,10 +215,9 @@ recommended to use option together with .B -l as in the default format we may not know the mapping quality. - .TP .B -c -Call the consensus sequence using MAQ consensus model. Options +Call the consensus sequence using SOAPsnp consensus model. Options .B -T, .B -N, .B -I @@ -292,38 +228,147 @@ are only effective when or .B -g is in use. - .TP .B -g Generate genotype likelihood in the binary GLFv3 format. This option suppresses -c, -i and -s. - .TP .B -T FLOAT The theta parameter (error dependency coefficient) in the maq consensus calling model [0.85] - .TP .B -N INT Number of haplotypes in the sample (>=2) [2] - .TP .B -r FLOAT Expected fraction of differences between a pair of haplotypes [0.001] - .TP .B -I INT Phred probability of an indel in sequencing/prep. [40] +.RE + +.TP +.B mpileup +samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]] +Generate pileup for multiple BAM files. Consensus calling is not +implemented. + +.B OPTIONS: +.RS +.TP 8 +.B -r STR +Only generate pileup in region +.I STR +[all sites] +.TP +.B -f FILE +The reference file [null] .RE .TP -.B tview -samtools tview [ref.fasta] +.B reheader +samtools reheader -Text alignment viewer (based on the ncurses library). In the viewer, -press `?' for help and press `g' to check the alignment start from a -region in the format like `chr10:10,000,000'. +Replace the header in +.I in.bam +with the header in +.I in.header.sam. +This command is much faster than replacing the header with a +BAM->SAM->BAM conversion. + +.TP +.B sort +samtools sort [-no] [-m maxMem] + +Sort alignments by leftmost coordinates. File +.I .bam +will be created. This command may also create temporary files +.I .%d.bam +when the whole alignment cannot be fitted into memory (controlled by +option -m). + +.B OPTIONS: +.RS +.TP 8 +.B -o +Output the final alignment to the standard output. +.TP +.B -n +Sort by read names rather than by chromosomal coordinates +.TP +.B -m INT +Approximately the maximum required memory. [500000000] +.RE + +.TP +.B merge +samtools merge [-h inh.sam] [-nr] [...] + +Merge multiple sorted alignments. +The header reference lists of all the input BAM files, and the @SQ headers of +.IR inh.sam , +if any, must all refer to the same set of reference sequences. +The header reference list and (unless overridden by +.BR -h ) +`@' headers of +.I in1.bam +will be copied to +.IR out.bam , +and the headers of other files will be ignored. + +.B OPTIONS: +.RS +.TP 8 +.B -h FILE +Use the lines of +.I FILE +as `@' headers to be copied to +.IR out.bam , +replacing any header lines that would otherwise be copied from +.IR in1.bam . +.RI ( FILE +is actually in SAM format, though any alignment records it may contain +are ignored.) +.TP +.B -r +Attach an RG tag to each alignment. The tag value is inferred from file names. +.TP +.B -n +The input alignments are sorted by read names rather than by chromosomal +coordinates +.RE + +.TP +.B index +samtools index + +Index sorted alignment for fast random access. Index file +.I .bai +will be created. + +.TP +.B idxstats +samtools idxstats + +Retrieve and print stats in the index file. The output is TAB delimited +with each line consisting of reference sequence name, sequence length, # +mapped reads and # unmapped reads. + +.TP +.B faidx +samtools faidx [region1 [...]] + +Index reference sequence in the FASTA format or extract subsequence from +indexed reference sequence. If no region is specified, +.B faidx +will index the file and create +.I .fai +on the disk. If regions are speficified, the subsequences will be +retrieved and printed to stdout in the FASTA format. The input file can +be compressed in the +.B RAZF +format. .TP .B fixmate @@ -334,28 +379,34 @@ name-sorted alignment. .TP .B rmdup -samtools rmdup +samtools rmdup [-sS] Remove potential PCR duplicates: if multiple read pairs have identical external coordinates, only retain the pair with highest mapping quality. -This command +In the paired-end mode, this command .B ONLY -works with FR orientation and requires ISIZE is correctly set. - -.TP -.B rmdupse -samtools rmdupse +works with FR orientation and requires ISIZE is correctly set. It does +not work for unpaired reads (e.g. two ends mapped to different +chromosomes or orphan reads). -Remove potential duplicates for single-ended reads. This command will -treat all reads as single-ended even if they are paired in fact. +.B OPTIONS: +.RS +.TP 8 +.B -s +Remove duplicate for single-end reads. By default, the command works for +paired-end reads only. +.TP 8 +.B -S +Treat paired-end reads and single-end reads. +.RE .TP -.B fillmd -samtools fillmd [-e] +.B calmd +samtools calmd [-eubS] Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing -tag. +tag. Output SAM by default. .B OPTIONS: .RS @@ -363,7 +414,15 @@ tag. .B -e Convert a the read base to = if it is identical to the aligned reference base. Indel caller does not support the = bases at the moment. - +.TP +.B -u +Output uncompressed BAM +.TP +.B -b +Output compressed BAM +.TP +.B -S +The input is SAM with header lines .RE .SH SAM FORMAT @@ -396,21 +455,21 @@ Each bit in the FLAG field is defined as: .TS center box; -cb | cb -l | l . -Flag Description +cb | cb | cb +l | c | l . +Flag Chr Description _ -0x0001 the read is paired in sequencing -0x0002 the read is mapped in a proper pair -0x0004 the query sequence itself is unmapped -0x0008 the mate is unmapped -0x0010 strand of the query (1 for reverse) -0x0020 strand of the mate -0x0040 the read is the first read in a pair -0x0080 the read is the second read in a pair -0x0100 the alignment is not primary -0x0200 the read fails platform/vendor quality checks -0x0400 the read is either a PCR or an optical duplicate +0x0001 p the read is paired in sequencing +0x0002 P the read is mapped in a proper pair +0x0004 u the query sequence itself is unmapped +0x0008 U the mate is unmapped +0x0010 r strand of the query (1 for reverse) +0x0020 R strand of the mate +0x0040 1 the read is the first read in a pair +0x0080 2 the read is the second read in a pair +0x0100 s the alignment is not primary +0x0200 f the read fails platform/vendor quality checks +0x0400 d the read is either a PCR or an optical duplicate .TE .SH LIMITATIONS @@ -418,23 +477,23 @@ _ .IP o 2 Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. .IP o 2 -CIGAR operation P is not properly handled at the moment. -.IP o 2 In merging, the input files are required to have the same number of reference sequences. The requirement can be relaxed. In addition, merging does not reconstruct the header dictionaries automatically. Endusers have to provide the correct header. Picard is better at merging. .IP o 2 -Samtools' rmdup does not work for single-end data and does not remove -duplicates across chromosomes. Picard is better. +Samtools paired-end rmdup does not work for unpaired reads (e.g. orphan +reads or ends mapped to different chromosomes). If this is a concern, +please use Picard's MarkDuplicate which correctly handles these cases, +although a little slower. .SH AUTHOR .PP Heng Li from the Sanger Institute wrote the C version of samtools. Bob Handsaker from the Broad Institute implemented the BGZF library and Jue Ruan from Beijing Genomics Institute wrote the RAZF library. Various -people in the 1000Genomes Project contributed to the SAM format +people in the 1000 Genomes Project contributed to the SAM format specification. .SH SEE ALSO diff --git a/samtools.txt b/samtools.txt index feec238..20e6c15 100644 --- a/samtools.txt +++ b/samtools.txt @@ -12,6 +12,8 @@ SYNOPSIS samtools index aln.sorted.bam + samtools idxstats aln.sorted.bam + samtools view aln.sorted.bam chr2:20,100,000-20,200,000 samtools merge out.bam in1.bam in2.bam in3.bam @@ -20,6 +22,8 @@ SYNOPSIS samtools pileup -f ref.fasta aln.sorted.bam + samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam + samtools tview aln.sorted.bam ref.fasta @@ -43,70 +47,19 @@ DESCRIPTION COMMANDS AND OPTIONS - import samtools import - - Since 0.1.4, this command is an alias of: - - samtools view -bt -o - - - sort samtools sort [-n] [-m maxMem] - - Sort alignments by leftmost coordinates. File .bam will be created. This command may also create tempo- - rary files .%d.bam when the whole alignment can- - not be fitted into memory (controlled by option -m). - - OPTIONS: - - -n Sort by read names rather than by chromosomal coordi- - nates - - -m INT Approximately the maximum required memory. - [500000000] - - - merge samtools merge [-h inh.sam] [-n] - [...] - - Merge multiple sorted alignments. The header reference lists - of all the input BAM files, and the @SQ headers of inh.sam, - if any, must all refer to the same set of reference - sequences. The header reference list and (unless overridden - by -h) `@' headers of in1.bam will be copied to out.bam, and - the headers of other files will be ignored. - - OPTIONS: - - -h FILE Use the lines of FILE as `@' headers to be copied to - out.bam, replacing any header lines that would other- - wise be copied from in1.bam. (FILE is actually in - SAM format, though any alignment records it may con- - tain are ignored.) - - -n The input alignments are sorted by read names rather - than by chromosomal coordinates - - - index samtools index - - Index sorted alignment for fast random access. Index file - .bai will be created. - - view samtools view [-bhuHS] [-t in.refList] [-o output] [-f - reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r read- - Group] | [region1 [...]] + reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r read- + Group] [-R rgFile] | [region1 [...]] - Extract/print all or sub alignments in SAM or BAM format. If - no region is specified, all the alignments will be printed; - otherwise only alignments overlapping the specified regions - will be output. An alignment may be given multiple times if + Extract/print all or sub alignments in SAM or BAM format. If + no region is specified, all the alignments will be printed; + otherwise only alignments overlapping the specified regions + will be output. An alignment may be given multiple times if it is overlapping several regions. A region can be presented, - for example, in the following format: `chr2' (the whole - chr2), `chr2:1000000' (region starting from 1,000,000bp) or - `chr2:1,000,000-2,000,000' (region between 1,000,000 and - 2,000,000bp including the end points). The coordinate is + for example, in the following format: `chr2' (the whole + chr2), `chr2:1000000' (region starting from 1,000,000bp) or + `chr2:1,000,000-2,000,000' (region between 1,000,000 and + 2,000,000bp including the end points). The coordinate is 1-based. OPTIONS: @@ -114,27 +67,27 @@ COMMANDS AND OPTIONS -b Output in the BAM format. -u Output uncompressed BAM. This option saves time spent - on compression/decomprssion and is thus preferred + on compression/decomprssion and is thus preferred when the output is piped to another samtools command. -h Include the header in the output. -H Output the header only. - -S Input is in SAM. If @SQ header lines are absent, the + -S Input is in SAM. If @SQ header lines are absent, the `-t' option is required. - -t FILE This file is TAB-delimited. Each line must contain - the reference name and the length of the reference, - one line for each distinct reference; additional - fields are ignored. This file also defines the order - of the reference sequences in sorting. If you run - `samtools faidx ', the resultant index file - .fai can be used as this file. + -t FILE This file is TAB-delimited. Each line must contain + the reference name and the length of the reference, + one line for each distinct reference; additional + fields are ignored. This file also defines the order + of the reference sequences in sorting. If you run + `samtools faidx ', the resultant index file + .fai can be used as this file. -o FILE Output file [stdout] - -f INT Only output alignments with all bits in INT present + -f INT Only output alignments with all bits in INT present in the FLAG field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0] @@ -146,125 +99,197 @@ COMMANDS AND OPTIONS -r STR Only output reads in read group STR [null] + -R FILE Output reads in read groups listed in FILE [null] - faidx samtools faidx [region1 [...]] - Index reference sequence in the FASTA format or extract sub- - sequence from indexed reference sequence. If no region is - specified, faidx will index the file and create - .fai on the disk. If regions are speficified, the - subsequences will be retrieved and printed to stdout in the - FASTA format. The input file can be compressed in the RAZF - format. + tview samtools tview [ref.fasta] + + Text alignment viewer (based on the ncurses library). In the + viewer, press `?' for help and press `g' to check the align- + ment start from a region in the format like + `chr10:10,000,000' or `=10,000,000' when viewing the same + reference sequence. - pileup samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l - in.site_list] [-iscgS2] [-T theta] [-N nHap] [-r + pileup samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l + in.site_list] [-iscgS2] [-T theta] [-N nHap] [-r pairDiffRate] | - Print the alignment in the pileup format. In the pileup for- - mat, each line represents a genomic position, consisting of + Print the alignment in the pileup format. In the pileup for- + mat, each line represents a genomic position, consisting of chromosome name, coordinate, reference base, read bases, read - qualities and alignment mapping qualities. Information on + qualities and alignment mapping qualities. Information on match, mismatch, indel, strand, mapping quality and start and - end of a read are all encoded at the read base column. At - this column, a dot stands for a match to the reference base - on the forward strand, a comma for a match on the reverse - strand, `ACGTN' for a mismatch on the forward strand and - `acgtn' for a mismatch on the reverse strand. A pattern - `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion - between this reference position and the next reference posi- - tion. The length of the insertion is given by the integer in - the pattern, followed by the inserted sequence. Similarly, a + end of a read are all encoded at the read base column. At + this column, a dot stands for a match to the reference base + on the forward strand, a comma for a match on the reverse + strand, `ACGTN' for a mismatch on the forward strand and + `acgtn' for a mismatch on the reverse strand. A pattern + `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion + between this reference position and the next reference posi- + tion. The length of the insertion is given by the integer in + the pattern, followed by the inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' represents a deletion from the - reference. The deleted bases will be presented as `*' in the - following lines. Also at the read base column, a symbol `^' - marks the start of a read segment which is a contiguous sub- - sequence on the read separated by `N/S/H' CIGAR operations. - The ASCII of the character following `^' minus 33 gives the - mapping quality. A symbol `$' marks the end of a read seg- + reference. The deleted bases will be presented as `*' in the + following lines. Also at the read base column, a symbol `^' + marks the start of a read segment which is a contiguous sub- + sequence on the read separated by `N/S/H' CIGAR operations. + The ASCII of the character following `^' minus 33 gives the + mapping quality. A symbol `$' marks the end of a read seg- ment. - If option -c is applied, the consensus base, Phred-scaled - consensus quality, SNP quality (i.e. the Phred-scaled proba- + If option -c is applied, the consensus base, Phred-scaled + consensus quality, SNP quality (i.e. the Phred-scaled proba- bility of the consensus being identical to the reference) and - root mean square (RMS) mapping quality of the reads covering - the site will be inserted between the `reference base' and - the `read bases' columns. An indel occupies an additional - line. Each indel line consists of chromosome name, coordi- - nate, a star, the genotype, consensus quality, SNP quality, + root mean square (RMS) mapping quality of the reads covering + the site will be inserted between the `reference base' and + the `read bases' columns. An indel occupies an additional + line. Each indel line consists of chromosome name, coordi- + nate, a star, the genotype, consensus quality, SNP quality, RMS mapping quality, # covering reads, the first alllele, the - second allele, # reads supporting the first allele, # reads - supporting the second allele and # reads containing indels + second allele, # reads supporting the first allele, # reads + supporting the second allele and # reads containing indels different from the top two alleles. - OPTIONS: + The position of indels is offset by -1. + OPTIONS: - -s Print the mapping quality as the last column. This - option makes the output easier to parse, although + -s Print the mapping quality as the last column. This + option makes the output easier to parse, although this format is not space efficient. - -S The input file is in SAM. - -i Only output pileup lines containing indels. - - -f FILE The reference sequence in the FASTA format. Index + -f FILE The reference sequence in the FASTA format. Index file FILE.fai will be created if absent. - -M INT Cap mapping quality at INT [60] + -m INT Filter reads with flag containing bits in INT + [1796] - -t FILE List of reference names ane sequence lengths, in - the format described for the import command. If - this option is present, samtools assumes the input + -d INT Use the first NUM reads in the pileup for indel + calling for speed up. Zero for unlimited. [0] + + -t FILE List of reference names ane sequence lengths, in + the format described for the import command. If + this option is present, samtools assumes the input is in SAM format; otherwise it assumes in BAM format. - - -l FILE List of sites at which pileup is output. This file - is space delimited. The first two columns are - required to be chromosome and 1-based coordinate. - Additional columns are ignored. It is recommended + -l FILE List of sites at which pileup is output. This file + is space delimited. The first two columns are + required to be chromosome and 1-based coordinate. + Additional columns are ignored. It is recommended to use option -s together with -l as in the default format we may not know the mapping quality. - - -c Call the consensus sequence using MAQ consensus + -c Call the consensus sequence using SOAPsnp consensus model. Options -T, -N, -I and -r are only effective when -c or -g is in use. - - -g Generate genotype likelihood in the binary GLFv3 + -g Generate genotype likelihood in the binary GLFv3 format. This option suppresses -c, -i and -s. - - -T FLOAT The theta parameter (error dependency coefficient) + -T FLOAT The theta parameter (error dependency coefficient) in the maq consensus calling model [0.85] - -N INT Number of haplotypes in the sample (>=2) [2] - - -r FLOAT Expected fraction of differences between a pair of + -r FLOAT Expected fraction of differences between a pair of haplotypes [0.001] - - -I INT Phred probability of an indel in sequencing/prep. + -I INT Phred probability of an indel in sequencing/prep. [40] + mpileup samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]] - tview samtools tview [ref.fasta] + Generate pileup for multiple BAM files. Consensus calling is + not implemented. + + OPTIONS: + + -r STR Only generate pileup in region STR [all sites] + + -f FILE The reference file [null] + + + reheader samtools reheader + + Replace the header in in.bam with the header in + in.header.sam. This command is much faster than replacing + the header with a BAM->SAM->BAM conversion. + + + sort samtools sort [-no] [-m maxMem] + + Sort alignments by leftmost coordinates. File .bam will be created. This command may also create tempo- + rary files .%d.bam when the whole alignment can- + not be fitted into memory (controlled by option -m). + + OPTIONS: + + -o Output the final alignment to the standard output. + + -n Sort by read names rather than by chromosomal coordi- + nates + + -m INT Approximately the maximum required memory. + [500000000] + + + merge samtools merge [-h inh.sam] [-nr] + [...] + + Merge multiple sorted alignments. The header reference lists + of all the input BAM files, and the @SQ headers of inh.sam, + if any, must all refer to the same set of reference + sequences. The header reference list and (unless overridden + by -h) `@' headers of in1.bam will be copied to out.bam, and + the headers of other files will be ignored. + + OPTIONS: + + -h FILE Use the lines of FILE as `@' headers to be copied to + out.bam, replacing any header lines that would other- + wise be copied from in1.bam. (FILE is actually in + SAM format, though any alignment records it may con- + tain are ignored.) + + -r Attach an RG tag to each alignment. The tag value is + inferred from file names. + + -n The input alignments are sorted by read names rather + than by chromosomal coordinates + + + index samtools index + + Index sorted alignment for fast random access. Index file + .bai will be created. + + + idxstats samtools idxstats - Text alignment viewer (based on the ncurses library). In the - viewer, press `?' for help and press `g' to check the align- - ment start from a region in the format like - `chr10:10,000,000'. + Retrieve and print stats in the index file. The output is TAB + delimited with each line consisting of reference sequence + name, sequence length, # mapped reads and # unmapped reads. + + + faidx samtools faidx [region1 [...]] + + Index reference sequence in the FASTA format or extract sub- + sequence from indexed reference sequence. If no region is + specified, faidx will index the file and create + .fai on the disk. If regions are speficified, the + subsequences will be retrieved and printed to stdout in the + FASTA format. The input file can be compressed in the RAZF + format. fixmate samtools fixmate @@ -273,26 +298,28 @@ COMMANDS AND OPTIONS name-sorted alignment. - rmdup samtools rmdup + rmdup samtools rmdup [-sS] - Remove potential PCR duplicates: if multiple read pairs have - identical external coordinates, only retain the pair with - highest mapping quality. This command ONLY works with FR - orientation and requires ISIZE is correctly set. + Remove potential PCR duplicates: if multiple read pairs have + identical external coordinates, only retain the pair with + highest mapping quality. In the paired-end mode, this com- + mand ONLY works with FR orientation and requires ISIZE is + correctly set. It does not work for unpaired reads (e.g. two + ends mapped to different chromosomes or orphan reads). + OPTIONS: - rmdupse samtools rmdupse + -s Remove duplicate for single-end reads. By default, + the command works for paired-end reads only. - Remove potential duplicates for single-ended reads. This com- - mand will treat all reads as single-ended even if they are - paired in fact. + -S Treat paired-end reads and single-end reads. - fillmd samtools fillmd [-e] + calmd samtools calmd [-eubS] Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is dif- - ferent from the existing tag. + ferent from the existing tag. Output SAM by default. OPTIONS: @@ -300,6 +327,11 @@ COMMANDS AND OPTIONS the aligned reference base. Indel caller does not support the = bases at the moment. + -u Output uncompressed BAM + + -b Output compressed BAM + + -S The input is SAM with header lines SAM FORMAT @@ -327,43 +359,43 @@ SAM FORMAT Each bit in the FLAG field is defined as: - +-------+--------------------------------------------------+ - | Flag | Description | - +-------+--------------------------------------------------+ - |0x0001 | the read is paired in sequencing | - |0x0002 | the read is mapped in a proper pair | - |0x0004 | the query sequence itself is unmapped | - |0x0008 | the mate is unmapped | - |0x0010 | strand of the query (1 for reverse) | - |0x0020 | strand of the mate | - |0x0040 | the read is the first read in a pair | - |0x0080 | the read is the second read in a pair | - |0x0100 | the alignment is not primary | - |0x0200 | the read fails platform/vendor quality checks | - |0x0400 | the read is either a PCR or an optical duplicate | - +-------+--------------------------------------------------+ + +-------+-----+--------------------------------------------------+ + | Flag | Chr | Description | + +-------+-----+--------------------------------------------------+ + |0x0001 | p | the read is paired in sequencing | + |0x0002 | P | the read is mapped in a proper pair | + |0x0004 | u | the query sequence itself is unmapped | + |0x0008 | U | the mate is unmapped | + |0x0010 | r | strand of the query (1 for reverse) | + |0x0020 | R | strand of the mate | + |0x0040 | 1 | the read is the first read in a pair | + |0x0080 | 2 | the read is the second read in a pair | + |0x0100 | s | the alignment is not primary | + |0x0200 | f | the read fails platform/vendor quality checks | + |0x0400 | d | the read is either a PCR or an optical duplicate | + +-------+-----+--------------------------------------------------+ LIMITATIONS o Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. - o CIGAR operation P is not properly handled at the moment. - o In merging, the input files are required to have the same number of reference sequences. The requirement can be relaxed. In addition, merging does not reconstruct the header dictionaries automatically. Endusers have to provide the correct header. Picard is better at merging. - o Samtools' rmdup does not work for single-end data and does not remove - duplicates across chromosomes. Picard is better. + o Samtools paired-end rmdup does not work for unpaired reads (e.g. + orphan reads or ends mapped to different chromosomes). If this is a + concern, please use Picard's MarkDuplicate which correctly handles + these cases, although a little slower. AUTHOR Heng Li from the Sanger Institute wrote the C version of samtools. Bob Handsaker from the Broad Institute implemented the BGZF library and Jue Ruan from Beijing Genomics Institute wrote the RAZF library. Various - people in the 1000Genomes Project contributed to the SAM format speci- + people in the 1000 Genomes Project contributed to the SAM format speci- fication. @@ -372,4 +404,4 @@ SEE ALSO -samtools-0.1.7 10 November 2009 samtools(1) +samtools-0.1.8 11 July 2010 samtools(1) diff --git a/win32/xcurses.h b/win32/xcurses.h new file mode 100644 index 0000000..6f3ce19 --- /dev/null +++ b/win32/xcurses.h @@ -0,0 +1,1377 @@ +/* Public Domain Curses */ + +/* $Id: curses.h,v 1.295 2008/07/15 17:13:25 wmcbrine Exp $ */ + +/*----------------------------------------------------------------------* + * PDCurses * + *----------------------------------------------------------------------*/ + +#ifndef __PDCURSES__ +#define __PDCURSES__ 1 + +/*man-start************************************************************** + +PDCurses definitions list: (Only define those needed) + + XCURSES True if compiling for X11. + PDC_RGB True if you want to use RGB color definitions + (Red = 1, Green = 2, Blue = 4) instead of BGR. + PDC_WIDE True if building wide-character support. + PDC_DLL_BUILD True if building a Win32 DLL. + NCURSES_MOUSE_VERSION Use the ncurses mouse API instead + of PDCurses' traditional mouse API. + +PDCurses portable platform definitions list: + + PDC_BUILD Defines API build version. + PDCURSES Enables access to PDCurses-only routines. + XOPEN Always true. + SYSVcurses True if you are compiling for SYSV portability. + BSDcurses True if you are compiling for BSD portability. + +**man-end****************************************************************/ + +#define PDC_BUILD 3401 +#define PDCURSES 1 /* PDCurses-only routines */ +#define XOPEN 1 /* X/Open Curses routines */ +#define SYSVcurses 1 /* System V Curses routines */ +#define BSDcurses 1 /* BSD Curses routines */ +#define CHTYPE_LONG 1 /* size of chtype; long */ + +/*----------------------------------------------------------------------*/ + +#include +#include +#include /* Required by X/Open usage below */ + +#ifdef PDC_WIDE +# include +#endif + +#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) +extern "C" +{ +# define bool _bool +#endif + +/*---------------------------------------------------------------------- + * + * PDCurses Manifest Constants + * + */ + +#ifndef FALSE +# define FALSE 0 +#endif +#ifndef TRUE +# define TRUE 1 +#endif +#ifndef NULL +# define NULL (void *)0 +#endif +#ifndef ERR +# define ERR (-1) +#endif +#ifndef OK +# define OK 0 +#endif + +/*---------------------------------------------------------------------- + * + * PDCurses Type Declarations + * + */ + +typedef unsigned char bool; /* PDCurses Boolean type */ + +#ifdef CHTYPE_LONG +# if _LP64 +typedef unsigned int chtype; +# else +typedef unsigned long chtype; /* 16-bit attr + 16-bit char */ +# endif +#else +typedef unsigned short chtype; /* 8-bit attr + 8-bit char */ +#endif + +#ifdef PDC_WIDE +typedef chtype cchar_t; +#endif + +typedef chtype attr_t; + +/*---------------------------------------------------------------------- + * + * PDCurses Mouse Interface -- SYSVR4, with extensions + * + */ + +typedef struct +{ + int x; /* absolute column, 0 based, measured in characters */ + int y; /* absolute row, 0 based, measured in characters */ + short button[3]; /* state of each button */ + int changes; /* flags indicating what has changed with the mouse */ +} MOUSE_STATUS; + +#define BUTTON_RELEASED 0x0000 +#define BUTTON_PRESSED 0x0001 +#define BUTTON_CLICKED 0x0002 +#define BUTTON_DOUBLE_CLICKED 0x0003 +#define BUTTON_TRIPLE_CLICKED 0x0004 +#define BUTTON_MOVED 0x0005 /* PDCurses */ +#define WHEEL_SCROLLED 0x0006 /* PDCurses */ +#define BUTTON_ACTION_MASK 0x0007 /* PDCurses */ + +#define PDC_BUTTON_SHIFT 0x0008 /* PDCurses */ +#define PDC_BUTTON_CONTROL 0x0010 /* PDCurses */ +#define PDC_BUTTON_ALT 0x0020 /* PDCurses */ +#define BUTTON_MODIFIER_MASK 0x0038 /* PDCurses */ + +#define MOUSE_X_POS (Mouse_status.x) +#define MOUSE_Y_POS (Mouse_status.y) + +/* + * Bits associated with the .changes field: + * 3 2 1 0 + * 210987654321098765432109876543210 + * 1 <- button 1 has changed + * 10 <- button 2 has changed + * 100 <- button 3 has changed + * 1000 <- mouse has moved + * 10000 <- mouse position report + * 100000 <- mouse wheel up + * 1000000 <- mouse wheel down + */ + +#define PDC_MOUSE_MOVED 0x0008 +#define PDC_MOUSE_POSITION 0x0010 +#define PDC_MOUSE_WHEEL_UP 0x0020 +#define PDC_MOUSE_WHEEL_DOWN 0x0040 + +#define A_BUTTON_CHANGED (Mouse_status.changes & 7) +#define MOUSE_MOVED (Mouse_status.changes & PDC_MOUSE_MOVED) +#define MOUSE_POS_REPORT (Mouse_status.changes & PDC_MOUSE_POSITION) +#define BUTTON_CHANGED(x) (Mouse_status.changes & (1 << ((x) - 1))) +#define BUTTON_STATUS(x) (Mouse_status.button[(x) - 1]) +#define MOUSE_WHEEL_UP (Mouse_status.changes & PDC_MOUSE_WHEEL_UP) +#define MOUSE_WHEEL_DOWN (Mouse_status.changes & PDC_MOUSE_WHEEL_DOWN) + +/* mouse bit-masks */ + +#define BUTTON1_RELEASED 0x00000001L +#define BUTTON1_PRESSED 0x00000002L +#define BUTTON1_CLICKED 0x00000004L +#define BUTTON1_DOUBLE_CLICKED 0x00000008L +#define BUTTON1_TRIPLE_CLICKED 0x00000010L +#define BUTTON1_MOVED 0x00000010L /* PDCurses */ + +#define BUTTON2_RELEASED 0x00000020L +#define BUTTON2_PRESSED 0x00000040L +#define BUTTON2_CLICKED 0x00000080L +#define BUTTON2_DOUBLE_CLICKED 0x00000100L +#define BUTTON2_TRIPLE_CLICKED 0x00000200L +#define BUTTON2_MOVED 0x00000200L /* PDCurses */ + +#define BUTTON3_RELEASED 0x00000400L +#define BUTTON3_PRESSED 0x00000800L +#define BUTTON3_CLICKED 0x00001000L +#define BUTTON3_DOUBLE_CLICKED 0x00002000L +#define BUTTON3_TRIPLE_CLICKED 0x00004000L +#define BUTTON3_MOVED 0x00004000L /* PDCurses */ + +/* For the ncurses-compatible functions only, BUTTON4_PRESSED and + BUTTON5_PRESSED are returned for mouse scroll wheel up and down; + otherwise PDCurses doesn't support buttons 4 and 5 */ + +#define BUTTON4_RELEASED 0x00008000L +#define BUTTON4_PRESSED 0x00010000L +#define BUTTON4_CLICKED 0x00020000L +#define BUTTON4_DOUBLE_CLICKED 0x00040000L +#define BUTTON4_TRIPLE_CLICKED 0x00080000L + +#define BUTTON5_RELEASED 0x00100000L +#define BUTTON5_PRESSED 0x00200000L +#define BUTTON5_CLICKED 0x00400000L +#define BUTTON5_DOUBLE_CLICKED 0x00800000L +#define BUTTON5_TRIPLE_CLICKED 0x01000000L + +#define MOUSE_WHEEL_SCROLL 0x02000000L /* PDCurses */ +#define BUTTON_MODIFIER_SHIFT 0x04000000L /* PDCurses */ +#define BUTTON_MODIFIER_CONTROL 0x08000000L /* PDCurses */ +#define BUTTON_MODIFIER_ALT 0x10000000L /* PDCurses */ + +#define ALL_MOUSE_EVENTS 0x1fffffffL +#define REPORT_MOUSE_POSITION 0x20000000L + +/* ncurses mouse interface */ + +typedef unsigned long mmask_t; + +typedef struct +{ + short id; /* unused, always 0 */ + int x, y, z; /* x, y same as MOUSE_STATUS; z unused */ + mmask_t bstate; /* equivalent to changes + button[], but + in the same format as used for mousemask() */ +} MEVENT; + +#ifdef NCURSES_MOUSE_VERSION +# define BUTTON_SHIFT BUTTON_MODIFIER_SHIFT +# define BUTTON_CONTROL BUTTON_MODIFIER_CONTROL +# define BUTTON_CTRL BUTTON_MODIFIER_CONTROL +# define BUTTON_ALT BUTTON_MODIFIER_ALT +#else +# define BUTTON_SHIFT PDC_BUTTON_SHIFT +# define BUTTON_CONTROL PDC_BUTTON_CONTROL +# define BUTTON_ALT PDC_BUTTON_ALT +#endif + +/*---------------------------------------------------------------------- + * + * PDCurses Structure Definitions + * + */ + +typedef struct _win /* definition of a window */ +{ + int _cury; /* current pseudo-cursor */ + int _curx; + int _maxy; /* max window coordinates */ + int _maxx; + int _begy; /* origin on screen */ + int _begx; + int _flags; /* window properties */ + chtype _attrs; /* standard attributes and colors */ + chtype _bkgd; /* background, normally blank */ + bool _clear; /* causes clear at next refresh */ + bool _leaveit; /* leaves cursor where it is */ + bool _scroll; /* allows window scrolling */ + bool _nodelay; /* input character wait flag */ + bool _immed; /* immediate update flag */ + bool _sync; /* synchronise window ancestors */ + bool _use_keypad; /* flags keypad key mode active */ + chtype **_y; /* pointer to line pointer array */ + int *_firstch; /* first changed character in line */ + int *_lastch; /* last changed character in line */ + int _tmarg; /* top of scrolling region */ + int _bmarg; /* bottom of scrolling region */ + int _delayms; /* milliseconds of delay for getch() */ + int _parx, _pary; /* coords relative to parent (0,0) */ + struct _win *_parent; /* subwin's pointer to parent win */ +} WINDOW; + +/* Avoid using the SCREEN struct directly -- use the corresponding + functions if possible. This struct may eventually be made private. */ + +typedef struct +{ + bool alive; /* if initscr() called, and not endwin() */ + bool autocr; /* if cr -> lf */ + bool cbreak; /* if terminal unbuffered */ + bool echo; /* if terminal echo */ + bool raw_inp; /* raw input mode (v. cooked input) */ + bool raw_out; /* raw output mode (7 v. 8 bits) */ + bool audible; /* FALSE if the bell is visual */ + bool mono; /* TRUE if current screen is mono */ + bool resized; /* TRUE if TERM has been resized */ + bool orig_attr; /* TRUE if we have the original colors */ + short orig_fore; /* original screen foreground color */ + short orig_back; /* original screen foreground color */ + int cursrow; /* position of physical cursor */ + int curscol; /* position of physical cursor */ + int visibility; /* visibility of cursor */ + int orig_cursor; /* original cursor size */ + int lines; /* new value for LINES */ + int cols; /* new value for COLS */ + unsigned long _trap_mbe; /* trap these mouse button events */ + unsigned long _map_mbe_to_key; /* map mouse buttons to slk */ + int mouse_wait; /* time to wait (in ms) for a + button release after a press, in + order to count it as a click */ + int slklines; /* lines in use by slk_init() */ + WINDOW *slk_winptr; /* window for slk */ + int linesrippedoff; /* lines ripped off via ripoffline() */ + int linesrippedoffontop; /* lines ripped off on + top via ripoffline() */ + int delaytenths; /* 1/10ths second to wait block + getch() for */ + bool _preserve; /* TRUE if screen background + to be preserved */ + int _restore; /* specifies if screen background + to be restored, and how */ + bool save_key_modifiers; /* TRUE if each key modifiers saved + with each key press */ + bool return_key_modifiers; /* TRUE if modifier keys are + returned as "real" keys */ + bool key_code; /* TRUE if last key is a special key; + used internally by get_wch() */ +#ifdef XCURSES + int XcurscrSize; /* size of Xcurscr shared memory block */ + bool sb_on; + int sb_viewport_y; + int sb_viewport_x; + int sb_total_y; + int sb_total_x; + int sb_cur_y; + int sb_cur_x; +#endif + short line_color; /* color of line attributes - default -1 */ +} SCREEN; + +/*---------------------------------------------------------------------- + * + * PDCurses External Variables + * + */ + +#ifdef PDC_DLL_BUILD +# ifdef CURSES_LIBRARY +# define PDCEX __declspec(dllexport) extern +# else +# define PDCEX __declspec(dllimport) +# endif +#else +# define PDCEX extern +#endif + +PDCEX int LINES; /* terminal height */ +PDCEX int COLS; /* terminal width */ +PDCEX WINDOW *stdscr; /* the default screen window */ +PDCEX WINDOW *curscr; /* the current screen image */ +PDCEX SCREEN *SP; /* curses variables */ +PDCEX MOUSE_STATUS Mouse_status; +PDCEX int COLORS; +PDCEX int COLOR_PAIRS; +PDCEX int TABSIZE; +PDCEX chtype acs_map[]; /* alternate character set map */ +PDCEX char ttytype[]; /* terminal name/description */ + +/*man-start************************************************************** + +PDCurses Text Attributes +======================== + +Originally, PDCurses used a short (16 bits) for its chtype. To include +color, a number of things had to be sacrificed from the strict Unix and +System V support. The main problem was fitting all character attributes +and color into an unsigned char (all 8 bits!). + +Today, PDCurses by default uses a long (32 bits) for its chtype, as in +System V. The short chtype is still available, by undefining CHTYPE_LONG +and rebuilding the library. + +The following is the structure of a win->_attrs chtype: + +short form: + +------------------------------------------------- +|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| +------------------------------------------------- + color number | attrs | character eg 'a' + +The available non-color attributes are bold, reverse and blink. Others +have no effect. The high order char is an index into an array of +physical colors (defined in color.c) -- 32 foreground/background color +pairs (5 bits) plus 3 bits for other attributes. + +long form: + +---------------------------------------------------------------------------- +|31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|..| 3| 2| 1| 0| +---------------------------------------------------------------------------- + color number | modifiers | character eg 'a' + +The available non-color attributes are bold, underline, invisible, +right-line, left-line, protect, reverse and blink. 256 color pairs (8 +bits), 8 bits for other attributes, and 16 bits for character data. + +**man-end****************************************************************/ + +/*** Video attribute macros ***/ + +#define A_NORMAL (chtype)0 + +#ifdef CHTYPE_LONG +# define A_ALTCHARSET (chtype)0x00010000 +# define A_RIGHTLINE (chtype)0x00020000 +# define A_LEFTLINE (chtype)0x00040000 +# define A_INVIS (chtype)0x00080000 +# define A_UNDERLINE (chtype)0x00100000 +# define A_REVERSE (chtype)0x00200000 +# define A_BLINK (chtype)0x00400000 +# define A_BOLD (chtype)0x00800000 + +# define A_ATTRIBUTES (chtype)0xffff0000 +# define A_CHARTEXT (chtype)0x0000ffff +# define A_COLOR (chtype)0xff000000 + +# define A_ITALIC A_INVIS +# define A_PROTECT (A_UNDERLINE | A_LEFTLINE | A_RIGHTLINE) + +# define PDC_ATTR_SHIFT 19 +# define PDC_COLOR_SHIFT 24 +#else +# define A_BOLD (chtype)0x0100 /* X/Open */ +# define A_REVERSE (chtype)0x0200 /* X/Open */ +# define A_BLINK (chtype)0x0400 /* X/Open */ + +# define A_ATTRIBUTES (chtype)0xff00 /* X/Open */ +# define A_CHARTEXT (chtype)0x00ff /* X/Open */ +# define A_COLOR (chtype)0xf800 /* System V */ + +# define A_ALTCHARSET A_NORMAL /* X/Open */ +# define A_PROTECT A_NORMAL /* X/Open */ +# define A_UNDERLINE A_NORMAL /* X/Open */ + +# define A_LEFTLINE A_NORMAL +# define A_RIGHTLINE A_NORMAL +# define A_ITALIC A_NORMAL +# define A_INVIS A_NORMAL + +# define PDC_ATTR_SHIFT 8 +# define PDC_COLOR_SHIFT 11 +#endif + +#define A_STANDOUT (A_REVERSE | A_BOLD) /* X/Open */ +#define A_DIM A_NORMAL + +#define CHR_MSK A_CHARTEXT /* Obsolete */ +#define ATR_MSK A_ATTRIBUTES /* Obsolete */ +#define ATR_NRM A_NORMAL /* Obsolete */ + +/* For use with attr_t -- X/Open says, "these shall be distinct", so + this is a non-conforming implementation. */ + +#define WA_ALTCHARSET A_ALTCHARSET +#define WA_BLINK A_BLINK +#define WA_BOLD A_BOLD +#define WA_DIM A_DIM +#define WA_INVIS A_INVIS +#define WA_LEFT A_LEFTLINE +#define WA_PROTECT A_PROTECT +#define WA_REVERSE A_REVERSE +#define WA_RIGHT A_RIGHTLINE +#define WA_STANDOUT A_STANDOUT +#define WA_UNDERLINE A_UNDERLINE + +#define WA_HORIZONTAL A_NORMAL +#define WA_LOW A_NORMAL +#define WA_TOP A_NORMAL +#define WA_VERTICAL A_NORMAL + +/*** Alternate character set macros ***/ + +/* 'w' = 32-bit chtype; acs_map[] index | A_ALTCHARSET + 'n' = 16-bit chtype; it gets the fallback set because no bit is + available for A_ALTCHARSET */ + +#ifdef CHTYPE_LONG +# define ACS_PICK(w, n) ((chtype)w | A_ALTCHARSET) +#else +# define ACS_PICK(w, n) ((chtype)n) +#endif + +/* VT100-compatible symbols -- box chars */ + +#define ACS_ULCORNER ACS_PICK('l', '+') +#define ACS_LLCORNER ACS_PICK('m', '+') +#define ACS_URCORNER ACS_PICK('k', '+') +#define ACS_LRCORNER ACS_PICK('j', '+') +#define ACS_RTEE ACS_PICK('u', '+') +#define ACS_LTEE ACS_PICK('t', '+') +#define ACS_BTEE ACS_PICK('v', '+') +#define ACS_TTEE ACS_PICK('w', '+') +#define ACS_HLINE ACS_PICK('q', '-') +#define ACS_VLINE ACS_PICK('x', '|') +#define ACS_PLUS ACS_PICK('n', '+') + +/* VT100-compatible symbols -- other */ + +#define ACS_S1 ACS_PICK('o', '-') +#define ACS_S9 ACS_PICK('s', '_') +#define ACS_DIAMOND ACS_PICK('`', '+') +#define ACS_CKBOARD ACS_PICK('a', ':') +#define ACS_DEGREE ACS_PICK('f', '\'') +#define ACS_PLMINUS ACS_PICK('g', '#') +#define ACS_BULLET ACS_PICK('~', 'o') + +/* Teletype 5410v1 symbols -- these are defined in SysV curses, but + are not well-supported by most terminals. Stick to VT100 characters + for optimum portability. */ + +#define ACS_LARROW ACS_PICK(',', '<') +#define ACS_RARROW ACS_PICK('+', '>') +#define ACS_DARROW ACS_PICK('.', 'v') +#define ACS_UARROW ACS_PICK('-', '^') +#define ACS_BOARD ACS_PICK('h', '#') +#define ACS_LANTERN ACS_PICK('i', '*') +#define ACS_BLOCK ACS_PICK('0', '#') + +/* That goes double for these -- undocumented SysV symbols. Don't use + them. */ + +#define ACS_S3 ACS_PICK('p', '-') +#define ACS_S7 ACS_PICK('r', '-') +#define ACS_LEQUAL ACS_PICK('y', '<') +#define ACS_GEQUAL ACS_PICK('z', '>') +#define ACS_PI ACS_PICK('{', 'n') +#define ACS_NEQUAL ACS_PICK('|', '+') +#define ACS_STERLING ACS_PICK('}', 'L') + +/* Box char aliases */ + +#define ACS_BSSB ACS_ULCORNER +#define ACS_SSBB ACS_LLCORNER +#define ACS_BBSS ACS_URCORNER +#define ACS_SBBS ACS_LRCORNER +#define ACS_SBSS ACS_RTEE +#define ACS_SSSB ACS_LTEE +#define ACS_SSBS ACS_BTEE +#define ACS_BSSS ACS_TTEE +#define ACS_BSBS ACS_HLINE +#define ACS_SBSB ACS_VLINE +#define ACS_SSSS ACS_PLUS + +/* cchar_t aliases */ + +#ifdef PDC_WIDE +# define WACS_ULCORNER (&(acs_map['l'])) +# define WACS_LLCORNER (&(acs_map['m'])) +# define WACS_URCORNER (&(acs_map['k'])) +# define WACS_LRCORNER (&(acs_map['j'])) +# define WACS_RTEE (&(acs_map['u'])) +# define WACS_LTEE (&(acs_map['t'])) +# define WACS_BTEE (&(acs_map['v'])) +# define WACS_TTEE (&(acs_map['w'])) +# define WACS_HLINE (&(acs_map['q'])) +# define WACS_VLINE (&(acs_map['x'])) +# define WACS_PLUS (&(acs_map['n'])) + +# define WACS_S1 (&(acs_map['o'])) +# define WACS_S9 (&(acs_map['s'])) +# define WACS_DIAMOND (&(acs_map['`'])) +# define WACS_CKBOARD (&(acs_map['a'])) +# define WACS_DEGREE (&(acs_map['f'])) +# define WACS_PLMINUS (&(acs_map['g'])) +# define WACS_BULLET (&(acs_map['~'])) + +# define WACS_LARROW (&(acs_map[','])) +# define WACS_RARROW (&(acs_map['+'])) +# define WACS_DARROW (&(acs_map['.'])) +# define WACS_UARROW (&(acs_map['-'])) +# define WACS_BOARD (&(acs_map['h'])) +# define WACS_LANTERN (&(acs_map['i'])) +# define WACS_BLOCK (&(acs_map['0'])) + +# define WACS_S3 (&(acs_map['p'])) +# define WACS_S7 (&(acs_map['r'])) +# define WACS_LEQUAL (&(acs_map['y'])) +# define WACS_GEQUAL (&(acs_map['z'])) +# define WACS_PI (&(acs_map['{'])) +# define WACS_NEQUAL (&(acs_map['|'])) +# define WACS_STERLING (&(acs_map['}'])) + +# define WACS_BSSB WACS_ULCORNER +# define WACS_SSBB WACS_LLCORNER +# define WACS_BBSS WACS_URCORNER +# define WACS_SBBS WACS_LRCORNER +# define WACS_SBSS WACS_RTEE +# define WACS_SSSB WACS_LTEE +# define WACS_SSBS WACS_BTEE +# define WACS_BSSS WACS_TTEE +# define WACS_BSBS WACS_HLINE +# define WACS_SBSB WACS_VLINE +# define WACS_SSSS WACS_PLUS +#endif + +/*** Color macros ***/ + +#define COLOR_BLACK 0 + +#ifdef PDC_RGB /* RGB */ +# define COLOR_RED 1 +# define COLOR_GREEN 2 +# define COLOR_BLUE 4 +#else /* BGR */ +# define COLOR_BLUE 1 +# define COLOR_GREEN 2 +# define COLOR_RED 4 +#endif + +#define COLOR_CYAN (COLOR_BLUE | COLOR_GREEN) +#define COLOR_MAGENTA (COLOR_RED | COLOR_BLUE) +#define COLOR_YELLOW (COLOR_RED | COLOR_GREEN) + +#define COLOR_WHITE 7 + +/*---------------------------------------------------------------------- + * + * Function and Keypad Key Definitions. + * Many are just for compatibility. + * + */ + +#define KEY_CODE_YES 0x100 /* If get_wch() gives a key code */ + +#define KEY_BREAK 0x101 /* Not on PC KBD */ +#define KEY_DOWN 0x102 /* Down arrow key */ +#define KEY_UP 0x103 /* Up arrow key */ +#define KEY_LEFT 0x104 /* Left arrow key */ +#define KEY_RIGHT 0x105 /* Right arrow key */ +#define KEY_HOME 0x106 /* home key */ +#define KEY_BACKSPACE 0x107 /* not on pc */ +#define KEY_F0 0x108 /* function keys; 64 reserved */ + +#define KEY_DL 0x148 /* delete line */ +#define KEY_IL 0x149 /* insert line */ +#define KEY_DC 0x14a /* delete character */ +#define KEY_IC 0x14b /* insert char or enter ins mode */ +#define KEY_EIC 0x14c /* exit insert char mode */ +#define KEY_CLEAR 0x14d /* clear screen */ +#define KEY_EOS 0x14e /* clear to end of screen */ +#define KEY_EOL 0x14f /* clear to end of line */ +#define KEY_SF 0x150 /* scroll 1 line forward */ +#define KEY_SR 0x151 /* scroll 1 line back (reverse) */ +#define KEY_NPAGE 0x152 /* next page */ +#define KEY_PPAGE 0x153 /* previous page */ +#define KEY_STAB 0x154 /* set tab */ +#define KEY_CTAB 0x155 /* clear tab */ +#define KEY_CATAB 0x156 /* clear all tabs */ +#define KEY_ENTER 0x157 /* enter or send (unreliable) */ +#define KEY_SRESET 0x158 /* soft/reset (partial/unreliable) */ +#define KEY_RESET 0x159 /* reset/hard reset (unreliable) */ +#define KEY_PRINT 0x15a /* print/copy */ +#define KEY_LL 0x15b /* home down/bottom (lower left) */ +#define KEY_ABORT 0x15c /* abort/terminate key (any) */ +#define KEY_SHELP 0x15d /* short help */ +#define KEY_LHELP 0x15e /* long help */ +#define KEY_BTAB 0x15f /* Back tab key */ +#define KEY_BEG 0x160 /* beg(inning) key */ +#define KEY_CANCEL 0x161 /* cancel key */ +#define KEY_CLOSE 0x162 /* close key */ +#define KEY_COMMAND 0x163 /* cmd (command) key */ +#define KEY_COPY 0x164 /* copy key */ +#define KEY_CREATE 0x165 /* create key */ +#define KEY_END 0x166 /* end key */ +#define KEY_EXIT 0x167 /* exit key */ +#define KEY_FIND 0x168 /* find key */ +#define KEY_HELP 0x169 /* help key */ +#define KEY_MARK 0x16a /* mark key */ +#define KEY_MESSAGE 0x16b /* message key */ +#define KEY_MOVE 0x16c /* move key */ +#define KEY_NEXT 0x16d /* next object key */ +#define KEY_OPEN 0x16e /* open key */ +#define KEY_OPTIONS 0x16f /* options key */ +#define KEY_PREVIOUS 0x170 /* previous object key */ +#define KEY_REDO 0x171 /* redo key */ +#define KEY_REFERENCE 0x172 /* ref(erence) key */ +#define KEY_REFRESH 0x173 /* refresh key */ +#define KEY_REPLACE 0x174 /* replace key */ +#define KEY_RESTART 0x175 /* restart key */ +#define KEY_RESUME 0x176 /* resume key */ +#define KEY_SAVE 0x177 /* save key */ +#define KEY_SBEG 0x178 /* shifted beginning key */ +#define KEY_SCANCEL 0x179 /* shifted cancel key */ +#define KEY_SCOMMAND 0x17a /* shifted command key */ +#define KEY_SCOPY 0x17b /* shifted copy key */ +#define KEY_SCREATE 0x17c /* shifted create key */ +#define KEY_SDC 0x17d /* shifted delete char key */ +#define KEY_SDL 0x17e /* shifted delete line key */ +#define KEY_SELECT 0x17f /* select key */ +#define KEY_SEND 0x180 /* shifted end key */ +#define KEY_SEOL 0x181 /* shifted clear line key */ +#define KEY_SEXIT 0x182 /* shifted exit key */ +#define KEY_SFIND 0x183 /* shifted find key */ +#define KEY_SHOME 0x184 /* shifted home key */ +#define KEY_SIC 0x185 /* shifted input key */ + +#define KEY_SLEFT 0x187 /* shifted left arrow key */ +#define KEY_SMESSAGE 0x188 /* shifted message key */ +#define KEY_SMOVE 0x189 /* shifted move key */ +#define KEY_SNEXT 0x18a /* shifted next key */ +#define KEY_SOPTIONS 0x18b /* shifted options key */ +#define KEY_SPREVIOUS 0x18c /* shifted prev key */ +#define KEY_SPRINT 0x18d /* shifted print key */ +#define KEY_SREDO 0x18e /* shifted redo key */ +#define KEY_SREPLACE 0x18f /* shifted replace key */ +#define KEY_SRIGHT 0x190 /* shifted right arrow */ +#define KEY_SRSUME 0x191 /* shifted resume key */ +#define KEY_SSAVE 0x192 /* shifted save key */ +#define KEY_SSUSPEND 0x193 /* shifted suspend key */ +#define KEY_SUNDO 0x194 /* shifted undo key */ +#define KEY_SUSPEND 0x195 /* suspend key */ +#define KEY_UNDO 0x196 /* undo key */ + +/* PDCurses-specific key definitions -- PC only */ + +#define ALT_0 0x197 +#define ALT_1 0x198 +#define ALT_2 0x199 +#define ALT_3 0x19a +#define ALT_4 0x19b +#define ALT_5 0x19c +#define ALT_6 0x19d +#define ALT_7 0x19e +#define ALT_8 0x19f +#define ALT_9 0x1a0 +#define ALT_A 0x1a1 +#define ALT_B 0x1a2 +#define ALT_C 0x1a3 +#define ALT_D 0x1a4 +#define ALT_E 0x1a5 +#define ALT_F 0x1a6 +#define ALT_G 0x1a7 +#define ALT_H 0x1a8 +#define ALT_I 0x1a9 +#define ALT_J 0x1aa +#define ALT_K 0x1ab +#define ALT_L 0x1ac +#define ALT_M 0x1ad +#define ALT_N 0x1ae +#define ALT_O 0x1af +#define ALT_P 0x1b0 +#define ALT_Q 0x1b1 +#define ALT_R 0x1b2 +#define ALT_S 0x1b3 +#define ALT_T 0x1b4 +#define ALT_U 0x1b5 +#define ALT_V 0x1b6 +#define ALT_W 0x1b7 +#define ALT_X 0x1b8 +#define ALT_Y 0x1b9 +#define ALT_Z 0x1ba + +#define CTL_LEFT 0x1bb /* Control-Left-Arrow */ +#define CTL_RIGHT 0x1bc +#define CTL_PGUP 0x1bd +#define CTL_PGDN 0x1be +#define CTL_HOME 0x1bf +#define CTL_END 0x1c0 + +#define KEY_A1 0x1c1 /* upper left on Virtual keypad */ +#define KEY_A2 0x1c2 /* upper middle on Virt. keypad */ +#define KEY_A3 0x1c3 /* upper right on Vir. keypad */ +#define KEY_B1 0x1c4 /* middle left on Virt. keypad */ +#define KEY_B2 0x1c5 /* center on Virt. keypad */ +#define KEY_B3 0x1c6 /* middle right on Vir. keypad */ +#define KEY_C1 0x1c7 /* lower left on Virt. keypad */ +#define KEY_C2 0x1c8 /* lower middle on Virt. keypad */ +#define KEY_C3 0x1c9 /* lower right on Vir. keypad */ + +#define PADSLASH 0x1ca /* slash on keypad */ +#define PADENTER 0x1cb /* enter on keypad */ +#define CTL_PADENTER 0x1cc /* ctl-enter on keypad */ +#define ALT_PADENTER 0x1cd /* alt-enter on keypad */ +#define PADSTOP 0x1ce /* stop on keypad */ +#define PADSTAR 0x1cf /* star on keypad */ +#define PADMINUS 0x1d0 /* minus on keypad */ +#define PADPLUS 0x1d1 /* plus on keypad */ +#define CTL_PADSTOP 0x1d2 /* ctl-stop on keypad */ +#define CTL_PADCENTER 0x1d3 /* ctl-enter on keypad */ +#define CTL_PADPLUS 0x1d4 /* ctl-plus on keypad */ +#define CTL_PADMINUS 0x1d5 /* ctl-minus on keypad */ +#define CTL_PADSLASH 0x1d6 /* ctl-slash on keypad */ +#define CTL_PADSTAR 0x1d7 /* ctl-star on keypad */ +#define ALT_PADPLUS 0x1d8 /* alt-plus on keypad */ +#define ALT_PADMINUS 0x1d9 /* alt-minus on keypad */ +#define ALT_PADSLASH 0x1da /* alt-slash on keypad */ +#define ALT_PADSTAR 0x1db /* alt-star on keypad */ +#define ALT_PADSTOP 0x1dc /* alt-stop on keypad */ +#define CTL_INS 0x1dd /* ctl-insert */ +#define ALT_DEL 0x1de /* alt-delete */ +#define ALT_INS 0x1df /* alt-insert */ +#define CTL_UP 0x1e0 /* ctl-up arrow */ +#define CTL_DOWN 0x1e1 /* ctl-down arrow */ +#define CTL_TAB 0x1e2 /* ctl-tab */ +#define ALT_TAB 0x1e3 +#define ALT_MINUS 0x1e4 +#define ALT_EQUAL 0x1e5 +#define ALT_HOME 0x1e6 +#define ALT_PGUP 0x1e7 +#define ALT_PGDN 0x1e8 +#define ALT_END 0x1e9 +#define ALT_UP 0x1ea /* alt-up arrow */ +#define ALT_DOWN 0x1eb /* alt-down arrow */ +#define ALT_RIGHT 0x1ec /* alt-right arrow */ +#define ALT_LEFT 0x1ed /* alt-left arrow */ +#define ALT_ENTER 0x1ee /* alt-enter */ +#define ALT_ESC 0x1ef /* alt-escape */ +#define ALT_BQUOTE 0x1f0 /* alt-back quote */ +#define ALT_LBRACKET 0x1f1 /* alt-left bracket */ +#define ALT_RBRACKET 0x1f2 /* alt-right bracket */ +#define ALT_SEMICOLON 0x1f3 /* alt-semi-colon */ +#define ALT_FQUOTE 0x1f4 /* alt-forward quote */ +#define ALT_COMMA 0x1f5 /* alt-comma */ +#define ALT_STOP 0x1f6 /* alt-stop */ +#define ALT_FSLASH 0x1f7 /* alt-forward slash */ +#define ALT_BKSP 0x1f8 /* alt-backspace */ +#define CTL_BKSP 0x1f9 /* ctl-backspace */ +#define PAD0 0x1fa /* keypad 0 */ + +#define CTL_PAD0 0x1fb /* ctl-keypad 0 */ +#define CTL_PAD1 0x1fc +#define CTL_PAD2 0x1fd +#define CTL_PAD3 0x1fe +#define CTL_PAD4 0x1ff +#define CTL_PAD5 0x200 +#define CTL_PAD6 0x201 +#define CTL_PAD7 0x202 +#define CTL_PAD8 0x203 +#define CTL_PAD9 0x204 + +#define ALT_PAD0 0x205 /* alt-keypad 0 */ +#define ALT_PAD1 0x206 +#define ALT_PAD2 0x207 +#define ALT_PAD3 0x208 +#define ALT_PAD4 0x209 +#define ALT_PAD5 0x20a +#define ALT_PAD6 0x20b +#define ALT_PAD7 0x20c +#define ALT_PAD8 0x20d +#define ALT_PAD9 0x20e + +#define CTL_DEL 0x20f /* clt-delete */ +#define ALT_BSLASH 0x210 /* alt-back slash */ +#define CTL_ENTER 0x211 /* ctl-enter */ + +#define SHF_PADENTER 0x212 /* shift-enter on keypad */ +#define SHF_PADSLASH 0x213 /* shift-slash on keypad */ +#define SHF_PADSTAR 0x214 /* shift-star on keypad */ +#define SHF_PADPLUS 0x215 /* shift-plus on keypad */ +#define SHF_PADMINUS 0x216 /* shift-minus on keypad */ +#define SHF_UP 0x217 /* shift-up on keypad */ +#define SHF_DOWN 0x218 /* shift-down on keypad */ +#define SHF_IC 0x219 /* shift-insert on keypad */ +#define SHF_DC 0x21a /* shift-delete on keypad */ + +#define KEY_MOUSE 0x21b /* "mouse" key */ +#define KEY_SHIFT_L 0x21c /* Left-shift */ +#define KEY_SHIFT_R 0x21d /* Right-shift */ +#define KEY_CONTROL_L 0x21e /* Left-control */ +#define KEY_CONTROL_R 0x21f /* Right-control */ +#define KEY_ALT_L 0x220 /* Left-alt */ +#define KEY_ALT_R 0x221 /* Right-alt */ +#define KEY_RESIZE 0x222 /* Window resize */ +#define KEY_SUP 0x223 /* Shifted up arrow */ +#define KEY_SDOWN 0x224 /* Shifted down arrow */ + +#define KEY_MIN KEY_BREAK /* Minimum curses key value */ +#define KEY_MAX KEY_SDOWN /* Maximum curses key */ + +#define KEY_F(n) (KEY_F0 + (n)) + +/*---------------------------------------------------------------------- + * + * PDCurses Function Declarations + * + */ + +/* Standard */ + +int addch(const chtype); +int addchnstr(const chtype *, int); +int addchstr(const chtype *); +int addnstr(const char *, int); +int addstr(const char *); +int attroff(chtype); +int attron(chtype); +int attrset(chtype); +int attr_get(attr_t *, short *, void *); +int attr_off(attr_t, void *); +int attr_on(attr_t, void *); +int attr_set(attr_t, short, void *); +int baudrate(void); +int beep(void); +int bkgd(chtype); +void bkgdset(chtype); +int border(chtype, chtype, chtype, chtype, chtype, chtype, chtype, chtype); +int box(WINDOW *, chtype, chtype); +bool can_change_color(void); +int cbreak(void); +int chgat(int, attr_t, short, const void *); +int clearok(WINDOW *, bool); +int clear(void); +int clrtobot(void); +int clrtoeol(void); +int color_content(short, short *, short *, short *); +int color_set(short, void *); +int copywin(const WINDOW *, WINDOW *, int, int, int, int, int, int, int); +int curs_set(int); +int def_prog_mode(void); +int def_shell_mode(void); +int delay_output(int); +int delch(void); +int deleteln(void); +void delscreen(SCREEN *); +int delwin(WINDOW *); +WINDOW *derwin(WINDOW *, int, int, int, int); +int doupdate(void); +WINDOW *dupwin(WINDOW *); +int echochar(const chtype); +int echo(void); +int endwin(void); +char erasechar(void); +int erase(void); +void filter(void); +int flash(void); +int flushinp(void); +chtype getbkgd(WINDOW *); +int getnstr(char *, int); +int getstr(char *); +WINDOW *getwin(FILE *); +int halfdelay(int); +bool has_colors(void); +bool has_ic(void); +bool has_il(void); +int hline(chtype, int); +void idcok(WINDOW *, bool); +int idlok(WINDOW *, bool); +void immedok(WINDOW *, bool); +int inchnstr(chtype *, int); +int inchstr(chtype *); +chtype inch(void); +int init_color(short, short, short, short); +int init_pair(short, short, short); +WINDOW *initscr(void); +int innstr(char *, int); +int insch(chtype); +int insdelln(int); +int insertln(void); +int insnstr(const char *, int); +int insstr(const char *); +int instr(char *); +int intrflush(WINDOW *, bool); +bool isendwin(void); +bool is_linetouched(WINDOW *, int); +bool is_wintouched(WINDOW *); +char *keyname(int); +int keypad(WINDOW *, bool); +char killchar(void); +int leaveok(WINDOW *, bool); +char *longname(void); +int meta(WINDOW *, bool); +int move(int, int); +int mvaddch(int, int, const chtype); +int mvaddchnstr(int, int, const chtype *, int); +int mvaddchstr(int, int, const chtype *); +int mvaddnstr(int, int, const char *, int); +int mvaddstr(int, int, const char *); +int mvchgat(int, int, int, attr_t, short, const void *); +int mvcur(int, int, int, int); +int mvdelch(int, int); +int mvderwin(WINDOW *, int, int); +int mvgetch(int, int); +int mvgetnstr(int, int, char *, int); +int mvgetstr(int, int, char *); +int mvhline(int, int, chtype, int); +chtype mvinch(int, int); +int mvinchnstr(int, int, chtype *, int); +int mvinchstr(int, int, chtype *); +int mvinnstr(int, int, char *, int); +int mvinsch(int, int, chtype); +int mvinsnstr(int, int, const char *, int); +int mvinsstr(int, int, const char *); +int mvinstr(int, int, char *); +int mvprintw(int, int, const char *, ...); +int mvscanw(int, int, const char *, ...); +int mvvline(int, int, chtype, int); +int mvwaddchnstr(WINDOW *, int, int, const chtype *, int); +int mvwaddchstr(WINDOW *, int, int, const chtype *); +int mvwaddch(WINDOW *, int, int, const chtype); +int mvwaddnstr(WINDOW *, int, int, const char *, int); +int mvwaddstr(WINDOW *, int, int, const char *); +int mvwchgat(WINDOW *, int, int, int, attr_t, short, const void *); +int mvwdelch(WINDOW *, int, int); +int mvwgetch(WINDOW *, int, int); +int mvwgetnstr(WINDOW *, int, int, char *, int); +int mvwgetstr(WINDOW *, int, int, char *); +int mvwhline(WINDOW *, int, int, chtype, int); +int mvwinchnstr(WINDOW *, int, int, chtype *, int); +int mvwinchstr(WINDOW *, int, int, chtype *); +chtype mvwinch(WINDOW *, int, int); +int mvwinnstr(WINDOW *, int, int, char *, int); +int mvwinsch(WINDOW *, int, int, chtype); +int mvwinsnstr(WINDOW *, int, int, const char *, int); +int mvwinsstr(WINDOW *, int, int, const char *); +int mvwinstr(WINDOW *, int, int, char *); +int mvwin(WINDOW *, int, int); +int mvwprintw(WINDOW *, int, int, const char *, ...); +int mvwscanw(WINDOW *, int, int, const char *, ...); +int mvwvline(WINDOW *, int, int, chtype, int); +int napms(int); +WINDOW *newpad(int, int); +SCREEN *newterm(const char *, FILE *, FILE *); +WINDOW *newwin(int, int, int, int); +int nl(void); +int nocbreak(void); +int nodelay(WINDOW *, bool); +int noecho(void); +int nonl(void); +void noqiflush(void); +int noraw(void); +int notimeout(WINDOW *, bool); +int overlay(const WINDOW *, WINDOW *); +int overwrite(const WINDOW *, WINDOW *); +int pair_content(short, short *, short *); +int pechochar(WINDOW *, chtype); +int pnoutrefresh(WINDOW *, int, int, int, int, int, int); +int prefresh(WINDOW *, int, int, int, int, int, int); +int printw(const char *, ...); +int putwin(WINDOW *, FILE *); +void qiflush(void); +int raw(void); +int redrawwin(WINDOW *); +int refresh(void); +int reset_prog_mode(void); +int reset_shell_mode(void); +int resetty(void); +int ripoffline(int, int (*)(WINDOW *, int)); +int savetty(void); +int scanw(const char *, ...); +int scr_dump(const char *); +int scr_init(const char *); +int scr_restore(const char *); +int scr_set(const char *); +int scrl(int); +int scroll(WINDOW *); +int scrollok(WINDOW *, bool); +SCREEN *set_term(SCREEN *); +int setscrreg(int, int); +int slk_attroff(const chtype); +int slk_attr_off(const attr_t, void *); +int slk_attron(const chtype); +int slk_attr_on(const attr_t, void *); +int slk_attrset(const chtype); +int slk_attr_set(const attr_t, short, void *); +int slk_clear(void); +int slk_color(short); +int slk_init(int); +char *slk_label(int); +int slk_noutrefresh(void); +int slk_refresh(void); +int slk_restore(void); +int slk_set(int, const char *, int); +int slk_touch(void); +int standend(void); +int standout(void); +int start_color(void); +WINDOW *subpad(WINDOW *, int, int, int, int); +WINDOW *subwin(WINDOW *, int, int, int, int); +int syncok(WINDOW *, bool); +chtype termattrs(void); +attr_t term_attrs(void); +char *termname(void); +void timeout(int); +int touchline(WINDOW *, int, int); +int touchwin(WINDOW *); +int typeahead(int); +int untouchwin(WINDOW *); +void use_env(bool); +int vidattr(chtype); +int vid_attr(attr_t, short, void *); +int vidputs(chtype, int (*)(int)); +int vid_puts(attr_t, short, void *, int (*)(int)); +int vline(chtype, int); +int vw_printw(WINDOW *, const char *, va_list); +int vwprintw(WINDOW *, const char *, va_list); +int vw_scanw(WINDOW *, const char *, va_list); +int vwscanw(WINDOW *, const char *, va_list); +int waddchnstr(WINDOW *, const chtype *, int); +int waddchstr(WINDOW *, const chtype *); +int waddch(WINDOW *, const chtype); +int waddnstr(WINDOW *, const char *, int); +int waddstr(WINDOW *, const char *); +int wattroff(WINDOW *, chtype); +int wattron(WINDOW *, chtype); +int wattrset(WINDOW *, chtype); +int wattr_get(WINDOW *, attr_t *, short *, void *); +int wattr_off(WINDOW *, attr_t, void *); +int wattr_on(WINDOW *, attr_t, void *); +int wattr_set(WINDOW *, attr_t, short, void *); +void wbkgdset(WINDOW *, chtype); +int wbkgd(WINDOW *, chtype); +int wborder(WINDOW *, chtype, chtype, chtype, chtype, + chtype, chtype, chtype, chtype); +int wchgat(WINDOW *, int, attr_t, short, const void *); +int wclear(WINDOW *); +int wclrtobot(WINDOW *); +int wclrtoeol(WINDOW *); +int wcolor_set(WINDOW *, short, void *); +void wcursyncup(WINDOW *); +int wdelch(WINDOW *); +int wdeleteln(WINDOW *); +int wechochar(WINDOW *, const chtype); +int werase(WINDOW *); +int wgetch(WINDOW *); +int wgetnstr(WINDOW *, char *, int); +int wgetstr(WINDOW *, char *); +int whline(WINDOW *, chtype, int); +int winchnstr(WINDOW *, chtype *, int); +int winchstr(WINDOW *, chtype *); +chtype winch(WINDOW *); +int winnstr(WINDOW *, char *, int); +int winsch(WINDOW *, chtype); +int winsdelln(WINDOW *, int); +int winsertln(WINDOW *); +int winsnstr(WINDOW *, const char *, int); +int winsstr(WINDOW *, const char *); +int winstr(WINDOW *, char *); +int wmove(WINDOW *, int, int); +int wnoutrefresh(WINDOW *); +int wprintw(WINDOW *, const char *, ...); +int wredrawln(WINDOW *, int, int); +int wrefresh(WINDOW *); +int wscanw(WINDOW *, const char *, ...); +int wscrl(WINDOW *, int); +int wsetscrreg(WINDOW *, int, int); +int wstandend(WINDOW *); +int wstandout(WINDOW *); +void wsyncdown(WINDOW *); +void wsyncup(WINDOW *); +void wtimeout(WINDOW *, int); +int wtouchln(WINDOW *, int, int, int); +int wvline(WINDOW *, chtype, int); + +/* Wide-character functions */ + +#ifdef PDC_WIDE +int addnwstr(const wchar_t *, int); +int addwstr(const wchar_t *); +int add_wch(const cchar_t *); +int add_wchnstr(const cchar_t *, int); +int add_wchstr(const cchar_t *); +int border_set(const cchar_t *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *); +int box_set(WINDOW *, const cchar_t *, const cchar_t *); +int echo_wchar(const cchar_t *); +int erasewchar(wchar_t *); +int getbkgrnd(cchar_t *); +int getcchar(const cchar_t *, wchar_t *, attr_t *, short *, void *); +int getn_wstr(wint_t *, int); +int get_wch(wint_t *); +int get_wstr(wint_t *); +int hline_set(const cchar_t *, int); +int innwstr(wchar_t *, int); +int ins_nwstr(const wchar_t *, int); +int ins_wch(const cchar_t *); +int ins_wstr(const wchar_t *); +int inwstr(wchar_t *); +int in_wch(cchar_t *); +int in_wchnstr(cchar_t *, int); +int in_wchstr(cchar_t *); +char *key_name(wchar_t); +int killwchar(wchar_t *); +int mvaddnwstr(int, int, const wchar_t *, int); +int mvaddwstr(int, int, const wchar_t *); +int mvadd_wch(int, int, const cchar_t *); +int mvadd_wchnstr(int, int, const cchar_t *, int); +int mvadd_wchstr(int, int, const cchar_t *); +int mvgetn_wstr(int, int, wint_t *, int); +int mvget_wch(int, int, wint_t *); +int mvget_wstr(int, int, wint_t *); +int mvhline_set(int, int, const cchar_t *, int); +int mvinnwstr(int, int, wchar_t *, int); +int mvins_nwstr(int, int, const wchar_t *, int); +int mvins_wch(int, int, const cchar_t *); +int mvins_wstr(int, int, const wchar_t *); +int mvinwstr(int, int, wchar_t *); +int mvin_wch(int, int, cchar_t *); +int mvin_wchnstr(int, int, cchar_t *, int); +int mvin_wchstr(int, int, cchar_t *); +int mvvline_set(int, int, const cchar_t *, int); +int mvwaddnwstr(WINDOW *, int, int, const wchar_t *, int); +int mvwaddwstr(WINDOW *, int, int, const wchar_t *); +int mvwadd_wch(WINDOW *, int, int, const cchar_t *); +int mvwadd_wchnstr(WINDOW *, int, int, const cchar_t *, int); +int mvwadd_wchstr(WINDOW *, int, int, const cchar_t *); +int mvwgetn_wstr(WINDOW *, int, int, wint_t *, int); +int mvwget_wch(WINDOW *, int, int, wint_t *); +int mvwget_wstr(WINDOW *, int, int, wint_t *); +int mvwhline_set(WINDOW *, int, int, const cchar_t *, int); +int mvwinnwstr(WINDOW *, int, int, wchar_t *, int); +int mvwins_nwstr(WINDOW *, int, int, const wchar_t *, int); +int mvwins_wch(WINDOW *, int, int, const cchar_t *); +int mvwins_wstr(WINDOW *, int, int, const wchar_t *); +int mvwin_wch(WINDOW *, int, int, cchar_t *); +int mvwin_wchnstr(WINDOW *, int, int, cchar_t *, int); +int mvwin_wchstr(WINDOW *, int, int, cchar_t *); +int mvwinwstr(WINDOW *, int, int, wchar_t *); +int mvwvline_set(WINDOW *, int, int, const cchar_t *, int); +int pecho_wchar(WINDOW *, const cchar_t*); +int setcchar(cchar_t*, const wchar_t*, const attr_t, short, const void*); +int slk_wset(int, const wchar_t *, int); +int unget_wch(const wchar_t); +int vline_set(const cchar_t *, int); +int waddnwstr(WINDOW *, const wchar_t *, int); +int waddwstr(WINDOW *, const wchar_t *); +int wadd_wch(WINDOW *, const cchar_t *); +int wadd_wchnstr(WINDOW *, const cchar_t *, int); +int wadd_wchstr(WINDOW *, const cchar_t *); +int wbkgrnd(WINDOW *, const cchar_t *); +void wbkgrndset(WINDOW *, const cchar_t *); +int wborder_set(WINDOW *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *, const cchar_t *); +int wecho_wchar(WINDOW *, const cchar_t *); +int wgetbkgrnd(WINDOW *, cchar_t *); +int wgetn_wstr(WINDOW *, wint_t *, int); +int wget_wch(WINDOW *, wint_t *); +int wget_wstr(WINDOW *, wint_t *); +int whline_set(WINDOW *, const cchar_t *, int); +int winnwstr(WINDOW *, wchar_t *, int); +int wins_nwstr(WINDOW *, const wchar_t *, int); +int wins_wch(WINDOW *, const cchar_t *); +int wins_wstr(WINDOW *, const wchar_t *); +int winwstr(WINDOW *, wchar_t *); +int win_wch(WINDOW *, cchar_t *); +int win_wchnstr(WINDOW *, cchar_t *, int); +int win_wchstr(WINDOW *, cchar_t *); +wchar_t *wunctrl(cchar_t *); +int wvline_set(WINDOW *, const cchar_t *, int); +#endif + +/* Quasi-standard */ + +chtype getattrs(WINDOW *); +int getbegx(WINDOW *); +int getbegy(WINDOW *); +int getmaxx(WINDOW *); +int getmaxy(WINDOW *); +int getparx(WINDOW *); +int getpary(WINDOW *); +int getcurx(WINDOW *); +int getcury(WINDOW *); +void traceoff(void); +void traceon(void); +char *unctrl(chtype); + +int crmode(void); +int nocrmode(void); +int draino(int); +int resetterm(void); +int fixterm(void); +int saveterm(void); +int setsyx(int, int); + +int mouse_set(unsigned long); +int mouse_on(unsigned long); +int mouse_off(unsigned long); +int request_mouse_pos(void); +int map_button(unsigned long); +void wmouse_position(WINDOW *, int *, int *); +unsigned long getmouse(void); +unsigned long getbmap(void); + +/* ncurses */ + +int assume_default_colors(int, int); +const char *curses_version(void); +bool has_key(int); +int use_default_colors(void); +int wresize(WINDOW *, int, int); + +int mouseinterval(int); +mmask_t mousemask(mmask_t, mmask_t *); +bool mouse_trafo(int *, int *, bool); +int nc_getmouse(MEVENT *); +int ungetmouse(MEVENT *); +bool wenclose(const WINDOW *, int, int); +bool wmouse_trafo(const WINDOW *, int *, int *, bool); + +/* PDCurses */ + +int addrawch(chtype); +int insrawch(chtype); +bool is_termresized(void); +int mvaddrawch(int, int, chtype); +int mvdeleteln(int, int); +int mvinsertln(int, int); +int mvinsrawch(int, int, chtype); +int mvwaddrawch(WINDOW *, int, int, chtype); +int mvwdeleteln(WINDOW *, int, int); +int mvwinsertln(WINDOW *, int, int); +int mvwinsrawch(WINDOW *, int, int, chtype); +int raw_output(bool); +int resize_term(int, int); +WINDOW *resize_window(WINDOW *, int, int); +int waddrawch(WINDOW *, chtype); +int winsrawch(WINDOW *, chtype); +char wordchar(void); + +#ifdef PDC_WIDE +wchar_t *slk_wlabel(int); +#endif + +void PDC_debug(const char *, ...); +int PDC_ungetch(int); +int PDC_set_blink(bool); +int PDC_set_line_color(short); +void PDC_set_title(const char *); + +int PDC_clearclipboard(void); +int PDC_freeclipboard(char *); +int PDC_getclipboard(char **, long *); +int PDC_setclipboard(const char *, long); + +unsigned long PDC_get_input_fd(void); +unsigned long PDC_get_key_modifiers(void); +int PDC_return_key_modifiers(bool); +int PDC_save_key_modifiers(bool); + +#ifdef XCURSES +WINDOW *Xinitscr(int, char **); +void XCursesExit(void); +int sb_init(void); +int sb_set_horz(int, int, int); +int sb_set_vert(int, int, int); +int sb_get_horz(int *, int *, int *); +int sb_get_vert(int *, int *, int *); +int sb_refresh(void); +#endif + +/*** Functions defined as macros ***/ + +/* getch() and ungetch() conflict with some DOS libraries */ + +#define getch() wgetch(stdscr) +#define ungetch(ch) PDC_ungetch(ch) + +#define COLOR_PAIR(n) (((chtype)(n) << PDC_COLOR_SHIFT) & A_COLOR) +#define PAIR_NUMBER(n) (((n) & A_COLOR) >> PDC_COLOR_SHIFT) + +/* These will _only_ work as macros */ + +#define getbegyx(w, y, x) (y = getbegy(w), x = getbegx(w)) +#define getmaxyx(w, y, x) (y = getmaxy(w), x = getmaxx(w)) +#define getparyx(w, y, x) (y = getpary(w), x = getparx(w)) +#define getyx(w, y, x) (y = getcury(w), x = getcurx(w)) + +#define getsyx(y, x) { if (curscr->_leaveit) (y)=(x)=-1; \ + else getyx(curscr,(y),(x)); } + +#ifdef NCURSES_MOUSE_VERSION +# define getmouse(x) nc_getmouse(x) +#endif + +/* return codes from PDC_getclipboard() and PDC_setclipboard() calls */ + +#define PDC_CLIP_SUCCESS 0 +#define PDC_CLIP_ACCESS_ERROR 1 +#define PDC_CLIP_EMPTY 2 +#define PDC_CLIP_MEMORY_ERROR 3 + +/* PDCurses key modifier masks */ + +#define PDC_KEY_MODIFIER_SHIFT 1 +#define PDC_KEY_MODIFIER_CONTROL 2 +#define PDC_KEY_MODIFIER_ALT 4 +#define PDC_KEY_MODIFIER_NUMLOCK 8 + +#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) +# undef bool +} +#endif + +#endif /* __PDCURSES__ */ diff --git a/win32/zconf.h b/win32/zconf.h new file mode 100644 index 0000000..03a9431 --- /dev/null +++ b/win32/zconf.h @@ -0,0 +1,332 @@ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#ifndef ZCONF_H +#define ZCONF_H + +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + */ +#ifdef Z_PREFIX +# define deflateInit_ z_deflateInit_ +# define deflate z_deflate +# define deflateEnd z_deflateEnd +# define inflateInit_ z_inflateInit_ +# define inflate z_inflate +# define inflateEnd z_inflateEnd +# define deflateInit2_ z_deflateInit2_ +# define deflateSetDictionary z_deflateSetDictionary +# define deflateCopy z_deflateCopy +# define deflateReset z_deflateReset +# define deflateParams z_deflateParams +# define deflateBound z_deflateBound +# define deflatePrime z_deflatePrime +# define inflateInit2_ z_inflateInit2_ +# define inflateSetDictionary z_inflateSetDictionary +# define inflateSync z_inflateSync +# define inflateSyncPoint z_inflateSyncPoint +# define inflateCopy z_inflateCopy +# define inflateReset z_inflateReset +# define inflateBack z_inflateBack +# define inflateBackEnd z_inflateBackEnd +# define compress z_compress +# define compress2 z_compress2 +# define compressBound z_compressBound +# define uncompress z_uncompress +# define adler32 z_adler32 +# define crc32 z_crc32 +# define get_crc_table z_get_crc_table +# define zError z_zError + +# define alloc_func z_alloc_func +# define free_func z_free_func +# define in_func z_in_func +# define out_func z_out_func +# define Byte z_Byte +# define uInt z_uInt +# define uLong z_uLong +# define Bytef z_Bytef +# define charf z_charf +# define intf z_intf +# define uIntf z_uIntf +# define uLongf z_uLongf +# define voidpf z_voidpf +# define voidp z_voidp +#endif + +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif +#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) +# define OS2 +#endif +#if defined(_WINDOWS) && !defined(WINDOWS) +# define WINDOWS +#endif +#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) +# ifndef WIN32 +# define WIN32 +# endif +#endif +#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) +# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) +# ifndef SYS16BIT +# define SYS16BIT +# endif +# endif +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#ifdef SYS16BIT +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#ifdef __STDC_VERSION__ +# ifndef STDC +# define STDC +# endif +# if __STDC_VERSION__ >= 199901L +# ifndef STDC99 +# define STDC99 +# endif +# endif +#endif +#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) +# define STDC +#endif +#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) +# define STDC +#endif +#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) +# define STDC +#endif +#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) +# define STDC +#endif + +#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const /* note: need a more gentle solution here */ +# endif +#endif + +/* Some Mac compilers merge all .h files incorrectly: */ +#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__) +# define NO_DUMMY_DECL +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2. + * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files + * created by gzip. (Files created by minigzip can still be extracted by + * gzip.) + */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + (1 << (windowBits+2)) + (1 << (memLevel+9)) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus a few kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#ifdef SYS16BIT +# if defined(M_I86SM) || defined(M_I86MM) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR _far +# else +# define FAR far +# endif +# endif +# if (defined(__SMALL__) || defined(__MEDIUM__)) + /* Turbo C small or medium model */ +# define SMALL_MEDIUM +# ifdef __BORLANDC__ +# define FAR _far +# else +# define FAR far +# endif +# endif +#endif + +#if defined(WINDOWS) || defined(WIN32) + /* If building or using zlib as a DLL, define ZLIB_DLL. + * This is not mandatory, but it offers a little performance increase. + */ +# ifdef ZLIB_DLL +# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) +# ifdef ZLIB_INTERNAL +# define ZEXTERN extern __declspec(dllexport) +# else +# define ZEXTERN extern __declspec(dllimport) +# endif +# endif +# endif /* ZLIB_DLL */ + /* If building or using zlib with the WINAPI/WINAPIV calling convention, + * define ZLIB_WINAPI. + * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. + */ +# ifdef ZLIB_WINAPI +# ifdef FAR +# undef FAR +# endif +# include + /* No need for _export, use ZLIB.DEF instead. */ + /* For complete Windows compatibility, use WINAPI, not __stdcall. */ +# define ZEXPORT WINAPI +# ifdef WIN32 +# define ZEXPORTVA WINAPIV +# else +# define ZEXPORTVA FAR CDECL +# endif +# endif +#endif + +#if defined (__BEOS__) +# ifdef ZLIB_DLL +# ifdef ZLIB_INTERNAL +# define ZEXPORT __declspec(dllexport) +# define ZEXPORTVA __declspec(dllexport) +# else +# define ZEXPORT __declspec(dllimport) +# define ZEXPORTVA __declspec(dllimport) +# endif +# endif +#endif + +#ifndef ZEXTERN +# define ZEXTERN extern +#endif +#ifndef ZEXPORT +# define ZEXPORT +#endif +#ifndef ZEXPORTVA +# define ZEXPORTVA +#endif + +#ifndef FAR +# define FAR +#endif + +#if !defined(__MACTYPES__) +typedef unsigned char Byte; /* 8 bits */ +#endif +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +#ifdef SMALL_MEDIUM + /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void const *voidpc; + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte const *voidpc; + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + +#if 0 /* HAVE_UNISTD_H -- this line is updated by ./configure */ +# include /* for off_t */ +# include /* for SEEK_* and off_t */ +# ifdef VMS +# include /* for off_t */ +# endif +# define z_off_t off_t +#endif +#ifndef SEEK_SET +# define SEEK_SET 0 /* Seek from beginning of file. */ +# define SEEK_CUR 1 /* Seek from current position. */ +# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#endif +#ifndef z_off_t +# define z_off_t long +#endif + +#if defined(__OS400__) +# define NO_vsnprintf +#endif + +#if defined(__MVS__) +# define NO_vsnprintf +# ifdef FAR +# undef FAR +# endif +#endif + +/* MVS linker does not support external names larger than 8 bytes */ +#if defined(__MVS__) +# pragma map(deflateInit_,"DEIN") +# pragma map(deflateInit2_,"DEIN2") +# pragma map(deflateEnd,"DEEND") +# pragma map(deflateBound,"DEBND") +# pragma map(inflateInit_,"ININ") +# pragma map(inflateInit2_,"ININ2") +# pragma map(inflateEnd,"INEND") +# pragma map(inflateSync,"INSY") +# pragma map(inflateSetDictionary,"INSEDI") +# pragma map(compressBound,"CMBND") +# pragma map(inflate_table,"INTABL") +# pragma map(inflate_fast,"INFA") +# pragma map(inflate_copyright,"INCOPY") +#endif + +#endif /* ZCONF_H */ diff --git a/win32/zlib.h b/win32/zlib.h new file mode 100644 index 0000000..0228179 --- /dev/null +++ b/win32/zlib.h @@ -0,0 +1,1357 @@ +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifndef ZLIB_H +#define ZLIB_H + +#include "zconf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZLIB_VERSION "1.2.3" +#define ZLIB_VERNUM 0x1230 + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed + data. This version of the library supports only one compression method + (deflation) but other algorithms will be added later and will have the same + stream interface. + + Compression can be done in a single step if the buffers are large + enough (for example if an input file is mmap'ed), or can be done by + repeated calls of the compression function. In the latter case, the + application must provide more input and/or consume the output + (providing more output space) before each call. + + The compressed data format used by default by the in-memory functions is + the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped + around a deflate stream, which is itself documented in RFC 1951. + + The library also supports reading and writing files in gzip (.gz) format + with an interface similar to that of stdio using the functions that start + with "gz". The gzip format is different from the zlib format. gzip is a + gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. + + This library can optionally read and write gzip streams in memory as well. + + The zlib format was designed to be compact and fast for use in memory + and on communications channels. The gzip format was designed for single- + file compression on file systems, has a larger header than zlib to maintain + directory information, and uses a different, slower check method than zlib. + + The library does not install any signal handler. The decoder checks + the consistency of the compressed data, so the library should never + crash even in case of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total nb of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total nb of bytes output so far */ + + char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: binary or text */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + gzip header information passed to and from zlib routines. See RFC 1952 + for more details on the meanings of these fields. +*/ +typedef struct gz_header_s { + int text; /* true if compressed data believed to be text */ + uLong time; /* modification time */ + int xflags; /* extra flags (not used when writing a gzip file) */ + int os; /* operating system */ + Bytef *extra; /* pointer to extra field or Z_NULL if none */ + uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ + uInt extra_max; /* space at extra (only when reading header) */ + Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ + uInt name_max; /* space at name (only when reading header) */ + Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ + uInt comm_max; /* space at comment (only when reading header) */ + int hcrc; /* true if there was or will be a header crc */ + int done; /* true when done reading gzip header (not used + when writing a gzip file) */ +} gz_header; + +typedef gz_header FAR *gz_headerp; + +/* + The application must update next_in and avail_in when avail_in has + dropped to zero. It must update next_out and avail_out when avail_out + has dropped to zero. The application must initialize zalloc, zfree and + opaque before calling the init function. All other fields are set by the + compression library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + If zlib is used in a multi-threaded application, zalloc and zfree must be + thread safe. + + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this + if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, + pointers returned by zalloc for objects of exactly 65536 bytes *must* + have their offset normalized to zero. The default allocation function + provided by this library ensures this (see zutil.c). To reduce memory + requirements and avoid any allocation of 64K objects, at the expense of + compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or + progress reports. After compression, total_in holds the total size of + the uncompressed data and may be saved for use in the decompressor + (particularly if the decompressor wants to decompress everything in + a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ +#define Z_SYNC_FLUSH 2 +#define Z_FULL_FLUSH 3 +#define Z_FINISH 4 +#define Z_BLOCK 5 +/* Allowed flush values; see deflate() and inflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative + * values are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_RLE 3 +#define Z_FIXED 4 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_TEXT 1 +#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ +#define Z_UNKNOWN 2 +/* Possible values of the data_type field (though see inflate()) */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + /* basic functions */ + +ZEXTERN const char * ZEXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is + not compatible with the zlib.h header file used by the application. + This check is automatically made by deflateInit and inflateInit. + */ + +/* +ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. + If zalloc and zfree are set to Z_NULL, deflateInit updates them to + use default allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at + all (the input data is simply copied a block at a time). + Z_DEFAULT_COMPRESSION requests a default compromise between speed and + compression (currently equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if level is not a valid compression level, + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). + msg is set to null if there is no error message. deflateInit does not + perform any compression: this will be done by deflate(). +*/ + + +ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); +/* + deflate compresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce some + output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. deflate performs one or both of the + following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). + Some output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating avail_in or avail_out accordingly; avail_out + should never be zero before the call. The application can consume the + compressed output when it wants, for example when the output buffer is full + (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK + and with zero avail_out, it must be called again after making room in the + output buffer because there might be more output pending. + + Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to + decide how much data to accumualte before producing output, in order to + maximize compression. + + If the parameter flush is set to Z_SYNC_FLUSH, all pending output is + flushed to the output buffer and the output is aligned on a byte boundary, so + that the decompressor can get all input data available so far. (In particular + avail_in is zero after the call if enough output space has been provided + before the call.) Flushing may degrade compression for some compression + algorithms and so it should be used only when necessary. + + If flush is set to Z_FULL_FLUSH, all output is flushed as with + Z_SYNC_FLUSH, and the compression state is reset so that decompression can + restart from this point if previous compressed data has been damaged or if + random access is desired. Using Z_FULL_FLUSH too often can seriously degrade + compression. + + If deflate returns with avail_out == 0, this function must be called again + with the same value of the flush parameter and more output space (updated + avail_out), until the flush is complete (deflate returns with non-zero + avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that + avail_out is greater than six to avoid repeated flush markers due to + avail_out == 0 on return. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there + was enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the + stream are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least + the value returned by deflateBound (see below). If deflate does not return + Z_STREAM_END, then it must be called again as described above. + + deflate() sets strm->adler to the adler32 checksum of all input read + so far (that is, total_in bytes). + + deflate() may update strm->data_type if it can make a good guess about + the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered + binary. This field is only for information purposes and does not affect + the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible + (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not + fatal, and deflate() can be called again with more input and more output + space to continue compressing. +*/ + + +ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, + msg may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + next_in, avail_in, zalloc, zfree and opaque must be initialized before by + the caller. If next_in is not Z_NULL and avail_in is large enough (the exact + value depends on the compression method), inflateInit determines the + compression method from the zlib header and allocates all data structures + accordingly; otherwise the allocation will be deferred to the first call of + inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to + use default allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller. msg is set to null if there is no error + message. inflateInit does not perform any decompression apart from reading + the zlib header if present: this will be done by inflate(). (So next_in and + avail_in may be modified, but next_out and avail_out are unchanged.) +*/ + + +ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); +/* + inflate decompresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. inflate performs one or both of the + following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing + will resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there + is no more input data or no more space in the output buffer (see below + about the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating the next_* and avail_* values accordingly. + The application can consume the uncompressed output when it wants, for + example when the output buffer is full (avail_out == 0), or after each + call of inflate(). If inflate returns Z_OK and with zero avail_out, it + must be called again after making room in the output buffer because there + might be more output pending. + + The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, + Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much + output as possible to the output buffer. Z_BLOCK requests that inflate() stop + if and when it gets to the next deflate block boundary. When decoding the + zlib or gzip format, this will cause inflate() to return immediately after + the header and before the first block. When doing a raw inflate, inflate() + will go ahead and process the first block, and will return when it gets to + the end of that block, or when it runs out of data. + + The Z_BLOCK option assists in appending to or combining deflate streams. + Also to assist in this, on return inflate() will set strm->data_type to the + number of unused bits in the last byte taken from strm->next_in, plus 64 + if inflate() is currently decoding the last block in the deflate stream, + plus 128 if inflate() returned immediately after decoding an end-of-block + code or decoding the complete header up to just before the first byte of the + deflate stream. The end-of-block will not be indicated until all of the + uncompressed data from that block has been written to strm->next_out. The + number of unused bits may in general be greater than seven, except when + bit 7 of data_type is set, in which case the number of unused bits will be + less than eight. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step + (a single call of inflate), the parameter flush should be set to + Z_FINISH. In this case all pending input is processed and all pending + output is flushed; avail_out must be large enough to hold all the + uncompressed data. (The size of the uncompressed data may have been saved + by the compressor for this purpose.) The next operation on this stream must + be inflateEnd to deallocate the decompression state. The use of Z_FINISH + is never required, but can be used to inform inflate that a faster approach + may be used for the single inflate() call. + + In this implementation, inflate() always flushes as much output as + possible to the output buffer, and always uses the faster approach on the + first call. So the only effect of the flush parameter in this implementation + is on the return value of inflate(), as noted below, or when it returns early + because Z_BLOCK is used. + + If a preset dictionary is needed after this call (see inflateSetDictionary + below), inflate sets strm->adler to the adler32 checksum of the dictionary + chosen by the compressor and returns Z_NEED_DICT; otherwise it sets + strm->adler to the adler32 checksum of all output produced so far (that is, + total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described + below. At the end of the stream, inflate() checks that its computed adler32 + checksum is equal to that saved by the compressor and returns Z_STREAM_END + only if the checksum is correct. + + inflate() will decompress and check either zlib-wrapped or gzip-wrapped + deflate data. The header type is detected automatically. Any information + contained in the gzip header is not retained, so applications that need that + information should instead use raw inflate, see inflateInit2() below, or + inflateBack() and perform their own processing of the gzip header and + trailer. + + inflate() returns Z_OK if some progress has been made (more input processed + or more output produced), Z_STREAM_END if the end of the compressed data has + been reached and all uncompressed output has been produced, Z_NEED_DICT if a + preset dictionary is needed at this point, Z_DATA_ERROR if the input data was + corrupted (input stream not conforming to the zlib format or incorrect check + value), Z_STREAM_ERROR if the stream structure was inconsistent (for example + if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in the + output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and + inflate() can be called again with more input and more output space to + continue decompressing. If Z_DATA_ERROR is returned, the application may then + call inflateSync() to look for a good compression block if a partial recovery + of the data is desired. +*/ + + +ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by + the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library. Larger values of this parameter result in better + compression at the expense of memory usage. The default value is 15 if + deflateInit is used instead. + + windowBits can also be -8..-15 for raw deflate. In this case, -windowBits + determines the window size. deflate() will then generate raw deflate data + with no zlib header or trailer, and will not compute an adler32 check value. + + windowBits can also be greater than 15 for optional gzip encoding. Add + 16 to windowBits to write a simple gzip header and trailer around the + compressed data instead of a zlib wrapper. The gzip header will have no + file name, no extra data, no comment, no modification time (set to zero), + no header crc, and the operating system will be set to 255 (unknown). If a + gzip stream is being written, strm->adler is a crc32 instead of an adler32. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but + is slow and reduces compression ratio; memLevel=9 uses maximum memory + for optimal speed. The default value is 8. See zconf.h for total memory + usage as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match), or Z_RLE to limit match distances to one (run-length + encoding). Filtered data consists mostly of small values with a somewhat + random distribution. In this case, the compression algorithm is tuned to + compress them better. The effect of Z_FILTERED is to force more Huffman + coding and less string matching; it is somewhat intermediate between + Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as + Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy + parameter only affects the compression ratio but not the correctness of the + compressed output even if it is not set appropriately. Z_FIXED prevents the + use of dynamic Huffman codes, allowing for a simpler decoder for special + applications. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid + method). msg is set to null if there is no error message. deflateInit2 does + not perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary from the given byte sequence + without producing any compressed output. This function must be called + immediately after deflateInit, deflateInit2 or deflateReset, before any + call of deflate. The compressor and decompressor must use exactly the same + dictionary (see inflateSetDictionary). + + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and can be + predicted with good accuracy; the data can then be compressed better than + with the default empty dictionary. + + Depending on the size of the compression data structures selected by + deflateInit or deflateInit2, a part of the dictionary may in effect be + discarded, for example if the dictionary is larger than the window size in + deflate or deflate2. Thus the strings most likely to be useful should be + put at the end of the dictionary, not at the front. In addition, the + current implementation of deflate will use at most the window size minus + 262 bytes of the provided dictionary. + + Upon return of this function, strm->adler is set to the adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) If a raw deflate was requested, then the + adler32 value is not computed and strm->adler is not set. + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent (for example if deflate has already been called for this stream + or if the compression method is bsort). deflateSetDictionary does not + perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and + can consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. + The stream will keep the same compression level and any other attributes + that may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, + int level, + int strategy)); +/* + Dynamically update the compression level and compression strategy. The + interpretation of level and strategy is as in deflateInit2. This can be + used to switch between compression and straight copy of the input data, or + to switch to a different kind of input data requiring a different + strategy. If the compression level is changed, the input available so far + is compressed with the old level (and may be flushed); the new level will + take effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to + be compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR + if strm->avail_out was zero. +*/ + +ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, + int good_length, + int max_lazy, + int nice_length, + int max_chain)); +/* + Fine tune deflate's internal compression parameters. This should only be + used by someone who understands the algorithm used by zlib's deflate for + searching for the best matching string, and even then only by the most + fanatic optimizer trying to squeeze out the last compressed bit for their + specific input data. Read the deflate.c source code for the meaning of the + max_lazy, good_length, nice_length, and max_chain parameters. + + deflateTune() can be called after deflateInit() or deflateInit2(), and + returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. + */ + +ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, + uLong sourceLen)); +/* + deflateBound() returns an upper bound on the compressed size after + deflation of sourceLen bytes. It must be called after deflateInit() + or deflateInit2(). This would be used to allocate an output buffer + for deflation in a single pass, and so would be called before deflate(). +*/ + +ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + deflatePrime() inserts bits in the deflate output stream. The intent + is that this function is used to start off the deflate output with the + bits leftover from a previous deflate stream when appending to it. As such, + this function can only be used for raw deflate, and must be used before the + first deflate() call after a deflateInit2() or deflateReset(). bits must be + less than or equal to 16, and that many of the least significant bits of + value will be inserted in the output. + + deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, + gz_headerp head)); +/* + deflateSetHeader() provides gzip header information for when a gzip + stream is requested by deflateInit2(). deflateSetHeader() may be called + after deflateInit2() or deflateReset() and before the first call of + deflate(). The text, time, os, extra field, name, and comment information + in the provided gz_header structure are written to the gzip header (xflag is + ignored -- the extra flags are set according to the compression level). The + caller must assure that, if not Z_NULL, name and comment are terminated with + a zero byte, and that if extra is not Z_NULL, that extra_len bytes are + available there. If hcrc is true, a gzip header crc is included. Note that + the current versions of the command-line version of gzip (up through version + 1.3.x) do not support header crc's, and will report that it is a "multi-part + gzip file" and give up. + + If deflateSetHeader is not used, the default gzip header has text false, + the time set to zero, and os set to 255, with no extra, name, or comment + fields. The gzip header is returned to the default state by deflateReset(). + + deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with an extra parameter. The + fields next_in, avail_in, zalloc, zfree and opaque must be initialized + before by the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library. The default value is 15 if inflateInit is used + instead. windowBits must be greater than or equal to the windowBits value + provided to deflateInit2() while compressing, or it must be equal to 15 if + deflateInit2() was not used. If a compressed stream with a larger window + size is given as input, inflate() will return with the error code + Z_DATA_ERROR instead of trying to allocate a larger window. + + windowBits can also be -8..-15 for raw inflate. In this case, -windowBits + determines the window size. inflate() will then process raw deflate data, + not looking for a zlib or gzip header, not generating a check value, and not + looking for any check values for comparison at the end of the stream. This + is for use with other formats that use the deflate compressed data format + such as zip. Those formats provide their own check values. If a custom + format is developed using the raw deflate format for compressed data, it is + recommended that a check value such as an adler32 or a crc32 be applied to + the uncompressed data as is done in the zlib, gzip, and zip formats. For + most applications, the zlib format should be used as is. Note that comments + above on the use in deflateInit2() applies to the magnitude of windowBits. + + windowBits can also be greater than 15 for optional gzip decoding. Add + 32 to windowBits to enable zlib and gzip decoding with automatic header + detection, or add 16 to decode only the gzip format (the zlib format will + return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is + a crc32 instead of an adler32. + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg + is set to null if there is no error message. inflateInit2 does not perform + any decompression apart from reading the zlib header if present: this will + be done by inflate(). (So next_in and avail_in may be modified, but next_out + and avail_out are unchanged.) +*/ + +ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the decompression dictionary from the given uncompressed byte + sequence. This function must be called immediately after a call of inflate, + if that call returned Z_NEED_DICT. The dictionary chosen by the compressor + can be determined from the adler32 value returned by that call of inflate. + The compressor and decompressor must use exactly the same dictionary (see + deflateSetDictionary). For raw inflate, this function can be called + immediately after inflateInit2() or inflateReset() and before any call of + inflate() to set the dictionary. The application must insure that the + dictionary that was used for compression is provided. + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect adler32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); +/* + Skips invalid compressed data until a full flush point (see above the + description of deflate with Z_FULL_FLUSH) can be found, or until all + available input is skipped. No output is provided. + + inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR + if no more input was provided, Z_DATA_ERROR if no flush point has been found, + or Z_STREAM_ERROR if the stream structure was inconsistent. In the success + case, the application may save the current current value of total_in which + indicates where valid compressed data was found. In the error case, the + application may repeatedly call inflateSync, providing more input each time, + until success or end of the input data. +*/ + +ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when randomly accessing a large stream. The + first pass through the stream can periodically record the inflate state, + allowing restarting inflate at those points when randomly accessing the + stream. + + inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate all the internal decompression state. + The stream will keep attributes that may have been set by inflateInit2. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + This function inserts bits in the inflate input stream. The intent is + that this function is used to start inflating at a bit position in the + middle of a byte. The provided bits will be used before any bytes are used + from next_in. This function should only be used with raw inflate, and + should be used before the first inflate() call after inflateInit2() or + inflateReset(). bits must be less than or equal to 16, and that many of the + least significant bits of value will be inserted in the input. + + inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, + gz_headerp head)); +/* + inflateGetHeader() requests that gzip header information be stored in the + provided gz_header structure. inflateGetHeader() may be called after + inflateInit2() or inflateReset(), and before the first call of inflate(). + As inflate() processes the gzip stream, head->done is zero until the header + is completed, at which time head->done is set to one. If a zlib stream is + being decoded, then head->done is set to -1 to indicate that there will be + no gzip header information forthcoming. Note that Z_BLOCK can be used to + force inflate() to return immediately after header processing is complete + and before any actual data is decompressed. + + The text, time, xflags, and os fields are filled in with the gzip header + contents. hcrc is set to true if there is a header CRC. (The header CRC + was valid if done is set to one.) If extra is not Z_NULL, then extra_max + contains the maximum number of bytes to write to extra. Once done is true, + extra_len contains the actual extra field length, and extra contains the + extra field, or that field truncated if extra_max is less than extra_len. + If name is not Z_NULL, then up to name_max characters are written there, + terminated with a zero unless the length is greater than name_max. If + comment is not Z_NULL, then up to comm_max characters are written there, + terminated with a zero unless the length is greater than comm_max. When + any of extra, name, or comment are not Z_NULL and the respective field is + not present in the header, then that field is set to Z_NULL to signal its + absence. This allows the use of deflateSetHeader() with the returned + structure to duplicate the header. However if those fields are set to + allocated memory, then the application will need to save those pointers + elsewhere so that they can be eventually freed. + + If inflateGetHeader is not used, then the header information is simply + discarded. The header is always checked for validity, including the header + CRC if present. inflateReset() will reset the process to discard the header + information. The application would need to call inflateGetHeader() again to + retrieve the header from the next gzip stream. + + inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, + unsigned char FAR *window)); + + Initialize the internal stream state for decompression using inflateBack() + calls. The fields zalloc, zfree and opaque in strm must be initialized + before the call. If zalloc and zfree are Z_NULL, then the default library- + derived memory allocation routines are used. windowBits is the base two + logarithm of the window size, in the range 8..15. window is a caller + supplied buffer of that size. Except for special applications where it is + assured that deflate was used with small window sizes, windowBits must be 15 + and a 32K byte window must be supplied to be able to decompress general + deflate streams. + + See inflateBack() for the usage of these routines. + + inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of + the paramaters are invalid, Z_MEM_ERROR if the internal state could not + be allocated, or Z_VERSION_ERROR if the version of the library does not + match the version of the header file. +*/ + +typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *)); +typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); + +ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, + in_func in, void FAR *in_desc, + out_func out, void FAR *out_desc)); +/* + inflateBack() does a raw inflate with a single call using a call-back + interface for input and output. This is more efficient than inflate() for + file i/o applications in that it avoids copying between the output and the + sliding window by simply making the window itself the output buffer. This + function trusts the application to not change the output buffer passed by + the output function, at least until inflateBack() returns. + + inflateBackInit() must be called first to allocate the internal state + and to initialize the state with the user-provided window buffer. + inflateBack() may then be used multiple times to inflate a complete, raw + deflate stream with each call. inflateBackEnd() is then called to free + the allocated state. + + A raw deflate stream is one with no zlib or gzip header or trailer. + This routine would normally be used in a utility that reads zip or gzip + files and writes out uncompressed files. The utility would decode the + header and process the trailer on its own, hence this routine expects + only the raw deflate stream to decompress. This is different from the + normal behavior of inflate(), which expects either a zlib or gzip header and + trailer around the deflate stream. + + inflateBack() uses two subroutines supplied by the caller that are then + called by inflateBack() for input and output. inflateBack() calls those + routines until it reads a complete deflate stream and writes out all of the + uncompressed data, or until it encounters an error. The function's + parameters and return types are defined above in the in_func and out_func + typedefs. inflateBack() will call in(in_desc, &buf) which should return the + number of bytes of provided input, and a pointer to that input in buf. If + there is no input available, in() must return zero--buf is ignored in that + case--and inflateBack() will return a buffer error. inflateBack() will call + out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() + should return zero on success, or non-zero on failure. If out() returns + non-zero, inflateBack() will return with an error. Neither in() nor out() + are permitted to change the contents of the window provided to + inflateBackInit(), which is also the buffer that out() uses to write from. + The length written by out() will be at most the window size. Any non-zero + amount of input may be provided by in(). + + For convenience, inflateBack() can be provided input on the first call by + setting strm->next_in and strm->avail_in. If that input is exhausted, then + in() will be called. Therefore strm->next_in must be initialized before + calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called + immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in + must also be initialized, and then if strm->avail_in is not zero, input will + initially be taken from strm->next_in[0 .. strm->avail_in - 1]. + + The in_desc and out_desc parameters of inflateBack() is passed as the + first parameter of in() and out() respectively when they are called. These + descriptors can be optionally used to pass any information that the caller- + supplied in() and out() functions need to do their job. + + On return, inflateBack() will set strm->next_in and strm->avail_in to + pass back any unused input that was provided by the last in() call. The + return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR + if in() or out() returned an error, Z_DATA_ERROR if there was a format + error in the deflate stream (in which case strm->msg is set to indicate the + nature of the error), or Z_STREAM_ERROR if the stream was not properly + initialized. In the case of Z_BUF_ERROR, an input or output error can be + distinguished using strm->next_in which will be Z_NULL only if in() returned + an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to + out() returning non-zero. (in() will always be called before out(), so + strm->next_in is assured to be defined if out() returns non-zero.) Note + that inflateBack() cannot return Z_OK. +*/ + +ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); +/* + All memory allocated by inflateBackInit() is freed. + + inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream + state was inconsistent. +*/ + +ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); +/* Return flags indicating compile-time options. + + Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: + 1.0: size of uInt + 3.2: size of uLong + 5.4: size of voidpf (pointer) + 7.6: size of z_off_t + + Compiler, assembler, and debug options: + 8: DEBUG + 9: ASMV or ASMINF -- use ASM code + 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention + 11: 0 (reserved) + + One-time table building (smaller code, but not thread-safe if true): + 12: BUILDFIXED -- build static block decoding tables when needed + 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed + 14,15: 0 (reserved) + + Library content (indicates missing functionality): + 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking + deflate code when not needed) + 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect + and decode gzip streams (to avoid linking crc code) + 18-19: 0 (reserved) + + Operation variations (changes in library functionality): + 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate + 21: FASTEST -- deflate algorithm with only one, lowest compression level + 22,23: 0 (reserved) + + The sprintf variant used by gzprintf (zero is best): + 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format + 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! + 26: 0 = returns value, 1 = void -- 1 means inferred string length returned + + Remainder: + 27-31: 0 (reserved) + */ + + + /* utility functions */ + +/* + The following utility functions are implemented on top of the + basic stream-oriented functions. To simplify the interface, some + default options are assumed (compression level and memory usage, + standard memory allocation functions). The source code of these + utility functions can easily be modified if you need special options. +*/ + +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least the value returned + by compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen, + int level)); +/* + Compresses the source buffer into the destination buffer. The level + parameter has the same meaning as in deflateInit. sourceLen is the byte + length of the source buffer. Upon entry, destLen is the total size of the + destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + + compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_BUF_ERROR if there was not enough room in the output buffer, + Z_STREAM_ERROR if the level parameter is invalid. +*/ + +ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); +/* + compressBound() returns an upper bound on the compressed size after + compress() or compress2() on sourceLen bytes. It would be used before + a compress() or compress2() call to allocate the destination buffer. +*/ + +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. +*/ + + +typedef voidp gzFile; + +ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); +/* + Opens a gzip (.gz) file for reading or writing. The mode parameter + is as in fopen ("rb" or "wb") but can also include a compression level + ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for + Huffman only compression as in "wb1h", or 'R' for run-length encoding + as in "wb1R". (See the description of deflateInit2 for more information + about the strategy parameter.) + + gzopen can be used to read a file which is not in gzip format; in this + case gzread will directly read from the file without decompression. + + gzopen returns NULL if the file could not be opened or if there was + insufficient memory to allocate the (de)compression state; errno + can be checked to distinguish the two cases (if errno is zero, the + zlib error is Z_MEM_ERROR). */ + +ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); +/* + gzdopen() associates a gzFile with the file descriptor fd. File + descriptors are obtained from calls like open, dup, creat, pipe or + fileno (in the file has been previously opened with fopen). + The mode parameter is as in gzopen. + The next call of gzclose on the returned gzFile will also close the + file descriptor fd, just like fclose(fdopen(fd), mode) closes the file + descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). + gzdopen returns NULL if there was insufficient memory to allocate + the (de)compression state. +*/ + +ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); +/* + Dynamically update the compression level or strategy. See the description + of deflateInit2 for the meaning of these parameters. + gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not + opened for writing. +*/ + +ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); +/* + Reads the given number of uncompressed bytes from the compressed file. + If the input file was not in gzip format, gzread copies the given number + of bytes into the buffer. + gzread returns the number of uncompressed bytes actually read (0 for + end of file, -1 for error). */ + +ZEXTERN int ZEXPORT gzwrite OF((gzFile file, + voidpc buf, unsigned len)); +/* + Writes the given number of uncompressed bytes into the compressed file. + gzwrite returns the number of uncompressed bytes actually written + (0 in case of error). +*/ + +ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); +/* + Converts, formats, and writes the args to the compressed file under + control of the format string, as in fprintf. gzprintf returns the number of + uncompressed bytes actually written (0 in case of error). The number of + uncompressed bytes written is limited to 4095. The caller should assure that + this limit is not exceeded. If it is exceeded, then gzprintf() will return + return an error (0) with nothing written. In this case, there may also be a + buffer overflow with unpredictable consequences, which is possible only if + zlib was compiled with the insecure functions sprintf() or vsprintf() + because the secure snprintf() or vsnprintf() functions were not available. +*/ + +ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); +/* + Writes the given null-terminated string to the compressed file, excluding + the terminating null character. + gzputs returns the number of characters written, or -1 in case of error. +*/ + +ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); +/* + Reads bytes from the compressed file until len-1 characters are read, or + a newline character is read and transferred to buf, or an end-of-file + condition is encountered. The string is then terminated with a null + character. + gzgets returns buf, or Z_NULL in case of error. +*/ + +ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); +/* + Writes c, converted to an unsigned char, into the compressed file. + gzputc returns the value that was written, or -1 in case of error. +*/ + +ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); +/* + Reads one byte from the compressed file. gzgetc returns this byte + or -1 in case of end of file or error. +*/ + +ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); +/* + Push one character back onto the stream to be read again later. + Only one character of push-back is allowed. gzungetc() returns the + character pushed, or -1 on failure. gzungetc() will fail if a + character has been pushed but not read yet, or if c is -1. The pushed + character will be discarded if the stream is repositioned with gzseek() + or gzrewind(). +*/ + +ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); +/* + Flushes all pending output into the compressed file. The parameter + flush is as in the deflate() function. The return value is the zlib + error number (see function gzerror below). gzflush returns Z_OK if + the flush parameter is Z_FINISH and all output could be flushed. + gzflush should be called only when strictly necessary because it can + degrade compression. +*/ + +ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, + z_off_t offset, int whence)); +/* + Sets the starting position for the next gzread or gzwrite on the + given compressed file. The offset represents a number of bytes in the + uncompressed data stream. The whence parameter is defined as in lseek(2); + the value SEEK_END is not supported. + If the file is opened for reading, this function is emulated but can be + extremely slow. If the file is opened for writing, only forward seeks are + supported; gzseek then compresses a sequence of zeroes up to the new + starting position. + + gzseek returns the resulting offset location as measured in bytes from + the beginning of the uncompressed stream, or -1 in case of error, in + particular if the file is opened for writing and the new starting position + would be before the current position. +*/ + +ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); +/* + Rewinds the given file. This function is supported only for reading. + + gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) +*/ + +ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); +/* + Returns the starting position for the next gzread or gzwrite on the + given compressed file. This position represents a number of bytes in the + uncompressed data stream. + + gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) +*/ + +ZEXTERN int ZEXPORT gzeof OF((gzFile file)); +/* + Returns 1 when EOF has previously been detected reading the given + input stream, otherwise zero. +*/ + +ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); +/* + Returns 1 if file is being read directly without decompression, otherwise + zero. +*/ + +ZEXTERN int ZEXPORT gzclose OF((gzFile file)); +/* + Flushes all pending output if necessary, closes the compressed file + and deallocates all the (de)compression state. The return value is the zlib + error number (see function gzerror below). +*/ + +ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); +/* + Returns the error message for the last error which occurred on the + given compressed file. errnum is set to zlib error number. If an + error occurred in the file system and not in the compression library, + errnum is set to Z_ERRNO and the application may consult errno + to get the exact error code. +*/ + +ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); +/* + Clears the error and end-of-file flags for file. This is analogous to the + clearerr() function in stdio. This is useful for continuing to read a gzip + file that is being written concurrently. +*/ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the + compression library. +*/ + +ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. If buf is NULL, this function returns + the required initial value for the checksum. + An Adler-32 checksum is almost as reliable as a CRC32 but can be computed + much faster. Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, + z_off_t len2)); +/* + Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 + and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for + each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of + seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. +*/ + +ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); +/* + Update a running CRC-32 with the bytes buf[0..len-1] and return the + updated CRC-32. If buf is NULL, this function returns the required initial + value for the for the crc. Pre- and post-conditioning (one's complement) is + performed within this function so it shouldn't be done by the application. + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ + +ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); + +/* + Combine two CRC-32 check values into one. For two sequences of bytes, + seq1 and seq2 with lengths len1 and len2, CRC-32 check values were + calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 + check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and + len2. +*/ + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, + int windowBits, int memLevel, + int strategy, const char *version, + int stream_size)); +ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, + unsigned char FAR *window, + const char *version, + int stream_size)); +#define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) +#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) +#define inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, sizeof(z_stream)) + + +#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL) + struct internal_state {int dummy;}; /* hack for buggy compilers */ +#endif + +ZEXTERN const char * ZEXPORT zError OF((int)); +ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); +ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); + +#ifdef __cplusplus +} +#endif + +#endif /* ZLIB_H */ -- 2.30.2