+------------------------------------------------------------------------
+r451 | lh3lh3 | 2009-09-02 10:44:48 +0100 (Wed, 02 Sep 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_rmdup.c
+ M /trunk/samtools/bam_rmdupse.c
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.5-34 (r451)
+ * applied the patch by John
+ * improved the help message a little bit
+
+------------------------------------------------------------------------
+r450 | lh3lh3 | 2009-09-02 09:55:55 +0100 (Wed, 02 Sep 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_color.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-33 (r450)
+ * fixed a bug in bam_color.c (on behalf of Nils Homer)
+
+------------------------------------------------------------------------
+r449 | lh3lh3 | 2009-08-29 20:36:41 +0100 (Sat, 29 Aug 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools-0.1.5-32 (r449)
+ * fillmd: fixed a bug in modifying MD/NM tags
+ * in import, give a warning if the read is aligned but there is no CIGAR.
+
+------------------------------------------------------------------------
+r448 | lh3lh3 | 2009-08-19 09:44:28 +0100 (Wed, 19 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/wgsim_eval.pl
+
+ * samtools-0.1.5-31 (r448)
+ * fixed an issue when the last CIGAR is I or D
+
+------------------------------------------------------------------------
+r447 | lh3lh3 | 2009-08-17 09:34:57 +0100 (Mon, 17 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-30 (r447)
+ * fixed a bug in bam_aux_get(): 'A' is not checked
+
+------------------------------------------------------------------------
+r446 | lh3lh3 | 2009-08-17 09:33:17 +0100 (Mon, 17 Aug 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bamtk.c
+
+ *
+
+------------------------------------------------------------------------
+r444 | lh3lh3 | 2009-08-11 10:02:36 +0100 (Tue, 11 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-28 (r444)
+ * bug in "merge -n"
+
+------------------------------------------------------------------------
+r443 | lh3lh3 | 2009-08-11 09:29:11 +0100 (Tue, 11 Aug 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-27 (r443)
+ * SEQ and QUAL can be "*"
+ * parse CIGAR "=" and "X" as "M"
+
+------------------------------------------------------------------------
+r442 | lh3lh3 | 2009-08-07 21:56:38 +0100 (Fri, 07 Aug 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/md5.c
+ M /trunk/samtools/misc/md5.h
+ M /trunk/samtools/misc/md5fa.c
+
+ * samtools-0.1.5-26 (r442)
+ * replace RSA Inc md5.* with ones under permissive lincense
+ * fixed a bug in detecting unsorted bam in pileup
+
+------------------------------------------------------------------------
+r441 | bhandsaker | 2009-08-05 14:41:28 +0100 (Wed, 05 Aug 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bgzf.c
+ M /trunk/samtools/bgzf.h
+ M /trunk/samtools/bgzip.c
+
+Change copyright notices now that MIT has approved open source distribution.
+
+------------------------------------------------------------------------
+r440 | lh3lh3 | 2009-08-05 10:44:24 +0100 (Wed, 05 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_stat.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-25 (r436)
+ * in flagstats, do not report singletons if both ends are unmapped
+
+------------------------------------------------------------------------
+r439 | lh3lh3 | 2009-08-04 22:16:51 +0100 (Tue, 04 Aug 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/maq2sam.c
+
+fixed a SERIOUS bug in setting 0x20 flag
+
+------------------------------------------------------------------------
+r438 | lh3lh3 | 2009-08-04 21:50:43 +0100 (Tue, 04 Aug 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/samtools.pl
+
+fixed two minor bugs (suggested by Tim M Storm)
+
+------------------------------------------------------------------------
+r437 | lh3lh3 | 2009-08-04 09:13:24 +0100 (Tue, 04 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/samtools.pl
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.5-24 (r435)
+ * fixed a typo
+
+------------------------------------------------------------------------
+r434 | lh3lh3 | 2009-08-03 10:40:42 +0100 (Mon, 03 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-23 (r434)
+ * in tview, press 'r' to show read names rather than sequences
+
+------------------------------------------------------------------------
+r433 | lh3lh3 | 2009-08-02 19:13:35 +0100 (Sun, 02 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+
+ * tried to fixed the buggy FTP random access in Windows. FAILED.
+ * anyway, MinGW seems to have problem with "%lld".
+
+------------------------------------------------------------------------
+r432 | lh3lh3 | 2009-08-02 00:32:07 +0100 (Sun, 02 Aug 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/Makefile.mingw
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/faidx.c
+ M /trunk/samtools/razf.c
+ A /trunk/samtools/win32/libcurses.a
+ A /trunk/samtools/win32/xcurses.h
+
+ * samtools-0.1.5-22 (r432)
+ * faidx: fixed compitability issue with _WIN32
+ * razf: fixed potential compitability issue with _WIN32
+ * PDCurses support in Windows
+
+------------------------------------------------------------------------
+r431 | lh3lh3 | 2009-08-01 23:34:54 +0100 (Sat, 01 Aug 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/win32/libz.a
+
+replace the GnuWin32 version of libz.a with my own build with MinGW.
+
+------------------------------------------------------------------------
+r430 | lh3lh3 | 2009-08-01 23:21:07 +0100 (Sat, 01 Aug 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+
+add comments
+
+------------------------------------------------------------------------
+r429 | lh3lh3 | 2009-08-01 22:41:19 +0100 (Sat, 01 Aug 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile.mingw
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.5-21 (r428)
+ * knetfile.c is now compatible with mingw-winsock
+
+------------------------------------------------------------------------
+r428 | lh3lh3 | 2009-08-01 00:39:07 +0100 (Sat, 01 Aug 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile.mingw
+
+simplify MinGW Makefile
+
+------------------------------------------------------------------------
+r427 | lh3lh3 | 2009-08-01 00:30:54 +0100 (Sat, 01 Aug 2009) | 5 lines
+Changed paths:
+ A /trunk/samtools/Makefile.mingw
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ A /trunk/samtools/win32
+ A /trunk/samtools/win32/libz.a
+ A /trunk/samtools/win32/zconf.h
+ A /trunk/samtools/win32/zlib.h
+
+ * samtools-0.1.5-20 (r427)
+ * MinGW support. At least SAM<->BAM conversion is working. Other
+ functionality are not tested at the moment.
+ * zlib headers and Windows version of libz.a are included in win32/
+
+------------------------------------------------------------------------
+r426 | lh3lh3 | 2009-07-31 23:32:09 +0100 (Fri, 31 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.5-19 (r426)
+ * fixed a bug caused by recent modifications. Sorry.
+
+------------------------------------------------------------------------
+r425 | lh3lh3 | 2009-07-31 23:23:51 +0100 (Fri, 31 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bgzf.c
+
+compatible with Windows binary files
+
+------------------------------------------------------------------------
+r424 | lh3lh3 | 2009-07-31 10:19:59 +0100 (Fri, 31 Jul 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_maqcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools-0.1.5-18 (r423)
+ * output additional information in pileup indel lines, for the purepose
+ of debugging at the moment
+ * in tview, optionally allow to treat reference skip as deletion
+
+------------------------------------------------------------------------
+r423 | lh3lh3 | 2009-07-30 22:00:36 +0100 (Thu, 30 Jul 2009) | 2 lines
+Changed paths:
+ A /trunk/samtools/misc/psl2sam.pl
+
+convert BLAT psl to SAM.
+
+------------------------------------------------------------------------
+r422 | lh3lh3 | 2009-07-30 11:24:39 +0100 (Thu, 30 Jul 2009) | 6 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.c
+ M /trunk/samtools/bgzf.h
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.5-17 (r422)
+ * fixed a but in knetfile.c when seek type is not SEEK_SET
+ * write an empty BGZF block to every BGZF file
+ * check BGZF EOF marker in bam_header_read()
+ * update ChangeLog
+
+------------------------------------------------------------------------
+r421 | lh3lh3 | 2009-07-30 10:03:39 +0100 (Thu, 30 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/samtools.pl
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam.h
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.5-16 (r421)
+ * in view and pileup, load header from FASTA index if the input is SAM.
+
+------------------------------------------------------------------------
+r420 | lh3lh3 | 2009-07-29 09:18:55 +0100 (Wed, 29 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/maq2sam.c
+
+do not set "read 1" if reads are not mapped in the PE mode of maq
+
+------------------------------------------------------------------------
+r419 | lh3lh3 | 2009-07-28 09:52:33 +0100 (Tue, 28 Jul 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/samtools.pl
+ M /trunk/samtools/misc/wgsim_eval.pl
+
+ * samtools-0.1.5-15 (r419)
+ * in sam_open(), return NULL when the file cannot be opened.
+ * make wgsim_eval.pl more robust to imperfect SAM
+ * add "unique" command to samtools.pl
+
+------------------------------------------------------------------------
+r418 | lh3lh3 | 2009-07-24 14:04:19 +0100 (Fri, 24 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/misc/wgsim_eval.pl
+
+skip @header lines in SAM
+
+------------------------------------------------------------------------
+r417 | lh3lh3 | 2009-07-24 12:42:38 +0100 (Fri, 24 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.5-14 (r417)
+ * more help in "samtools view" due to the recent changes.
+
+------------------------------------------------------------------------
+r416 | lh3lh3 | 2009-07-24 12:34:30 +0100 (Fri, 24 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam.h
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.5-17 (r416)
+ * support import/export SAM with string tags
+
+------------------------------------------------------------------------
+r415 | lh3lh3 | 2009-07-24 11:39:26 +0100 (Fri, 24 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+ M /trunk/samtools/sam.h
+ M /trunk/samtools/sam_view.c
+
+ * samtools-0.1.5-12 (r415)
+ * FLAG now can be in HEX
+
+------------------------------------------------------------------------
+r414 | lh3lh3 | 2009-07-22 22:03:49 +0100 (Wed, 22 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/kstring.h
+
+fixed a compiling error (thank Ken for fixing it)
+
+------------------------------------------------------------------------
+r412 | lh3lh3 | 2009-07-21 22:19:40 +0100 (Tue, 21 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/kstring.c
+ M /trunk/samtools/kstring.h
+
+Implemented Boyer-Moore search in the kstring library.
+
+------------------------------------------------------------------------
+r409 | lh3lh3 | 2009-07-17 17:10:20 +0100 (Fri, 17 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+
+do not include knetfile.h when _USE_KNETFILE is not defined
+
+------------------------------------------------------------------------
+r408 | lh3lh3 | 2009-07-17 15:29:21 +0100 (Fri, 17 Jul 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.c
+
+ * samtools-0.1.5-11 (r408)
+ * force to overwirte existing MD if it is different from the one calculated
+ from fillmd.
+ * bgzf.c: improved the compatibility with Windows headers
+
+------------------------------------------------------------------------
+r407 | lh3lh3 | 2009-07-17 14:46:56 +0100 (Fri, 17 Jul 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.h
+
+ * samtools-0.1.5-10 (r407)
+ * implemented bam_aux_del() to remove a tag
+ * fillmd: generate the NM tag
+ * fillmd: cmd interface improvement
+
+------------------------------------------------------------------------
+r406 | lh3lh3 | 2009-07-16 23:30:40 +0100 (Thu, 16 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+
+Sorry. The old Makefile is for PDCurses...
+
+------------------------------------------------------------------------
+r405 | lh3lh3 | 2009-07-16 23:30:11 +0100 (Thu, 16 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-9 (r405)
+ * improved the compatibility with PDCurses a little bit
+
+------------------------------------------------------------------------
+r404 | lh3lh3 | 2009-07-16 23:23:52 +0100 (Thu, 16 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-8 (r404)
+ * compatible with PDCurses
+
+------------------------------------------------------------------------
+r403 | lh3lh3 | 2009-07-16 22:39:39 +0100 (Thu, 16 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/kseq.h
+
+ * samtools-0.1.5-7 (r403)
+ * fixed a bug in kseq.h for binary files (text files are fine)
+
+------------------------------------------------------------------------
+r402 | lh3lh3 | 2009-07-16 11:49:53 +0100 (Thu, 16 Jul 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_import.c
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bgzf.c
+
+ * samtools-0.1.5-6 (r402)
+ * fixed compiling error when "-D_USE_NETFILE" is not applied
+ * improve portability to MinGW
+
+------------------------------------------------------------------------
+r398 | lh3lh3 | 2009-07-13 10:21:36 +0100 (Mon, 13 Jul 2009) | 3 lines
+Changed paths:
+ A /trunk/bam-lite/bam.h (from /trunk/samtools/bam.h:395)
+ A /trunk/bam-lite/bam_lite.c (from /trunk/samtools/bam_lite.c:395)
+ D /trunk/samtools/bam_lite.c
+
+ * move bam_lite.c to bam-lite
+ * copy bam.h to bam-lite
+
+------------------------------------------------------------------------
+r395 | lh3lh3 | 2009-07-13 10:12:57 +0100 (Mon, 13 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_lite.c
+ M /trunk/samtools/bam_lpileup.c
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-5 (r395)
+ * added bam_pileup_file() and removed bam_lpileup_file()
+
+------------------------------------------------------------------------
+r394 | lh3lh3 | 2009-07-13 00:35:10 +0100 (Mon, 13 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.5-4 (r394)
+ * http_proxy support in knetfile library (check http_proxy ENV)
+
+------------------------------------------------------------------------
+r393 | lh3lh3 | 2009-07-12 23:57:07 +0100 (Sun, 12 Jul 2009) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/knetfile.c
+ M /trunk/samtools/knetfile.h
+
+ * samtools-0.1.5-3 (r393)
+ * knetfile now supports HTTP (no proxy at the moment)
+ * fixed a potential issue in knetfile on opening ordinary file, although I have
+ not seen the sideeffect so far.
+
+------------------------------------------------------------------------
+r392 | lh3lh3 | 2009-07-12 18:50:55 +0100 (Sun, 12 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/samtools.1
+
+Remove the warning in tview
+
+------------------------------------------------------------------------
+r391 | lh3lh3 | 2009-07-12 18:42:43 +0100 (Sun, 12 Jul 2009) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-2 (r391)
+ * do not show a blank screen when no reads mapped
+
+------------------------------------------------------------------------
+r390 | lh3lh3 | 2009-07-09 14:01:42 +0100 (Thu, 09 Jul 2009) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ A /trunk/samtools/bam_lite.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.5-1 (r390)
+ * removed useless _IOLIB in bam.h. This should cause no change at all.
+ * added bam_lite.c for light-weight BAM reading
+
+------------------------------------------------------------------------
+r385 | lh3lh3 | 2009-07-07 16:53:29 +0100 (Tue, 07 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/knetfile.c
+
+Release samtools-0.1.5c (fixed a bug in piping)
+
+------------------------------------------------------------------------
+r383 | lh3lh3 | 2009-07-07 11:39:55 +0100 (Tue, 07 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/sam.c
+
+Release samtools-0.1.5b (BUG! so embarrassing!)
+
+------------------------------------------------------------------------
+r381 | lh3lh3 | 2009-07-07 11:20:06 +0100 (Tue, 07 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_aux.c
+ M /trunk/samtools/bamtk.c
+
+Release samtools-0.1.5a (for compatibility with Bio::DB::Sam)
+
+------------------------------------------------------------------------
+r373 | lh3lh3 | 2009-07-07 10:26:57 +0100 (Tue, 07 Jul 2009) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+Release samtools-0.1.5
+
------------------------------------------------------------------------
r372 | lh3lh3 | 2009-07-07 09:49:27 +0100 (Tue, 07 Jul 2009) | 3 lines
Changed paths:
CC= gcc
-CXX= g++
CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
-CXXFLAGS= $(CFLAGS)
-DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE #-D_NO_CURSES
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \
bam_sort.o
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o
PROG= samtools
-INCLUDES=
+INCLUDES=
SUBDIRS= . misc
-LIBPATH=
+LIBPATH=
+LIBCURSES= -lcurses # -lXCurses
.SUFFIXES:.c .o
libbam.a:$(LOBJS)
$(AR) -cru $@ $(LOBJS)
-### For the curses library: comment out `-lcurses' if you do not have curses installed
samtools:lib $(AOBJS)
- $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -lcurses -lz -L. -lbam
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) $(LIBCURSES) -lz -L. -lbam
razip:razip.o razf.o
$(CC) $(CFLAGS) -o $@ razf.o razip.o -lz
--- /dev/null
+CC= gcc.exe
+AR= ar.exe
+CFLAGS= -g -Wall -O2
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_CURSES_LIB=2 -D_USE_KNETFILE
+LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
+ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bam_sort.o \
+ knetfile.o
+AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
+ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
+ bamtk.o
+PROG= samtools
+INCLUDES= -Iwin32
+SUBDIRS= .
+LIBPATH=
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+lib:libbam.a
+
+libbam.a:$(LOBJS)
+ $(AR) -cru $@ $(LOBJS)
+
+samtools:lib $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32
+
+razip:razip.o razf.o
+ $(CC) $(CFLAGS) -o $@ razf.o razip.o -lz
+
+bgzip:bgzip.o bgzf.o
+ $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h kstring.h
+sam.o:sam.h bam.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h bam_maqcns.h
+bam_maqcns.o:bam.h ksort.h bam_maqcns.h
+bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
+glf.o:glf.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+clean:
+ rm -fr gmon.out *.o *.exe *.dSYM razip $(PROG) *~ *.a
+Beta Release 0.1.6 (2 September, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * In tview, do not show a blank screen when no reads mapped to the
+ corresponding region.
+
+ * Implemented native HTTP support in the BGZF library. Samtools is now
+ able to directly open a BAM file on HTTP. HTTP proxy is also
+ supported via the "http_proxy" environmental variable.
+
+ * Samtools is now compitable with the MinGW (win32) compiler and the
+ PDCurses library.
+
+ * The calmd (or fillmd) command now calculates the NM tag and replaces
+ MD tags if they are wrong.
+
+ * The view command now recognizes and optionally prints FLAG in HEXs or
+ strings to make a SAM file more friendly to human eyes. This is a
+ samtools-C extension, not implemented in Picard for the time
+ being. Please type `samtools view -?' for more information.
+
+ * BAM files now have an end-of-file (EOF) marker to facilitate
+ truncation detection. A warning will be given if an on-disk BAM file
+ does not have this marker. The warning will be seen on BAM files
+ generated by an older version of samtools. It does NO harm.
+
+ * New key bindings in tview: `r' to show read names and `s' to show
+ reference skip (N operation) as deletions.
+
+ * Fixed a bug in `samtools merge -n'.
+
+ * Samtools merge now optionally copies the header of a user specified
+ SAM file to the resultant BAM output.
+
+ * Samtools pileup/tview works with a CIGAR with the first or the last
+ operation is an indel.
+
+ * Fixed a bug in bam_aux_get().
+
+
+Changes in other utilies:
+
+ * Fixed wrong FLAG in maq2sam.
+
+
+(0.1.6: 2 September 2009, r453)
+
+
+
Beta Release 0.1.5 (7 July, 2009)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include "kstring.h"
int bam_is_be = 0;
+char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
/**************************
* CIGAR related routines *
{
bam_header_t *header;
char buf[4];
- int32_t i, name_len;
+ int32_t i = 1, name_len;
+ // check EOF
+ i = bgzf_check_EOF(fp);
+ if (i < 0) fprintf(stderr, "[bam_header_read] read from pipe; skip EOF checking.\n");
+ else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n");
// read "BAM1"
if (bam_read(fp, buf, 4) != 4) return 0;
if (strncmp(buf, "BAM\001", 4)) {
return bam_write1_core(fp, &b->core, b->data_len, b->data);
}
-char *bam_format1(const bam_header_t *header, const bam1_t *b)
+char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
{
uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
int i;
kstring_t str;
str.l = str.m = 0; str.s = 0;
- ksprintf(&str, "%s\t%d\t", bam1_qname(b), c->flag);
+ ksprintf(&str, "%s\t", bam1_qname(b));
+ if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag);
+ else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
+ else { // BAM_OFSTR
+ for (i = 0; i < 16; ++i)
+ if ((c->flag & 1<<i) && bam_flag2char_table[i])
+ kputc(bam_flag2char_table[i], &str);
+ kputc('\t', &str);
+ }
if (c->tid < 0) kputs("*\t", &str);
else ksprintf(&str, "%s\t", header->target_name[c->tid]);
ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual);
else if (c->mtid == c->tid) kputs("=\t", &str);
else ksprintf(&str, "%s\t", header->target_name[c->mtid]);
ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize);
- for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
- kputc('\t', &str);
- if (t[0] == 0xff) kputc('*', &str);
- else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+ if (c->l_qseq) {
+ for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
+ kputc('\t', &str);
+ if (t[0] == 0xff) kputc('*', &str);
+ else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+ } else ksprintf(&str, "*\t*");
s = bam1_aux(b);
while (s < b->data + b->data_len) {
uint8_t type, key[2];
return str.s;
}
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
+{
+ return bam_format1_core(header, b, BAM_OFDEC);
+}
+
void bam_view1(const bam_header_t *header, const bam1_t *b)
{
char *s = bam_format1(header, b);
#include <string.h>
#include <stdio.h>
-#define _IOLIB 2
-
-#if _IOLIB == 1 && !defined(_NO_RAZF)
-#define BAM_TRUE_OFFSET
-#include "razf.h"
-/*! @abstract BAM file handler */
-typedef RAZF *bamFile;
-#define bam_open(fn, mode) razf_open(fn, mode)
-#define bam_dopen(fd, mode) razf_dopen(fd, mode)
-#define bam_close(fp) razf_close(fp)
-#define bam_read(fp, buf, size) razf_read(fp, buf, size)
-#define bam_write(fp, buf, size) razf_write(fp, buf, size)
-#define bam_tell(fp) razf_tell(fp)
-#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir)
-#elif _IOLIB == 2
+#ifndef BAM_LITE
#define BAM_VIRTUAL_OFFSET16
#include "bgzf.h"
/*! @abstract BAM file handler */
#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
#define bam_tell(fp) bgzf_tell(fp)
#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
-#elif _IOLIB == 3
-#define BAM_VIRTUAL_OFFSET16
-#include "razf.h"
-/*! @abstract BAM file handler */
-typedef RAZF *bamFile;
-#define bam_open(fn, mode) razf_open2(fn, mode)
-#define bam_dopen(fd, mode) razf_dopen2(fd, mode)
-#define bam_close(fp) razf_close(fp)
-#define bam_read(fp, buf, size) razf_read(fp, buf, size)
-#define bam_write(fp, buf, size) razf_write(fp, buf, size)
-#define bam_tell(fp) razf_tell2(fp)
-#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir)
+#else
+#define BAM_TRUE_OFFSET
+#include <zlib.h>
+typedef gzFile bamFile;
+#define bam_open(fn, mode) gzopen(fn, mode)
+#define bam_dopen(fd, mode) gzdopen(fd, mode)
+#define bam_close(fp) gzclose(fp)
+#define bam_read(fp, buf, size) gzread(fp, buf, size)
+/* no bam_write/bam_tell/bam_seek() here */
#endif
/*! @typedef
/*! @abstract optical or PCR duplicate */
#define BAM_FDUP 1024
+#define BAM_OFDEC 0
+#define BAM_OFHEX 1
+#define BAM_OFSTR 2
+
/*! @abstract defautl mask for pileup */
#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
*/
char *bam_format1(const bam_header_t *header, const bam1_t *b);
+ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
+
/*! @typedef
@abstract Structure for one alignment covering the pileup position.
@field b pointer to the alignment
*/
int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
+ int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
+
struct __bam_lplbuf_t;
typedef struct __bam_lplbuf_t bam_lplbuf_t;
/*! @abstract bam_plbuf_push() equivalent with level calculated. */
int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
- /*! @abstract bam_plbuf_file() equivalent with level calculated. */
- int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
-
struct __bam_index_t;
typedef struct __bam_index_t bam_index_t;
char bam_aux2A(const uint8_t *s);
char *bam_aux2Z(const uint8_t *s);
+ int bam_aux_del(bam1_t *b, uint8_t *s);
void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
-
uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
/*!
return bam_aux_get(b, tag);
}
+#define __skip_tag(s) do { \
+ int type = toupper(*(s)); \
+ ++(s); \
+ if (type == 'C' || type == 'A') ++(s); \
+ else if (type == 'S') (s) += 2; \
+ else if (type == 'I' || type == 'F') (s) += 4; \
+ else if (type == 'D') (s) += 8; \
+ else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
+ } while (0)
+
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
{
uint8_t *s;
int y = tag[0]<<8 | tag[1];
s = bam1_aux(b);
while (s < b->data + b->data_len) {
- int type, x = (int)s[0]<<8 | s[1];
+ int x = (int)s[0]<<8 | s[1];
s += 2;
if (x == y) return s;
- type = toupper(*s); ++s;
- if (type == 'C') ++s;
- else if (type == 'S') s += 2;
- else if (type == 'I' || type == 'F') s += 4;
- else if (type == 'D') s += 8;
- else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+ __skip_tag(s);
}
return 0;
}
+// s MUST BE returned by bam_aux_get()
+int bam_aux_del(bam1_t *b, uint8_t *s)
+{
+ uint8_t *p, *aux;
+ aux = bam1_aux(b);
+ p = s - 2;
+ __skip_tag(s);
+ memmove(p, s, b->l_aux - (s - aux));
+ b->data_len -= s - p;
+ b->l_aux -= s - p;
+ return 0;
+}
void bam_init_header_hash(bam_header_t *header)
{
cs_i = strlen(cs) - 1 - i;
// get current color
cur_color = cs[cs_i];
- // get previous base
- prev_b = (0 == cs_i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+ // get previous base. Note: must rc adaptor
+ prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
// get current base
cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
}
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
+#ifdef _WIN32
+#include <fcntl.h>
+#endif
#include "kstring.h"
#include "bam.h"
#include "kseq.h"
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
};
+unsigned short bam_char2flag_table[256] = {
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
+ BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
+};
+
char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
struct __tamFile_t {
kstring_t *str;
kh_ref_t *hash;
khiter_t k;
- hash = kh_init(ref);
+ if (fn == 0) return 0;
fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
- assert(fp);
+ if (fp == 0) return 0;
+ hash = kh_init(ref);
ks = ks_init(fp);
str = (kstring_t*)calloc(1, sizeof(kstring_t));
while (ks_getuntil(ks, 0, str, &dret) > 0) {
memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
doff += c->l_qname;
}
- { // flag, tid, pos, qual
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s);
+ { // flag
+ long flag;
+ char *s;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ flag = strtol((char*)str->s, &s, 0);
+ if (*s) { // not the end of the string
+ flag = 0;
+ for (s = str->s; *s; ++s)
+ flag |= bam_char2flag_table[(int)*s];
+ }
+ c->flag = flag;
+ }
+ { // tid, pos, qual
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
if (c->tid < 0 && strcmp(str->s, "*")) {
if (header->n_targets == 0) {
for (i = 0, s = str->s; i != c->n_cigar; ++i) {
x = strtol(s, &t, 10);
op = toupper(*t);
- if (op == 'M') op = BAM_CMATCH;
+ if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
else if (op == 'I') op = BAM_CINS;
else if (op == 'D') op = BAM_CDEL;
else if (op == 'N') op = BAM_CREF_SKIP;
if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b)));
doff += c->n_cigar * 4;
- } else c->bin = bam_reg2bin(c->pos, c->pos + 1);
+ } else {
+ if (!(c->flag&BAM_FUNMAP)) {
+ fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
+ c->flag |= BAM_FUNMAP;
+ }
+ c->bin = bam_reg2bin(c->pos, c->pos + 1);
+ }
}
{ // mtid, mpos, isize
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
}
{ // seq and qual
int i;
- uint8_t *p;
+ uint8_t *p = 0;
if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
z += str->l + 1;
- c->l_qseq = strlen(str->s);
- if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
- parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
- p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
- bzero(p, (c->l_qseq+1)/2);
- for (i = 0; i < c->l_qseq; ++i)
- p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+ if (strcmp(str->s, "*")) {
+ c->l_qseq = strlen(str->s);
+ if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
+ parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+ p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+ memset(p, 0, (c->l_qseq+1)/2);
+ for (i = 0; i < c->l_qseq; ++i)
+ p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+ } else c->l_qseq = 0;
if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
z += str->l + 1;
if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
tamFile sam_open(const char *fn)
{
tamFile fp;
+ gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");
+ if (gzfp == 0) return 0;
fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
- fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ fp->fp = gzfp;
fp->ks = ks_init(fp->fp);
return fp;
}
#include "khash.h"
#include "ksort.h"
#include "bam_endian.h"
+#ifdef _USE_KNETFILE
#include "knetfile.h"
+#endif
/*!
@header
FILE *fp;
char *fnidx, *fn;
- if (strstr(_fn, "ftp://") == _fn) {
+ if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
const char *p;
int l = strlen(_fn);
for (p = _fn + l - 1; p >= _fn; --p)
} else return 0;
}
+#ifdef _USE_KNETFILE
static void download_from_remote(const char *url)
{
const int buf_size = 1 * 1024 * 1024;
uint8_t *buf;
knetFile *fp_remote;
int l;
- if (strstr(url, "ftp://") != url) return;
+ if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
l = strlen(url);
for (fn = (char*)url + l - 1; fn >= url; --fn)
if (*fn == '/') break;
fclose(fp);
knet_close(fp_remote);
}
+#else
+static void download_from_remote(const char *url)
+{
+ return;
+}
+#endif
bam_index_t *bam_index_load(const char *fn)
{
bam_index_t *idx;
idx = bam_index_load_local(fn);
- if (idx == 0 && strstr(fn, "ftp://") == fn) {
+ if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
char *fnidx = calloc(strlen(fn) + 5, 1);
strcat(strcpy(fnidx, fn), ".bai");
fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
{
return bam_plbuf_push(b, tv->plbuf);
}
-
-int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
-{
- bam_lplbuf_t *buf;
- int ret;
- bam1_t *b;
- b = (bam1_t*)calloc(1, sizeof(bam1_t));
- buf = bam_lplbuf_init(func, func_data);
- bam_plbuf_set_mask(buf->plbuf, mask);
- while ((ret = bam_read1(fp, b)) >= 0)
- bam_lplbuf_push(b, buf);
- bam_lplbuf_push(0, buf);
- bam_lplbuf_destroy(buf);
- free(b->data); free(b);
- return 0;
-}
}
if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
- if (types[i] != 0) score[i*n+j] -= mi->indel_err;
+ //if (types[i] != 0) score[i*n+j] -= mi->indel_err;
//printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j,
// score[i*n+j], pscore[i*n+j]);
}
if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel;
else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel;
}
+ // write cnt_ref and cnt_ambi
+ if (max1_i != 0 && max2_i != 0) {
+ for (j = 0; j < n; ++j) {
+ int diff1 = score[j] - score[max1_i * n + j];
+ int diff2 = score[j] - score[max2_i * n + j];
+ if (diff1 > 0 && diff2 > 0) ++ret->cnt_ref;
+ else if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi;
+ }
+ }
}
free(score); free(pscore); free(ref2); free(inscns);
}
typedef struct {
int indel1, indel2;
- int cnt1, cnt2, cnt_ambi, cnt_anti;
+ int cnt1, cnt2, cnt_anti;
+ int cnt_ref, cnt_ambi;
char *s[2];
//
int gt, gl[2];
#include <string.h>
#include <ctype.h>
#include "faidx.h"
-#include "bam.h"
+#include "sam.h"
#include "kstring.h"
void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
bam1_core_t *c = &b->core;
int i, x, y, u = 0;
kstring_t *str;
- uint8_t *old_md;
+ uint8_t *old_md, *old_nm;
+ int32_t old_nm_i = -1, nm = 0;
- old_md = bam_aux_get(b, "MD");
- if (c->flag & BAM_FUNMAP) return;
- if (old_md && !is_equal) return; // no need to add MD
str = (kstring_t*)calloc(1, sizeof(kstring_t));
for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
int z = y + j;
int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
if (ref[x+j] == 0) break; // out of boundary
- if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) {
+ if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
++u;
} else {
ksprintf(str, "%d", u);
kputc(ref[x+j], str);
- u = 0;
+ u = 0; ++nm;
}
}
if (j < l) break;
}
u = 0;
if (j < l) break;
- x += l;
+ x += l; nm += l;
} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
y += l;
+ if (op == BAM_CINS) nm += l;
} else if (op == BAM_CREF_SKIP) {
x += l;
}
}
ksprintf(str, "%d", u);
+ // update NM
+ old_nm = bam_aux_get(b, "NM");
+ if (c->flag & BAM_FUNMAP) return;
+ if (old_nm) old_nm_i = bam_aux2i(old_nm);
+ if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ else if (nm != old_nm_i) {
+ fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+ bam_aux_del(b, old_nm);
+ bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ }
+ // update MD
+ old_md = bam_aux_get(b, "MD");
if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
else {
int is_diff = 0;
break;
if (i < str->l) is_diff = 1;
} else is_diff = 1;
- if (is_diff)
- fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' != '%s'\n", bam1_qname(b), old_md+1, str->s);
+ if (is_diff) {
+ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+ bam_aux_del(b, old_md);
+ bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ }
}
free(str->s); free(str);
}
int bam_fillmd(int argc, char *argv[])
{
- int c, is_equal = 0, tid = -2, ret, len;
- bamFile fp, fpout = 0;
- bam_header_t *header;
+ int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed;
+ samfile_t *fp, *fpout = 0;
faidx_t *fai;
- char *ref = 0;
+ char *ref = 0, mode_w[8], mode_r[8];
bam1_t *b;
- while ((c = getopt(argc, argv, "e")) >= 0) {
+ is_bam_out = is_sam_in = is_uncompressed = 0;
+ mode_w[0] = mode_r[0] = 0;
+ strcpy(mode_r, "r"); strcpy(mode_w, "w");
+ while ((c = getopt(argc, argv, "eubS")) >= 0) {
switch (c) {
case 'e': is_equal = 1; break;
+ case 'b': is_bam_out = 1; break;
+ case 'u': is_uncompressed = is_bam_out = 1; break;
+ case 'S': is_sam_in = 1; break;
default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
}
}
+ if (!is_sam_in) strcat(mode_r, "b");
+ if (is_bam_out) strcat(mode_w, "b");
+ else strcat(mode_w, "h");
+ if (is_uncompressed) strcat(mode_w, "u");
if (optind + 1 >= argc) {
- fprintf(stderr, "Usage: bam fillmd [-e] <aln.bam> <ref.fasta>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools fillmd [-eubS] <aln.bam> <ref.fasta>\n\n");
+ fprintf(stderr, "Options: -e change identical bases to '='\n");
+ fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
+ fprintf(stderr, " -b compressed BAM output\n");
+ fprintf(stderr, " -S the input is SAM with header\n\n");
+ return 1;
+ }
+ fp = samopen(argv[optind], mode_r, 0);
+ if (fp == 0) return 1;
+ if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
+ fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
return 1;
}
- fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
- assert(fp);
- header = bam_header_read(fp);
- fpout = bam_dopen(fileno(stdout), "w");
- bam_header_write(fpout, header);
+ fpout = samopen("-", mode_w, fp->header);
fai = fai_load(argv[optind+1]);
b = bam_init1();
- while ((ret = bam_read1(fp, b)) >= 0) {
+ while ((ret = samread(fp, b)) >= 0) {
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
- ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
+ ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
tid = b->core.tid;
}
bam_fillmd1(b, ref, is_equal);
}
- bam_write1(fpout, b);
+ samwrite(fpout, b);
}
bam_destroy1(b);
free(ref);
fai_destroy(fai);
- bam_header_destroy(header);
- bam_close(fp); bam_close(fpout);
+ samclose(fp); samclose(fpout);
return 0;
}
int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
+ if (op_next == BAM_CDEL || op_next == BAM_CINS) {
+ if (k + 2 < c->n_cigar) op_next = bam1_cigar(b)[k+2]&BAM_CIGAR_MASK;
+ else p->is_tail = 1;
+ }
if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
p->is_tail = 1; // tail
} else p->is_tail = 1; // this is the last operation; set tail
if (b->core.flag & buf->flag_mask) return 0;
bam_copy1(&buf->tail->b, b);
buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
- if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
- fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n");
- abort();
+ if (b->core.tid < buf->max_tid) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+ return -1;
+ }
+ if ((b->core.tid == buf->max_tid) && (buf->tail->beg < buf->max_pos)) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+ return -1;
}
buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) {
}
return 0;
}
+
+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
+{
+ bam_plbuf_t *buf;
+ int ret;
+ bam1_t *b;
+ b = bam_init1();
+ buf = bam_plbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf, mask);
+ while ((ret = bam_read1(fp, b)) >= 0)
+ bam_plbuf_push(b, buf);
+ bam_plbuf_push(0, buf);
+ bam_plbuf_destroy(buf);
+ bam_destroy1(b);
+ return 0;
+}
printf("%d\t%d\t", rms_mapq, n);
printf("%s\t%s\t", r->s[0], r->s[1]);
//printf("%d\t%d\t", r->gl[0], r->gl[1]);
- printf("%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_anti);
+ printf("%d\t%d\t%d\t", r->cnt1, r->cnt2, r->cnt_anti);
+ printf("%d\t%d\n", r->cnt_ref, r->cnt_ambi);
bam_maqindel_ret_destroy(r);
}
return 0;
fprintf(stderr, " -i only show lines/consensus with indels\n");
fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask);
fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ);
- fprintf(stderr, " -t FILE list of reference sequences (assume the input is in SAM)\n");
+ fprintf(stderr, " -t FILE list of reference sequences (force -S)\n");
fprintf(stderr, " -l FILE list of sites at which pileup is output\n");
fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n");
fprintf(stderr, " -c output the maq consensus sequence\n");
}
if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)))
fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n");
+ if (fn_fa && is_SAM && fn_list == 0) fn_list = samfaipath(fn_fa);
+
{
samfile_t *fp;
fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0);
{
bamFile in, out;
if (argc < 3) {
- fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
+ fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n\n");
+ fprintf(stderr, "Note: Picard is recommended for this task.\n");
return 1;
}
in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
samfile_t *in, *out;
buffer_t *buf;
if (argc < 3) {
- fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
+ fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n\n");
+ fprintf(stderr, "Note: Picard is recommended for this task.\n");
return 1;
}
buf = calloc(1, sizeof(buffer_t));
static inline int heap_lt(const heap1_t a, const heap1_t b)
{
if (g_is_by_qname) {
- int t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+ int t;
+ if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;
+ t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
return (t > 0 || (t == 0 && a.pos > b.pos));
} else return (a.pos > b.pos);
}
KSORT_INIT(heap, heap1_t, heap_lt)
+static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
+{
+ int tempi;
+ char *temps;
+ tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;
+ temps = h1->text, h1->text = h2->text, h2->text = temps;
+}
+
/*!
@abstract Merge multiple sorted BAM.
@param is_by_qname whether to sort by query name
@param out output BAM file name
+ @param headers name of SAM file from which to copy '@' header lines,
+ or NULL to copy them from the first file to be merged
@param n number of files to be merged
@param fn names of files to be merged
@discussion Padding information may NOT correctly maintained. This
function is NOT thread safe.
*/
-void bam_merge_core(int by_qname, const char *out, int n, char * const *fn)
+void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn)
{
bamFile fpout, *fp;
heap1_t *heap;
bam_header_t *hout = 0;
+ bam_header_t *hheaders = NULL;
int i, j;
+ if (headers) {
+ tamFile fpheaders = sam_open(headers);
+ if (fpheaders == 0) {
+ fprintf(stderr, "[bam_merge_core] Cannot open file `%s'. Continue anyway.\n", headers);
+ } else {
+ hheaders = sam_header_read(fpheaders);
+ sam_close(fpheaders);
+ }
+ }
+
g_is_by_qname = by_qname;
fp = (bamFile*)calloc(n, sizeof(bamFile));
heap = (heap1_t*)calloc(n, sizeof(heap1_t));
bam_header_t *hin;
assert(fp[i] = bam_open(fn[i], "r"));
hin = bam_header_read(fp[i]);
- if (i == 0) hout = hin;
- else { // validate multiple baf
+ if (i == 0) { // the first SAM
+ hout = hin;
+ if (hheaders) {
+ // If the text headers to be swapped in include any @SQ headers,
+ // check that they are consistent with the existing binary list
+ // of reference information.
+ if (hheaders->n_targets > 0) {
+ if (hout->n_targets != hheaders->n_targets)
+ fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers);
+ for (j = 0; j < hout->n_targets; ++j)
+ if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0)
+ fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers);
+ }
+ swap_header_text(hout, hheaders);
+ bam_header_destroy(hheaders);
+ hheaders = NULL;
+ }
+ } else { // validate multiple baf
if (hout->n_targets != hin->n_targets) {
fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]);
exit(1);
hout->target_name[j], hin->target_name[j], fn[i]);
exit(1);
}
- if (hout->target_len[j] != hin->target_len[j])
- fprintf(stderr, "[bam_merge_core] different target sequence length: %d != %d in file '%s'. Continue.\n",
- hout->target_len[j], hin->target_len[j], fn[i]);
}
bam_header_destroy(hin);
}
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->b;
bam_write1_core(fpout, &b->core, b->data_len, b->data);
- if ((j = bam_read1(fp[heap->i], b)) >= 0)
+ if ((j = bam_read1(fp[heap->i], b)) >= 0) {
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b);
- else if (j == -1) heap->pos = HEAP_EMPTY;
- else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+ } else if (j == -1) {
+ heap->pos = HEAP_EMPTY;
+ free(heap->b->data); free(heap->b);
+ heap->b = 0;
+ } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
ks_heapadjust(heap, 0, n, heap);
}
- for (i = 0; i != n; ++i) {
- bam_close(fp[i]);
- free(heap[i].b->data);
- free(heap[i].b);
- }
+ for (i = 0; i != n; ++i) bam_close(fp[i]);
bam_close(fpout);
free(fp); free(heap);
}
int bam_merge(int argc, char *argv[])
{
int c, is_by_qname = 0;
- while ((c = getopt(argc, argv, "n")) >= 0) {
+ char *fn_headers = NULL;
+
+ while ((c = getopt(argc, argv, "h:n")) >= 0) {
switch (c) {
+ case 'h': fn_headers = strdup(optarg); break;
case 'n': is_by_qname = 1; break;
}
}
if (optind + 2 >= argc) {
- fprintf(stderr, "Usage: samtools merge [-n] <out.bam> <in1.bam> <in2.bam> [...]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools merge [-n] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
+ fprintf(stderr, "Options: -n sort by read names\n");
+ fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n");
+ fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
+ fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n");
+ fprintf(stderr, " the header dictionary in merging.\n\n");
return 1;
}
- bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1);
+ bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1);
+ free(fn_headers);
return 0;
}
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
- bam_merge_core(is_by_qname, fnout, n, fns);
+ bam_merge_core(is_by_qname, fnout, 0, n, fns);
free(fnout);
for (i = 0; i < n; ++i) {
unlink(fns[i]);
if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \
if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \
if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \
- if ((c)->flag & BAM_FMUNMAP) ++(s)->n_sgltn; \
+ if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \
if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
++(s)->n_pair_map; \
if ((c)->mtid != (c)->tid) { \
-#ifndef _NO_CURSES
+#undef _HAVE_CURSES
+
+#if _CURSES_LIB == 0
+#elif _CURSES_LIB == 1
#include <curses.h>
-#ifdef NCURSES_VERSION
+#ifndef NCURSES_VERSION
+#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled"
+#else
+#define _HAVE_CURSES
+#endif
+#elif _CURSES_LIB == 2
+#include <xcurses.h>
+#define _HAVE_CURSES
+#else
+#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled"
+#endif
+
+#ifdef _HAVE_CURSES
#include <ctype.h>
#include <assert.h>
#include <string.h>
faidx_t *fai;
bam_maqcns_t *bmc;
- int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins;
+ int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name;
char *ref;
} tview_t;
// print referece
rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
for (i = tv->last_pos + 1; i < pos; ++i) {
- if (i%10 == 0) mvprintw(0, tv->ccol, "%-d", i+1);
+ if (i%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", i+1);
c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
mvaddch(1, tv->ccol++, c);
}
- if (pos%10 == 0) mvprintw(0, tv->ccol, "%-d", pos+1);
+ if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1);
// print consensus
call = bam_maqcns_call(n, pl, tv->bmc);
attr = A_UNDERLINE;
c = bam_aux_getCSi(p->b, p->qpos);
// assume that if we found one color, we will be able to get the color error
if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.';
- }
- else {
- c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
- if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ } else {
+ if (tv->show_name) {
+ char *name = bam1_qname(p->b);
+ c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
+ } else {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ }
}
} else c = '*';
} else { // padding
if (j > p->indel) c = '*';
else { // insertion
if (tv->base_for == TV_BASE_NUCL) {
- c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
- if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
- }
- else {
+ if (tv->show_name) {
+ char *name = bam1_qname(p->b);
+ c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
+ } else {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ } else {
c = bam_aux_getCSi(p->b, p->qpos + j);
if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.';
}
clear();
noecho();
cbreak();
-#ifdef NCURSES_VERSION
+ tv->mrow = 24; tv->mcol = 80;
getmaxyx(stdscr, tv->mrow, tv->mcol);
-#else
- tv->mrow = 80; tv->mcol = 40;
-#endif
tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(27, 40, 5, 5);
+ tv->whelp = newwin(29, 40, 5, 5);
tv->color_for = TV_COLOR_MAPQ;
start_color();
init_pair(1, COLOR_BLUE, COLOR_BLACK);
int tv_fetch_func(const bam1_t *b, void *data)
{
tview_t *tv = (tview_t*)data;
+ if (tv->no_skip) {
+ uint32_t *cigar = bam1_cigar(b); // this is cheating...
+ int i;
+ for (i = 0; i <b->core.n_cigar; ++i) {
+ if ((cigar[i]&0xf) == BAM_CREF_SKIP)
+ cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
+ }
+ }
bam_lplbuf_push(b, tv->lplbuf);
return 0;
}
bam_lplbuf_reset(tv->lplbuf);
bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
bam_lplbuf_push(0, tv->lplbuf);
+
+ while (tv->ccol < tv->mcol) {
+ int pos = tv->last_pos + 1;
+ if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1);
+ mvaddch(1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
+ ++tv->last_pos;
+ }
return 0;
}
mvwprintw(win, r++, 2, "c Color for cs color");
mvwprintw(win, r++, 2, "z Color for cs qual");
mvwprintw(win, r++, 2, ". Toggle on/off dot view");
+ mvwprintw(win, r++, 2, "s Toggle on/off ref skip");
+ mvwprintw(win, r++, 2, "r Toggle on/off rd name");
mvwprintw(win, r++, 2, "N Turn on nt view");
mvwprintw(win, r++, 2, "C Turn on cs view");
mvwprintw(win, r++, 2, "i Toggle on/off ins");
tid = tv->curr_tid; pos = tv->left_pos;
while (1) {
int c = getch();
- //if(256 < c) {c = 1 + (c%256);} // Terminal was displaying ctrl-H as 263 via ssh from Mac OS X 10.5 computer
switch (c) {
case '?': tv_win_help(tv); break;
case '\033':
case 'n': tv->color_for = TV_COLOR_NUCL; break;
case 'c': tv->color_for = TV_COLOR_COL; break;
case 'z': tv->color_for = TV_COLOR_COLQ; break;
+ case 's': tv->no_skip = !tv->no_skip; break;
+ case 'r': tv->show_name = !tv->show_name; break;
case KEY_LEFT:
case 'h': --pos; break;
case KEY_RIGHT:
case 'k': ++tv->row_shift; break;
case KEY_BACKSPACE:
case '\177': pos -= tv->mcol; break;
-#ifdef KEY_RESIZE
case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
-#endif
default: continue;
}
if (pos < 0) pos = 0;
tv_destroy(tv);
return 0;
}
-#else // #ifdef NCURSES_VERSION
-#warning "The ncurses library is unavailable; tview is disabled."
+#else // #ifdef _HAVE_CURSES
+#include <stdio.h>
+#warning "No curses library is available; tview is disabled."
int bam_tview_main(int argc, char *argv[])
{
fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n");
return 1;
}
-#endif
-#endif // #ifndef _NO_CURSES
+#endif // #ifdef _HAVE_CURSES
#include <stdio.h>
#include <unistd.h>
#include <assert.h>
+#include <fcntl.h>
#include "bam.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.5c (r385)"
+#define PACKAGE_VERSION "0.1.6 (r453)"
#endif
int bam_taf2baf(int argc, char *argv[]);
fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n");
fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
fprintf(stderr, "Usage: samtools <command> [options]\n\n");
- fprintf(stderr, "Command: import import from SAM (obsolete; use `view')\n");
- fprintf(stderr, " view export to the text format\n");
+ fprintf(stderr, "Command: view SAM<->BAM conversion\n");
fprintf(stderr, " sort sort alignment file\n");
- fprintf(stderr, " merge merge multiple sorted alignment files\n");
fprintf(stderr, " pileup generate pileup output\n");
fprintf(stderr, " faidx index/extract FASTA\n");
-#ifndef _NO_CURSES
+#if _CURSES_LIB != 0
fprintf(stderr, " tview text alignment viewer\n");
#endif
fprintf(stderr, " index index alignment\n");
fprintf(stderr, " fixmate fix mate information\n");
- fprintf(stderr, " rmdup remove PCR duplicates\n");
fprintf(stderr, " glfview print GLFv3 file\n");
fprintf(stderr, " flagstat simple stats\n");
- fprintf(stderr, " fillmd fill the MD tag and change identical base to =\n");
+ fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n");
+ fprintf(stderr, " merge merge sorted alignments (Picard recommended)\n");
+ fprintf(stderr, " rmdup remove PCR duplicates (Picard recommended)\n");
fprintf(stderr, "\n");
return 1;
}
int main(int argc, char *argv[])
{
+#ifdef _WIN32
+ setmode(fileno(stdout), O_BINARY);
+ setmode(fileno(stdin), O_BINARY);
+#ifdef _USE_KNETFILE
+ knet_win32_init();
+#endif
+#endif
if (argc < 2) return usage();
if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);
+ else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
-#ifndef _NO_CURSES
+#if _CURSES_LIB != 0
else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
#endif
else {
-/*
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2008 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever.
- * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
- * or functionality.
- */
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
/*
2009-06-29 by lh3: cache recent uncompressed blocks.
} cache_t;
KHASH_MAP_INIT_INT64(cache, cache_t)
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
extern off_t ftello(FILE *stream);
extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
-typedef int8_t byte;
+typedef int8_t bgzf_byte_t;
static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
static const int MAX_BLOCK_SIZE = 64 * 1024;
buffer[3] = value >> 24;
}
-inline
+static inline
int
-min(int x, int y)
+bgzf_min(int x, int y)
{
return (x < y) ? x : y;
}
fp->open_mode = 'r';
fp->x.fpr = file;
#else
- int oflag = O_RDONLY;
- int fd = open(path, oflag);
+ int fd, oflag = O_RDONLY;
+#ifdef _WIN32
+ oflag |= O_BINARY;
+#endif
+ fd = open(path, oflag);
if (fd == -1) return 0;
fp = open_read(fd);
#endif
} else if (mode[0] == 'w' || mode[0] == 'W') {
- int oflag = O_WRONLY | O_CREAT | O_TRUNC;
- int fd = open(path, oflag, 0644);
+ int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+ oflag |= O_BINARY;
+#endif
+ fd = open(path, oflag, 0644);
if (fd == -1) return 0;
fp = open_write(fd, strstr(mode, "u")? 1 : 0);
}
// Deflate the block in fp->uncompressed_block into fp->compressed_block.
// Also adds an extra field that stores the compressed block length.
- byte* buffer = fp->compressed_block;
+ bgzf_byte_t* buffer = fp->compressed_block;
int buffer_size = fp->compressed_block_size;
// Init gzip header
static
int
-check_header(const byte* header)
+check_header(const bgzf_byte_t* header)
{
return (header[0] == GZIP_ID1 &&
- header[1] == (byte) GZIP_ID2 &&
+ header[1] == (bgzf_byte_t) GZIP_ID2 &&
header[2] == Z_DEFLATED &&
(header[3] & FLG_FEXTRA) != 0 &&
unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
int
read_block(BGZF* fp)
{
- byte header[BLOCK_HEADER_LENGTH];
+ bgzf_byte_t header[BLOCK_HEADER_LENGTH];
int size = 0;
#ifdef _USE_KNETFILE
int64_t block_address = knet_tell(fp->x.fpr);
return -1;
}
int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
- byte* compressed_block = (byte*) fp->compressed_block;
+ bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
int remaining = block_length - BLOCK_HEADER_LENGTH;
#ifdef _USE_KNETFILE
}
int bytes_read = 0;
- byte* output = data;
+ bgzf_byte_t* output = data;
while (bytes_read < length) {
int available = fp->block_length - fp->block_offset;
if (available <= 0) {
break;
}
}
- int copy_length = min(length-bytes_read, available);
- byte* buffer = fp->uncompressed_block;
+ int copy_length = bgzf_min(length-bytes_read, available);
+ bgzf_byte_t* buffer = fp->uncompressed_block;
memcpy(output, buffer + fp->block_offset, copy_length);
fp->block_offset += copy_length;
output += copy_length;
fp->uncompressed_block = malloc(fp->uncompressed_block_size);
}
- const byte* input = data;
+ const bgzf_byte_t* input = data;
int block_length = fp->uncompressed_block_size;
int bytes_written = 0;
while (bytes_written < length) {
- int copy_length = min(block_length - fp->block_offset, length - bytes_written);
- byte* buffer = fp->uncompressed_block;
+ int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
+ bgzf_byte_t* buffer = fp->uncompressed_block;
memcpy(buffer + fp->block_offset, input, copy_length);
fp->block_offset += copy_length;
input += copy_length;
if (flush_block(fp) != 0) {
return -1;
}
+ { // add an empty block
+ int count, block_length = deflate_block(fp, 0);
+#ifdef _USE_KNETFILE
+ count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+#else
+ count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+#endif
+ }
#ifdef _USE_KNETFILE
if (fflush(fp->x.fpw) != 0) {
#else
if (fp) fp->cache_size = cache_size;
}
+int bgzf_check_EOF(BGZF *fp)
+{
+ static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+ uint8_t buf[28];
+ off_t offset;
+#ifdef _USE_KNETFILE
+ offset = knet_tell(fp->x.fpr);
+ if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
+ knet_read(fp->x.fpr, buf, 28);
+ knet_seek(fp->x.fpr, offset, SEEK_SET);
+#else
+ offset = ftello(fp->file);
+ if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
+ fread(buf, 1, 28, fp->file);
+ fseeko(fp->file, offset, SEEK_SET);
+#endif
+ return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
int64_t
bgzf_seek(BGZF* fp, int64_t pos, int where)
{
fp->block_offset = block_offset;
return 0;
}
-
-/*
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2008 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever.
- * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
- * or functionality.
- */
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
#ifndef __BGZF_H
#define __BGZF_H
*/
void bgzf_set_cache_size(BGZF *fp, int cache_size);
+int bgzf_check_EOF(BGZF *fp);
+
#ifdef __cplusplus
}
#endif
-/*
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2008 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever.
- * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
- * or functionality.
- */
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#ifndef _NO_RAZF
#include "razf.h"
#else
+#ifdef _WIN32
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
extern off_t ftello(FILE *stream);
extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
#define RAZF FILE
#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
#define razf_open(fn, mode) fopen(fn, mode)
faidx1_t x;
k = kh_get(s, fai->hash, fai->name[i]);
x = kh_value(fai->hash, k);
+#ifdef _WIN32
+ fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
+#else
fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+#endif
}
}
faidx_t *fai;
char *buf, *p;
int len, line_len, line_blen;
+#ifdef _WIN32
+ long offset;
+#else
long long offset;
+#endif
fai = (faidx_t*)calloc(1, sizeof(faidx_t));
fai->hash = kh_init(s);
buf = (char*)calloc(0x10000, 1);
while (!feof(fp) && fgets(buf, 0x10000, fp)) {
for (p = buf; *p && isgraph(*p); ++p);
*p = 0; ++p;
+#ifdef _WIN32
+ sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
+#else
sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+#endif
fai_insert_index(fai, buf, len, line_len, line_blen, offset);
}
free(buf);
}
fai = fai_build_core(rz);
razf_close(rz);
- fp = fopen(str, "w");
+ fp = fopen(str, "wb");
if (fp == 0) {
fprintf(stderr, "[fai_build] fail to write FASTA index.\n");
fai_destroy(fai); free(str);
faidx_t *fai;
str = (char*)calloc(strlen(fn) + 5, 1);
sprintf(str, "%s.fai", fn);
- fp = fopen(str, "r");
+ fp = fopen(str, "rb");
if (fp == 0) {
fprintf(stderr, "[fai_load] build FASTA index.\n");
fai_build(fn);
}
fai = fai_read(fp);
fclose(fp);
- fai->rz = razf_open(fn, "r");
+ fai->rz = razf_open(fn, "rb");
free(str);
if (fai->rz == 0) {
fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/* Probably I will not do socket programming in the next few years and
+ therefore I decide to heavily annotate this file, for Linux and
+ Windows as well. -lh3 */
+
#include <time.h>
#include <stdio.h>
-#include <netdb.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
+
+#ifdef _WIN32
+#include <winsock.h>
+#else
+#include <netdb.h>
#include <arpa/inet.h>
#include <sys/socket.h>
+#endif
+
#include "knetfile.h"
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
static int socket_wait(int fd, int is_read)
{
fd_set fds, *fdr = 0, *fdw = 0;
return ret;
}
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+ int on = 1, fd;
+ struct linger lng = { 0, 0 };
+ struct addrinfo hints, *res;
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ /* In Unix/Mac, getaddrinfo() is the most convenient way to get
+ * server information. */
+ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+ if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+ /* The following two setsockopt() are used by ftplib
+ * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+ * necessary. */
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+ freeaddrinfo(res);
+ return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *uint64tostr(char *buf, uint64_t x)
+{
+ int i, cnt;
+ for (i = 0; x; x /= 10) buf[i++] = '0' + x%10;
+ buf[i] = 0;
+ for (cnt = i, i = 0; i < cnt/2; ++i) {
+ int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+ }
+ return buf;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+ WSADATA wsaData;
+ return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+ WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); return -1; } while (0)
+
+ int on = 1;
+ SOCKET fd;
+ struct linger lng = { 0, 0 };
+ struct sockaddr_in server;
+ struct hostent *hp = 0;
+ // open socket
+ if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ // get host info
+ if (isalpha(host[0])) hp = gethostbyname(host);
+ else {
+ struct in_addr addr;
+ addr.s_addr = inet_addr(host);
+ hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+ }
+ if (hp == 0) __err_connect("gethost");
+ // connect
+ server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+ server.sin_family= AF_INET;
+ server.sin_port = htons(atoi(port));
+ if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+ // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+ return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+ off_t rest = len, curr, l = 0;
+ /* recv() and read() may not read the required length of data with
+ * one call. They have to be called repeatedly. */
+ while (rest) {
+ if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+ curr = netread(fd, buf + l, rest);
+ /* According to the glibc manual, section 13.2, a zero returned
+ * value indicates end-of-file (EOF), which should mean that
+ * read() will not return zero if EOF has not been met but data
+ * are not immediately available. */
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
static int kftp_get_response(knetFile *ftp)
{
unsigned char c;
int n = 0;
char *p;
if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
- while (read(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+ while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
//fputc(c, stderr);
if (n >= ftp->max_response) {
ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
{
if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
- write(ftp->ctrl_fd, cmd, strlen(cmd));
+ netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
return is_get? kftp_get_response(ftp) : 0;
}
return 0;
}
+
static int kftp_pasv_connect(knetFile *ftp)
{
-#define __err_pasv_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
-
- struct addrinfo hints, *res;
- struct linger lng = { 0, 0 };
- int on = 1;
char host[80], port[10];
-
if (ftp->pasv_port == 0) {
fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
return -1;
}
- memset(&hints, 0, sizeof(struct addrinfo));
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_STREAM;
sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
sprintf(port, "%d", ftp->pasv_port);
- if (getaddrinfo(host, port, &hints, &res) != 0) { perror("getaddrinfo"); return -1; }
- if ((ftp->fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_pasv_connect("socket");
- if (setsockopt(ftp->fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_pasv_connect("setsockopt");
- if (setsockopt(ftp->fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_pasv_connect("setsockopt");
- if (connect(ftp->fd, res->ai_addr, res->ai_addrlen) != 0) __err_pasv_connect("connect");
- freeaddrinfo(res);
+ ftp->fd = socket_connect(host, port);
+ if (ftp->fd == -1) return -1;
return 0;
}
int kftp_connect(knetFile *ftp)
{
-#define __err_connect(func) do { perror(func); return -1; } while (0)
-
- int on = 1;
- { // open socket
- struct addrinfo hints, *res;
- memset(&hints, 0, sizeof(struct addrinfo));
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_STREAM;
- if (getaddrinfo(ftp->host, "21", &hints, &res) != 0) __err_connect("getaddrinfo");
- if ((ftp->ctrl_fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
- if (setsockopt(ftp->ctrl_fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
- if (connect(ftp->ctrl_fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
- freeaddrinfo(res);
- kftp_get_response(ftp);
- }
- { // login
- kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
- kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
- kftp_send_cmd(ftp, "TYPE I\r\n", 1);
- }
+ ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+ if (ftp->ctrl_fd == -1) return -1;
+ kftp_get_response(ftp);
+ kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+ kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+ kftp_send_cmd(ftp, "TYPE I\r\n", 1);
return 0;
}
int kftp_reconnect(knetFile *ftp)
{
- if (ftp->ctrl_fd >= 0) {
- close(ftp->ctrl_fd);
+ if (ftp->ctrl_fd != -1) {
+ netclose(ftp->ctrl_fd);
ftp->ctrl_fd = -1;
}
- close(ftp->fd);
+ netclose(ftp->fd);
return kftp_connect(ftp);
}
fp = calloc(1, sizeof(knetFile));
fp->type = KNF_TYPE_FTP;
fp->fd = -1;
+ /* the Linux/Mac version of socket_connect() also recognizes a port
+ * like "ftp", but the Windows version does not. */
+ fp->port = strdup("21");
fp->host = calloc(l + 1, 1);
if (strchr(mode, 'c')) fp->no_reconnect = 1;
strncpy(fp->host, fn + 6, l);
int kftp_connect_file(knetFile *fp)
{
int ret;
- if (fp->fd >= 0) {
- close(fp->fd);
+ if (fp->fd != -1) {
+ netclose(fp->fd);
if (fp->no_reconnect) kftp_get_response(fp);
}
kftp_pasv_prep(fp);
if (fp->offset) {
char tmp[32];
+#ifndef _WIN32
sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+ strcpy(tmp, "REST ");
+ uint64tostr(tmp + 5, fp->offset);
+ strcat(tmp, "\r\n");
+#endif
kftp_send_cmd(fp, tmp, 1);
}
kftp_send_cmd(fp, fp->retr, 0);
ret = kftp_get_response(fp);
if (ret != 150) {
fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
- close(fp->fd);
+ netclose(fp->fd);
fp->fd = -1;
return -1;
}
return 0;
}
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p, *proxy, *q;
+ int l;
+ if (strstr(fn, "http://") != fn) return 0;
+ // set ->http_host
+ for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+ l = p - fn - 7;
+ fp = calloc(1, sizeof(knetFile));
+ fp->http_host = calloc(l + 1, 1);
+ strncpy(fp->http_host, fn + 7, l);
+ fp->http_host[l] = 0;
+ for (q = fp->http_host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ // get http_proxy
+ proxy = getenv("http_proxy");
+ // set ->host, ->port and ->path
+ if (proxy == 0) {
+ fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(*p? p : "/");
+ } else {
+ fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+ for (q = fp->host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(fn);
+ }
+ fp->type = KNF_TYPE_HTTP;
+ fp->ctrl_fd = fp->fd = -1;
+ fp->seek_offset = -1;
+ return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+ int ret, l = 0;
+ char *buf, *p;
+ if (fp->fd != -1) netclose(fp->fd);
+ fp->fd = socket_connect(fp->host, fp->port);
+ buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+ l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+ if (fp->offset)
+ l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+ l += sprintf(buf + l, "\r\n");
+ netwrite(fp->fd, buf, l);
+ l = 0;
+ while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+ if (buf[l] == '\n' && l >= 3)
+ if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+ ++l;
+ }
+ buf[l] = 0;
+ if (l < 14) { // prematured header
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ ret = strtol(buf + 8, &p, 0); // HTTP return code
+ if (ret == 200 && fp->offset) { // 200 (complete result); then skip beginning of the file
+ off_t rest = fp->offset;
+ while (rest) {
+ off_t l = rest < 0x10000? rest : 0x10000;
+ rest -= my_netread(fp->fd, buf, l);
+ }
+ } else if (ret != 206 && ret != 200) {
+ free(buf);
+ fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ free(buf);
+ fp->is_ready = 1;
+ return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
knetFile *knet_open(const char *fn, const char *mode)
{
knetFile *fp = 0;
return 0;
}
kftp_connect_file(fp);
- if (fp->fd < 0) {
- knet_close(fp);
- return 0;
- }
- } else {
+ } else if (strstr(fn, "http://") == fn) {
+ fp = khttp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ khttp_connect_file(fp);
+ } else { // local file
+#ifdef _WIN32
+ /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+ * be undefined on some systems, although it is defined on my
+ * Mac and the Linux I have tested on. */
+ int fd = open(fn, O_RDONLY | O_BINARY);
+#else
int fd = open(fn, O_RDONLY);
+#endif
if (fd == -1) {
perror("open");
return 0;
fp = (knetFile*)calloc(1, sizeof(knetFile));
fp->type = KNF_TYPE_LOCAL;
fp->fd = fd;
+ fp->ctrl_fd = -1;
+ }
+ if (fp && fp->fd == -1) {
+ knet_close(fp);
+ return 0;
}
return fp;
}
off_t knet_read(knetFile *fp, void *buf, off_t len)
{
off_t l = 0;
- if (fp->fd < 0) return 0;
- if (fp->type == KNF_TYPE_LOCAL) {
- off_t rest = len, curr;
- while (rest) {
- curr = read(fp->fd, buf + l, rest);
- if (curr == 0) break;
- l += curr; rest -= curr;
- }
- fp->offset += l;
- } else {
- off_t rest = len, curr;
+ if (fp->fd == -1) return 0;
+ if (fp->type == KNF_TYPE_FTP) {
if (fp->is_ready == 0) {
if (!fp->no_reconnect) kftp_reconnect(fp);
kftp_connect_file(fp);
- fp->is_ready = 1;
}
+ } else if (fp->type == KNF_TYPE_HTTP) {
+ if (fp->is_ready == 0)
+ khttp_connect_file(fp);
+ }
+ if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+ off_t rest = len, curr;
while (rest) {
- if (socket_wait(fp->fd, 1) <= 0) break; // socket is not ready for reading
curr = read(fp->fd, buf + l, rest);
- if (curr == 0) break; // FIXME: end of file or bad network? I do not know...
+ if (curr == 0) break;
l += curr; rest -= curr;
}
- fp->offset += l;
- }
+ } else l = my_netread(fp->fd, buf, len);
+ fp->offset += l;
return l;
}
int knet_seek(knetFile *fp, off_t off, int whence)
{
+ if (whence == SEEK_SET && off == fp->offset) return 0;
if (fp->type == KNF_TYPE_LOCAL) {
- if (lseek(fp->fd, off, whence) == -1) {
+ /* Be aware that lseek() returns the offset after seeking,
+ * while fseek() returns zero on success. */
+ off_t offset = lseek(fp->fd, off, whence);
+ if (offset == -1) {
perror("lseek");
return -1;
}
- fp->offset = off;
+ fp->offset = offset;
return 0;
- }
- if (fp->type == KNF_TYPE_FTP) {
+ } else if (fp->type == KNF_TYPE_FTP || fp->type == KNF_TYPE_HTTP) {
if (whence != SEEK_SET) { // FIXME: we can surely allow SEEK_CUR and SEEK_END in future
- fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP. Offset is unchanged.\n");
+ fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP/HTTP. Offset is unchanged.\n");
return -1;
}
fp->offset = off;
int knet_close(knetFile *fp)
{
if (fp == 0) return 0;
- if (fp->ctrl_fd >= 0) close(fp->ctrl_fd);
- if (fp->fd >= 0) close(fp->fd);
- free(fp->response); free(fp->retr); free(fp->host);
+ if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+ if (fp->fd != -1) {
+ /* On Linux/Mac, netclose() is an alias of close(), but on
+ * Windows, it is an alias of closesocket(). */
+ if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+ else netclose(fp->fd);
+ }
+ free(fp->host); free(fp->port);
+ free(fp->response); free(fp->retr); // FTP specific
+ free(fp->path); free(fp->http_host); // HTTP specific
free(fp);
return 0;
}
#ifdef KNETFILE_MAIN
int main(void)
{
- char buf[256];
+ char *buf;
knetFile *fp;
-// fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); knet_seek(fp, 2500000000ll, SEEK_SET);
- fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); knet_seek(fp, 2000, SEEK_SET);
-// fp = knet_open("knetfile.c", "r"); knet_seek(fp, 2000, SEEK_SET);
- knet_read(fp, buf, 255);
- buf[255] = 0;
- printf("%s\n", buf);
+ int type = 4, l;
+#ifdef _WIN32
+ knet_win32_init();
+#endif
+ buf = calloc(0x100000, 1);
+ if (type == 0) {
+ fp = knet_open("knetfile.c", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 1) { // NCBI FTP, large file
+ fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+ knet_seek(fp, 2500000000ll, SEEK_SET);
+ l = knet_read(fp, buf, 255);
+ } else if (type == 2) {
+ fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 3) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 4) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+ knet_read(fp, buf, 10000);
+ knet_seek(fp, 20000, SEEK_SET);
+ knet_seek(fp, 10000, SEEK_SET);
+ l = knet_read(fp, buf+10000, 10000000) + 10000;
+ }
+ if (type != 4 && type != 1) {
+ knet_read(fp, buf, 255);
+ buf[255] = 0;
+ printf("%s\n", buf);
+ } else write(fileno(stdout), buf, l);
knet_close(fp);
+ free(buf);
return 0;
}
#endif
#include <stdint.h>
#include <fcntl.h>
+#ifndef _WIN32
+#define netread(fd, ptr, len) read(fd, ptr, len)
+#define netwrite(fd, ptr, len) write(fd, ptr, len)
+#define netclose(fd) close(fd)
+#else
+#include <winsock.h>
+#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
+#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
+#define netclose(fd) closesocket(fd)
+#endif
+
// FIXME: currently I/O is unbuffered
#define KNF_TYPE_LOCAL 1
typedef struct knetFile_s {
int type, fd;
int64_t offset;
- char *host;
+ char *host, *port;
// the following are for FTP only
int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
char *response, *retr;
int64_t seek_offset; // for lazy seek
+
+ // the following are for HTTP only
+ char *path, *http_host;
} knetFile;
#define knet_tell(fp) ((fp)->offset)
extern "C" {
#endif
+#ifdef _WIN32
+ int knet_win32_init();
+ void knet_win32_destroy();
+#endif
+
knetFile *knet_open(const char *fn, const char *mode);
/*
/* Contact: Heng Li <lh3@sanger.ac.uk> */
+/*
+ 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
+ */
+
/* Last Modified: 12APR2009 */
#ifndef AC_KSEQ_H
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
- char *buf; \
+ unsigned char *buf; \
int begin, end, is_eof; \
type_t f; \
} kstream_t;
{ \
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
ks->f = f; \
- ks->buf = (char*)malloc(__bufsize); \
+ ks->buf = malloc(__bufsize); \
return ks; \
} \
static inline void ks_destroy(kstream_t *ks) \
#include <stdio.h>
#include <ctype.h>
#include <string.h>
+#include <stdint.h>
#include "kstring.h"
int ksprintf(kstring_t *s, const char *fmt, ...)
return n;
}
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+int *ksBM_prep(const uint8_t *pat, int m)
+{
+ int i, *suff, *prep, *bmGs, *bmBc;
+ prep = calloc(m + 256, 1);
+ bmGs = prep; bmBc = prep + m;
+ { // preBmBc()
+ for (i = 0; i < 256; ++i) bmBc[i] = m;
+ for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+ }
+ suff = calloc(m, sizeof(int));
+ { // suffixes()
+ int f = 0, g;
+ suff[m - 1] = m;
+ g = m - 1;
+ for (i = m - 2; i >= 0; --i) {
+ if (i > g && suff[i + m - 1 - f] < i - g)
+ suff[i] = suff[i + m - 1 - f];
+ else {
+ if (i < g) g = i;
+ f = i;
+ while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+ suff[i] = f - g;
+ }
+ }
+ }
+ { // preBmGs()
+ int j = 0;
+ for (i = 0; i < m; ++i) bmGs[i] = m;
+ for (i = m - 1; i >= 0; --i)
+ if (suff[i] == i + 1)
+ for (; j < m - 1 - i; ++j)
+ if (bmGs[j] == m)
+ bmGs[j] = m - 1 - i;
+ for (i = 0; i <= m - 2; ++i)
+ bmGs[m - 1 - suff[i]] = m - 1 - i;
+ }
+ free(suff);
+ return prep;
+}
+
+int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches)
+{
+ int i, j, *prep, *bmGs, *bmBc;
+ int *matches = 0, mm = 0, nm = 0;
+ prep = _prep? _prep : ksBM_prep(pat, m);
+ bmGs = prep; bmBc = prep + m;
+ j = 0;
+ while (j <= n - m) {
+ for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+ if (i < 0) {
+ if (nm == mm) {
+ mm = mm? mm<<1 : 1;
+ matches = realloc(matches, mm * sizeof(int));
+ }
+ matches[nm++] = j;
+ j += bmGs[0];
+ } else {
+ int max = bmBc[str[i+j]] - m + 1 + i;
+ if (max < bmGs[i]) max = bmGs[i];
+ j += max;
+ }
+ }
+ *n_matches = nm;
+ if (_prep == 0) free(prep);
+ return matches;
+}
+
#ifdef KSTRING_MAIN
#include <stdio.h>
int main()
for (i = 0; i < n; ++i)
printf("field[%d] = '%s'\n", i, s->s + fields[i]);
free(s);
+
+ {
+ static char *str = "abcdefgcdg";
+ static char *pat = "cd";
+ int n, *matches;
+ matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n);
+ printf("%d: \n", n);
+ for (i = 0; i < n; ++i)
+ printf("- %d\n", matches[i]);
+ free(matches);
+ }
return 0;
}
#endif
#include <stdlib.h>
#include <string.h>
+#include <stdint.h>
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
int ksprintf(kstring_t *s, const char *fmt, ...);
int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+// calculate the auxiliary array, allocated by calloc()
+int *ksBM_prep(const uint8_t *pat, int m);
+
+/* Search pat in str and returned the list of matches. The size of the
+ * list is returned as n_matches. _prep is the array returned by
+ * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */
+int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches);
+
static inline int kputsn(const char *p, int l, kstring_t *s)
{
if (s->l + l + 1 >= s->m) {
#include <stdlib.h>
#include <assert.h>
-#define PACKAGE_VERSION "0.1.2 (20090521)"
+#define PACKAGE_VERSION "r439"
//#define MAQ_LONGREADS
else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1;
else c = m1->pos&1;
}
- flag |= c;
+ if (c) flag |= 0x20;
}
- if (flag) {
+ if (m1->flag) {
int l = strlen(m1->name);
if (m1->name[l-2] == '/') {
flag |= (m1->name[l-1] == '1')? 0x40 : 0x80;
/*
- **********************************************************************
- ** md5.c **
- ** RSA Data Security, Inc. MD5 Message Digest Algorithm **
- ** Created: 2/17/90 RLR **
- ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version **
- **********************************************************************
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
*/
+/* Brutally hacked by John Walker back from ANSI C to K&R (no
+ prototypes) to maintain the tradition that Netfone will compile
+ with Sun's original "cc". */
+
+#include <string.h>
+#include "md5.h"
+
+#ifndef HIGHFIRST
+#define byteReverse(buf, len) /* Nothing */
+#else
/*
- **********************************************************************
- ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
- ** **
- ** License to copy and use this software is granted provided that **
- ** it is identified as the "RSA Data Security, Inc. MD5 Message **
- ** Digest Algorithm" in all material mentioning or referencing this **
- ** software or this function. **
- ** **
- ** License is also granted to make and use derivative works **
- ** provided that such works are identified as "derived from the RSA **
- ** Data Security, Inc. MD5 Message Digest Algorithm" in all **
- ** material mentioning or referencing the derived work. **
- ** **
- ** RSA Data Security, Inc. makes no representations concerning **
- ** either the merchantability of this software or the suitability **
- ** of this software for any particular purpose. It is provided "as **
- ** is" without express or implied warranty of any kind. **
- ** **
- ** These notices must be retained in any copies of any part of this **
- ** documentation and/or software. **
- **********************************************************************
+ * Note: this code is harmless on little-endian machines.
*/
+void byteReverse(buf, longs)
+ unsigned char *buf; unsigned longs;
+{
+ uint32_t t;
+ do {
+ t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
+ ((unsigned) buf[1] << 8 | buf[0]);
+ *(uint32_t *) buf = t;
+ buf += 4;
+ } while (--longs);
+}
+#endif
+
+void MD5Transform(uint32_t buf[4], uint32_t in[16]);
-#include "md5.h"
-/* forward declaration */
-static void Transform ();
-
-static unsigned char PADDING[64] = {
- 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
-/* F, G and H are basic MD5 functions: selection, majority, parity */
-#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
-#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-#define I(x, y, z) ((y) ^ ((x) | (~z)))
-
-/* ROTATE_LEFT rotates x left n bits */
-#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
-
-/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
-/* Rotation is separate from addition to prevent recomputation */
-#define FF(a, b, c, d, x, s, ac) \
- {(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-#define GG(a, b, c, d, x, s, ac) \
- {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-#define HH(a, b, c, d, x, s, ac) \
- {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-#define II(a, b, c, d, x, s, ac) \
- {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-
-void MD5Init (mdContext)
-MD5_CTX *mdContext;
+/*
+ * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(ctx)
+ struct MD5Context *ctx;
{
- mdContext->i[0] = mdContext->i[1] = (UINT4)0;
-
- /* Load magic initialization constants.
- */
- mdContext->buf[0] = (UINT4)0x67452301;
- mdContext->buf[1] = (UINT4)0xefcdab89;
- mdContext->buf[2] = (UINT4)0x98badcfe;
- mdContext->buf[3] = (UINT4)0x10325476;
+ ctx->buf[0] = 0x67452301;
+ ctx->buf[1] = 0xefcdab89;
+ ctx->buf[2] = 0x98badcfe;
+ ctx->buf[3] = 0x10325476;
+
+ ctx->bits[0] = 0;
+ ctx->bits[1] = 0;
}
-void MD5Update (mdContext, inBuf, inLen)
-MD5_CTX *mdContext;
-unsigned char *inBuf;
-unsigned int inLen;
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(ctx, buf, len)
+ struct MD5Context *ctx; unsigned char *buf; unsigned len;
{
- UINT4 in[16];
- int mdi;
- unsigned int i, ii;
-
- /* compute number of bytes mod 64 */
- mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
-
- /* update number of bits */
- if ((mdContext->i[0] + ((UINT4)inLen << 3)) < mdContext->i[0])
- mdContext->i[1]++;
- mdContext->i[0] += ((UINT4)inLen << 3);
- mdContext->i[1] += ((UINT4)inLen >> 29);
-
- while (inLen--) {
- /* add new character to buffer, increment mdi */
- mdContext->in[mdi++] = *inBuf++;
-
- /* transform if necessary */
- if (mdi == 0x40) {
- for (i = 0, ii = 0; i < 16; i++, ii += 4)
- in[i] = (((UINT4)mdContext->in[ii+3]) << 24) |
- (((UINT4)mdContext->in[ii+2]) << 16) |
- (((UINT4)mdContext->in[ii+1]) << 8) |
- ((UINT4)mdContext->in[ii]);
- Transform (mdContext->buf, in);
- mdi = 0;
+ uint32_t t;
+
+ /* Update bitcount */
+
+ t = ctx->bits[0];
+ if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
+ ctx->bits[1]++; /* Carry from low to high */
+ ctx->bits[1] += len >> 29;
+
+ t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
+
+ /* Handle any leading odd-sized chunks */
+
+ if (t) {
+ unsigned char *p = (unsigned char *) ctx->in + t;
+
+ t = 64 - t;
+ if (len < t) {
+ memcpy(p, buf, len);
+ return;
+ }
+ memcpy(p, buf, t);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ buf += t;
+ len -= t;
+ }
+ /* Process data in 64-byte chunks */
+
+ while (len >= 64) {
+ memcpy(ctx->in, buf, 64);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ buf += 64;
+ len -= 64;
}
- }
+
+ /* Handle any remaining bytes of data. */
+
+ memcpy(ctx->in, buf, len);
}
-void MD5Final (mdContext)
-MD5_CTX *mdContext;
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(digest, ctx)
+ unsigned char digest[16]; struct MD5Context *ctx;
{
- UINT4 in[16];
- int mdi;
- unsigned int i, ii;
- unsigned int padLen;
-
- /* save number of bits */
- in[14] = mdContext->i[0];
- in[15] = mdContext->i[1];
-
- /* compute number of bytes mod 64 */
- mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
-
- /* pad out to 56 mod 64 */
- padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi);
- MD5Update (mdContext, PADDING, padLen);
-
- /* append length in bits and transform */
- for (i = 0, ii = 0; i < 14; i++, ii += 4)
- in[i] = (((UINT4)mdContext->in[ii+3]) << 24) |
- (((UINT4)mdContext->in[ii+2]) << 16) |
- (((UINT4)mdContext->in[ii+1]) << 8) |
- ((UINT4)mdContext->in[ii]);
- Transform (mdContext->buf, in);
-
- /* store buffer in digest */
- for (i = 0, ii = 0; i < 4; i++, ii += 4) {
- mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF);
- mdContext->digest[ii+1] =
- (unsigned char)((mdContext->buf[i] >> 8) & 0xFF);
- mdContext->digest[ii+2] =
- (unsigned char)((mdContext->buf[i] >> 16) & 0xFF);
- mdContext->digest[ii+3] =
- (unsigned char)((mdContext->buf[i] >> 24) & 0xFF);
- }
+ unsigned count;
+ unsigned char *p;
+
+ /* Compute number of bytes mod 64 */
+ count = (ctx->bits[0] >> 3) & 0x3F;
+
+ /* Set the first char of padding to 0x80. This is safe since there is
+ always at least one byte free */
+ p = ctx->in + count;
+ *p++ = 0x80;
+
+ /* Bytes of padding needed to make 64 bytes */
+ count = 64 - 1 - count;
+
+ /* Pad out to 56 mod 64 */
+ if (count < 8) {
+ /* Two lots of padding: Pad the first block to 64 bytes */
+ memset(p, 0, count);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+
+ /* Now fill the next block with 56 bytes */
+ memset(ctx->in, 0, 56);
+ } else {
+ /* Pad block to 56 bytes */
+ memset(p, 0, count - 8);
+ }
+ byteReverse(ctx->in, 14);
+
+ /* Append length in bits and transform */
+ ((uint32_t *) ctx->in)[14] = ctx->bits[0];
+ ((uint32_t *) ctx->in)[15] = ctx->bits[1];
+
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ byteReverse((unsigned char *) ctx->buf, 4);
+ memcpy(digest, ctx->buf, 16);
+ memset(ctx, 0, sizeof(ctx)); /* In case it's sensitive */
}
-/* Basic MD5 step. Transform buf based on in.
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+ ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
*/
-static void Transform (buf, in)
-UINT4 *buf;
-UINT4 *in;
+void MD5Transform(buf, in)
+ uint32_t buf[4]; uint32_t in[16];
{
- UINT4 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-
- /* Round 1 */
-#define S11 7
-#define S12 12
-#define S13 17
-#define S14 22
- FF ( a, b, c, d, in[ 0], S11, 3614090360u); /* 1 */
- FF ( d, a, b, c, in[ 1], S12, 3905402710u); /* 2 */
- FF ( c, d, a, b, in[ 2], S13, 606105819u); /* 3 */
- FF ( b, c, d, a, in[ 3], S14, 3250441966u); /* 4 */
- FF ( a, b, c, d, in[ 4], S11, 4118548399u); /* 5 */
- FF ( d, a, b, c, in[ 5], S12, 1200080426u); /* 6 */
- FF ( c, d, a, b, in[ 6], S13, 2821735955u); /* 7 */
- FF ( b, c, d, a, in[ 7], S14, 4249261313u); /* 8 */
- FF ( a, b, c, d, in[ 8], S11, 1770035416u); /* 9 */
- FF ( d, a, b, c, in[ 9], S12, 2336552879u); /* 10 */
- FF ( c, d, a, b, in[10], S13, 4294925233u); /* 11 */
- FF ( b, c, d, a, in[11], S14, 2304563134u); /* 12 */
- FF ( a, b, c, d, in[12], S11, 1804603682u); /* 13 */
- FF ( d, a, b, c, in[13], S12, 4254626195u); /* 14 */
- FF ( c, d, a, b, in[14], S13, 2792965006u); /* 15 */
- FF ( b, c, d, a, in[15], S14, 1236535329u); /* 16 */
-
- /* Round 2 */
-#define S21 5
-#define S22 9
-#define S23 14
-#define S24 20
- GG ( a, b, c, d, in[ 1], S21, 4129170786u); /* 17 */
- GG ( d, a, b, c, in[ 6], S22, 3225465664u); /* 18 */
- GG ( c, d, a, b, in[11], S23, 643717713u); /* 19 */
- GG ( b, c, d, a, in[ 0], S24, 3921069994u); /* 20 */
- GG ( a, b, c, d, in[ 5], S21, 3593408605u); /* 21 */
- GG ( d, a, b, c, in[10], S22, 38016083u); /* 22 */
- GG ( c, d, a, b, in[15], S23, 3634488961u); /* 23 */
- GG ( b, c, d, a, in[ 4], S24, 3889429448u); /* 24 */
- GG ( a, b, c, d, in[ 9], S21, 568446438u); /* 25 */
- GG ( d, a, b, c, in[14], S22, 3275163606u); /* 26 */
- GG ( c, d, a, b, in[ 3], S23, 4107603335u); /* 27 */
- GG ( b, c, d, a, in[ 8], S24, 1163531501u); /* 28 */
- GG ( a, b, c, d, in[13], S21, 2850285829u); /* 29 */
- GG ( d, a, b, c, in[ 2], S22, 4243563512u); /* 30 */
- GG ( c, d, a, b, in[ 7], S23, 1735328473u); /* 31 */
- GG ( b, c, d, a, in[12], S24, 2368359562u); /* 32 */
-
- /* Round 3 */
-#define S31 4
-#define S32 11
-#define S33 16
-#define S34 23
- HH ( a, b, c, d, in[ 5], S31, 4294588738u); /* 33 */
- HH ( d, a, b, c, in[ 8], S32, 2272392833u); /* 34 */
- HH ( c, d, a, b, in[11], S33, 1839030562u); /* 35 */
- HH ( b, c, d, a, in[14], S34, 4259657740u); /* 36 */
- HH ( a, b, c, d, in[ 1], S31, 2763975236u); /* 37 */
- HH ( d, a, b, c, in[ 4], S32, 1272893353u); /* 38 */
- HH ( c, d, a, b, in[ 7], S33, 4139469664u); /* 39 */
- HH ( b, c, d, a, in[10], S34, 3200236656u); /* 40 */
- HH ( a, b, c, d, in[13], S31, 681279174u); /* 41 */
- HH ( d, a, b, c, in[ 0], S32, 3936430074u); /* 42 */
- HH ( c, d, a, b, in[ 3], S33, 3572445317u); /* 43 */
- HH ( b, c, d, a, in[ 6], S34, 76029189u); /* 44 */
- HH ( a, b, c, d, in[ 9], S31, 3654602809u); /* 45 */
- HH ( d, a, b, c, in[12], S32, 3873151461u); /* 46 */
- HH ( c, d, a, b, in[15], S33, 530742520u); /* 47 */
- HH ( b, c, d, a, in[ 2], S34, 3299628645u); /* 48 */
-
- /* Round 4 */
-#define S41 6
-#define S42 10
-#define S43 15
-#define S44 21
- II ( a, b, c, d, in[ 0], S41, 4096336452u); /* 49 */
- II ( d, a, b, c, in[ 7], S42, 1126891415u); /* 50 */
- II ( c, d, a, b, in[14], S43, 2878612391u); /* 51 */
- II ( b, c, d, a, in[ 5], S44, 4237533241u); /* 52 */
- II ( a, b, c, d, in[12], S41, 1700485571u); /* 53 */
- II ( d, a, b, c, in[ 3], S42, 2399980690u); /* 54 */
- II ( c, d, a, b, in[10], S43, 4293915773u); /* 55 */
- II ( b, c, d, a, in[ 1], S44, 2240044497u); /* 56 */
- II ( a, b, c, d, in[ 8], S41, 1873313359u); /* 57 */
- II ( d, a, b, c, in[15], S42, 4264355552u); /* 58 */
- II ( c, d, a, b, in[ 6], S43, 2734768916u); /* 59 */
- II ( b, c, d, a, in[13], S44, 1309151649u); /* 60 */
- II ( a, b, c, d, in[ 4], S41, 4149444226u); /* 61 */
- II ( d, a, b, c, in[11], S42, 3174756917u); /* 62 */
- II ( c, d, a, b, in[ 2], S43, 718787259u); /* 63 */
- II ( b, c, d, a, in[ 9], S44, 3951481745u); /* 64 */
-
- buf[0] += a;
- buf[1] += b;
- buf[2] += c;
- buf[3] += d;
+ register uint32_t a, b, c, d;
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
}
/* lh3: the following code is added by me */
static void md5_one(const char *fn)
{
- unsigned char buf[4096];
+ unsigned char buf[4096], digest[16];
MD5_CTX md5;
int l;
FILE *fp;
MD5Init(&md5);
while ((l = fread(buf, 1, 4096, fp)) > 0)
MD5Update(&md5, buf, l);
- MD5Final(&md5);
+ MD5Final(digest, &md5);
if (fp != stdin) fclose(fp);
for (l = 0; l < 16; ++l)
- printf("%c%c", HEX_STR[md5.digest[l]>>4&0xf], HEX_STR[md5.digest[l]&0xf]);
+ printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
printf(" %s\n", fn);
}
int main(int argc, char *argv[])
/*
- **********************************************************************
- ** md5.h -- Header file for implementation of MD5 **
- ** RSA Data Security, Inc. MD5 Message Digest Algorithm **
- ** Created: 2/17/90 RLR **
- ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version **
- ** Revised (for MD5): RLR 4/27/91 **
- ** -- G modified to have y&~z instead of y&z **
- ** -- FF, GG, HH modified to add in last register done **
- ** -- Access pattern: round 2 works mod 5, round 3 works mod 3 **
- ** -- distinct additive constant for each step **
- ** -- round 4 added, working mod 7 **
- **********************************************************************
- */
+ This file is adapted from a program in this page:
-/*
- **********************************************************************
- ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
- ** **
- ** License to copy and use this software is granted provided that **
- ** it is identified as the "RSA Data Security, Inc. MD5 Message **
- ** Digest Algorithm" in all material mentioning or referencing this **
- ** software or this function. **
- ** **
- ** License is also granted to make and use derivative works **
- ** provided that such works are identified as "derived from the RSA **
- ** Data Security, Inc. MD5 Message Digest Algorithm" in all **
- ** material mentioning or referencing the derived work. **
- ** **
- ** RSA Data Security, Inc. makes no representations concerning **
- ** either the merchantability of this software or the suitability **
- ** of this software for any particular purpose. It is provided "as **
- ** is" without express or implied warranty of any kind. **
- ** **
- ** These notices must be retained in any copies of any part of this **
- ** documentation and/or software. **
- **********************************************************************
+ http://www.fourmilab.ch/md5/
+
+ The original source code does not work on 64-bit machines due to the
+ wrong typedef "uint32". I also added prototypes.
+
+ -lh3
*/
#ifndef MD5_H
#define MD5_H
-#include <stdint.h>
+/* The following tests optimise behaviour on little-endian
+ machines, where there is no need to reverse the byte order
+ of 32 bit words in the MD5 computation. By default,
+ HIGHFIRST is defined, which indicates we're running on a
+ big-endian (most significant byte first) machine, on which
+ the byteReverse function in md5.c must be invoked. However,
+ byteReverse is coded in such a way that it is an identity
+ function when run on a little-endian machine, so calling it
+ on such a platform causes no harm apart from wasting time.
+ If the platform is known to be little-endian, we speed
+ things up by undefining HIGHFIRST, which defines
+ byteReverse as a null macro. Doing things in this manner
+ insures we work on new platforms regardless of their byte
+ order. */
+
+#define HIGHFIRST
+
+#if __LITTLE_ENDIAN__ != 0
+#undef HIGHFIRST
+#endif
-/* typedef a 32 bit type */
-typedef uint32_t UINT4;
+#include <stdint.h>
-/* Data structure for MD5 (Message Digest) computation */
-typedef struct {
- UINT4 i[2]; /* number of _bits_ handled mod 2^64 */
- UINT4 buf[4]; /* scratch buffer */
- unsigned char in[64]; /* input buffer */
- unsigned char digest[16]; /* actual digest after MD5Final call */
-} MD5_CTX;
+struct MD5Context {
+ uint32_t buf[4];
+ uint32_t bits[2];
+ unsigned char in[64];
+};
-#ifdef __cplusplus
-extern "C" {
-#endif
+void MD5Init(struct MD5Context *ctx);
+void MD5Update(struct MD5Context *ctx, unsigned char *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *ctx);
- void MD5Init(MD5_CTX *mdContext);
- void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, unsigned intinLen);
- void MD5Final(MD5_CTX *mdContext);
+/*
+ * This is needed to make RSAREF happy on some MS-DOS compilers.
+ */
+typedef struct MD5Context MD5_CTX;
-#ifdef __cplusplus
-}
-#endif
+/* Define CHECK_HARDWARE_PROPERTIES to have main,c verify
+ byte order and uint32_t settings. */
+#define CHECK_HARDWARE_PROPERTIES
-#endif
+#endif /* !MD5_H */
int l, i, k;
gzFile fp;
kseq_t *seq;
- unsigned char unordered[16];
+ unsigned char unordered[16], digest[16];
for (l = 0; l < 16; ++l) unordered[l] = 0;
fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
}
MD5Init(&md5_one);
MD5Update(&md5_one, (unsigned char*)seq->seq.s, k);
- MD5Final(&md5_one);
+ MD5Final(digest, &md5_one);
for (l = 0; l < 16; ++l) {
- printf("%c%c", HEX_STR[md5_one.digest[l]>>4&0xf], HEX_STR[md5_one.digest[l]&0xf]);
- unordered[l] ^= md5_one.digest[l];
+ printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
+ unordered[l] ^= digest[l];
}
printf(" %s %s\n", fn, seq->name.s);
MD5Update(&md5_all, (unsigned char*)seq->seq.s, k);
}
- MD5Final(&md5_all);
+ MD5Final(digest, &md5_all);
kseq_destroy(seq);
for (l = 0; l < 16; ++l)
- printf("%c%c", HEX_STR[md5_all.digest[l]>>4&0xf], HEX_STR[md5_all.digest[l]&0xf]);
+ printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
printf(" %s >ordered\n", fn);
for (l = 0; l < 16; ++l)
printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]);
--- /dev/null
+#!/usr/bin/perl -w
+
+# Author: lh3
+
+# This script calculates a score using the BLAST scoring
+# system. However, I am not sure how to count gap opens and gap
+# extensions. It seems to me that column 5-8 are not what I am
+# after. This script counts gaps from the last three columns. It does
+# not generate reference skip (N) in the CIGAR as it is not easy to
+# directly tell which gaps correspond to introns.
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my %opts = (a=>1, b=>3, q=>5, r=>2);
+getopts('a:b:q:r:', \%opts);
+die("Usage: psl2sam.pl [-a $opts{a}] [-b $opts{b}] [-q $opts{q}] [-r $opts{r}] <in.psl>\n") if (@ARGV == 0 && -t STDIN);
+
+my @stack;
+my $last = '';
+my ($a, $b, $q, $r) = ($opts{a}, $opts{b}, $opts{q}, $opts{r});
+while (<>) {
+ next unless (/^\d/);
+ my @t = split;
+ my @s;
+ my $cigar = '';
+ if ($t[8] eq '-') {
+ my $tmp = $t[11];
+ $t[11] = $t[10] - $t[12];
+ $t[12] = $t[10] - $tmp;
+ }
+ @s[0..4] = ($t[9], (($t[8] eq '+')? 0 : 16), $t[13], $t[15]+1, 0);
+ @s[6..10] = ('*', 0, 0, '*', '*');
+ $cigar .= $t[11].'H' if ($t[11]); # 5'-end clipping
+ my @x = split(',', $t[18]);
+ my @y = split(',', $t[19]);
+ my @z = split(',', $t[20]);
+ my ($y0, $z0) = ($y[0], $z[0]);
+ my ($gap_open, $gap_ext) = (0, 0, 0);
+ for (1 .. $t[17]-1) {
+ my $ly = $y[$_] - $y[$_-1] - $x[$_-1];
+ my $lz = $z[$_] - $z[$_-1] - $x[$_-1];
+ if ($ly < $lz) { # del: the reference gap is longer
+ ++$gap_open;
+ $gap_ext += $lz - $ly;
+ $cigar .= ($y[$_] - $y0) . 'M';
+ $cigar .= ($lz - $ly) . 'D';
+ ($y0, $z0) = ($y[$_], $z[$_]);
+ } elsif ($lz < $ly) { # ins: the query gap is longer
+ ++$gap_open;
+ $gap_ext += $ly - $lz;
+ $cigar .= ($z[$_] - $z0) . 'M';
+ $cigar .= ($ly - $lz) . 'I';
+ ($y0, $z0) = ($y[$_], $z[$_]);
+ }
+ }
+ $cigar .= ($t[12] - $y0) . 'M';
+ $cigar .= ($t[10] - $t[12]).'H' if ($t[10] != $t[12]); # 3'-end clipping
+ $s[5] = $cigar;
+ my $score = $a * $t[0] - $b * $t[1] - $q * $gap_open - $r * $gap_ext;
+ $score = 0 if ($score < 0);
+ $s[11] = "AS:i:$score";
+ print join("\t", @s), "\n";
+}
use warnings;
use Getopt::Std;
-my $version = '0.3.2 (r321)';
+my $version = '0.3.3';
&usage if (@ARGV < 1);
my $command = shift(@ARGV);
-my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter);
+my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter,
+ unique=>\&unique, uniqcmp=>\&uniqcmp);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
die(qq/Usage: samtools.pl showALEN <in.sam>\n/) if (@ARGV == 0 && -t STDIN);
while (<>) {
my @t = split;
+ next if (/^\@/ || @t < 11);
my $l = 0;
$_ = $t[5];
- s/(\d+)[SMI]/$l+=$1/eg;
+ s/(\d+)[MI]/$l+=$1/eg;
print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n";
}
}
sub varFilter {
my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef);
- getopts('pd:D:l:Q:w:W:N:G:', \%opts);
+ getopts('pq:d:D:l:Q:w:W:N:G:', \%opts);
die(qq/
Usage: samtools.pl varFilter [options] <in.cns-pileup>
my @staging; # (indel_filtering_score, flt_tag)
while (<>) {
my @t = split;
- next if ($t[2] eq $t[3] || $t[3] eq '*/*'); # skip non-var sites
+ next if (uc($t[2]) eq uc($t[3]) || $t[3] eq '*/*'); # skip non-var sites
# clear the out-of-range elements
while (@staging) {
last if ($staging[0][2] eq $t[0] && $staging[0][3] + $max_dist >= $t[1]);
}
#
-# varStats
+# unique
#
-sub varStats {
- my %opts = (d=>'', c=>5);
- getopts('d:c:', \%opts);
- die("Usage: samtools.pl varStats [-d dbSNP.snp] [-c $opts{c}] <in.plp.snp>\n") if (@ARGV == 0 && -t STDIN);
- my (@cnt, %hash);
- my $col = $opts{c} - 1;
+sub unique {
+ my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3);
+ getopts('Qf:q:r:a:b:', \%opts);
+ die("Usage: samtools.pl unique [-f $opts{f}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
+ my $last = '';
+ my $recal_Q = !defined($opts{Q});
+ my @a;
while (<>) {
- my @t = split;
- if ($t[2] eq '*') {
+ my $score = -1;
+ print $_ if (/^\@/);
+ $score = $1 if (/AS:i:(\d+)/);
+ my @t = split("\t");
+ next if (@t < 11);
+ if ($score < 0) { # AS tag is unavailable
+ my $cigar = $t[5];
+ my ($mm, $go, $ge) = (0, 0, 0);
+ $cigar =~ s/(\d+)[ID]/++$go,$ge+=$1/eg;
+ $cigar = $t[5];
+ $cigar =~ s/(\d+)M/$mm+=$1/eg;
+ $score = $mm * $opts{a} - $go * $opts{q} - $ge * $opts{r}; # no mismatches...
+ }
+ $score = 1 if ($score < 1);
+ if ($t[0] ne $last) {
+ &unique_aux(\@a, $opts{f}, $recal_Q) if (@a);
+ $last = $t[0];
+ }
+ push(@a, [$score, \@t]);
+ }
+ &unique_aux(\@a, $opts{f}, $recal_Q) if (@a);
+}
+
+sub unique_aux {
+ my ($a, $fac, $is_recal) = @_;
+ my ($max, $max2, $max_i) = (0, 0, -1);
+ for (my $i = 0; $i < @$a; ++$i) {
+ if ($a->[$i][0] > $max) {
+ $max2 = $max; $max = $a->[$i][0]; $max_i = $i;
+ } elsif ($a->[$i][0] > $max2) {
+ $max2 = $a->[$i][0];
+ }
+ }
+ if ($is_recal) {
+ my $q = int($fac * ($max - $max2) / $max + .499);
+ $q = 250 if ($q > 250);
+ $a->[$max_i][1][4] = $q < 250? $q : 250;
+ }
+ print join("\t", @{$a->[$max_i][1]});
+ @$a = ();
+}
+
+#
+# uniqcmp: compare two SAM files
+#
+
+sub uniqcmp {
+ my %opts = (q=>10, s=>100);
+ getopts('pq:s:', \%opts);
+ die("Usage: samtools.pl uniqcmp <in1.sam> <in2.sam>\n") if (@ARGV < 2);
+ my ($fh, %a);
+ warn("[uniqcmp] read the first file...\n");
+ &uniqcmp_aux($ARGV[0], \%a, 0);
+ warn("[uniqcmp] read the second file...\n");
+ &uniqcmp_aux($ARGV[1], \%a, 1);
+ warn("[uniqcmp] stats...\n");
+ my @cnt;
+ $cnt[$_] = 0 for (0..9);
+ for my $x (keys %a) {
+ my $p = $a{$x};
+ my $z;
+ if (defined($p->[0]) && defined($p->[1])) {
+ $z = ($p->[0][0] == $p->[1][0] && $p->[0][1] eq $p->[1][1] && abs($p->[0][2] - $p->[1][2]) < $opts{s})? 0 : 1;
+ if ($p->[0][3] >= $opts{q} && $p->[1][3] >= $opts{q}) {
+ ++$cnt[$z*3+0];
+ } elsif ($p->[0][3] >= $opts{q}) {
+ ++$cnt[$z*3+1];
+ } elsif ($p->[1][3] >= $opts{q}) {
+ ++$cnt[$z*3+2];
+ }
+ print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t",
+ $p->[0][5]-$p->[1][5], "\n" if ($z && defined($opts{p}) && ($p->[0][3] >= $opts{q} || $p->[1][3] >= $opts{q}));
+ } elsif (defined($p->[0])) {
+ ++$cnt[$p->[0][3]>=$opts{q}? 6 : 7];
+ print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t*\t0\t*\t",
+ $p->[0][5], "\n" if (defined($opts{p}) && $p->[0][3] >= $opts{q});
} else {
- my $q = $t[$col];
- $q = 99 if ($q > 99);
- $q = int($q/10);
- my $is_het = ($t[3] =~ /^[ACGT]$/)? 0 : 1;
- ++$cnt[$q][$is_het];
- $hash{$t[0],$t[1]} = $q;
+ print STDERR "$x\t*\t0\t*\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t",
+ -$p->[1][5], "\n" if (defined($opts{p}) && $p->[1][3] >= $opts{q});
+ ++$cnt[$p->[1][3]>=$opts{q}? 8 : 9];
}
}
+ print "Consistent (high, high): $cnt[0]\n";
+ print "Consistent (high, low ): $cnt[1]\n";
+ print "Consistent (low , high): $cnt[2]\n";
+ print "Inconsistent (high, high): $cnt[3]\n";
+ print "Inconsistent (high, low ): $cnt[4]\n";
+ print "Inconsistent (low , high): $cnt[5]\n";
+ print "Second missing (high): $cnt[6]\n";
+ print "Second missing (low ): $cnt[7]\n";
+ print "First missing (high): $cnt[8]\n";
+ print "First missing (low ): $cnt[9]\n";
+}
+
+sub uniqcmp_aux {
+ my ($fn, $a, $which) = @_;
+ my $fh;
+ $fn = "samtools view $fn |" if ($fn =~ /\.bam/);
+ open($fh, $fn) || die;
+ while (<$fh>) {
+ my @t = split;
+ next if (@t < 11);
+# my $l = ($t[5] =~ /^(\d+)S/)? $1 : 0;
+ my $l = 0;
+ my ($x, $nm) = (0, 0);
+ $nm = $1 if (/NM:i:(\d+)/);
+ $_ = $t[5];
+ s/(\d+)[MI]/$x+=$1/eg;
+ @{$a->{$t[0]}[$which]} = (($t[1]&0x10)? 1 : 0, $t[2], $t[3]-$l, $t[4], "$x:$nm", $x - 4 * $nm);
+ }
+ close($fh);
}
#
#!/usr/bin/perl -w
# Contact: lh3
-# Version: 0.1.3
+# Version: 0.1.5
use strict;
use warnings;
exit;
sub wgsim_eval {
- my %opts;
- getopts('pc', \%opts);
- die("Usage: wgsim_eval.pl [-pc] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
+ my %opts = (g=>5);
+ getopts('pcg:', \%opts);
+ die("Usage: wgsim_eval.pl [-pc] [-g $opts{g}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
my (@c0, @c1);
my ($max_q, $flag) = (0, 0);
- my $gap = 5;
+ my $gap = $opts{g};
$flag |= 1 if (defined $opts{p});
$flag |= 2 if (defined $opts{c});
while (<>) {
- my @t = split;
+ next if (/^\@/);
+ my @t = split("\t");
+ next if (@t < 11);
my $line = $_;
my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]);
$max_q = $q if ($q > $max_q);
$_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg;
--$rght;
# correct for soft clipping
- $left -= $1 if (/^(\d+)S/);
- $rght += $1 if (/(\d+)S$/);
+ my ($left0, $rght0) = ($left, $rght);
+ $left -= $1 if (/^(\d+)[SH]/);
+ $rght += $1 if (/(\d+)[SH]$/);
+ $left0 -= $1 if (/(\d+)[SH]$/);
+ $rght0 += $1 if (/^(\d+)[SH]/);
# skip unmapped reads
next if (($t[1]&0x4) || $chr eq '*');
# parse read name and check
} else {
if ($flag & 2) {
if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward
- $is_correct = 0 if (abs($2 - $left) > $gap);
+ $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
} elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse
- $is_correct = 0 if (abs($3 - $rght) > $gap);
+ $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap);
} elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward
- $is_correct = 0 if (abs($3 - $left) > $gap);
+ $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap);
} else { # R3, reverse
- $is_correct = 0 if (abs($2 - $rght) > $gap);
+ $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap);
}
} else {
if ($t[1] & 0x10) { # reverse
- $is_correct = 0 if (abs($3 - $rght) > $gap); # in case of indels that are close to the end of a reads
+ $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads
} else {
- $is_correct = 0 if (abs($2 - $left) > $gap);
+ $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
}
}
}
#else
static RAZF* razf_open_w(int fd){
RAZF *rz;
+#ifdef _WIN32
+ setmode(fd, O_BINARY);
+#endif
rz = calloc(1, sizeof(RAZF));
rz->mode = 'w';
rz->filedes = fd;
int n, is_be, ret;
int64_t end;
unsigned char c[] = "RAZF";
+#ifdef _WIN32
+ setmode(fd, O_BINARY);
+#endif
rz = calloc(1, sizeof(RAZF));
rz->mode = 'r';
rz->filedes = fd;
}
RAZF* razf_dopen(int fd, const char *mode){
- if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 1);
- else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
+ if(strstr(mode, "r")) return razf_open_r(fd, 1);
+ else if(strstr(mode, "w")) return razf_open_w(fd);
else return NULL;
}
RAZF* razf_dopen2(int fd, const char *mode)
{
- if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 0);
- else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
+ if(strstr(mode, "r")) return razf_open_r(fd, 0);
+ else if(strstr(mode, "w")) return razf_open_w(fd);
else return NULL;
}
static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
int fd;
RAZF *rz;
- if(strcasecmp(mode, "r") == 0){
+ if(strstr(mode, "r")){
+#ifdef _WIN32
+ fd = open(filename, O_RDONLY | O_BINARY);
+#else
fd = open(filename, O_RDONLY);
+#endif
rz = razf_open_r(fd, _load_index);
- } else if(strcasecmp(mode, "w") == 0){
+ } else if(strstr(mode, "w")){
+#ifdef _WIN32
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+#else
fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+#endif
rz = razf_open_w(fd);
} else return NULL;
return rz;
#include <string.h>
+#include <unistd.h>
+#include "faidx.h"
#include "sam.h"
#define TYPE_BAM 1
// open file
fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
if (fp->x.tamr == 0) goto open_err_ret;
+ if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2;
+ else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2;
+ else fp->type |= BAM_OFDEC<<2;
// write header
if (strstr(mode, "h")) {
int i;
if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing
if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b);
else {
- char *s = bam_format1(fp->header, b);
+ char *s = bam_format1_core(fp->header, b, fp->type>>2&3);
int l = strlen(s);
fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw);
free(s);
bam_destroy1(b);
return 0;
}
+
+char *samfaipath(const char *fn_ref)
+{
+ char *fn_list = 0;
+ if (fn_ref == 0) return 0;
+ fn_list = calloc(strlen(fn_ref) + 5, 1);
+ strcat(strcpy(fn_list, fn_ref), ".fai");
+ if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
+ if (access(fn_ref, R_OK) == -1) {
+ fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
+ } else {
+ fprintf(stderr, "[samfaipath] build FASTA index...\n");
+ if (fai_build(fn_ref) == -1) {
+ fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
+ free(fn_list); fn_list = 0;
+ }
+ }
+ }
+ return fn_list;
+}
/*! @typedef
@abstract SAM/BAM file handler
- @field type type of the handler; bit 1 for BAM and bit 2 for reading
+ @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format
@field bam BAM file handler; valid if (type&1) == 1
@field tamr SAM file handler for reading; valid if type == 2
@field tamw SAM file handler for writing; valid if type == 0
@param fn SAM/BAM file name; "-" is recognized as stdin (for
reading) or stdout (for writing).
- @param mode open mode /[rw](b?)(u?)(h?)/: 'r' for reading, 'w' for
- writing, 'b' for BAM I/O, 'u' for uncompressed BAM output and 'h'
- for outputing header in SAM. If 'b' present, it must immediately
- follow 'r' or 'w'. Valid modes are "r", "w", "wh", "rb", "wb" and
- "wbu" exclusively.
+ @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading,
+ 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output,
+ 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for
+ string flag. If 'b' present, it must immediately follow 'r' or
+ 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX",
+ "rb", "wb" and "wbu" exclusively.
@param aux auxiliary data; if mode[0]=='w', aux points to
- bam_header_t; if strcmp(mode, "rb")==0 and @SQ header lines in SAM
+ bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM
are absent, aux points the file name of the list of the reference;
- aux is not used otherwise.
+ aux is not used otherwise. If @SQ header lines are present in SAM,
+ aux is not used, either.
@return SAM/BAM file handler
*/
*/
int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data);
+ char *samfaipath(const char *fn_ref);
+
#ifdef __cplusplus
}
#endif
#include <stdio.h>
#include <unistd.h>
#include "sam.h"
+#include "faidx.h"
static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;
static char *g_library, *g_rg;
return 0;
}
-static int usage(void);
+static int usage(int is_long_help);
int main_samview(int argc, char *argv[])
{
int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0;
+ int of_type = BAM_OFDEC, is_long_help = 0;
samfile_t *in = 0, *out = 0;
- char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0;
+ char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:")) >= 0) {
+ while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:")) >= 0) {
switch (c) {
case 'S': is_bamin = 0; break;
case 'b': is_bamout = 1; break;
case 'u': is_uncompressed = 1; break;
case 'l': g_library = strdup(optarg); break;
case 'r': g_rg = strdup(optarg); break;
- default: return usage();
+ case 'x': of_type = BAM_OFHEX; break;
+ case 'X': of_type = BAM_OFSTR; break;
+ case '?': is_long_help = 1; break;
+ case 'T': fn_ref = strdup(optarg); is_bamin = 0; break;
+ default: return usage(is_long_help);
}
}
if (is_uncompressed) is_bamout = 1;
if (is_header_only) is_header = 1;
if (is_bamout) strcat(out_mode, "b");
+ else {
+ if (of_type == BAM_OFHEX) strcat(out_mode, "x");
+ else if (of_type == BAM_OFSTR) strcat(out_mode, "X");
+ }
if (is_bamin) strcat(in_mode, "b");
if (is_header) strcat(out_mode, "h");
if (is_uncompressed) strcat(out_mode, "u");
- if (argc == optind) return usage();
+ if (argc == optind) return usage(is_long_help);
+ // generate the fn_list if necessary
+ if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
// open file handlers
if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
fprintf(stderr, "[main_samview] fail to open file for reading.\n");
goto view_end;
}
+ if (in->header == 0) {
+ fprintf(stderr, "[main_samview] fail to read the header.\n");
+ goto view_end;
+ }
if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
fprintf(stderr, "[main_samview] fail to open file for writing.\n");
goto view_end;
view_end:
// close files, free and return
- free(fn_list); free(fn_out); free(g_library); free(g_rg);
+ free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg);
samclose(in);
samclose(out);
return ret;
}
-static int usage()
+static int usage(int is_long_help)
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]\n\n");
fprintf(stderr, " -H print header only (no alignments)\n");
fprintf(stderr, " -S input is SAM\n");
fprintf(stderr, " -u uncompressed BAM output (force -b)\n");
+ fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n");
+ fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n");
fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n");
+ fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n");
fprintf(stderr, " -o FILE output file name [stdout]\n");
fprintf(stderr, " -f INT required flag, 0 for unset [0]\n");
fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n");
fprintf(stderr, " -q INT minimum mapping quality [0]\n");
fprintf(stderr, " -l STR only output reads in library STR [null]\n");
fprintf(stderr, " -r STR only output reads in read group STR [null]\n");
- fprintf(stderr, "\n\
-Notes:\n\
+ fprintf(stderr, " -? longer help\n");
+ fprintf(stderr, "\n");
+ if (is_long_help)
+ fprintf(stderr, "Notes:\n\
\n\
1. By default, this command assumes the file on the command line is in\n\
the BAM format and it prints the alignments in SAM. If `-t' is\n\
corresponding sequence length. The `.fai' file generated by `faidx'\n\
can be used here. This file may be empty if reads are unaligned.\n\
\n\
- 2. SAM->BAM conversion: `samtools view -bt ref.fa.fai in.sam.gz'.\n\
+ 2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\
\n\
3. BAM->SAM conversion: `samtools view in.bam'.\n\
\n\
\n\
5. Option `-u' is preferred over `-b' when the output is piped to\n\
another samtools command.\n\
+\n\
+ 6. In a string FLAG, each character represents one bit with\n\
+ p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\n\
+ U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\n\
+ 1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \n\
+ f=0x200 (failure) and d=0x400 (duplicate). Note that `-x' and\n\
+ `-X' are samtools-C specific. Picard and older samtools do not\n\
+ support HEX or string flags.\n\
\n");
return 1;
}
-.TH samtools 1 "6 July 2009" "samtools-0.1.5" "Bioinformatics tools"
+.TH samtools 1 "2 September 2009" "samtools-0.1.6" "Bioinformatics tools"
.SH NAME
.PP
samtools - Utilities for the Sequence Alignment/Map (SAM) format
pipes. Samtools always output warning and error messages to the standard
error output (stderr).
-Samtools is also able to open a BAM (not SAM) file on a remote FTP
-server if the BAM file name starts with `ftp://'. Samtools checks the
-current working directory for the index file and will download the index
-upon absence. Samtools achieves random FTP file access with the `REST'
-ftp command. It does not retrieve the entire alignment file unless it is
-asked to do so.
+Samtools is also able to open a BAM (not SAM) file on a remote FTP or
+HTTP server if the BAM file name starts with `ftp://' or `http://'.
+Samtools checks the current working directory for the index file and
+will download the index upon absence. Samtools does not retrieve the
+entire alignment file unless it is asked to do so.
.SH COMMANDS AND OPTIONS
.TP
.B merge
-samtools merge [-n] <out.bam> <in1.bam> <in2.bam> [...]
-
-Merge multiple sorted alignments. The header of
-.I <in1.bam>
+samtools merge [-h inh.sam] [-n] <out.bam> <in1.bam> <in2.bam> [...]
+
+Merge multiple sorted alignments.
+The header reference lists of all the input BAM files, and the @SQ headers of
+.IR inh.sam ,
+if any, must all refer to the same set of reference sequences.
+The header reference list and (unless overridden by
+.BR -h )
+`@' headers of
+.I in1.bam
will be copied to
-.I <out.bam>
+.IR out.bam ,
and the headers of other files will be ignored.
.B OPTIONS:
.RS
.TP 8
+.B -h FILE
+Use the lines of
+.I FILE
+as `@' headers to be copied to
+.IR out.bam ,
+replacing any header lines that would otherwise be copied from
+.IR in1.bam .
+.RI ( FILE
+is actually in SAM format, though any alignment records it may contain
+are ignored.)
+.TP
.B -n
The input alignments are sorted by read names rather than by chromosomal
coordinates
Text alignment viewer (based on the ncurses library). In the viewer,
press `?' for help and press `g' to check the alignment start from a
-region in the format like `chr10:10,000,000'. Note that if the region
-showed on the screen contains no mapped reads, a blank screen will be
-seen. This is a known issue and will be improved later.
+region in the format like `chr10:10,000,000'.
.RE
Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c.
.IP o 2
CIGAR operation P is not properly handled at the moment.
+.IP o 2
+In merging, the input files are required to have the same number of
+reference sequences. The requirement can be relaxed. In addition,
+merging does not reconstruct the header dictionaries
+automatically. Endusers have to provide the correct header. Picard is
+better at merging.
+.IP o 2
+Samtools' rmdup does not work for single-end data and does not remove
+duplicates across chromosomes. Picard is better.
.SH AUTHOR
.PP
.SH SEE ALSO
.PP
-Samtools website: http://samtools.sourceforge.net
+Samtools website: <http://samtools.sourceforge.net>
--- /dev/null
+samtools(1) Bioinformatics tools samtools(1)
+
+
+
+NAME
+ samtools - Utilities for the Sequence Alignment/Map (SAM) format
+
+SYNOPSIS
+ samtools view -bt ref_list.txt -o aln.bam aln.sam.gz
+
+ samtools sort aln.bam aln.sorted
+
+ samtools index aln.sorted.bam
+
+ samtools view aln.sorted.bam chr2:20,100,000-20,200,000
+
+ samtools merge out.bam in1.bam in2.bam in3.bam
+
+ samtools faidx ref.fasta
+
+ samtools pileup -f ref.fasta aln.sorted.bam
+
+ samtools tview aln.sorted.bam ref.fasta
+
+
+DESCRIPTION
+ Samtools is a set of utilities that manipulate alignments in the BAM
+ format. It imports from and exports to the SAM (Sequence Alignment/Map)
+ format, does sorting, merging and indexing, and allows to retrieve
+ reads in any regions swiftly.
+
+ Samtools is designed to work on a stream. It regards an input file `-'
+ as the standard input (stdin) and an output file `-' as the standard
+ output (stdout). Several commands can thus be combined with Unix pipes.
+ Samtools always output warning and error messages to the standard error
+ output (stderr).
+
+ Samtools is also able to open a BAM (not SAM) file on a remote FTP or
+ HTTP server if the BAM file name starts with `ftp://' or `http://'.
+ Samtools checks the current working directory for the index file and
+ will download the index upon absence. Samtools does not retrieve the
+ entire alignment file unless it is asked to do so.
+
+
+COMMANDS AND OPTIONS
+ import samtools import <in.ref_list> <in.sam> <out.bam>
+
+ Since 0.1.4, this command is an alias of:
+
+ samtools view -bt <in.ref_list> -o <out.bam> <in.sam>
+
+
+ sort samtools sort [-n] [-m maxMem] <in.bam> <out.prefix>
+
+ Sort alignments by leftmost coordinates. File <out.pre-
+ fix>.bam will be created. This command may also create tempo-
+ rary files <out.prefix>.%d.bam when the whole alignment can-
+ not be fitted into memory (controlled by option -m).
+
+ OPTIONS:
+
+ -n Sort by read names rather than by chromosomal coordi-
+ nates
+
+ -m INT Approximately the maximum required memory.
+ [500000000]
+
+
+ merge samtools merge [-h inh.sam] [-n] <out.bam> <in1.bam>
+ <in2.bam> [...]
+
+ Merge multiple sorted alignments. The header reference lists
+ of all the input BAM files, and the @SQ headers of inh.sam,
+ if any, must all refer to the same set of reference
+ sequences. The header reference list and (unless overridden
+ by -h) `@' headers of in1.bam will be copied to out.bam, and
+ the headers of other files will be ignored.
+
+ OPTIONS:
+
+ -h FILE Use the lines of FILE as `@' headers to be copied to
+ out.bam, replacing any header lines that would other-
+ wise be copied from in1.bam. (FILE is actually in
+ SAM format, though any alignment records it may con-
+ tain are ignored.)
+
+ -n The input alignments are sorted by read names rather
+ than by chromosomal coordinates
+
+
+ index samtools index <aln.bam>
+
+ Index sorted alignment for fast random access. Index file
+ <aln.bam>.bai will be created.
+
+
+ view samtools view [-bhuHS] [-t in.refList] [-o output] [-f
+ reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r read-
+ Group] <in.bam>|<in.sam> [region1 [...]]
+
+ Extract/print all or sub alignments in SAM or BAM format. If
+ no region is specified, all the alignments will be printed;
+ otherwise only alignments overlapping the specified regions
+ will be output. An alignment may be given multiple times if
+ it is overlapping several regions. A region can be presented,
+ for example, in the following format: `chr2', `chr2:1000000'
+ or `chr2:1,000,000-2,000,000'. The coordinate is 1-based.
+
+ OPTIONS:
+
+ -b Output in the BAM format.
+
+ -u Output uncompressed BAM. This option saves time spent
+ on compression/decomprssion and is thus preferred
+ when the output is piped to another samtools command.
+
+ -h Include the header in the output.
+
+ -H Output the header only.
+
+ -S Input is in SAM. If @SQ header lines are absent, the
+ `-t' option is required.
+
+ -t FILE This file is TAB-delimited. Each line must contain
+ the reference name and the length of the reference,
+ one line for each distinct reference; additional
+ fields are ignored. This file also defines the order
+ of the reference sequences in sorting. If you run
+ `samtools faidx <ref.fa>', the resultant index file
+ <ref.fa>.fai can be used as this <in.ref_list> file.
+
+ -o FILE Output file [stdout]
+
+ -f INT Only output alignments with all bits in INT present
+ in the FLAG field. INT can be in hex in the format of
+ /^0x[0-9A-F]+/ [0]
+
+ -F INT Skip alignments with bits present in INT [0]
+
+ -q INT Skip alignments with MAPQ smaller than INT [0]
+
+ -l STR Only output reads in library STR [null]
+
+ -r STR Only output reads in read group STR [null]
+
+
+ faidx samtools faidx <ref.fasta> [region1 [...]]
+
+ Index reference sequence in the FASTA format or extract sub-
+ sequence from indexed reference sequence. If no region is
+ specified, faidx will index the file and create
+ <ref.fasta>.fai on the disk. If regions are speficified, the
+ subsequences will be retrieved and printed to stdout in the
+ FASTA format. The input file can be compressed in the RAZF
+ format.
+
+
+ pileup samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l
+ in.site_list] [-iscgS2] [-T theta] [-N nHap] [-r
+ pairDiffRate] <in.bam>|<in.sam>
+
+ Print the alignment in the pileup format. In the pileup for-
+ mat, each line represents a genomic position, consisting of
+ chromosome name, coordinate, reference base, read bases, read
+ qualities and alignment mapping qualities. Information on
+ match, mismatch, indel, strand, mapping quality and start and
+ end of a read are all encoded at the read base column. At
+ this column, a dot stands for a match to the reference base
+ on the forward strand, a comma for a match on the reverse
+ strand, `ACGTN' for a mismatch on the forward strand and
+ `acgtn' for a mismatch on the reverse strand. A pattern
+ `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
+ between this reference position and the next reference posi-
+ tion. The length of the insertion is given by the integer in
+ the pattern, followed by the inserted sequence. Similarly, a
+ pattern `-[0-9]+[ACGTNacgtn]+' represents a deletion from the
+ reference. The deleted bases will be presented as `*' in the
+ following lines. Also at the read base column, a symbol `^'
+ marks the start of a read segment which is a contiguous sub-
+ sequence on the read separated by `N/S/H' CIGAR operations.
+ The ASCII of the character following `^' minus 33 gives the
+ mapping quality. A symbol `$' marks the end of a read seg-
+ ment.
+
+ If option -c is applied, the consensus base, consensus qual-
+ ity, SNP quality and RMS mapping quality of the reads cover-
+ ing the site will be inserted between the `reference base'
+ and the `read bases' columns. An indel occupies an additional
+ line. Each indel line consists of chromosome name, coordi-
+ nate, a star, the genotype, consensus quality, SNP quality,
+ RMS mapping quality, # covering reads, the first alllele, the
+ second allele, # reads supporting the first allele, # reads
+ supporting the second allele and # reads containing indels
+ different from the top two alleles.
+
+ OPTIONS:
+
+
+ -s Print the mapping quality as the last column. This
+ option makes the output easier to parse, although
+ this format is not space efficient.
+
+
+ -S The input file is in SAM.
+
+
+ -i Only output pileup lines containing indels.
+
+
+ -f FILE The reference sequence in the FASTA format. Index
+ file FILE.fai will be created if absent.
+
+
+ -M INT Cap mapping quality at INT [60]
+
+
+ -t FILE List of reference names ane sequence lengths, in
+ the format described for the import command. If
+ this option is present, samtools assumes the input
+ <in.alignment> is in SAM format; otherwise it
+ assumes in BAM format.
+
+
+ -l FILE List of sites at which pileup is output. This file
+ is space delimited. The first two columns are
+ required to be chromosome and 1-based coordinate.
+ Additional columns are ignored. It is recommended
+ to use option -s together with -l as in the default
+ format we may not know the mapping quality.
+
+
+ -c Call the consensus sequence using MAQ consensus
+ model. Options -T, -N, -I and -r are only effective
+ when -c or -g is in use.
+
+
+ -g Generate genotype likelihood in the binary GLFv3
+ format. This option suppresses -c, -i and -s.
+
+
+ -T FLOAT The theta parameter (error dependency coefficient)
+ in the maq consensus calling model [0.85]
+
+
+ -N INT Number of haplotypes in the sample (>=2) [2]
+
+
+ -r FLOAT Expected fraction of differences between a pair of
+ haplotypes [0.001]
+
+
+ -I INT Phred probability of an indel in sequencing/prep.
+ [40]
+
+
+
+ tview samtools tview <in.sorted.bam> [ref.fasta]
+
+ Text alignment viewer (based on the ncurses library). In the
+ viewer, press `?' for help and press `g' to check the align-
+ ment start from a region in the format like
+ `chr10:10,000,000'.
+
+
+
+ fixmate samtools fixmate <in.nameSrt.bam> <out.bam>
+
+ Fill in mate coordinates, ISIZE and mate related flags from a
+ name-sorted alignment.
+
+
+ rmdup samtools rmdup <input.srt.bam> <out.bam>
+
+ Remove potential PCR duplicates: if multiple read pairs have
+ identical external coordinates, only retain the pair with
+ highest mapping quality. This command ONLY works with FR
+ orientation and requires ISIZE is correctly set.
+
+
+
+ rmdupse samtools rmdupse <input.srt.bam> <out.bam>
+
+ Remove potential duplicates for single-ended reads. This com-
+ mand will treat all reads as single-ended even if they are
+ paired in fact.
+
+
+
+ fillmd samtools fillmd [-e] <aln.bam> <ref.fasta>
+
+ Generate the MD tag. If the MD tag is already present, this
+ command will give a warning if the MD tag generated is dif-
+ ferent from the existing tag.
+
+ OPTIONS:
+
+ -e Convert a the read base to = if it is identical to
+ the aligned reference base. Indel caller does not
+ support the = bases at the moment.
+
+
+
+SAM FORMAT
+ SAM is TAB-delimited. Apart from the header lines, which are started
+ with the `@' symbol, each alignment line consists of:
+
+
+ +----+-------+----------------------------------------------------------+
+ |Col | Field | Description |
+ +----+-------+----------------------------------------------------------+
+ | 1 | QNAME | Query (pair) NAME |
+ | 2 | FLAG | bitwise FLAG |
+ | 3 | RNAME | Reference sequence NAME |
+ | 4 | POS | 1-based leftmost POSition/coordinate of clipped sequence |
+ | 5 | MAPQ | MAPping Quality (Phred-scaled) |
+ | 6 | CIAGR | extended CIGAR string |
+ | 7 | MRNM | Mate Reference sequence NaMe (`=' if same as RNAME) |
+ | 8 | MPOS | 1-based Mate POSistion |
+ | 9 | ISIZE | Inferred insert SIZE |
+ |10 | SEQ | query SEQuence on the same strand as the reference |
+ |11 | QUAL | query QUALity (ASCII-33 gives the Phred base quality) |
+ |12 | OPT | variable OPTional fields in the format TAG:VTYPE:VALUE |
+ +----+-------+----------------------------------------------------------+
+
+ Each bit in the FLAG field is defined as:
+
+
+ +-------+--------------------------------------------------+
+ | Flag | Description |
+ +-------+--------------------------------------------------+
+ |0x0001 | the read is paired in sequencing |
+ |0x0002 | the read is mapped in a proper pair |
+ |0x0004 | the query sequence itself is unmapped |
+ |0x0008 | the mate is unmapped |
+ |0x0010 | strand of the query (1 for reverse) |
+ |0x0020 | strand of the mate |
+ |0x0040 | the read is the first read in a pair |
+ |0x0080 | the read is the second read in a pair |
+ |0x0100 | the alignment is not primary |
+ |0x0200 | the read fails platform/vendor quality checks |
+ |0x0400 | the read is either a PCR or an optical duplicate |
+ +-------+--------------------------------------------------+
+
+LIMITATIONS
+ o Unaligned words used in bam_import.c, bam_endian.h, bam.c and
+ bam_aux.c.
+
+ o CIGAR operation P is not properly handled at the moment.
+
+ o In merging, the input files are required to have the same number of
+ reference sequences. The requirement can be relaxed. In addition,
+ merging does not reconstruct the header dictionaries automatically.
+ Endusers have to provide the correct header. Picard is better at
+ merging.
+
+ o Samtools' rmdup does not work for single-end data and does not remove
+ duplicates across chromosomes. Picard is better.
+
+
+AUTHOR
+ Heng Li from the Sanger Institute wrote the C version of samtools. Bob
+ Handsaker from the Broad Institute implemented the BGZF library and Jue
+ Ruan from Beijing Genomics Institute wrote the RAZF library. Various
+ people in the 1000Genomes Project contributed to the SAM format speci-
+ fication.
+
+
+SEE ALSO
+ Samtools website: <http://samtools.sourceforge.net>
+
+
+
+samtools-0.1.6 2 September 2009 samtools(1)
+++ /dev/null
-digraph {
- faidx[label="faidx.c\n(faidx)"]
- import[label="bam_import.c\n(import)"]
- plcmd[label="bam_plcmd.c\n(pileup)"]
- sort[label="bam_sort.c\n(sort, merge)"]
- index[label="bam_index.c\n(index)"]
- tview[label="bam_tview.c\n(tview)"]
- glf[label="glf.c\n(glfview)"]
- rmdup[label="bam_rmdup.c\n(rmdup)"]
- fixmate[label="bam_mate.c\n(fixmate)"]
- "bam_aux.c" -> {"bam.c", import}
- glf -> {"bam_maqcns.c", plcmd}
- "bgzf.c" -> {"bam.c", glf}
- "bam.c" -> {index, "bam_pileup.c", sort, import, rmdup, fixmate}
- "bam_pileup.c" -> {"bam_lpileup.c", plcmd}
- {"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview
- {import, faidx, "bam_maqcns.c"} -> plcmd
- {tview, plcmd, faidx, sort, import, index, glf, rmdup, fixmate} -> "bamtk.c\n(view)"
-}
\ No newline at end of file