From: Charles Plessy Date: Tue, 8 Sep 2009 10:38:06 +0000 (+0900) Subject: Imported Upstream version 0.1.5c X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=commitdiff_plain;h=b27e00385f41769d03a8cca4dbd71275fc9fa906 Imported Upstream version 0.1.5c --- b27e00385f41769d03a8cca4dbd71275fc9fa906 diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..435431c --- /dev/null +++ b/AUTHORS @@ -0,0 +1,16 @@ +Heng Li from the Sanger Institute wrote most of the initial source codes +of SAMtools and various converters. + +Bob Handsaker from the Broad Institute is a major contributor to the +SAM/BAM specification. He designed and implemented the BGZF format, the +underlying indexable compression format for the BAM format. BGZF does +not support arithmetic between file offsets. + +Jue Ruan for the Beijing Genome Institute designed and implemented the +RAZF format, an alternative indexable compression format. RAZF supports +arithmetic between file offsets, at the cost of increased index file +size and the full compatibility with gzip. RAZF is optional and only +used in `faidx' for indexing RAZF compressed fasta files. + +Colin Hercus updated novo2sam.pl to support gapped alignment by +novoalign. diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..82fa2f4 --- /dev/null +++ b/COPYING @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2008-2009 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..3bf82a5 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,2099 @@ +------------------------------------------------------------------------ +r372 | lh3lh3 | 2009-07-07 09:49:27 +0100 (Tue, 07 Jul 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + + * samtools-0.1.4-23 (r372) + * keep header text if "view -t" is used (by Gerton) + +------------------------------------------------------------------------ +r371 | lh3lh3 | 2009-07-07 01:13:32 +0100 (Tue, 07 Jul 2009) | 2 lines +Changed paths: + M /trunk/samtools/samtools.1 + +update documentation + +------------------------------------------------------------------------ +r370 | bhandsaker | 2009-07-02 22:24:34 +0100 (Thu, 02 Jul 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + +Introduced LIBPATH variable so this could be overridden to allow samtools to build correct at the Broad. + +------------------------------------------------------------------------ +r369 | lh3lh3 | 2009-07-02 13:36:53 +0100 (Thu, 02 Jul 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-22 (r369) + * in pileup, optionally print E2 and U2 + * remove the debugging code in bam_aux_get() (Drat!) + +------------------------------------------------------------------------ +r368 | lh3lh3 | 2009-07-02 11:32:26 +0100 (Thu, 02 Jul 2009) | 6 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_rmdup.c + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/faidx.c + M /trunk/samtools/faidx.h + M /trunk/samtools/glf.c + + * samtools-0.1.4-21 (r368) + * propagate errors rather than exit or complain assertion failure. Assertion + should be only used for checking internal bugs, but not for external input + inconsistency. I was just a bit lazy. + * small memory leak may be present on failure, though + +------------------------------------------------------------------------ +r367 | lh3lh3 | 2009-06-30 16:18:42 +0100 (Tue, 30 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +reduce the chance of blocking in FTP connection + +------------------------------------------------------------------------ +r366 | lh3lh3 | 2009-06-30 15:35:21 +0100 (Tue, 30 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +minor changes to knetfile: invalid fd equals -1 rather than 0 + +------------------------------------------------------------------------ +r365 | lh3lh3 | 2009-06-30 14:04:30 +0100 (Tue, 30 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + + * samtools-0.1.4-20 (r365) + * download the BAM index file if it is not found in the current working directory. + +------------------------------------------------------------------------ +r364 | lh3lh3 | 2009-06-30 12:39:07 +0100 (Tue, 30 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/knetfile.c + + * samtools-0.1.4-19 (r364) + * knetfile: report error when the file is not present on FTP + +------------------------------------------------------------------------ +r363 | lh3lh3 | 2009-06-29 23:23:32 +0100 (Mon, 29 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + + * samtools-0.1.4-18 (r363) + * knetfile: do not trigger network communication in FTP seek (lazy seek) + * bgzf: cache recent blocks (disabled by default) + +------------------------------------------------------------------------ +r362 | lh3lh3 | 2009-06-25 21:04:34 +0100 (Thu, 25 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/bgzf.c + +write changelog + +------------------------------------------------------------------------ +r361 | lh3lh3 | 2009-06-25 21:03:10 +0100 (Thu, 25 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-17 (r361) + * if a file is given on FTP, search locally for the BAM index + +------------------------------------------------------------------------ +r360 | lh3lh3 | 2009-06-25 20:44:52 +0100 (Thu, 25 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + + * samtools-0.1.4-16 (r360) + * report more information in index when the input is not sorted + * change the behaviour of knet_seek() such that it returns 0 on success + * support knetfile library in BGZF + +------------------------------------------------------------------------ +r359 | lh3lh3 | 2009-06-25 17:10:55 +0100 (Thu, 25 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + +fixed bugs in knetfile.* + +------------------------------------------------------------------------ +r358 | lh3lh3 | 2009-06-25 13:53:19 +0100 (Thu, 25 Jun 2009) | 2 lines +Changed paths: + A /trunk/samtools/knetfile.h + +this is the header file + +------------------------------------------------------------------------ +r357 | lh3lh3 | 2009-06-25 13:52:03 +0100 (Thu, 25 Jun 2009) | 3 lines +Changed paths: + A /trunk/samtools/knetfile.c + + * open a file at FTP + * preliminary version + +------------------------------------------------------------------------ +r354 | lh3lh3 | 2009-06-24 14:02:25 +0100 (Wed, 24 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-15 (r354) + * fixed a memory leak in bam_view1(), although samtools is not using this routine. + +------------------------------------------------------------------------ +r351 | lh3lh3 | 2009-06-18 00:16:26 +0100 (Thu, 18 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/faidx.c + + * samtools-0.1.4-13 (r351) + * make faidx more tolerant to empty lines right before or after > lines + * hope this does not introduce new bugs... + +------------------------------------------------------------------------ +r350 | lh3lh3 | 2009-06-16 14:37:01 +0100 (Tue, 16 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-13 (r350) + * fixed a small memory leak in pileup, caused by recent modifications + +------------------------------------------------------------------------ +r347 | lh3lh3 | 2009-06-13 21:20:49 +0100 (Sat, 13 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-12 (r347) + * added `-S' to pileup, similar to `view -S' + +------------------------------------------------------------------------ +r346 | lh3lh3 | 2009-06-13 17:52:31 +0100 (Sat, 13 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam_view.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.4-11 (r346) + * allow to select a read group at view command-line + +------------------------------------------------------------------------ +r344 | lh3lh3 | 2009-06-13 14:06:24 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/calDepth.c + +added more comments + +------------------------------------------------------------------------ +r343 | lh3lh3 | 2009-06-13 14:01:22 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/calDepth.c + +nothing really + +------------------------------------------------------------------------ +r342 | lh3lh3 | 2009-06-13 13:58:48 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/Makefile + A /trunk/samtools/examples/calDepth.c + +added an example of calculating read depth + +------------------------------------------------------------------------ +r341 | lh3lh3 | 2009-06-13 13:00:08 +0100 (Sat, 13 Jun 2009) | 6 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + A /trunk/samtools/bam_color.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam.h + + * samtools-0.1.4-10 (r341) + * only include key APIs in libbam.a + * move color-specific routines to bam_color.c + * update documentations + * remove the support of -q in pileup + +------------------------------------------------------------------------ +r340 | lh3lh3 | 2009-06-13 11:17:14 +0100 (Sat, 13 Jun 2009) | 6 lines +Changed paths: + M /trunk/samtools/INSTALL + M /trunk/samtools/Makefile + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/razf.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-9 (r340) + * added a warning to razf.c if zlib<1.2.2.1 + * fixed a compilation warning + * fixed a segfault caused by @RG parsing + * detect NCURSES in bam_tview.c + +------------------------------------------------------------------------ +r339 | lh3lh3 | 2009-06-13 10:35:19 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/INSTALL + +update INSTALL + +------------------------------------------------------------------------ +r338 | lh3lh3 | 2009-06-13 00:15:24 +0100 (Sat, 13 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kstring.h + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-8 (r338) + * parse the @RG header lines and allow to choose library at the "samtools view" + command line + +------------------------------------------------------------------------ +r337 | lh3lh3 | 2009-06-12 21:25:50 +0100 (Fri, 12 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-7 (r337) + * bgzf.c: support mode string "wu": uncompressed output + * "samtools view" support "-u" command-line option + +------------------------------------------------------------------------ +r336 | lh3lh3 | 2009-06-12 17:20:12 +0100 (Fri, 12 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/misc/Makefile + M /trunk/samtools/razf.c + M /trunk/samtools/razf.h + M /trunk/samtools/razip.c + + * no changes to samtools itself + * remove zlib source codes + * make RAZF reading compatible with old version of zlib + * on old version of zlib, writing is not available + +------------------------------------------------------------------------ +r335 | lh3lh3 | 2009-06-12 16:47:33 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + D /trunk/samtools/zlib + +remove zlib for simplification... + +------------------------------------------------------------------------ +r334 | lh3lh3 | 2009-06-12 15:43:36 +0100 (Fri, 12 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-6 (r334) + * do not export bam_aux_get_core() for Bio::DB::Sam because it has already + been implemented in that. + * this version works with the latest Bio::DB::Sam (20090612) + +------------------------------------------------------------------------ +r333 | lh3lh3 | 2009-06-12 15:33:42 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r332 | lh3lh3 | 2009-06-12 15:21:21 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/AUTHORS + M /trunk/samtools/Makefile + M /trunk/samtools/misc/Makefile + +fixed minor things in Makefile + +------------------------------------------------------------------------ +r331 | lh3lh3 | 2009-06-12 15:07:05 +0100 (Fri, 12 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-5 (r3310 + * no change to samtools itself. Version number is increased to reflect the + changes in the Makefile building system. + +------------------------------------------------------------------------ +r330 | lh3lh3 | 2009-06-12 15:03:38 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/AUTHORS + D /trunk/samtools/README + +update information... + +------------------------------------------------------------------------ +r329 | lh3lh3 | 2009-06-12 14:52:21 +0100 (Fri, 12 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/novo2sam.pl + + * updated novoalign converter by Colin Hercus et al. + * this version works with indels + +------------------------------------------------------------------------ +r328 | lh3lh3 | 2009-06-12 14:50:53 +0100 (Fri, 12 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/INSTALL + M /trunk/samtools/Makefile + M /trunk/samtools/misc/Makefile + M /trunk/samtools/zlib/Makefile + + * update Makefile + * update INSTALL instruction + +------------------------------------------------------------------------ +r327 | lh3lh3 | 2009-06-12 14:18:29 +0100 (Fri, 12 Jun 2009) | 4 lines +Changed paths: + A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.generic:325) + D /trunk/samtools/Makefile.am + D /trunk/samtools/Makefile.generic + D /trunk/samtools/Makefile.lite + D /trunk/samtools/autogen.sh + D /trunk/samtools/cleanup.sh + D /trunk/samtools/configure.ac + A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.generic:305) + D /trunk/samtools/misc/Makefile.am + D /trunk/samtools/misc/Makefile.generic + M /trunk/samtools/razf.c + A /trunk/samtools/zlib + A /trunk/samtools/zlib/Makefile + A /trunk/samtools/zlib/adler32.c + A /trunk/samtools/zlib/compress.c + A /trunk/samtools/zlib/crc32.c + A /trunk/samtools/zlib/crc32.h + A /trunk/samtools/zlib/deflate.c + A /trunk/samtools/zlib/deflate.h + A /trunk/samtools/zlib/gzio.c + A /trunk/samtools/zlib/infback.c + A /trunk/samtools/zlib/inffast.c + A /trunk/samtools/zlib/inffast.h + A /trunk/samtools/zlib/inffixed.h + A /trunk/samtools/zlib/inflate.c + A /trunk/samtools/zlib/inflate.h + A /trunk/samtools/zlib/inftrees.c + A /trunk/samtools/zlib/inftrees.h + A /trunk/samtools/zlib/trees.c + A /trunk/samtools/zlib/trees.h + A /trunk/samtools/zlib/uncompr.c + A /trunk/samtools/zlib/zconf.h + A /trunk/samtools/zlib/zlib.h + A /trunk/samtools/zlib/zutil.c + A /trunk/samtools/zlib/zutil.h + D /trunk/samtools/zutil.h + + * added zlib-1.2.3 as razip requires that + * prepare to changed back to the Makefile building system + * unfinished! (will be soon) + +------------------------------------------------------------------------ +r326 | lh3lh3 | 2009-06-12 14:12:03 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + +Unfinished + +------------------------------------------------------------------------ +r325 | lh3lh3 | 2009-06-10 16:27:59 +0100 (Wed, 10 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-4 (r325) + * further avoid wrong consensus calls in repetitive regions. + +------------------------------------------------------------------------ +r324 | lh3lh3 | 2009-06-10 15:56:17 +0100 (Wed, 10 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam.h + + * samtools-0.1.4-3 (r324) + * make maqcns generate the correct call in repetitive regions. + * allow filtering on mapQ at the pileup command line + +------------------------------------------------------------------------ +r323 | lh3lh3 | 2009-06-10 10:04:21 +0100 (Wed, 10 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.3.2 (r322) + * indels and SNPs use different mapping quality threshold + +------------------------------------------------------------------------ +r322 | lh3lh3 | 2009-06-10 10:03:22 +0100 (Wed, 10 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/export2sam.pl + +fixed a typo + +------------------------------------------------------------------------ +r321 | lh3lh3 | 2009-06-09 09:21:48 +0100 (Tue, 09 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + +just typo. no real change + +------------------------------------------------------------------------ +r320 | lh3lh3 | 2009-06-08 14:32:51 +0100 (Mon, 08 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + +a little bit code cleanup + +------------------------------------------------------------------------ +r319 | lh3lh3 | 2009-06-08 14:22:33 +0100 (Mon, 08 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.3.1 + * change default parameters + * optionally print filtered variants + +------------------------------------------------------------------------ +r318 | lh3lh3 | 2009-06-08 14:14:26 +0100 (Mon, 08 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.3.0 + * combine snpFilter and indelFilter + +------------------------------------------------------------------------ +r317 | lh3lh3 | 2009-06-08 11:31:42 +0100 (Mon, 08 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.3 + * change a default parameter + +------------------------------------------------------------------------ +r316 | lh3lh3 | 2009-06-08 11:11:06 +0100 (Mon, 08 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + + * samtools-0.1.4-2 (r316) + * pileup: cap mapping quality at 60 (by default) + * pileup: always calculate RMS mapq + * pileup: allow to output variant sites only + +------------------------------------------------------------------------ +r312 | lh3lh3 | 2009-06-04 13:01:10 +0100 (Thu, 04 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.2 + * added pileup2fq + +------------------------------------------------------------------------ +r311 | lh3lh3 | 2009-06-03 09:40:40 +0100 (Wed, 03 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * in snpFilter, suppress non-SNP sites + +------------------------------------------------------------------------ +r310 | lh3lh3 | 2009-06-01 14:35:13 +0100 (Mon, 01 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.1 + * fixed a typo + +------------------------------------------------------------------------ +r309 | lh3lh3 | 2009-06-01 14:04:39 +0100 (Mon, 01 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.0 + * snpFilter + +------------------------------------------------------------------------ +r306 | lh3lh3 | 2009-05-28 11:49:35 +0100 (Thu, 28 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bgzf.c + + * minor changes to bgzf: return NULL if fd == -1 + * suggested by {kdj,jm18}@sanger.ac.uk + +------------------------------------------------------------------------ +r305 | lh3lh3 | 2009-05-28 11:16:08 +0100 (Thu, 28 May 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/interpolate_sam.pl + +Script for paired-end pileup, contributed by Stephen Montgomery. + +------------------------------------------------------------------------ +r304 | lh3lh3 | 2009-05-28 11:08:49 +0100 (Thu, 28 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + + * samtools-0.1.4-1 (r304) + * fixed a minor bug in printing headers + +------------------------------------------------------------------------ +r297 | lh3lh3 | 2009-05-21 16:06:16 +0100 (Thu, 21 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/maq2sam.c + M /trunk/samtools/samtools.1 + +Release samtools-0.1.4 + +------------------------------------------------------------------------ +r296 | lh3lh3 | 2009-05-21 12:53:14 +0100 (Thu, 21 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-24 (r296) + * another similar bug in the indel caller + +------------------------------------------------------------------------ +r295 | lh3lh3 | 2009-05-21 12:50:28 +0100 (Thu, 21 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-23 (r295) + * fixed a critical bug in the indel caller + +------------------------------------------------------------------------ +r294 | lh3lh3 | 2009-05-20 13:00:20 +0100 (Wed, 20 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_stat.c + +added a missing header file + +------------------------------------------------------------------------ +r293 | lh3lh3 | 2009-05-19 23:44:25 +0100 (Tue, 19 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-22 (r293) + * open tview in the dot-view mode by default + +------------------------------------------------------------------------ +r292 | lh3lh3 | 2009-05-18 21:01:23 +0100 (Mon, 18 May 2009) | 6 lines +Changed paths: + M /trunk/samtools/samtools.1 + +Added a note to the manual. Currently SAMtools used unaligned words in +several places. Although this does not cause bus errors to me, it may +affect portability. Please see the "Bus error" wiki page for more +information. Also thank James Bonfields for pointing this out. + + +------------------------------------------------------------------------ +r286 | lh3lh3 | 2009-05-14 15:23:13 +0100 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-21 (286) + * declare bam_aux_get_core() in bam.h + +------------------------------------------------------------------------ +r276 | lh3lh3 | 2009-05-13 10:07:55 +0100 (Wed, 13 May 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-20 (r276) + * remove bam1_t::hash again. We need to modify the Perl API anyway to + make it work with the latest SVN. + * As is suggested by Tim, scan "{base}.bai" and "{base}.bam.bai" for index + +------------------------------------------------------------------------ +r275 | lh3lh3 | 2009-05-12 21:14:10 +0100 (Tue, 12 May 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam.h + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-19 (r275) + * a minor change to the bam1_t struct: added back "void *hash" for the + backward compatibility with Bio::DB::Sam + +------------------------------------------------------------------------ +r273 | lh3lh3 | 2009-05-12 14:28:39 +0100 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_rmdupse.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-18 (r273) + * rmdupse: do not remove unmapped reads + +------------------------------------------------------------------------ +r272 | lh3lh3 | 2009-05-12 14:20:00 +0100 (Tue, 12 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_rmdupse.c + +change a parameter. It does nothing + +------------------------------------------------------------------------ +r271 | lh3lh3 | 2009-05-12 14:17:58 +0100 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile.am + M /trunk/samtools/Makefile.generic + M /trunk/samtools/Makefile.lite + A /trunk/samtools/bam_rmdupse.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/configure.ac + + * samtools-0.1.3-17 (r271) + * added 'rmdupse' command + +------------------------------------------------------------------------ +r267 | lh3lh3 | 2009-05-05 22:31:41 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.3-16 (r267) + * in sam_view.c, changed g_flag_on based on the suggestion by Angie Hinrichs + +------------------------------------------------------------------------ +r266 | lh3lh3 | 2009-05-05 22:23:27 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-15 (r266) + * report an error if a non-* reference is present while @SQ is absent + +------------------------------------------------------------------------ +r265 | lh3lh3 | 2009-05-05 22:09:00 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.3-14 (r262) + * make samopen() recognize @SQ header lines + +------------------------------------------------------------------------ +r261 | lh3lh3 | 2009-05-05 15:10:30 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.3-13 (r260) + * report error for file I/O error + +------------------------------------------------------------------------ +r260 | lh3lh3 | 2009-05-05 15:01:16 +0100 (Tue, 05 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.am + +update Makefile.am + +------------------------------------------------------------------------ +r259 | lh3lh3 | 2009-05-05 14:52:25 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam.h + + * samtools-0.1.3-12 (r259) + * use the new I/O interface in pileup + +------------------------------------------------------------------------ +r258 | lh3lh3 | 2009-05-05 14:33:22 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile.generic + M /trunk/samtools/Makefile.lite + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + A /trunk/samtools/sam.c + A /trunk/samtools/sam.h + A /trunk/samtools/sam_view.c + + * samtools-0.1.3-11 (r258) + * unify the interface to BAM and SAM I/O + +------------------------------------------------------------------------ +r257 | lh3lh3 | 2009-05-05 09:53:35 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile.lite + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-10 (r257) + * allow hex with "pileup -m" + +------------------------------------------------------------------------ +r256 | lh3lh3 | 2009-05-04 19:16:50 +0100 (Mon, 04 May 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-9 (r256) + * fixed a bug in bam_lpileup.c + * I do not know if this also fixes the bug causing assertion failure in the tview + +------------------------------------------------------------------------ +r251 | lh3lh3 | 2009-04-28 13:53:23 +0100 (Tue, 28 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-8 (r251) + * fixed a bug when there are reads without coordinates + +------------------------------------------------------------------------ +r250 | lh3lh3 | 2009-04-28 13:43:33 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + A /trunk/samtools/AUTHORS + A /trunk/samtools/README + M /trunk/samtools/cleanup.sh + +added missing files + +------------------------------------------------------------------------ +r249 | lh3lh3 | 2009-04-28 13:37:16 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.generic + M /trunk/samtools/Makefile.lite + M /trunk/samtools/configure.ac + M /trunk/samtools/misc/Makefile.generic + +improve large file support in compilation + +------------------------------------------------------------------------ +r248 | lh3lh3 | 2009-04-28 13:33:24 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/INSTALL + +update INSTALL + +------------------------------------------------------------------------ +r247 | lh3lh3 | 2009-04-28 13:28:50 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.am + M /trunk/samtools/autogen.sh + M /trunk/samtools/cleanup.sh + M /trunk/samtools/configure.ac + A /trunk/samtools/misc/Makefile.am + +fixed various issues about the GNU building scripts + +------------------------------------------------------------------------ +r246 | lh3lh3 | 2009-04-28 13:10:23 +0100 (Tue, 28 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + D /trunk/samtools/Makefile + A /trunk/samtools/Makefile.am + A /trunk/samtools/Makefile.generic + A /trunk/samtools/autogen.sh + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + A /trunk/samtools/cleanup.sh + A /trunk/samtools/configure.ac + D /trunk/samtools/misc/Makefile + A /trunk/samtools/misc/Makefile.generic (from /trunk/samtools/misc/Makefile:245) + + * samtools-0.1.3-7 (r246) + * incorporated revisions from Nils Homer + * enhanced support of displaying color-space reads + +------------------------------------------------------------------------ +r244 | lh3lh3 | 2009-04-25 11:49:40 +0100 (Sat, 25 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-6 (r244) + * fixed segfault for unmapped reads + +------------------------------------------------------------------------ +r243 | lh3lh3 | 2009-04-24 21:27:26 +0100 (Fri, 24 Apr 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-5 (r243) + * fixed a long existing bug which may cause memory leak + * check MD + * consensus calling now works with "=", but indel calling not + +------------------------------------------------------------------------ +r242 | lh3lh3 | 2009-04-24 20:44:46 +0100 (Fri, 24 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-4 (r242) + * fixed a memory leak + +------------------------------------------------------------------------ +r240 | lh3lh3 | 2009-04-24 16:40:18 +0100 (Fri, 24 Apr 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/Makefile.lite + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + A /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-3 (r240) + * generate MD tag + * generate "=" bases + * the plain pileup now support "=" bases, but consensus calling and glfgen may fail + +------------------------------------------------------------------------ +r239 | lh3lh3 | 2009-04-24 12:08:20 +0100 (Fri, 24 Apr 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-2 (r239) + * fixed bugs in bam_aux.c (these functions nevered used by samtools) + * removed bam_aux_init()/bam_aux_destroy() + * added tagview for testing bam_aux + +------------------------------------------------------------------------ +r235 | lh3lh3 | 2009-04-21 23:17:39 +0100 (Tue, 21 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-1 + * fixed a bug in pileup: the first read in a chromosome may not be printed + +------------------------------------------------------------------------ +r232 | lh3lh3 | 2009-04-16 15:25:43 +0100 (Thu, 16 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.lite + +a missing file in Makefile.lite + +------------------------------------------------------------------------ +r227 | lh3lh3 | 2009-04-15 22:02:53 +0100 (Wed, 15 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + +Release samtools-0.1.3 + +------------------------------------------------------------------------ +r223 | lh3lh3 | 2009-04-15 14:31:32 +0100 (Wed, 15 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-28 + * make samtools more robust to weird input such as empty file + +------------------------------------------------------------------------ +r222 | lh3lh3 | 2009-04-15 14:05:33 +0100 (Wed, 15 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/samtools.1 + +prepare for release 0.1.3 + +------------------------------------------------------------------------ +r221 | lh3lh3 | 2009-04-15 13:32:14 +0100 (Wed, 15 Apr 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/blast2sam.pl + +convert NCBI-BLASTN to SAM + +------------------------------------------------------------------------ +r220 | lh3lh3 | 2009-04-15 13:18:19 +0100 (Wed, 15 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-27 + * fixed a small memory leak in tview + +------------------------------------------------------------------------ +r219 | lh3lh3 | 2009-04-15 13:00:08 +0100 (Wed, 15 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_rmdup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-26 + * fixed a bug in rmdup when there are unmapped reads + +------------------------------------------------------------------------ +r218 | lh3lh3 | 2009-04-14 22:28:58 +0100 (Tue, 14 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + +proposed NEWS for the new release (have not yet) + +------------------------------------------------------------------------ +r216 | lh3lh3 | 2009-04-14 22:10:46 +0100 (Tue, 14 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.1.1 + * improve indelFilter to avoid filtering true indels. The new filter relies + on the new pileup indel line implemented in samtools-0.1.2-25 + +------------------------------------------------------------------------ +r215 | lh3lh3 | 2009-04-14 22:04:19 +0100 (Tue, 14 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.2-25 + * change the pileup indel line to shows the number of alignments actually + containing indels + +------------------------------------------------------------------------ +r211 | lh3lh3 | 2009-04-13 12:07:13 +0100 (Mon, 13 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + +update ChangeLog from "svn log" + +------------------------------------------------------------------------ +r210 | lh3lh3 | 2009-04-12 20:57:05 +0100 (Sun, 12 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kseq.h + + * samtools-0.1.2-24 + * in merge, gives a warning rather than error if the target sequence length is different + * allow empty header + +------------------------------------------------------------------------ +r209 | lh3lh3 | 2009-04-12 20:32:44 +0100 (Sun, 12 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-23 + * recognize '*' at the QUAL field + +------------------------------------------------------------------------ +r208 | lh3lh3 | 2009-04-12 20:08:02 +0100 (Sun, 12 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kseq.h + + * samtools-0.1.2-22 + * the field separater is TAB only, now + +------------------------------------------------------------------------ +r207 | lh3lh3 | 2009-04-08 15:18:03 +0100 (Wed, 08 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/ex1.sam.gz + + * fixed the problem in the example alignment due to the bug in fixmate + +------------------------------------------------------------------------ +r206 | lh3lh3 | 2009-04-08 15:15:05 +0100 (Wed, 08 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_mate.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/soap2sam.pl + + * samtools-0.1.2-21 + * fixed a nasty bug in `fixmate' + +------------------------------------------------------------------------ +r205 | lh3lh3 | 2009-04-08 10:57:08 +0100 (Wed, 08 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/bowtie2sam.pl + M /trunk/samtools/misc/soap2sam.pl + M /trunk/samtools/misc/wgsim_eval.pl + +make the script robust to the bugs in SOAP-2.1.7 + +------------------------------------------------------------------------ +r200 | lh3lh3 | 2009-04-02 15:14:56 +0100 (Thu, 02 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-20 + * check if file is truncated in flagstat + +------------------------------------------------------------------------ +r199 | lh3lh3 | 2009-04-02 15:09:10 +0100 (Thu, 02 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-19 + * print the header if requested + +------------------------------------------------------------------------ +r193 | lh3lh3 | 2009-03-27 15:09:50 +0000 (Fri, 27 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-18 + * fixed a minor bug reported by Nils Homer + +------------------------------------------------------------------------ +r185 | lh3lh3 | 2009-03-24 11:50:32 +0000 (Tue, 24 Mar 2009) | 2 lines +Changed paths: + A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.std:184) + D /trunk/samtools/Makefile.std + A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.std:184) + D /trunk/samtools/misc/Makefile.std + +rename Makefile.std as Makefile. GNU building systerm is not ready and may take some time... + +------------------------------------------------------------------------ +r184 | lh3lh3 | 2009-03-24 10:36:38 +0000 (Tue, 24 Mar 2009) | 4 lines +Changed paths: + D /trunk/samtools/Makefile + A /trunk/samtools/Makefile.std (from /trunk/samtools/Makefile:183) + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + D /trunk/samtools/misc/Makefile + A /trunk/samtools/misc/Makefile.std (from /trunk/samtools/misc/Makefile:182) + M /trunk/samtools/samtools.1 + + * samtools-0.1.2-17 + * incorporating Nils' changes + * rename Makefile to Makefile.std and prepare to add the GNU building systerms (also by Nils) + +------------------------------------------------------------------------ +r183 | lh3lh3 | 2009-03-24 10:30:23 +0000 (Tue, 24 Mar 2009) | 4 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kseq.h + A /trunk/samtools/kstring.c + A /trunk/samtools/kstring.h + + * samtools-0.1.2-16 + * made pileup take a list of proposed indels. An insertion is N at the moment. + * added my kstring library for a bit complex parsing of the position list. + +------------------------------------------------------------------------ +r169 | lh3lh3 | 2009-03-12 13:40:14 +0000 (Thu, 12 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/soap2sam.pl + + * soap2sam.pl-0.1.2 + * more robust to truncated soap output + +------------------------------------------------------------------------ +r168 | lh3lh3 | 2009-03-11 10:49:00 +0000 (Wed, 11 Mar 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.lite + +added bam_stat.o to Makefile.lite + +------------------------------------------------------------------------ +r167 | lh3lh3 | 2009-03-10 22:11:31 +0000 (Tue, 10 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-15 + * generate RMS of mapQ instead of max mapQ + +------------------------------------------------------------------------ +r166 | lh3lh3 | 2009-03-10 22:06:45 +0000 (Tue, 10 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/glf.c + M /trunk/samtools/glf.h + M /trunk/samtools/misc/Makefile + + * samtools-0.1.2-14 + * implemented GLFv3 + +------------------------------------------------------------------------ +r159 | lh3lh3 | 2009-03-03 11:26:08 +0000 (Tue, 03 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-13 + * fixed a minor bug in displaying pileup + +------------------------------------------------------------------------ +r158 | lh3lh3 | 2009-03-03 11:24:16 +0000 (Tue, 03 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-12 + * optionally print SAM header + +------------------------------------------------------------------------ +r153 | lh3lh3 | 2009-03-02 10:45:28 +0000 (Mon, 02 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/glf.c + + * samtools-0.1.2-11 + * use "GLF\3" as the magic for GLFv3 files + +------------------------------------------------------------------------ +r152 | lh3lh3 | 2009-03-02 10:39:09 +0000 (Mon, 02 Mar 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/glf.c + M /trunk/samtools/glf.h + + * samtools-0.1.2-10 + * fixed a bug in import: core.bin is undefined for unmapped reads + * this bug can be alleviated (not completely solved) in bam_index.c + * update to GLFv3: pos is changed to offset for better compression + +------------------------------------------------------------------------ +r151 | lh3lh3 | 2009-03-01 15:18:43 +0000 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/wgsim.c + + * wgsim-0.2.3 + * fixed a bug in simulating indels + +------------------------------------------------------------------------ +r145 | lh3lh3 | 2009-02-26 19:43:57 +0000 (Thu, 26 Feb 2009) | 4 lines +Changed paths: + M /trunk/samtools/misc/wgsim.c + + * wgsim-0.2.2 + * allow to print mismatch information as fastq comment. MAQ does + not like long read names. + +------------------------------------------------------------------------ +r141 | lh3lh3 | 2009-02-26 14:53:03 +0000 (Thu, 26 Feb 2009) | 6 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/misc/wgsim.c + M /trunk/samtools/misc/wgsim_eval.pl + + * wgsim-0.2.1 + * fixed a bug about color read coordinates + * fixed a bug in read names + * wgsim_eval.pl-0.1.3 + * make the script work with color reads + +------------------------------------------------------------------------ +r140 | lh3lh3 | 2009-02-26 14:02:57 +0000 (Thu, 26 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/Makefile + M /trunk/samtools/misc/wgsim.c + + * wgsim: added a note + +------------------------------------------------------------------------ +r139 | lh3lh3 | 2009-02-26 11:39:08 +0000 (Thu, 26 Feb 2009) | 7 lines +Changed paths: + M /trunk/samtools/misc/wgsim.c + M /trunk/samtools/misc/wgsim_eval.pl + + * wgsim-0.2.0 + * considerable code clean up + * print number of substitutions/indels/errors on each read + * potentially support SOLiD simulation, though not tested at the moment + * wgsim_eval.pl-0.1.2 + * change in accordant with wgsim + +------------------------------------------------------------------------ +r129 | lh3lh3 | 2009-02-18 22:23:27 +0000 (Wed, 18 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-9 + * fixed a bug in bam_fetch, caused by completely contained adjacent chunks + +------------------------------------------------------------------------ +r128 | bhandsaker | 2009-02-18 19:06:57 +0000 (Wed, 18 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + +Fix annoying segv when invalid region specified. + +------------------------------------------------------------------------ +r127 | lh3lh3 | 2009-02-17 10:49:55 +0000 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + D /trunk/samtools/misc/indel_filter.pl + A /trunk/samtools/misc/samtools.pl + + * move indel_filter.pl to samtools.pl + +------------------------------------------------------------------------ +r126 | lh3lh3 | 2009-02-14 21:22:30 +0000 (Sat, 14 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_mate.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-7 + * fixed a bug in fixmate: SE reads are flagged as BAM_FMUNMAP + +------------------------------------------------------------------------ +r125 | lh3lh3 | 2009-02-13 09:54:45 +0000 (Fri, 13 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-7 + * fixed a minor bug in flagstat + +------------------------------------------------------------------------ +r124 | lh3lh3 | 2009-02-12 11:15:32 +0000 (Thu, 12 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/indel_filter.pl + + * samtools-0.1.2-6 + * improve indel caller by setting maximum window size + +------------------------------------------------------------------------ +r123 | lh3lh3 | 2009-02-12 10:30:29 +0000 (Thu, 12 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * output max mapping quality in indel line + +------------------------------------------------------------------------ +r122 | lh3lh3 | 2009-02-11 10:59:10 +0000 (Wed, 11 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/maq2sam.c + +fixed a bug in generating tag AM + +------------------------------------------------------------------------ +r121 | lh3lh3 | 2009-02-03 10:43:11 +0000 (Tue, 03 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + +fixed a potential memory problem in indexing + +------------------------------------------------------------------------ +r120 | bhandsaker | 2009-02-02 15:52:52 +0000 (Mon, 02 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + +Pass LIBS to recursive targets to facilitate building at Broad. + +------------------------------------------------------------------------ +r119 | lh3lh3 | 2009-02-02 10:12:15 +0000 (Mon, 02 Feb 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-3 + * fixed a bug in generating GLFv2 for indels + * improve flagstat report a little bit + +------------------------------------------------------------------------ +r118 | lh3lh3 | 2009-01-29 12:33:23 +0000 (Thu, 29 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + A /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-1 + * added flagstat command + +------------------------------------------------------------------------ +r116 | lh3lh3 | 2009-01-28 13:31:12 +0000 (Wed, 28 Jan 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + +Release SAMtools-0.1.2 + +------------------------------------------------------------------------ +r115 | lh3lh3 | 2009-01-28 12:54:08 +0000 (Wed, 28 Jan 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/indel_filter.pl + +Script for filtering indel results + +------------------------------------------------------------------------ +r114 | lh3lh3 | 2009-01-25 11:45:37 +0000 (Sun, 25 Jan 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/zoom2sam.pl + +convert ZOOM to SAM + +------------------------------------------------------------------------ +r113 | lh3lh3 | 2009-01-24 14:25:07 +0000 (Sat, 24 Jan 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/novo2sam.pl + +add a script to convert novo alignment to SAM + +------------------------------------------------------------------------ +r112 | lh3lh3 | 2009-01-23 20:57:39 +0000 (Fri, 23 Jan 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/ChangeLog.old + M /trunk/samtools/samtools.1 + +update documentation and ChangeLog + +------------------------------------------------------------------------ +r111 | lh3lh3 | 2009-01-23 19:22:59 +0000 (Fri, 23 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.1-19 + * fixed a bug in "merge" command line + +------------------------------------------------------------------------ +r110 | lh3lh3 | 2009-01-22 15:36:48 +0000 (Thu, 22 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/Makefile + A /trunk/samtools/misc/bowtie2sam.pl (from /branches/dev/samtools/misc/bowtie2sam.pl:108) + M /trunk/samtools/misc/export2sam.pl + A /trunk/samtools/misc/soap2sam.pl (from /branches/dev/samtools/misc/soap2sam.pl:108) + A /trunk/samtools/misc/wgsim.c (from /branches/dev/samtools/misc/wgsim.c:108) + A /trunk/samtools/misc/wgsim_eval.pl (from /branches/dev/samtools/misc/wgsim_eval.pl:108) + + * merge from branches/dev/ + * all future development will happen here + +------------------------------------------------------------------------ +r109 | lh3lh3 | 2009-01-22 15:14:27 +0000 (Thu, 22 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/COPYING + M /trunk/samtools/ChangeLog + A /trunk/samtools/INSTALL (from /branches/dev/samtools/INSTALL:108) + M /trunk/samtools/Makefile + A /trunk/samtools/Makefile.lite (from /branches/dev/samtools/Makefile.lite:108) + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + A /trunk/samtools/bam_mate.c (from /branches/dev/samtools/bam_mate.c:108) + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + A /trunk/samtools/bam_rmdup.c (from /branches/dev/samtools/bam_rmdup.c:108) + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/examples/00README.txt + A /trunk/samtools/examples/Makefile (from /branches/dev/samtools/examples/Makefile:108) + D /trunk/samtools/examples/ex1.fa.fai + M /trunk/samtools/examples/ex1.sam.gz + M /trunk/samtools/faidx.c + A /trunk/samtools/glf.c (from /branches/dev/samtools/glf.c:108) + M /trunk/samtools/glf.h + M /trunk/samtools/misc/Makefile + M /trunk/samtools/misc/maq2sam.c + M /trunk/samtools/razf.c + M /trunk/samtools/source.dot + + * Merge from branches/dev/ + * all future development will happen here at trunk/ + +------------------------------------------------------------------------ +r79 | bhandsaker | 2009-01-07 21:42:15 +0000 (Wed, 07 Jan 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_tview.c + +Fix problem with compiling without curses. + +------------------------------------------------------------------------ +r63 | lh3lh3 | 2008-12-22 15:58:02 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /trunk/samtools (from /branches/dev/samtools:62) + +Create trunk copy + +------------------------------------------------------------------------ +r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/NEWS + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + +Release samtools-0.1.1 + +------------------------------------------------------------------------ +r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines +Changed paths: + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-66 + * fixed a bug in razf.c: reset z_eof when razf_seek() is called + * fixed a memory leak in parsing a region + * changed pileup a little bit when -s is in use: output ^ and $ + * when a bam is not indexed, output more meaningful error message + * fixed a bug in indexing for small alignment + * fixed a bug in the viewer when we come to the end of a reference file + * updated documentation + * prepare to release 0.1.1 + +------------------------------------------------------------------------ +r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/examples + A /branches/dev/samtools/examples/00README.txt + A /branches/dev/samtools/examples/ex1.fa + A /branches/dev/samtools/examples/ex1.fa.fai + A /branches/dev/samtools/examples/ex1.sam.gz + +example + +------------------------------------------------------------------------ +r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/misc/export2sam.pl + + * added comments + * fixed several bugs + +------------------------------------------------------------------------ +r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/export2sam.pl + +convert Export format to SAM; not thoroughly tested + +------------------------------------------------------------------------ +r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/source.dot + + * samtools-0.1.0-65 + * pileup: generate maq-like simple output + * pileup: allow to output pileup at required sites + * source.dot: source file relationship graph + * tview: fixed a minor bug + +------------------------------------------------------------------------ +r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines +Changed paths: + D /branches/dev/samtools/misc/all2sam.pl + +remove all2sam.pl + +------------------------------------------------------------------------ +r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/COPYING + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/faidx.h + M /branches/dev/samtools/khash.h + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/ksort.h + M /branches/dev/samtools/samtools.1 + +Added copyright information and a bit more documentation. No code change. + +------------------------------------------------------------------------ +r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-64 + * improved efficiency of the indel caller for spliced alignments + +------------------------------------------------------------------------ +r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-63 + * a bit code cleanup: reduce the dependency between source files + +------------------------------------------------------------------------ +r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-62 + * fixed a memory leak + +------------------------------------------------------------------------ +r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/samtools.1 + +update documentation, ChangeLog and a comment + +------------------------------------------------------------------------ +r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-61 + * moved pileup command to a separate source file + * added indel caller + * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!) + * updated documentation + +------------------------------------------------------------------------ +r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-60 + * fixed another bug in maqcns when there is a nearby deletion + +------------------------------------------------------------------------ +r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-59 + * pileup: outputing consensus is now optional + * fixed a bug in glfgen. This bug also exists in maq's glfgen. However, + I am not quite sure why the previous version may have problem. + +------------------------------------------------------------------------ +r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-58 + * add maq consensus to pileup. However, I will move this part to a new + command as strictly speaking, consensus callin is not part of pileup, + and imposing it would make it harder to generate for other language + bindings. + +------------------------------------------------------------------------ +r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +Fix bug in tell() after reads that consume to the exact end of a block. + +------------------------------------------------------------------------ +r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-57 + * fixed a bug in parser when there is auxiliary fields + * made the parser a bit more robust + +------------------------------------------------------------------------ +r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-56 + * fixed a bug in bgzf (only reading is affected) + * fixed a typo in bam_index.c + * in bam_index.c, check potential bugs in the underlying I/O library + +------------------------------------------------------------------------ +r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-55 + * tried to make pileup work with clipping (previously not), though NOT tested + * removed -v from pileup + * made pileup take the reference sequence + +------------------------------------------------------------------------ +r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-54 + * in parser, recognize "=", rather than ",", as a match + * in parser, correctl parse "=" at the MRNM field. + +------------------------------------------------------------------------ +r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/maq2sam.c + +fixed a bug in handling maq flag 64 and 192 + +------------------------------------------------------------------------ +r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +also calculate unordered md5sum check + +------------------------------------------------------------------------ +r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a minor bug when there are space in the sequence + +------------------------------------------------------------------------ +r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a potential memory leak + +------------------------------------------------------------------------ +r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * fixed a bug in import: bin is wrongly calculated + +------------------------------------------------------------------------ +r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/all2sam.pl + +nothing, really + +------------------------------------------------------------------------ +r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/md5.c + A /branches/dev/samtools/misc/md5.h + A /branches/dev/samtools/misc/md5fa.c + + * fixed two warnings in kseq.h + * added md5sum utilities + +------------------------------------------------------------------------ +r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/kseq.h + D /branches/dev/samtools/kstream.h + + * samtools-0.1.0-52 + * replace kstream with kseq. kseq is a superset of kstream. I need the + extra functions in kseq.h. + * also compile stand-alone faidx + +------------------------------------------------------------------------ +r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-51 + * sorting by read names is available + +------------------------------------------------------------------------ +r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/misc/maq2sam.c + + * samtools-0.1.0-50 + * format change to meet the latest specification + +------------------------------------------------------------------------ +r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/misc/maq2sam.c + + * minor change in maqcns: special care when n==0 + * change maq2sam to meet the latest specification + +------------------------------------------------------------------------ +r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + +considerable code clean up in razf + +------------------------------------------------------------------------ +r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/faidx.c + +make RAZF optional in faidx.c + +------------------------------------------------------------------------ +r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-49 + * added routines for retrieving aux data, NOT TESTED YET! + +------------------------------------------------------------------------ +r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-48 + * bgzf: fixed a potential integer overflow on 32-it machines + * maqcns: set the minimum combined quality as 0 + * supporting hex strings + +------------------------------------------------------------------------ +r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-47 + * fixed the bug in maqcns + +------------------------------------------------------------------------ +r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_maqcns.c + A /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/glf.h + + * samtools-0.1.0-46 + * add MAQ consensus caller, currently BUGGY! + +------------------------------------------------------------------------ +r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-45 + * tview: display padded alignment (but not P operation) + * better coordinates and reference sequence + +------------------------------------------------------------------------ +r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + +new ChangeLog + +------------------------------------------------------------------------ +r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + D /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6) + +Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from +the log of my personal SVN repository. + +------------------------------------------------------------------------ +r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-44 + * declare fseeko and ftello as some Linux may not do this by default and + missing these declarations will make bgzf buggy + * get rid of some harmless warings + * use BGZF by default, now + +------------------------------------------------------------------------ +r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + + * samtools-0.1.0-43 + * fixed a bug in razf_read() + * give more warnings when the file is truncated (or due to bugs in I/O library) + +------------------------------------------------------------------------ +r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +fixed a bug in bgzf.c at the end of the file + +------------------------------------------------------------------------ +r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-42 + * a lot happened to RAZF, although samtools itself is untouched. Better + also update the version number anyway to avoid confusion + +------------------------------------------------------------------------ +r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +a change from Jue, but I think it should not matter + +------------------------------------------------------------------------ +r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a potential bug in razf. However, it seems still buggy, just +rarely happens, very rarely. + +------------------------------------------------------------------------ +r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a bug in razf, with the help of Jue + +------------------------------------------------------------------------ +r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + +remove a comment + +------------------------------------------------------------------------ +r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + + * Jue has updated razf to realize Bob's scheme + +------------------------------------------------------------------------ +r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/samtools.1 + +the manual page + +------------------------------------------------------------------------ +r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/Makefile + A /branches/dev/samtools/bam.c + A /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_aux.c + A /branches/dev/samtools/bam_endian.h + A /branches/dev/samtools/bam_import.c + A /branches/dev/samtools/bam_index.c + A /branches/dev/samtools/bam_lpileup.c + A /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_sort.c + A /branches/dev/samtools/bam_tview.c + A /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/bgzf.c + A /branches/dev/samtools/bgzf.h + A /branches/dev/samtools/bgzip.c + A /branches/dev/samtools/faidx.c + A /branches/dev/samtools/faidx.h + A /branches/dev/samtools/khash.h + A /branches/dev/samtools/ksort.h + A /branches/dev/samtools/kstream.h + A /branches/dev/samtools/misc + A /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/all2sam.pl + A /branches/dev/samtools/misc/maq2sam.c + A /branches/dev/samtools/razf.c + A /branches/dev/samtools/razf.h + A /branches/dev/samtools/razip.c + A /branches/dev/samtools/zutil.h + +The initial version of samtools, replicated from my local SVN repository. +The current version is: 0.1.0-42. All future development will happen here. + +------------------------------------------------------------------------ +r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools + +samtools (C version) + +------------------------------------------------------------------------ diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..f1cf7aa --- /dev/null +++ b/INSTALL @@ -0,0 +1,29 @@ +System Requirements +=================== + +SAMtools depends on the zlib library . The latest +version 1.2.3 is preferred and with the latest version you can compile +razip and use it to compress a FASTA file. SAMtools' faidx is able to +index a razip-compressed FASTA file to save diskspace. Older zlib also +works with SAMtools, but razip cannot be compiled. + +The text-based viewer (tview) requires the GNU ncurses library +, which comes with Mac OS X and +most of the modern Linux/Unix distributions. If you do not have this +library installed, you can still compile the rest of SAMtools by +manually modifying one line in Makefile. + + +Compilation +=========== + +Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can +compile razip with `make razip'. + + +Installation +============ + +Simply copy `samtools' and other executables/scripts in `misc' to a +location you want (e.g. a directory in your $PATH). No further +configurations are required. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7bb4469 --- /dev/null +++ b/Makefile @@ -0,0 +1,69 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE #-D_NO_CURSES +LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ + bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \ + bam_sort.o +AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ + bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ + bamtk.o +PROG= samtools +INCLUDES= +SUBDIRS= . misc +LIBPATH= + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libbam.a + +libbam.a:$(LOBJS) + $(AR) -cru $@ $(LOBJS) + +### For the curses library: comment out `-lcurses' if you do not have curses installed +samtools:lib $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -lcurses -lz -L. -lbam + +razip:razip.o razf.o + $(CC) $(CFLAGS) -o $@ razf.o razip.o -lz + +bgzip:bgzip.o bgzf.o + $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz + +razip.o:razf.h +bam.o:bam.h razf.h bam_endian.h kstring.h +sam.o:sam.h bam.h +bam_import.o:bam.h kseq.h khash.h razf.h +bam_pileup.o:bam.h razf.h ksort.h +bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h +bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h +bam_lpileup.o:bam.h ksort.h +bam_tview.o:bam.h faidx.h bam_maqcns.h +bam_maqcns.o:bam.h ksort.h bam_maqcns.h +bam_sort.o:bam.h ksort.h razf.h +bam_md.o:bam.h faidx.h +glf.o:glf.h + +faidx.o:faidx.h razf.h khash.h +faidx_main.o:faidx.h razf.h + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM razip $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..149c090 --- /dev/null +++ b/NEWS @@ -0,0 +1,224 @@ +Beta Release 0.1.5 (7 July, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Support opening a BAM alignment on FTP. Users can now use "tview" to + view alignments at the NCBI ftp site. Please read manual for more + information. + + * In library, propagate errors rather than exit or complain assertion + failure. + + * Simplified the building system and fixed compiling errors caused by + zlib<1.2.2.1. + + * Fixed an issue about lost header information when a SAM is imported + with "view -t". + + * Implemented "samtool.pl varFilter" which filters both SNPs and short + indels. This command replaces "indelFilter". + + * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from + pileup output. + + * In pileup, cap mapping quality at 60. This helps filtering when + different aligners are in use. + + * In pileup, allow to output variant sites only. + + * Made pileup generate correct calls in repetitive region. At the same + time, I am considering to implement a simplified model in SOAPsnp, + although this has not happened yet. + + * In view, added '-u' option to output BAM without compression. This + option is preferred when the output is piped to other commands. + + * In view, added '-l' and '-r' to get the alignments for one library or + read group. The "@RG" header lines are now partially parsed. + + * Do not include command line utilities to libbam.a. + + * Fixed memory leaks in pileup and bam_view1(). + + * Made faidx more tolerant to empty lines right before or after FASTA > + lines. + + +Changes in other utilities: + + * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign. + + +This release involves several modifications to the key code base which +may potentially introduce new bugs even though we have tried to minimize +this by testing on several examples. Please let us know if you catch +bugs. + +(0.1.5: 7 July 2009, r373) + + + +Beta Release 0.1.4 (21 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Added the 'rmdupse' command: removing duplicates for SE reads. + + * Fixed a critical bug in the indel caller: clipped alignments are not + processed correctly. + + * Fixed a bug in the tview: gapped alignment may be incorrectly + displayed. + + * Unified the interface to BAM and SAM I/O. This is done by + implementing a wrapper on top of the old APIs and therefore old APIs + are still valid. The new I/O APIs also recognize the @SQ header + lines. + + * Generate the MD tag. + + * Generate "=" bases. However, the indel caller will not work when "=" + bases are present. + + * Enhanced support of color-read display (by Nils Homer). + + * Implemented the GNU building system. However, currently the building + system does not generate libbam.a. We will improve this later. For + the time being, `make -f Makefile.generic' is preferred. + + * Fixed a minor bug in pileup: the first read in a chromosome may be + skipped. + + * Fixed bugs in bam_aux.c. These bugs do not affect other components as + they were not used previously. + + * Output the 'SM' tag from maq2sam. + +(0.1.4: 21 May 2009, r297) + + + +Beta Release 0.1.3 (15 April, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in SAMtools: + + * SAMtools is more consistent with the specification: a) '*' in the + QUAL field is allowed; b) the field separator is TAB only and SPACE + is treated as a character in a field; c) empty header is allowed. + + * Implemented GLFv3 support in pileup. + + * Fixed a severe bug in fixmate: strand information is wrongly + overwritten. + + * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are + not correctly retrieved sometimes. + + * Fixed a bug in rmdup: segfault if unmapped reads are present. + + * Move indel_filter.pl to samtools.pl and improved the filtering by + checking the actual number of alignments containing indels. The indel + pileup line is also changed a little to make this filtration easier. + + * Fixed a minor bug in indexing: the bin number of an unmapped read is + wrongly calculated. + + * Added `flagstat' command to show statistics on the FLAG field. + + * Improved indel caller by setting the maximum window size in local + realignment. + +Changes in other utilities: + + * Fixed a bug in maq2sam: a tag name is obsolete. + + * Improvement to wgsim: a) added support for SOLiD read simulation; b) + show the number of substitutions/indels/errors in read name; c) + considerable code clean up. + + * Various converters: improved functionality in general. + + * Updated the example SAM due to the previous bug in fixmate. + +(0.1.3: 15 April 2009, r227) + + + +Beta Release 0.1.2 (28 January, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in SAMtools: + + * Implemented a Bayesian indel caller. The new caller generate scores + and genotype and is potentially more accurate than Maq's indel + caller. The pileup format is also changed accordingly. + + * Implemented rmdup command: remove potential PCR duplicates. Note that + this command ONLY works for FR orientation and requires ISIZE is + correctly set. + + * Added fixmate command: fill in mate coordinates, ISIZE and mate + related flags from a name-sorted alignment. + + * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved. + + * Allow to select reads shown in the pileup output with a mask. + + * Generate GLFv2 from pileup. + + * Added two more flags for flagging PCR/optical duplicates and for QC + failure. + + * Fixed a bug in sort command: name sorting for large alignment did not + work. + + * Allow to completely disable RAZF (using Makefile.lite) as some people + have problem to compile it. + + * Fixed a bug in import command when there are reads without + coordinates. + + * Fixed a bug in tview: clipping broke the alignment viewer. + + * Fixed a compiling error when _NO_CURSES is applied. + + * Fixed a bug in merge command. + +Changes in other utilities: + + * Added wgsim, a paired-end reads simulator. Wgsim was adapted from + maq's reads simulator. Colin Hercus further improved it to allow + longer indels. + + * Added wgsim_eval.pl, a script that evaluates the accuracy of + alignment on reads generated by wgsim. + + * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not + work properly when multiple hits are output. + + * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will + be retained when multiple hits are present. + + * Fixed a bug in export2sam.pl for QC reads. + + * Support RG tag at MAQ->SAM converter. + + * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and + indel are not properly handled, though. + + * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the + default Illumina output. + +(0.1.2: 28 January 2008; r116) + + + +Beta Release 0.1.1 (22 December, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The is the first public release of samtools. For more information, +please check the manual page `samtools.1' and the samtools website +http://samtools.sourceforge.net \ No newline at end of file diff --git a/bam.c b/bam.c new file mode 100644 index 0000000..1ff4a5a --- /dev/null +++ b/bam.c @@ -0,0 +1,290 @@ +#include +#include +#include +#include "bam.h" +#include "bam_endian.h" +#include "kstring.h" + +int bam_is_be = 0; + +/************************** + * CIGAR related routines * + **************************/ + +int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg) +{ + unsigned k; + int32_t x = c->pos, y = 0; + int state = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; // operation + int l = cigar[k] >> BAM_CIGAR_SHIFT; // length + if (state == 0 && (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CINS) && x + l > pos) { + reg->tbeg = x; reg->qbeg = y; reg->cbeg = k; + state = 1; + } + if (op == BAM_CMATCH) { x += l; y += l; } + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + if (state == 1 && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CREF_SKIP || k == c->n_cigar - 1)) { + reg->tend = x; reg->qend = y; reg->cend = k; + } + } + return state? 0 : -1; +} + +uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) +{ + uint32_t k, end; + end = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) + end += cigar[k] >> BAM_CIGAR_SHIFT; + } + return end; +} + +int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) +{ + uint32_t k; + int32_t l = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) + l += cigar[k] >> BAM_CIGAR_SHIFT; + } + return l; +} + +/******************** + * BAM I/O routines * + ********************/ + +bam_header_t *bam_header_init() +{ + bam_is_be = bam_is_big_endian(); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); +} + +void bam_header_destroy(bam_header_t *header) +{ + int32_t i; + extern void bam_destroy_header_hash(bam_header_t *header); + if (header == 0) return; + if (header->target_name) { + for (i = 0; i < header->n_targets; ++i) + free(header->target_name[i]); + free(header->target_name); + free(header->target_len); + } + free(header->text); +#ifndef BAM_NO_HASH + if (header->rg2lib) bam_strmap_destroy(header->rg2lib); + bam_destroy_header_hash(header); +#endif + free(header); +} + +bam_header_t *bam_header_read(bamFile fp) +{ + bam_header_t *header; + char buf[4]; + int32_t i, name_len; + // read "BAM1" + if (bam_read(fp, buf, 4) != 4) return 0; + if (strncmp(buf, "BAM\001", 4)) { + fprintf(stderr, "[bam_header_read] wrong header\n"); + return 0; + } + header = bam_header_init(); + // read plain text and the number of reference sequences + bam_read(fp, &header->l_text, 4); + if (bam_is_be) bam_swap_endian_4p(&header->l_text); + header->text = (char*)calloc(header->l_text + 1, 1); + bam_read(fp, header->text, header->l_text); + bam_read(fp, &header->n_targets, 4); + if (bam_is_be) bam_swap_endian_4p(&header->n_targets); + // read reference sequence names and lengths + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); + for (i = 0; i != header->n_targets; ++i) { + bam_read(fp, &name_len, 4); + if (bam_is_be) bam_swap_endian_4p(&name_len); + header->target_name[i] = (char*)calloc(name_len, 1); + bam_read(fp, header->target_name[i], name_len); + bam_read(fp, &header->target_len[i], 4); + if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); + } + return header; +} + +int bam_header_write(bamFile fp, const bam_header_t *header) +{ + char buf[4]; + int32_t i, name_len, x; + // write "BAM1" + strncpy(buf, "BAM\001", 4); + bam_write(fp, buf, 4); + // write plain text and the number of reference sequences + if (bam_is_be) { + x = bam_swap_endian_4(header->l_text); + bam_write(fp, &x, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + x = bam_swap_endian_4(header->n_targets); + bam_write(fp, &x, 4); + } else { + bam_write(fp, &header->l_text, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + bam_write(fp, &header->n_targets, 4); + } + // write sequence names and lengths + for (i = 0; i != header->n_targets; ++i) { + char *p = header->target_name[i]; + name_len = strlen(p) + 1; + if (bam_is_be) { + x = bam_swap_endian_4(name_len); + bam_write(fp, &x, 4); + } else bam_write(fp, &name_len, 4); + bam_write(fp, p, name_len); + if (bam_is_be) { + x = bam_swap_endian_4(header->target_len[i]); + bam_write(fp, &x, 4); + } else bam_write(fp, &header->target_len[i], 4); + } + return 0; +} + +static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint8_t *s; + uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); + while (s < data + data_len) { + uint8_t type; + s += 2; // skip key + type = toupper(*s); ++s; // skip type + if (type == 'C' || type == 'A') ++s; + else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } + else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } + else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } +} + +int bam_read1(bamFile fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + + assert(BAM_CORE_SIZE == 32); + if ((ret = bam_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3; + if (bam_is_be) { + bam_swap_endian_4p(&block_len); + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->data_len = block_len - BAM_CORE_SIZE; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; + b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + return 4 + block_len; +} + +inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y; + int i; + assert(BAM_CORE_SIZE == 32); + x[0] = c->tid; + x[1] = c->pos; + x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; + x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + x[4] = c->l_qseq; + x[5] = c->mtid; + x[6] = c->mpos; + x[7] = c->isize; + if (bam_is_be) { + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + y = block_len; + bam_write(fp, bam_swap_endian_4p(&y), 4); + swap_endian_data(c, data_len, data); + } else bam_write(fp, &block_len, 4); + bam_write(fp, x, BAM_CORE_SIZE); + bam_write(fp, data, data_len); + if (bam_is_be) swap_endian_data(c, data_len, data); + return 4 + block_len; +} + +int bam_write1(bamFile fp, const bam1_t *b) +{ + return bam_write1_core(fp, &b->core, b->data_len, b->data); +} + +char *bam_format1(const bam_header_t *header, const bam1_t *b) +{ + uint8_t *s = bam1_seq(b), *t = bam1_qual(b); + int i; + const bam1_core_t *c = &b->core; + kstring_t str; + str.l = str.m = 0; str.s = 0; + + ksprintf(&str, "%s\t%d\t", bam1_qname(b), c->flag); + if (c->tid < 0) kputs("*\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->tid]); + ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); + if (c->n_cigar == 0) kputc('*', &str); + else { + for (i = 0; i < c->n_cigar; ++i) + ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); + } + kputc('\t', &str); + if (c->mtid < 0) kputs("*\t", &str); + else if (c->mtid == c->tid) kputs("=\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->mtid]); + ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); + for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); + kputc('\t', &str); + if (t[0] == 0xff) kputc('*', &str); + else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); + s = bam1_aux(b); + while (s < b->data + b->data_len) { + uint8_t type, key[2]; + key[0] = s[0]; key[1] = s[1]; + s += 2; type = *s; ++s; + ksprintf(&str, "\t%c%c:", key[0], key[1]); + if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } + else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } + else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; } + else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } + else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } + else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } + else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } + else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } + else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } + else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } + } + return str.s; +} + +void bam_view1(const bam_header_t *header, const bam1_t *b) +{ + char *s = bam_format1(header, b); + printf("%s\n", s); + free(s); +} diff --git a/bam.h b/bam.h new file mode 100644 index 0000000..83c03ad --- /dev/null +++ b/bam.h @@ -0,0 +1,714 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BAM_BAM_H +#define BAM_BAM_H + +/*! + @header + + BAM library provides I/O and various operations on manipulating files + in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) + format. It now supports importing from or exporting to TAM, sorting, + merging, generating pileup, and quickly retrieval of reads overlapped + with a specified region. + + @copyright Genome Research Ltd. + */ + +#include +#include +#include +#include + +#define _IOLIB 2 + +#if _IOLIB == 1 && !defined(_NO_RAZF) +#define BAM_TRUE_OFFSET +#include "razf.h" +/*! @abstract BAM file handler */ +typedef RAZF *bamFile; +#define bam_open(fn, mode) razf_open(fn, mode) +#define bam_dopen(fd, mode) razf_dopen(fd, mode) +#define bam_close(fp) razf_close(fp) +#define bam_read(fp, buf, size) razf_read(fp, buf, size) +#define bam_write(fp, buf, size) razf_write(fp, buf, size) +#define bam_tell(fp) razf_tell(fp) +#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir) +#elif _IOLIB == 2 +#define BAM_VIRTUAL_OFFSET16 +#include "bgzf.h" +/*! @abstract BAM file handler */ +typedef BGZF *bamFile; +#define bam_open(fn, mode) bgzf_open(fn, mode) +#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) +#define bam_close(fp) bgzf_close(fp) +#define bam_read(fp, buf, size) bgzf_read(fp, buf, size) +#define bam_write(fp, buf, size) bgzf_write(fp, buf, size) +#define bam_tell(fp) bgzf_tell(fp) +#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) +#elif _IOLIB == 3 +#define BAM_VIRTUAL_OFFSET16 +#include "razf.h" +/*! @abstract BAM file handler */ +typedef RAZF *bamFile; +#define bam_open(fn, mode) razf_open2(fn, mode) +#define bam_dopen(fd, mode) razf_dopen2(fd, mode) +#define bam_close(fp) razf_close(fp) +#define bam_read(fp, buf, size) razf_read(fp, buf, size) +#define bam_write(fp, buf, size) razf_write(fp, buf, size) +#define bam_tell(fp) razf_tell2(fp) +#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir) +#endif + +/*! @typedef + @abstract Structure for the alignment header. + @field n_targets number of reference sequences + @field target_name names of the reference sequences + @field target_len lengths of the referene sequences + @field hash hash table for fast name lookup + @field rg2lib hash table for @RG-ID -> LB lookup + @field l_text length of the plain text in the header + @field text plain text + + @discussion Field hash points to null by default. It is a private + member. + */ +typedef struct { + int32_t n_targets; + char **target_name; + uint32_t *target_len; + void *hash, *rg2lib; + int l_text; + char *text; +} bam_header_t; + +/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ +#define BAM_FPAIRED 1 +/*! @abstract the read is mapped in a proper pair */ +#define BAM_FPROPER_PAIR 2 +/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ +#define BAM_FUNMAP 4 +/*! @abstract the mate is unmapped */ +#define BAM_FMUNMAP 8 +/*! @abstract the read is mapped to the reverse strand */ +#define BAM_FREVERSE 16 +/*! @abstract the mate is mapped to the reverse strand */ +#define BAM_FMREVERSE 32 +/*! @abstract this is read1 */ +#define BAM_FREAD1 64 +/*! @abstract this is read2 */ +#define BAM_FREAD2 128 +/*! @abstract not primary alignment */ +#define BAM_FSECONDARY 256 +/*! @abstract QC failure */ +#define BAM_FQCFAIL 512 +/*! @abstract optical or PCR duplicate */ +#define BAM_FDUP 1024 + +/*! @abstract defautl mask for pileup */ +#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) + +#define BAM_CORE_SIZE sizeof(bam1_core_t) + +/** + * Describing how CIGAR operation/length is packed in a 32-bit integer. + */ +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +/* + CIGAR operations. + */ +/*! @abstract CIGAR: match */ +#define BAM_CMATCH 0 +/*! @abstract CIGAR: insertion to the reference */ +#define BAM_CINS 1 +/*! @abstract CIGAR: deletion from the reference */ +#define BAM_CDEL 2 +/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +#define BAM_CREF_SKIP 3 +/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +#define BAM_CSOFT_CLIP 4 +/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +#define BAM_CHARD_CLIP 5 +/*! @abstract CIGAR: padding */ +#define BAM_CPAD 6 + +/*! @typedef + @abstract Structure for core alignment information. + @field tid chromosome ID, defined by bam_header_t + @field pos 0-based leftmost coordinate + @field strand strand; 0 for forward and 1 otherwise + @field bin bin calculated by bam_reg2bin() + @field qual mapping quality + @field l_qname length of the query name + @field flag bitwise flag + @field n_cigar number of CIGAR operations + @field l_qseq length of the query sequence (read) + */ +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +/*! @typedef + @abstract Structure for one alignment. + @field core core information about the alignment + @field l_aux length of auxiliary data + @field data_len current length of bam1_t::data + @field m_data maximum length of bam1_t::data + @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux + + @discussion Notes: + + 1. qname is zero tailing and core.l_qname includes the tailing '\0'. + 2. l_qseq is calculated from the total length of an alignment block + on reading or from CIGAR. + */ +typedef struct { + bam1_core_t core; + int l_aux, data_len, m_data; + uint8_t *data; +} bam1_t; + +#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) + +/*! @function + @abstract Get the CIGAR array + @param b pointer to an alignment + @return pointer to the CIGAR array + + @discussion In the CIGAR array, each element is a 32-bit integer. The + lower 4 bits gives a CIGAR operation and the higher 28 bits keep the + length of a CIGAR. + */ +#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) + +/*! @function + @abstract Get the name of the query + @param b pointer to an alignment + @return pointer to the name string, null terminated + */ +#define bam1_qname(b) ((char*)((b)->data)) + +/*! @function + @abstract Get query sequence + @param b pointer to an alignment + @return pointer to sequence + + @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, + 8 for T and 15 for N. Two bases are packed in one byte with the base + at the higher 4 bits having smaller coordinate on the read. It is + recommended to use bam1_seqi() macro to get the base. + */ +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) + +/*! @function + @abstract Get query quality + @param b pointer to an alignment + @return pointer to quality string + */ +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2) + +/*! @function + @abstract Get a base on read + @param s Query sequence returned by bam1_seq() + @param i The i-th position, 0-based + @return 4-bit integer representing the base. + */ +#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) + +/*! @function + @abstract Get query sequence and quality + @param b pointer to an alignment + @return pointer to the concatenated auxiliary data + */ +#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) + +#ifndef kroundup32 +/*! @function + @abstract Round an integer to the next closest power-2 integer. + @param x integer to be rounded (in place) + @discussion x will be modified. + */ +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +/*! + @abstract Whether the machine is big-endian; modified only in + bam_header_init(). + */ +extern int bam_is_be; + +/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ +extern unsigned char bam_nt16_table[256]; + +/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ +extern char *bam_nt16_rev_table; + +extern char bam_nt16_nt4_table[]; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! @abstract TAM file handler */ + typedef struct __tamFile_t *tamFile; + + /*! + @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. + @param fn SAM file name + @return SAM file handler + */ + tamFile sam_open(const char *fn); + + /*! + @abstract Close a SAM file handler + @param fp SAM file handler + */ + void sam_close(tamFile fp); + + /*! + @abstract Read one alignment from a SAM file handler + @param fp SAM file handler + @param header header information (ordered names of chromosomes) + @param b read alignment; all members in b will be updated + @return 0 if successful; otherwise negative + */ + int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); + + /*! + @abstract Read header information from a TAB-delimited list file. + @param fn_list file name for the list + @return a pointer to the header structure + + @discussion Each line in this file consists of chromosome name and + the length of chromosome. + */ + bam_header_t *sam_header_read2(const char *fn_list); + + /*! + @abstract Read header from a SAM file (if present) + @param fp SAM file handler + @return pointer to header struct; 0 if no @SQ lines available + */ + bam_header_t *sam_header_read(tamFile fp); + + /*! + @abstract Parse @SQ lines a update a header struct + @param h pointer to the header struct to be updated + @return number of target sequences + + @discussion bam_header_t::{n_targets,target_len,target_name} will + be destroyed in the first place. + */ + int sam_header_parse(bam_header_t *h); + + /*! + @abstract Parse @RG lines a update a header struct + @param h pointer to the header struct to be updated + @return number of @RG lines + + @discussion bam_header_t::rg2lib will be destroyed in the first + place. + */ + int sam_header_parse_rg(bam_header_t *h); + +#define sam_write1(header, b) bam_view1(header, b) + + int bam_strmap_put(void *strmap, const char *rg, const char *lib); + const char *bam_strmap_get(const void *strmap, const char *rg); + void *bam_strmap_dup(const void*); + void *bam_strmap_init(); + void bam_strmap_destroy(void *strmap); + + /*! + @abstract Initialize a header structure. + @return the pointer to the header structure + + @discussion This function also modifies the global variable + bam_is_be. + */ + bam_header_t *bam_header_init(); + + /*! + @abstract Destroy a header structure. + @param header pointer to the header + */ + void bam_header_destroy(bam_header_t *header); + + /*! + @abstract Read a header structure from BAM. + @param fp BAM file handler, opened by bam_open() + @return pointer to the header structure + + @discussion The file position indicator must be placed at the + beginning of the file. Upon success, the position indicator will + be set at the start of the first alignment. + */ + bam_header_t *bam_header_read(bamFile fp); + + /*! + @abstract Write a header structure to BAM. + @param fp BAM file handler + @param header pointer to the header structure + @return always 0 currently + */ + int bam_header_write(bamFile fp, const bam_header_t *header); + + /*! + @abstract Read an alignment from BAM. + @param fp BAM file handler + @param b read alignment; all members are updated. + @return number of bytes read from the file + + @discussion The file position indicator must be + placed right before an alignment. Upon success, this function + will set the position indicator to the start of the next + alignment. This function is not affected by the machine + endianness. + */ + int bam_read1(bamFile fp, bam1_t *b); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param c pointer to the bam1_core_t structure + @param data_len total length of variable size data related to + the alignment + @param data pointer to the concatenated data + @return number of bytes written to the file + + @discussion This function is not affected by the machine + endianness. + */ + int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param b alignment to write + @return number of bytes written to the file + + @abstract It is equivalent to: + bam_write1_core(fp, &b->core, b->data_len, b->data) + */ + int bam_write1(bamFile fp, const bam1_t *b); + + /*! @function + @abstract Initiate a pointer to bam1_t struct + */ +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) + + /*! @function + @abstract Free the memory allocated for an alignment. + @param b pointer to an alignment + */ +#define bam_destroy1(b) do { \ + free((b)->data); free(b); \ + } while (0) + + /*! + @abstract Format a BAM record in the SAM format + @param header pointer to the header structure + @param b alignment to print + @return a pointer to the SAM string + */ + char *bam_format1(const bam_header_t *header, const bam1_t *b); + + /*! @typedef + @abstract Structure for one alignment covering the pileup position. + @field b pointer to the alignment + @field qpos position of the read base at the pileup site, 0-based + @field indel indel length; 0 for no indel, positive for ins and negative for del + @field is_del 1 iff the base on the padded read is a deletion + @field level the level of the read in the "viewer" mode + + @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The + difference between the two functions is that the former does not + set bam_pileup1_t::level, while the later does. Level helps the + implementation of alignment viewers, but calculating this has some + overhead. + */ + typedef struct { + bam1_t *b; + int32_t qpos; + int indel, level; + uint32_t is_del:1, is_head:1, is_tail:1; + } bam_pileup1_t; + + struct __bam_plbuf_t; + /*! @abstract pileup buffer */ + typedef struct __bam_plbuf_t bam_plbuf_t; + + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + + /*! @typedef + @abstract Type of function to be called by bam_plbuf_push(). + @param tid chromosome ID as is defined in the header + @param pos start coordinate of the alignment, 0-based + @param n number of elements in pl array + @param pl array of alignments + @param data user provided data + @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. + */ + typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); + + /*! + @abstract Reset a pileup buffer for another pileup process + @param buf the pileup buffer to be reset + */ + void bam_plbuf_reset(bam_plbuf_t *buf); + + /*! + @abstract Initialize a buffer for pileup. + @param func fucntion to be called by bam_pileup_core() + @param data user provided data + @return pointer to the pileup buffer + */ + bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); + + /*! + @abstract Destroy a pileup buffer. + @param buf pointer to the pileup buffer + */ + void bam_plbuf_destroy(bam_plbuf_t *buf); + + /*! + @abstract Push an alignment to the pileup buffer. + @param b alignment to be pushed + @param buf pileup buffer + @see bam_plbuf_init() + @return always 0 currently + + @discussion If all the alignments covering a particular site have + been collected, this function will call the user defined function + as is provided to bam_plbuf_init(). The coordinate of the site and + all the alignments will be transferred to the user defined + function as function parameters. + + When all the alignments are pushed to the buffer, this function + needs to be called with b equal to NULL. This will flush the + buffer. A pileup buffer can only be reused when bam_plbuf_reset() + is called. + */ + int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); + + struct __bam_lplbuf_t; + typedef struct __bam_lplbuf_t bam_lplbuf_t; + + void bam_lplbuf_reset(bam_lplbuf_t *buf); + + /*! @abstract bam_plbuf_init() equivalent with level calculated. */ + bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); + + /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ + void bam_lplbuf_destroy(bam_lplbuf_t *tv); + + /*! @abstract bam_plbuf_push() equivalent with level calculated. */ + int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); + + /*! @abstract bam_plbuf_file() equivalent with level calculated. */ + int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); + + struct __bam_index_t; + typedef struct __bam_index_t bam_index_t; + + /*! + @abstract Build index for a BAM file. + @discussion Index file "fn.bai" will be created. + @param fn name of the BAM file + @return always 0 currently + */ + int bam_index_build(const char *fn); + + /*! + @abstract Load index from file "fn.bai". + @param fn name of the BAM file (NOT the index file) + @return pointer to the index structure + */ + bam_index_t *bam_index_load(const char *fn); + + /*! + @abstract Destroy an index structure. + @param idx pointer to the index structure + */ + void bam_index_destroy(bam_index_t *idx); + + /*! @typedef + @abstract Type of function to be called by bam_fetch(). + @param b the alignment + @param data user provided data + */ + typedef int (*bam_fetch_f)(const bam1_t *b, void *data); + + /*! + @abstract Retrieve the alignments that are overlapped with the + specified region. + + @discussion A user defined function will be called for each + retrieved alignment ordered by its start position. + + @param fp BAM file handler + @param idx pointer to the alignment index + @param tid chromosome ID as is defined in the header + @param beg start coordinate, 0-based + @param end end coordinate, 0-based + @param data user provided data (will be transferred to func) + @param func user defined function + */ + int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); + + /*! + @abstract Parse a region in the format: "chr2:100,000-200,000". + @discussion bam_header_t::hash will be initialized if empty. + @param header pointer to the header structure + @param str string to be parsed + @param ref_id the returned chromosome ID + @param begin the returned start coordinate + @param end the returned end coordinate + @return 0 on success; -1 on failure + */ + int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + + /*! + @abstract Retrieve data of a tag + @param b pointer to an alignment struct + @param tag two-character tag to be retrieved + + @return pointer to the type and data. The first character is the + type that can be 'iIsScCdfAZH'. + + @discussion Use bam_aux2?() series to convert the returned data to + the corresponding type. + */ + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); + + int32_t bam_aux2i(const uint8_t *s); + float bam_aux2f(const uint8_t *s); + double bam_aux2d(const uint8_t *s); + char bam_aux2A(const uint8_t *s); + char *bam_aux2Z(const uint8_t *s); + + void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); + + uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() + + /*! + @abstract Calculate the rightmost coordinate of an alignment on the + reference genome. + + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return the rightmost coordinate, 0-based + */ + uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar); + + /*! + @abstract Calculate the length of the query sequence from CIGAR. + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return length of the query sequence + */ + int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); + + typedef struct { + int32_t qbeg, qend; + int32_t tbeg, tend; + int32_t cbeg, cend; + } bam_segreg_t; + + int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg); + +#ifdef __cplusplus +} +#endif + +/*! + @abstract Calculate the minimum bin that contains a region [beg,end). + @param beg start of the region, 0-based + @param end end of the region, 0-based + @return bin + */ +static inline int bam_reg2bin(uint32_t beg, uint32_t end) +{ + --end; + if (beg>>14 == end>>14) return 4681 + (beg>>14); + if (beg>>17 == end>>17) return 585 + (beg>>17); + if (beg>>20 == end>>20) return 73 + (beg>>20); + if (beg>>23 == end>>23) return 9 + (beg>>23); + if (beg>>26 == end>>26) return 1 + (beg>>26); + return 0; +} + +/*! + @abstract Copy an alignment + @param bdst destination alignment struct + @param bsrc source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +{ + uint8_t *data = bdst->data; + int m_data = bdst->m_data; // backup data and m_data + if (m_data < bsrc->m_data) { // double the capacity + m_data = bsrc->m_data; kroundup32(m_data); + data = (uint8_t*)realloc(data, m_data); + } + memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data + *bdst = *bsrc; // copy the rest + // restore the backup + bdst->m_data = m_data; + bdst->data = data; + return bdst; +} + +/*! + @abstract Duplicate an alignment + @param src source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_dup1(const bam1_t *src) +{ + bam1_t *b; + b = bam_init1(); + *b = *src; + b->m_data = b->data_len; + b->data = (uint8_t*)calloc(b->data_len, 1); + memcpy(b->data, src->data, b->data_len); + return b; +} + +#endif diff --git a/bam_aux.c b/bam_aux.c new file mode 100644 index 0000000..7482500 --- /dev/null +++ b/bam_aux.c @@ -0,0 +1,232 @@ +#include +#include "bam.h" +#include "khash.h" +typedef char *str_p; +KHASH_MAP_INIT_STR(s, int) +KHASH_MAP_INIT_STR(r2l, str_p) + +void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) +{ + int ori_len = b->data_len; + b->data_len += 3 + len; + b->l_aux += 3 + len; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; + b->data[ori_len + 2] = type; + memcpy(b->data + ori_len + 3, data, len); +} + +uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) +{ + return bam_aux_get(b, tag); +} + +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) +{ + uint8_t *s; + int y = tag[0]<<8 | tag[1]; + s = bam1_aux(b); + while (s < b->data + b->data_len) { + int type, x = (int)s[0]<<8 | s[1]; + s += 2; + if (x == y) return s; + type = toupper(*s); ++s; + if (type == 'C') ++s; + else if (type == 'S') s += 2; + else if (type == 'I' || type == 'F') s += 4; + else if (type == 'D') s += 8; + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } + return 0; +} + +void bam_init_header_hash(bam_header_t *header) +{ + if (header->hash == 0) { + int ret, i; + khiter_t iter; + khash_t(s) *h; + header->hash = h = kh_init(s); + for (i = 0; i < header->n_targets; ++i) { + iter = kh_put(s, h, header->target_name[i], &ret); + kh_value(h, iter) = i; + } + } +} + +void bam_destroy_header_hash(bam_header_t *header) +{ + if (header->hash) + kh_destroy(s, (khash_t(s)*)header->hash); +} + +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) +{ + khint_t k; + khash_t(s) *h = (khash_t(s)*)header->hash; + k = kh_get(s, h, seq_name); + return k == kh_end(h)? -1 : kh_value(h, k); +} + +int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + khiter_t iter; + khash_t(s) *h; + + bam_init_header_hash(header); + h = (khash_t(s)*)header->hash; + + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the ref_id */ + if (iter == kh_end(h)) { // name not found + *ref_id = -1; free(s); + return -1; + } + *ref_id = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return -1; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) { + fprintf(stderr, "[bam_parse_region] invalid region.\n"); + return -1; + } + return 0; +} + +int32_t bam_aux2i(const uint8_t *s) +{ + int type; + if (s == 0) return 0; + type = *s++; + if (type == 'c') return (int32_t)*(int8_t*)s; + else if (type == 'C') return (int32_t)*(uint8_t*)s; + else if (type == 's') return (int32_t)*(int16_t*)s; + else if (type == 'S') return (int32_t)*(uint16_t*)s; + else if (type == 'i' || type == 'I') return *(int32_t*)s; + else return 0; +} + +float bam_aux2f(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'f') return *(float*)s; + else return 0.0; +} + +double bam_aux2d(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'd') return *(double*)s; + else return 0.0; +} + +char bam_aux2A(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'A') return *(char*)s; + else return 0; +} + +char *bam_aux2Z(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'Z' || type == 'H') return (char*)s; + else return 0; +} + +/****************** + * rg2lib related * + ******************/ + +int bam_strmap_put(void *rg2lib, const char *rg, const char *lib) +{ + int ret; + khint_t k; + khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; + char *key; + if (h == 0) return 1; + key = strdup(rg); + k = kh_put(r2l, h, key, &ret); + if (ret) kh_val(h, k) = strdup(lib); + else { + fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg); + free(key); + } + return 0; +} + +const char *bam_strmap_get(const void *rg2lib, const char *rg) +{ + const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; + khint_t k; + if (h == 0) return 0; + k = kh_get(r2l, h, rg); + if (k != kh_end(h)) return (const char*)kh_val(h, k); + else return 0; +} + +void *bam_strmap_dup(const void *rg2lib) +{ + const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; + khash_t(r2l) *g; + khint_t k, l; + int ret; + if (h == 0) return 0; + g = kh_init(r2l); + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + char *key = strdup(kh_key(h, k)); + l = kh_put(r2l, g, key, &ret); + kh_val(g, l) = strdup(kh_val(h, k)); + } + } + return g; +} + +void *bam_strmap_init() +{ + return (void*)kh_init(r2l); +} + +void bam_strmap_destroy(void *rg2lib) +{ + khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; + khint_t k; + if (h == 0) return; + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free((char*)kh_key(h, k)); free(kh_val(h, k)); + } + } + kh_destroy(r2l, h); +} diff --git a/bam_color.c b/bam_color.c new file mode 100644 index 0000000..75aedd6 --- /dev/null +++ b/bam_color.c @@ -0,0 +1,127 @@ +#include +#include "bam.h" + +/*! + @abstract Get the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCSi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) i = strlen(cs) - 1 - i; + else i++; + return cs[i]; +} + +/*! + @abstract Get the color quality of the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color quality + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCQi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CQ"); + char *cq = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cq = bam_aux2Z(c); + // adjust for strandedness + if(bam1_strand(b)) i = strlen(cq) - 1 - i; + return cq[i]; +} + +char bam_aux_nt2int(char a) +{ + switch(toupper(a)) { + case 'A': + return 0; + break; + case 'C': + return 1; + break; + case 'G': + return 2; + break; + case 'T': + return 3; + break; + default: + return 4; + break; + } +} + +char bam_aux_ntnt2cs(char a, char b) +{ + a = bam_aux_nt2int(a); + b = bam_aux_nt2int(b); + if(4 == a || 4 == b) return '4'; + return "0123"[(int)(a ^ b)]; +} + +/*! + @abstract Get the color error profile at the give position + @param b pointer to an alignment + @return the original color if the color was an error, '-' (dash) otherwise + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCEi(bam1_t *b, int i) +{ + int cs_i; + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + char prev_b, cur_b; + char cur_color, cor_color; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) { //reverse strand + cs_i = strlen(cs) - 1 - i; + // get current color + cur_color = cs[cs_i]; + // get previous base + prev_b = (0 == cs_i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + else { + cs_i=i+1; + // get current color + cur_color = cs[cs_i]; + // get previous base + prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + + // corrected color + cor_color = bam_aux_ntnt2cs(prev_b, cur_b); + + if(cur_color == cor_color) { + return '-'; + } + else { + return cur_color; + } +} diff --git a/bam_endian.h b/bam_endian.h new file mode 100644 index 0000000..0fc74a8 --- /dev/null +++ b/bam_endian.h @@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif diff --git a/bam_import.c b/bam_import.c new file mode 100644 index 0000000..fccaa02 --- /dev/null +++ b/bam_import.c @@ -0,0 +1,475 @@ +#include +#include +#include +#include +#include +#include +#include +#include "kstring.h" +#include "bam.h" +#include "kseq.h" +#include "khash.h" + +KSTREAM_INIT(gzFile, gzread, 8192) +KHASH_MAP_INIT_STR(ref, uint64_t) + +void bam_init_header_hash(bam_header_t *header); +void bam_destroy_header_hash(bam_header_t *header); +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + +unsigned char bam_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"; + +struct __tamFile_t { + gzFile fp; + kstream_t *ks; + kstring_t *str; + uint64_t n_lines; + int is_first; +}; + +char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only +{ + char **list = 0, *s; + int n = 0, dret, m = 0; + gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + kstream_t *ks; + kstring_t *str; + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, '\n', str, &dret) > 0) { + if (n == m) { + m = m? m << 1 : 16; + list = (char**)realloc(list, m * sizeof(char*)); + } + if (str->s[str->l-1] == '\r') + str->s[--str->l] = '\0'; + s = list[n++] = (char*)calloc(str->l + 1, 1); + strcpy(s, str->s); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + *_n = n; + return list; +} + +static bam_header_t *hash2header(const kh_ref_t *hash) +{ + bam_header_t *header; + khiter_t k; + header = bam_header_init(); + header->n_targets = kh_size(hash); + header->target_name = (char**)calloc(kh_size(hash), sizeof(char*)); + header->target_len = (uint32_t*)calloc(kh_size(hash), 4); + for (k = kh_begin(hash); k != kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + int i = (int)kh_value(hash, k); + header->target_name[i] = (char*)kh_key(hash, k); + header->target_len[i] = kh_value(hash, k)>>32; + } + } + bam_init_header_hash(header); + return header; +} +bam_header_t *sam_header_read2(const char *fn) +{ + bam_header_t *header; + int c, dret, ret; + gzFile fp; + kstream_t *ks; + kstring_t *str; + kh_ref_t *hash; + khiter_t k; + hash = kh_init(ref); + fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + assert(fp); + ks = ks_init(fp); + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + while (ks_getuntil(ks, 0, str, &dret) > 0) { + char *s = strdup(str->s); + int len, i; + i = kh_size(hash); + ks_getuntil(ks, 0, str, &dret); + len = atoi(str->s); + k = kh_put(ref, hash, s, &ret); + kh_value(hash, k) = (uint64_t)len<<32 | i; + if (dret != '\n') + while ((c = ks_getc(ks)) != '\n' && c != -1); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); + header = hash2header(hash); + kh_destroy(ref, hash); + return header; +} +static inline uint8_t *alloc_data(bam1_t *b, int size) +{ + if (b->m_data < size) { + b->m_data = size; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + return b->data; +} +static inline void parse_error(int64_t n_lines, const char * __restrict msg) +{ + fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg); + abort(); +} +static inline void append_text(bam_header_t *header, kstring_t *str) +{ + int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null + kroundup32(x); kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. + header->l_text += str->l + 1; + header->text[header->l_text] = 0; +} + +int sam_header_parse_rg(bam_header_t *h) +{ + kstring_t *rgid, *rglib; + char *p, *q, *s, *r; + int n = 0; + + // free + if (h == 0) return 0; + bam_strmap_destroy(h->rg2lib); h->rg2lib = 0; + if (h->l_text < 3) return 0; + // parse @RG lines + h->rg2lib = bam_strmap_init(); + rgid = calloc(1, sizeof(kstring_t)); + rglib = calloc(1, sizeof(kstring_t)); + s = h->text; + while ((s = strstr(s, "@RG")) != 0) { + if (rgid->l && rglib->l) { + bam_strmap_put(h->rg2lib, rgid->s, rglib->s); + ++n; + } + rgid->l = rglib->l = 0; + s += 3; + r = s; + if ((p = strstr(s, "ID:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + kputsn(q, p - q, rgid); + } else { + fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n"); + break; + } + if (r < p) r = p; + if ((p = strstr(s, "LB:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + kputsn(q, p - q, rglib); + } else { + fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n"); + break; + } + if (r < p) r = p; + s = r + 3; + } + if (rgid->l && rglib->l) { + bam_strmap_put(h->rg2lib, rgid->s, rglib->s); + ++n; + } + free(rgid->s); free(rgid); + free(rglib->s); free(rglib); + if (n == 0) { + bam_strmap_destroy(h->rg2lib); + h->rg2lib = 0; + } + return n; +} + +int sam_header_parse(bam_header_t *h) +{ + int i; + char *s, *p, *q, *r; + + // free + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + if (h->l_text < 3) return 0; + // count number of @SQ + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + ++h->n_targets; + s += 3; + } + if (h->n_targets == 0) return 0; + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + // parse @SQ lines + i = 0; + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + s += 3; + r = s; + if ((p = strstr(s, "SN:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + h->target_name[i] = (char*)calloc(p - q + 1, 1); + strncpy(h->target_name[i], q, p - q); + } else goto header_err_ret; + if (r < p) r = p; + if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10); + else goto header_err_ret; + if (r < p) r = p; + s = r + 3; + ++i; + } + sam_header_parse_rg(h); + return h->n_targets; + +header_err_ret: + fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n"); + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + return 0; +} + +bam_header_t *sam_header_read(tamFile fp) +{ + int ret, dret; + bam_header_t *header = bam_header_init(); + kstring_t *str = fp->str; + while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header + str->s[str->l] = dret; // note that str->s is NOT null terminated!! + append_text(header, str); + if (dret != '\n') { + ret = ks_getuntil(fp->ks, '\n', str, &dret); + str->s[str->l] = '\n'; // NOT null terminated!! + append_text(header, str); + } + ++fp->n_lines; + } + sam_header_parse(header); + bam_init_header_hash(header); + fp->is_first = 1; + return header; +} + +int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) +{ + int ret, doff, doff0, dret, z = 0; + bam1_core_t *c = &b->core; + kstring_t *str = fp->str; + kstream_t *ks = fp->ks; + + if (fp->is_first) { + fp->is_first = 0; + ret = str->l; + } else { + do { // special consideration for empty lines + ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); + if (ret >= 0) z += str->l + 1; + } while (ret == 0); + } + if (ret < 0) return -1; + ++fp->n_lines; + doff = 0; + + { // name + c->l_qname = strlen(str->s) + 1; + memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname); + doff += c->l_qname; + } + { // flag, tid, pos, qual + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s); + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); + if (c->tid < 0 && strcmp(str->s, "*")) { + if (header->n_targets == 0) { + fprintf(stderr, "[sam_read1] missing header? Abort!\n"); + exit(1); + } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s); + } + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; + if (ret < 0) return -2; + } + { // cigar + char *s, *t; + int i, op; + long x; + c->n_cigar = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; + z += str->l + 1; + if (str->s[0] != '*') { + for (s = str->s; *s; ++s) { + if (isalpha(*s)) ++c->n_cigar; + else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); + } + b->data = alloc_data(b, doff + c->n_cigar * 4); + for (i = 0, s = str->s; i != c->n_cigar; ++i) { + x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M') op = BAM_CMATCH; + else if (op == 'I') op = BAM_CINS; + else if (op == 'D') op = BAM_CDEL; + else if (op == 'N') op = BAM_CREF_SKIP; + else if (op == 'S') op = BAM_CSOFT_CLIP; + else if (op == 'H') op = BAM_CHARD_CLIP; + else if (op == 'P') op = BAM_CPAD; + else parse_error(fp->n_lines, "invalid CIGAR operation"); + s = t + 1; + bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; + } + if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); + c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); + doff += c->n_cigar * 4; + } else c->bin = bam_reg2bin(c->pos, c->pos + 1); + } + { // mtid, mpos, isize + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; + if (ret < 0) return -4; + } + { // seq and qual + int i; + uint8_t *p; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq + z += str->l + 1; + c->l_qseq = strlen(str->s); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; + bzero(p, (c->l_qseq+1)/2); + for (i = 0; i < c->l_qseq; ++i) + p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual + z += str->l + 1; + if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) + parse_error(fp->n_lines, "sequence and quality are inconsistent"); + p += (c->l_qseq+1)/2; + if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff; + else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; + doff += c->l_qseq + (c->l_qseq+1)/2; + } + doff0 = doff; + if (dret != '\n' && dret != '\r') { // aux + while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) { + uint8_t *s, type, key[2]; + z += str->l + 1; + if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') + parse_error(fp->n_lines, "missing colon in auxiliary data"); + key[0] = str->s[0]; key[1] = str->s[1]; + type = str->s[3]; + s = alloc_data(b, doff + 3) + doff; + s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility + s = alloc_data(b, doff + 2) + doff; + *s++ = 'A'; *s = str->s[5]; + doff += 2; + } else if (type == 'I' || type == 'i') { + long long x; + s = alloc_data(b, doff + 5) + doff; + x = (long long)atoll(str->s + 5); + if (x < 0) { + if (x >= -127) { + *s++ = 'c'; *(int8_t*)s = (int8_t)x; + s += 1; doff += 2; + } else if (x >= -32767) { + *s++ = 's'; *(int16_t*)s = (int16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'i'; *(int32_t*)s = (int32_t)x; + s += 4; doff += 5; + if (x < -2147483648ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } else { + if (x <= 255) { + *s++ = 'C'; *s++ = (uint8_t)x; + doff += 2; + } else if (x <= 65535) { + *s++ = 'S'; *(uint16_t*)s = (uint16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'I'; *(uint32_t*)s = (uint32_t)x; + s += 4; doff += 5; + if (x > 4294967295ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } + } else if (type == 'f') { + s = alloc_data(b, doff + 5) + doff; + *s++ = 'f'; + *(float*)s = (float)atof(str->s + 5); + s += 4; doff += 5; + } else if (type == 'd') { + s = alloc_data(b, doff + 9) + doff; + *s++ = 'd'; + *(float*)s = (float)atof(str->s + 9); + s += 8; doff += 9; + } else if (type == 'Z' || type == 'H') { + int size = 1 + (str->l - 5) + 1; + if (type == 'H') { // check whether the hex string is valid + int i; + if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even"); + for (i = 0; i < str->l - 5; ++i) { + int c = toupper(str->s[5 + i]); + if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) + parse_error(fp->n_lines, "invalid hex character"); + } + } + s = alloc_data(b, doff + size) + doff; + *s++ = type; + memcpy(s, str->s + 5, str->l - 5); + s[str->l - 5] = 0; + doff += size; + } else parse_error(fp->n_lines, "unrecognized type"); + if (dret == '\n' || dret == '\r') break; + } + } + b->l_aux = doff - doff0; + b->data_len = doff; + return z; +} + +tamFile sam_open(const char *fn) +{ + tamFile fp; + fp = (tamFile)calloc(1, sizeof(struct __tamFile_t)); + fp->str = (kstring_t*)calloc(1, sizeof(kstring_t)); + fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + fp->ks = ks_init(fp->fp); + return fp; +} + +void sam_close(tamFile fp) +{ + if (fp) { + ks_destroy(fp->ks); + gzclose(fp->fp); + free(fp->str->s); free(fp->str); + free(fp); + } +} diff --git a/bam_index.c b/bam_index.c new file mode 100644 index 0000000..72ef270 --- /dev/null +++ b/bam_index.c @@ -0,0 +1,551 @@ +#include +#include +#include "bam.h" +#include "khash.h" +#include "ksort.h" +#include "bam_endian.h" +#include "knetfile.h" + +/*! + @header + + Alignment indexing. Before indexing, BAM must be sorted based on the + leftmost coordinate of alignments. In indexing, BAM uses two indices: + a UCSC binning index and a simple linear index. The binning index is + efficient for alignments spanning long distance, while the auxiliary + linear index helps to reduce unnecessary seek calls especially for + short alignments. + + The UCSC binning scheme was suggested by Richard Durbin and Lincoln + Stein and is explained by Kent et al. (2002). In this scheme, each bin + represents a contiguous genomic region which can be fully contained in + another bin; each alignment is associated with a bin which represents + the smallest region containing the entire alignment. The binning + scheme is essentially another representation of R-tree. A distinct bin + uniquely corresponds to a distinct internal node in a R-tree. Bin A is + a child of Bin B if region A is contained in B. + + In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin + 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp, + 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to + find the alignments overlapped with a region [rbeg,rend), we need to + calculate the list of bins that may be overlapped the region and test + the alignments in the bins to confirm the overlaps. If the specified + region is short, typically only a few alignments in six bins need to + be retrieved. The overlapping alignments can be quickly fetched. + + */ + +#define BAM_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. +#define BAM_LIDX_SHIFT 14 + +typedef struct { + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) +KSORT_INIT(off, pair64_t, pair64_lt) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} bam_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bam_lidx_t; + +KHASH_MAP_INIT_INT(i, bam_binlist_t) + +struct __bam_index_t { + int32_t n; + khash_t(i) **index; + bam_lidx_t *index2; +}; + +// requirement: len <= LEN_MASK +static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + bam_binlist_t *l; + int ret; + k = kh_put(i, h, bin, &ret); + l = &kh_value(h, k); + if (ret) { // not present + l->m = 1; l->n = 0; + l->list = (pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; l->list[l->n++].v = end; +} + +static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset) +{ + int i, beg, end; + beg = b->core.pos >> BAM_LIDX_SHIFT; + end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + for (i = beg + 1; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + index2->n = end + 1; +} + +static void merge_chunks(bam_index_t *idx) +{ +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + khash_t(i) *index; + int i, l, m; + khint_t k; + for (i = 0; i < idx->n; ++i) { + index = idx->index[i]; + for (k = kh_begin(index); k != kh_end(index); ++k) { + bam_binlist_t *p; + if (!kh_exist(index, k)) continue; + p = &kh_value(index, k); + m = 0; + for (l = 1; l < p->n; ++l) { +#ifdef BAM_TRUE_OFFSET + if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v; +#else + if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; +#endif + else p->list[++m] = p->list[l]; + } // ~for(l) + p->n = m + 1; + } // ~for(k) + } // ~for(i) +#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) +} + +bam_index_t *bam_index_core(bamFile fp) +{ + bam1_t *b; + bam_header_t *h; + int i, ret; + bam_index_t *idx; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; + bam1_core_t *c; + uint64_t save_off, last_off; + + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + h = bam_header_read(fp); + c = &b->core; + + idx->n = h->n_targets; + bam_header_destroy(h); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + + save_bin = save_tid = last_tid = last_bin = 0xffffffffu; + save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; + while ((ret = bam_read1(fp, b)) >= 0) { + if (last_tid != c->tid) { // change of chromosomes + last_tid = c->tid; + last_bin = 0xffffffffu; + } else if (last_coor > c->pos) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", + bam1_qname(b), last_coor, c->pos, c->tid+1); + exit(1); + } + if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->bin != last_bin) { // then possibly write the binning index + if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + save_off = last_off; + save_bin = last_bin = c->bin; + save_tid = c->tid; + if (save_tid < 0) break; + } + if (bam_tell(fp) <= last_off) { + fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", + (unsigned long long)bam_tell(fp), (unsigned long long)last_off); + exit(1); + } + last_off = bam_tell(fp); + last_coor = b->core.pos; + } + if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + merge_chunks(idx); + if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); + free(b->data); free(b); + return idx; +} + +void bam_index_destroy(bam_index_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) + free(kh_value(index, k).list); + } + kh_destroy(i, index); + free(index2->offset); + } + free(idx->index); free(idx->index2); + free(idx); +} + +void bam_index_save(const bam_index_t *idx, FILE *fp) +{ + int32_t i, size; + khint_t k; + fwrite("BAI\1", 1, 4, fp); + if (bam_is_be) { + uint32_t x = idx->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&idx->n, 4, 1, fp); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + // write binning index + size = kh_size(index); + if (bam_is_be) { // big endian + uint32_t x = size; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&size, 4, 1, fp); + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) { + bam_binlist_t *p = &kh_value(index, k); + if (bam_is_be) { // big endian + uint32_t x; + x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + fwrite(p->list, 16, p->n, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } else { + fwrite(&kh_key(index, k), 4, 1, fp); + fwrite(&p->n, 4, 1, fp); + fwrite(p->list, 16, p->n, fp); + } + } + } + // write linear index (index2) + if (bam_is_be) { + int x = index2->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&index2->n, 4, 1, fp); + if (bam_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + fwrite(index2->offset, 8, index2->n, fp); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else fwrite(index2->offset, 8, index2->n, fp); + } + fflush(fp); +} + +static bam_index_t *bam_index_load_core(FILE *fp) +{ + int i; + char magic[4]; + bam_index_t *idx; + if (fp == 0) { + fprintf(stderr, "[bam_index_load_core] fail to load index.\n"); + return 0; + } + fread(magic, 1, 4, fp); + if (strncmp(magic, "BAI\1", 4)) { + fprintf(stderr, "[bam_index_load] wrong magic number.\n"); + fclose(fp); + return 0; + } + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + fread(&idx->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&idx->n); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index; + bam_lidx_t *index2 = idx->index2 + i; + uint32_t key, size; + khint_t k; + int j, ret; + bam_binlist_t *p; + index = idx->index[i] = kh_init(i); + // load binning index + fread(&size, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&size); + for (j = 0; j < (int)size; ++j) { + fread(&key, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&key); + k = kh_put(i, index, key, &ret); + p = &kh_value(index, k); + fread(&p->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&p->n); + p->m = p->n; + p->list = (pair64_t*)malloc(p->m * 16); + fread(p->list, 16, p->n, fp); + if (bam_is_be) { + int x; + for (x = 0; x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } + } + // load linear index + fread(&index2->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + fread(index2->offset, index2->n, 8, fp); + if (bam_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +bam_index_t *bam_index_load_local(const char *_fn) +{ + FILE *fp; + char *fnidx, *fn; + + if (strstr(_fn, "ftp://") == _fn) { + const char *p; + int l = strlen(_fn); + for (p = _fn + l - 1; p >= _fn; --p) + if (*p == '/') break; + fn = strdup(p + 1); + } else fn = strdup(_fn); + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + fp = fopen(fnidx, "r"); + if (fp == 0) { // try "{base}.bai" + char *s = strstr(fn, "bam"); + if (s == fn + strlen(fn) - 3) { + strcpy(fnidx, fn); + fnidx[strlen(fn)-1] = 'i'; + fp = fopen(fnidx, "r"); + } + } + free(fnidx); free(fn); + if (fp) { + bam_index_t *idx = bam_index_load_core(fp); + fclose(fp); + return idx; + } else return 0; +} + +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} + +bam_index_t *bam_index_load(const char *fn) +{ + bam_index_t *idx; + idx = bam_index_load_local(fn); + if (idx == 0 && strstr(fn, "ftp://") == fn) { + char *fnidx = calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bai"); + fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); + download_from_remote(fnidx); + idx = bam_index_load_local(fn); + } + if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); + return idx; +} + +int bam_index_build2(const char *fn, const char *_fnidx) +{ + char *fnidx; + FILE *fpidx; + bamFile fp; + bam_index_t *idx; + if ((fp = bam_open(fn, "r")) == 0) { + fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n"); + return -1; + } + idx = bam_index_core(fp); + bam_close(fp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + } else fnidx = strdup(_fnidx); + fpidx = fopen(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + bam_index_save(idx, fpidx); + bam_index_destroy(idx); + fclose(fpidx); + free(fnidx); + return 0; +} + +int bam_index_build(const char *fn) +{ + return bam_index_build2(fn, 0); +} + +int bam_index(int argc, char *argv[]) +{ + if (argc < 2) { + fprintf(stderr, "Usage: samtools index []\n"); + return 1; + } + if (argc >= 3) bam_index_build2(argv[1], argv[2]); + else bam_index_build(argv[1]); + return 0; +} + +#define MAX_BIN 37450 // =(8^6-1)/7+1 + +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +{ + int i = 0, k; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; +} + +static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) +{ + uint32_t rbeg = b->core.pos; + uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; + return (rend > beg && rbeg < end); +} + +int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +{ + uint16_t *bins; + int i, n_bins, n_off; + pair64_t *off; + khint_t k; + khash_t(i) *index; + uint64_t min_off; + + bins = (uint16_t*)calloc(MAX_BIN, 2); + n_bins = reg2bins(beg, end, bins); + index = idx->index[tid]; + min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) + n_off += kh_value(index, k).n; + } + if (n_off == 0) { + free(bins); return 0; + } + off = (pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { + int j; + bam_binlist_t *p = &kh_value(index, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + free(bins); + { + bam1_t *b; + int l, ret, n_seeks; + uint64_t curr_off; + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + ks_introsort(off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + { // merge adjacent blocks +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + for (i = 1, l = 0; i < n_off; ++i) { +#ifdef BAM_TRUE_OFFSET + if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v; +#else + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; +#endif + else off[++l] = off[i]; + } + n_off = l + 1; +#endif + } + // retrive alignments + n_seeks = 0; i = -1; curr_off = 0; + for (;;) { + if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk + if (i == n_off - 1) break; // no more chunks + if (i >= 0) assert(curr_off == off[i].v); // otherwise bug + if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek + bam_seek(fp, off[i+1].u, SEEK_SET); + curr_off = bam_tell(fp); + ++n_seeks; + } + ++i; + } + if ((ret = bam_read1(fp, b)) > 0) { + curr_off = bam_tell(fp); + if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed + else if (is_overlap(beg, end, b)) func(b, data); + } else break; // end of file + } +// fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); + bam_destroy1(b); + } + free(off); + return 0; +} diff --git a/bam_lpileup.c b/bam_lpileup.c new file mode 100644 index 0000000..425290e --- /dev/null +++ b/bam_lpileup.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include "bam.h" +#include "ksort.h" + +#define TV_GAP 2 + +typedef struct __freenode_t { + uint32_t level:28, cnt:4; + struct __freenode_t *next; +} freenode_t, *freenode_p; + +#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) +KSORT_INIT(node, freenode_p, freenode_lt) + +/* Memory pool, similar to the one in bam_pileup.c */ +typedef struct { + int cnt, n, max; + freenode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + return (mempool_t*)calloc(1, sizeof(mempool_t)); +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) free(mp->buf[k]); + free(mp->buf); free(mp); +} +static inline freenode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, freenode_t *p) +{ + --mp->cnt; p->next = 0; p->cnt = TV_GAP; + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* core part */ +struct __bam_lplbuf_t { + int max, n_cur, n_pre; + int max_level, *cur_level, *pre_level; + mempool_t *mp; + freenode_t **aux, *head, *tail; + int n_nodes, m_aux; + bam_pileup_f func; + void *user_data; + bam_plbuf_t *plbuf; +}; + +void bam_lplbuf_reset(bam_lplbuf_t *buf) +{ + freenode_t *p, *q; + bam_plbuf_reset(buf->plbuf); + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; + buf->max_level = 0; + buf->n_cur = buf->n_pre = 0; + buf->n_nodes = 0; +} + +static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + bam_lplbuf_t *tv = (bam_lplbuf_t*)data; + freenode_t *p; + int i, l, max_level; + // allocate memory if necessary + if (tv->max < n) { // enlarge + tv->max = n; + kroundup32(tv->max); + tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); + tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); + } + tv->n_cur = n; + // update cnt + for (p = tv->head; p->next; p = p->next) + if (p->cnt > 0) --p->cnt; + // calculate cur_level[] + max_level = 0; + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) { + if (tv->head->next && tv->head->cnt == 0) { // then take a free slot + freenode_t *p = tv->head->next; + tv->cur_level[i] = tv->head->level; + mp_free(tv->mp, tv->head); + tv->head = p; + --tv->n_nodes; + } else tv->cur_level[i] = ++tv->max_level; + } else { + tv->cur_level[i] = tv->pre_level[l++]; + if (p->is_tail) { // then return a free slot + tv->tail->level = tv->cur_level[i]; + tv->tail->next = mp_alloc(tv->mp); + tv->tail = tv->tail->next; + ++tv->n_nodes; + } + } + if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; + ((bam_pileup1_t*)p)->level = tv->cur_level[i]; + } + assert(l == tv->n_pre); + tv->func(tid, pos, n, pl, tv->user_data); + // sort the linked list + if (tv->n_nodes) { + freenode_t *q; + if (tv->n_nodes + 1 > tv->m_aux) { // enlarge + tv->m_aux = tv->n_nodes + 1; + kroundup32(tv->m_aux); + tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); + } + for (p = tv->head, i = l = 0; p->next;) { + if (p->level > max_level) { // then discard this entry + q = p->next; + mp_free(tv->mp, p); + p = q; + } else { + tv->aux[i++] = p; + p = p->next; + } + } + tv->aux[i] = tv->tail; // add a proper tail for the loop below + tv->n_nodes = i; + if (tv->n_nodes) { + ks_introsort(node, tv->n_nodes, tv->aux); + for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; + tv->head = tv->aux[0]; + } else tv->head = tv->tail; + } + // clean up + tv->max_level = max_level; + memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); + // squeeze out terminated levels + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!p->is_tail) + tv->pre_level[l++] = tv->pre_level[i]; + } + tv->n_pre = l; +/* + fprintf(stderr, "%d\t", pos+1); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) fprintf(stderr, "^"); + if (p->is_tail) fprintf(stderr, "$"); + fprintf(stderr, "%d,", p->level); + } + fprintf(stderr, "\n"); +*/ + return 0; +} + +bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) +{ + bam_lplbuf_t *tv; + tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); + tv->mp = mp_init(); + tv->head = tv->tail = mp_alloc(tv->mp); + tv->func = func; + tv->user_data = data; + tv->plbuf = bam_plbuf_init(tview_func, tv); + return (bam_lplbuf_t*)tv; +} + +void bam_lplbuf_destroy(bam_lplbuf_t *tv) +{ + freenode_t *p, *q; + free(tv->cur_level); free(tv->pre_level); + bam_plbuf_destroy(tv->plbuf); + free(tv->aux); + for (p = tv->head; p->next;) { + q = p->next; + mp_free(tv->mp, p); p = q; + } + mp_free(tv->mp, p); + assert(tv->mp->cnt == 0); + mp_destroy(tv->mp); + free(tv); +} + +int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) +{ + return bam_plbuf_push(b, tv->plbuf); +} + +int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_lplbuf_t *buf; + int ret; + bam1_t *b; + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + buf = bam_lplbuf_init(func, func_data); + bam_plbuf_set_mask(buf->plbuf, mask); + while ((ret = bam_read1(fp, b)) >= 0) + bam_lplbuf_push(b, buf); + bam_lplbuf_push(0, buf); + bam_lplbuf_destroy(buf); + free(b->data); free(b); + return 0; +} diff --git a/bam_maqcns.c b/bam_maqcns.c new file mode 100644 index 0000000..464288a --- /dev/null +++ b/bam_maqcns.c @@ -0,0 +1,526 @@ +#include +#include "bam.h" +#include "bam_maqcns.h" +#include "ksort.h" +KSORT_INIT_GENERIC(uint32_t) + +#define MAX_WINDOW 33 + +typedef struct __bmc_aux_t { + int max; + uint32_t *info; +} bmc_aux_t; + +typedef struct { + float esum[4], fsum[4]; + uint32_t c[4]; + uint32_t rms_mapQ; +} glf_call_aux_t; + +char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +/* + P() = \theta \sum_{i=1}^{N-1} 1/i + P(D|) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2] + p_k = i/k / \sum_{i=1}^{N-1} 1/i + */ +static void cal_het(bam_maqcns_t *aa) +{ + int k, n1, n2; + double sum_harmo; // harmonic sum + double poly_rate; + double p1 = 0.0, p3 = 0.0; // just for testing + + free(aa->lhet); + aa->lhet = (double*)calloc(256 * 256, sizeof(double)); + sum_harmo = 0.0; + for (k = 1; k <= aa->n_hap - 1; ++k) + sum_harmo += 1.0 / k; + for (n1 = 0; n1 < 256; ++n1) { + for (n2 = 0; n2 < 256; ++n2) { + long double sum = 0.0; + double lC = lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1} + for (k = 1; k <= aa->n_hap - 1; ++k) { + double pk = 1.0 / k / sum_harmo; + double log1 = log((double)k/aa->n_hap); + double log2 = log(1.0 - (double)k/aa->n_hap); + sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2)); + } + aa->lhet[n1<<8|n2] = lC + logl(sum); + if (n1 == 17 && n2 == 3) p3 = lC + logl(expl(logl(0.5) * 20)); + if (n1 == 19 && n2 == 1) p1 = lC + logl(expl(logl(0.5) * 20)); + } + } + poly_rate = aa->het_rate * sum_harmo; + aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate)); +} + +/** initialize the helper structure */ +static void cal_coef(bam_maqcns_t *aa) +{ + int k, n, q; + long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256]; + double *lC; + + lC = (double*)calloc(256 * 256, sizeof(double)); + // aa->lhet will be allocated and initialized + free(aa->fk); free(aa->coef); + aa->fk = (double*)calloc(256, sizeof(double)); + aa->coef = (double*)calloc(256*256*64, sizeof(double)); + aa->fk[0] = fk2[0] = 1.0; + for (n = 1; n != 256; ++n) { + aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta; + fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands + } + for (n = 1; n != 256; ++n) + for (k = 1; k <= n; ++k) + lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); + for (q = 1; q != 64; ++q) { + double e = pow(10.0, -q/10.0); + double le = log(e); + double le1 = log(1.0-e); + for (n = 1; n != 256; ++n) { + double *coef = aa->coef + (q<<16|n<<8); + sum_a[n+1] = 0.0; + for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k} + sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1); + b[k] = sum_a[k+1] / sum_a[k]; + if (b[k] > 0.99) b[k] = 0.99; + } + for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k}) + q_c[k] = -4.343 * fk2[k] * logl(b[k] / e); + for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i + for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9 + tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k]))); + coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk} + } + } + } + free(lC); +} + +bam_maqcns_t *bam_maqcns_init() +{ + bam_maqcns_t *bm; + bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t)); + bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t)); + bm->het_rate = 0.001; + bm->theta = 0.85; + bm->n_hap = 2; + bm->eta = 0.03; + bm->cap_mapQ = 60; + return bm; +} + +void bam_maqcns_prepare(bam_maqcns_t *bm) +{ + cal_coef(bm); cal_het(bm); +} + +void bam_maqcns_destroy(bam_maqcns_t *bm) +{ + if (bm == 0) return; + free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); + free(bm->aux); free(bm); +} + +glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm) +{ + glf_call_aux_t *b; + int i, j, k, w[8], c, n; + glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t)); + float p[16], min_p = 1e30; + uint64_t rms; + + g->ref_base = ref_base; + if (_n == 0) return g; + + // construct aux array + if (bm->aux->max < _n) { + bm->aux->max = _n; + kroundup32(bm->aux->max); + bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max); + } + for (i = n = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + uint32_t q, x = 0, qq; + if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; + q = (uint32_t)bam1_qual(p->b)[p->qpos]; + x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; + if (p->b->core.qual < q) q = p->b->core.qual; + x |= q << 24; + qq = bam1_seqi(bam1_seq(p->b), p->qpos); + q = bam_nt16_nt4_table[qq? qq : ref_base]; + if (!p->is_del && q < 4) x |= 1 << 21 | q << 16; + bm->aux->info[n++] = x; + } + ks_introsort(uint32_t, n, bm->aux->info); + // generate esum and fsum + b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t)); + for (k = 0; k != 8; ++k) w[k] = 0; + rms = 0; + for (j = n - 1; j >= 0; --j) { // calculate esum and fsum + uint32_t info = bm->aux->info[j]; + int tmp; + if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); + k = info>>16&7; + if (info>>24 > 0) { + b->esum[k&3] += bm->fk[w[k]] * (info>>24); + b->fsum[k&3] += bm->fk[w[k]]; + if (w[k] < 0xff) ++w[k]; + ++b->c[k&3]; + } + tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ; + rms += tmp * tmp; + } + b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499); + // rescale ->c[] + for (j = c = 0; j != 4; ++j) c += b->c[j]; + if (c > 255) { + for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5); + for (j = c = 0; j != 4; ++j) c += b->c[j]; + } + // generate likelihood + for (j = 0; j != 4; ++j) { + // homozygous + float tmp1, tmp3; + int tmp2, bar_e; + for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) { + if (j == k) continue; + tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.5); + if (bar_e < 4) bar_e = 4; // should not happen + if (bar_e > 63) bar_e = 63; + p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; + } else p[j<<2|j] = 0.0; // all the bases are j + // heterozygous + for (k = j + 1; k < 4; ++k) { + for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) { + if (i == j || i == k) continue; + tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.5); + if (bar_e < 4) bar_e = 4; + if (bar_e > 63) bar_e = 63; + p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; + } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k + } + // + for (k = 0; k != 4; ++k) + if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0; + } + + { // fix p[k<<2|k] + float max1, max2, min1, min2; + int max_k, min_k; + max_k = min_k = -1; + max1 = max2 = -1.0; min1 = min2 = 1e30; + for (k = 0; k < 4; ++k) { + if (b->esum[k] > max1) { + max2 = max1; max1 = b->esum[k]; max_k = k; + } else if (b->esum[k] > max2) max2 = b->esum[k]; + } + for (k = 0; k < 4; ++k) { + if (p[k<<2|k] < min1) { + min2 = min1; min1 = p[k<<2|k]; min_k = k; + } else if (p[k<<2|k] < min2) min2 = p[k<<2|k]; + } + if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2)) + p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0; + } + + // convert necessary information to glf1_t + g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ; + g->depth = n > 16777215? 16777215 : n; + for (j = 0; j != 4; ++j) + for (k = j; k < 4; ++k) + if (p[j<<2|k] < min_p) min_p = p[j<<2|k]; + g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5); + for (j = c = 0; j != 4; ++j) + for (k = j; k < 4; ++k) + g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5); + + free(b); + return g; +} + +uint32_t glf2cns(const glf1_t *g, int q_r) +{ + int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1; + uint32_t x = 0; + for (i = k = 0; i < 4; ++i) + for (j = i; j < 4; ++j) { + tmp[j<<2|i] = -1; + tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r); + } + for (i = 0; i < 16; ++i) { + if (tmp[i] < 0) continue; + if (tmp[i] < min) { + min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i; + } else if (tmp[i] < min2) { + min3 = min2; min2 = tmp[i]; min_g2 = i; + } else if (tmp[i] < min3) min3 = tmp[i]; + } + x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28; + x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24; + x |= (uint32_t)g->max_mapQ << 16; + x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8; + x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff; + return x; +} + +uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) +{ + glf1_t *g; + uint32_t x; + if (n) { + g = bam_maqcns_glfgen(n, pl, 0xf, bm); + x = glf2cns(g, (int)(bm->q_r + 0.5)); + free(g); + } else x = 0xfU<<28 | 0xfU<<24; + return x; +} + +/************** *****************/ + +bam_maqindel_opt_t *bam_maqindel_opt_init() +{ + bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); + mi->q_indel = 40; + mi->r_indel = 0.00015; + // + mi->mm_penalty = 3; + mi->indel_err = 4; + mi->ambi_thres = 10; + return mi; +} + +void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) +{ + if (mir == 0) return; + free(mir->s[0]); free(mir->s[1]); free(mir); +} + +#define MINUS_CONST 0x10000000 + +bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types) +{ + int i, j, n_types, *types, left, right; + bam_maqindel_ret_t *ret = 0; + // if there is no proposed indel, check if there is an indel from the alignment + if (_n_types == 0) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; + } + if (i == n) return 0; // no indel + } + { // calculate how many types of indels are available (set n_types and types) + int m; + uint32_t *aux; + aux = (uint32_t*)calloc(n + _n_types + 1, 4); + m = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) + aux[m++] = MINUS_CONST + p->indel; + } + if (_n_types) // then also add this to aux[] + for (i = 0; i < _n_types; ++i) + if (_types[i]) aux[m++] = MINUS_CONST + _types[i]; + ks_introsort(uint32_t, m, aux); + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + types = (int*)calloc(n_types, sizeof(int)); + j = 0; + types[j++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) { + if (aux[i] != aux[i-1]) + types[j++] = aux[i] - MINUS_CONST; + } + free(aux); + } + { // calculate left and right boundary + bam_segreg_t seg; + left = 0x7fffffff; right = 0; + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP)) { + bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg); + if (seg.tbeg < left) left = seg.tbeg; + if (seg.tend > right) right = seg.tend; + } + } + if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW; + if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW; + } + { // the core part + char *ref2, *inscns = 0; + int k, l, *score, *pscore, max_ins = types[n_types-1]; + ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1); + if (max_ins > 0) { // get the consensus of inserted sequences + int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); + // count occurrences + for (i = 0; i < n_types; ++i) { + if (types[i] <= 0) continue; // not insertion + for (j = 0; j < n; ++j) { + const bam_pileup1_t *p = pl + j; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) { + for (k = 1; k <= p->indel; ++k) { + int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)]; + if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c]; + } + } + } + } + // construct the consensus of inserted sequence + inscns = (char*)calloc(n_types * max_ins, sizeof(char)); + for (i = 0; i < n_types; ++i) { + for (j = 0; j < types[i]; ++j) { + int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4; + for (k = 0; k < 4; ++k) { + if (ia[k] > max) { + max = ia[k]; + max_k = k; + } + } + inscns[i*max_ins + j] = max? 1<b->core; + int s, ps; + bam_segreg_t seg; + if (c->flag&BAM_FUNMAP) continue; + cigar = bam1_cigar(p->b); + bam_segreg(pos, c, cigar, &seg); + for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) { + int cq = bam1_seqi(bam1_seq(p->b), l), ct; + // in the following line, "<" will happen if reads are too long + ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15; + if (cq < 15 && ct < 15) { + s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } + } + score[i*n + j] = s; pscore[i*n + j] = ps; + if (types[i] != 0) { // then try the other way to calculate the score + for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) { + int cq = bam1_seqi(bam1_seq(p->b), l), ct; + ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15; + if (cq < 15 && ct < 15) { + s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } + } + } + if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores + if (pscore[i*n+j] > ps) pscore[i*n+j] = ps; + if (types[i] != 0) score[i*n+j] -= mi->indel_err; + //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j, + // score[i*n+j], pscore[i*n+j]); + } + } + { // get final result + int *sum, max1, max2, max1_i, max2_i; + // pick up the best two score + sum = (int*)calloc(n_types, sizeof(int)); + for (i = 0; i < n_types; ++i) + for (j = 0; j < n; ++j) + sum[i] += -pscore[i*n+j]; + max1 = max2 = -0x7fffffff; max1_i = max2_i = -1; + for (i = 0; i < n_types; ++i) { + if (sum[i] > max1) { + max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i; + } else if (sum[i] > max2) { + max2 = sum[i]; max2_i = i; + } + } + free(sum); + // write ret + ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t)); + ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i]; + ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1); + ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1); + // write indel sequence + if (ret->indel1 > 0) { + ret->s[0][0] = '+'; + for (k = 0; k < ret->indel1; ++k) + ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; + } else if (ret->indel1 < 0) { + ret->s[0][0] = '-'; + for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k) + ret->s[0][k+1] = ref[pos + k + 1]; + } else ret->s[0][0] = '*'; + if (ret->indel2 > 0) { + ret->s[1][0] = '+'; + for (k = 0; k < ret->indel2; ++k) + ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; + } else if (ret->indel2 < 0) { + ret->s[1][0] = '-'; + for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k) + ret->s[1][k+1] = ref[pos + k + 1]; + } else ret->s[1][0] = '*'; + // write count + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel == ret->indel1) ++ret->cnt1; + else if (p->indel == ret->indel2) ++ret->cnt2; + else ++ret->cnt_anti; + } + // write gl[] + ret->gl[0] = ret->gl[1] = 0; + for (j = 0; j < n; ++j) { + int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; + //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2); + if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel; + else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel; + } + } + free(score); free(pscore); free(ref2); free(inscns); + } + { // call genotype + int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); + int min1, min2, min1_i; + q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel; + q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel; + q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel; + min1 = min2 = 0x7fffffff; min1_i = -1; + for (i = 0; i < 3; ++i) { + if (q[i] < min1) { + min2 = min1; min1 = q[i]; min1_i = i; + } else if (q[i] < min2) min2 = q[i]; + } + ret->gt = min1_i; + ret->q_cns = min2 - min1; + // set q_ref + if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3; + else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2]; + if (ret->q_ref < 0) ret->q_ref = 0; + } + free(types); + return ret; +} diff --git a/bam_maqcns.h b/bam_maqcns.h new file mode 100644 index 0000000..36704d7 --- /dev/null +++ b/bam_maqcns.h @@ -0,0 +1,55 @@ +#ifndef BAM_MAQCNS_H +#define BAM_MAQCNS_H + +#include "glf.h" + +struct __bmc_aux_t; + +typedef struct { + float het_rate, theta; + int n_hap, cap_mapQ; + + float eta, q_r; + double *fk, *coef; + double *lhet; + struct __bmc_aux_t *aux; +} bam_maqcns_t; + +typedef struct { + int q_indel; + float r_indel; + // hidden parameters, unchangeable from command line + int mm_penalty, indel_err, ambi_thres; +} bam_maqindel_opt_t; + +typedef struct { + int indel1, indel2; + int cnt1, cnt2, cnt_ambi, cnt_anti; + char *s[2]; + // + int gt, gl[2]; + int q_cns, q_ref; +} bam_maqindel_ret_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bam_maqcns_t *bam_maqcns_init(); + void bam_maqcns_prepare(bam_maqcns_t *bm); + void bam_maqcns_destroy(bam_maqcns_t *bm); + glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm); + uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm); + // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2 + uint32_t glf2cns(const glf1_t *g, int q_r); + + bam_maqindel_opt_t *bam_maqindel_opt_init(); + bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types); + void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bam_mate.c b/bam_mate.c new file mode 100644 index 0000000..61f808a --- /dev/null +++ b/bam_mate.c @@ -0,0 +1,70 @@ +#include +#include +#include "bam.h" + +// currently, this function ONLY works if each read has one hit +void bam_mating_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b[2]; + int curr, has_prev; + + header = bam_header_read(in); + bam_header_write(out, header); + + b[0] = bam_init1(); + b[1] = bam_init1(); + curr = 0; has_prev = 0; + while (bam_read1(in, b[curr]) >= 0) { + bam1_t *cur = b[curr], *pre = b[1-curr]; + if (has_prev) { + if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name + cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; + pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; + if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) + { + uint32_t cur5, pre5; + cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; + cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; + } else cur->core.isize = pre->core.isize = 0; + if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; + else cur->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; + else pre->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } + if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_write1(out, pre); + bam_write1(out, cur); + has_prev = 0; + } else { // unpaired or singleton + pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; + if (pre->core.flag & BAM_FPAIRED) { + pre->core.flag |= BAM_FMUNMAP; + pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; + } + bam_write1(out, pre); + } + } else has_prev = 1; + curr = 1 - curr; + } + if (has_prev) bam_write1(out, b[1-curr]); + bam_header_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); +} + +int bam_mating(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "samtools fixmate \n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + bam_mating_core(in, out); + bam_close(in); bam_close(out); + return 0; +} diff --git a/bam_md.c b/bam_md.c new file mode 100644 index 0000000..a20f9b3 --- /dev/null +++ b/bam_md.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include "faidx.h" +#include "bam.h" +#include "kstring.h" + +void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +{ + uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, u = 0; + kstring_t *str; + uint8_t *old_md; + + old_md = bam_aux_get(b, "MD"); + if (c->flag & BAM_FUNMAP) return; + if (old_md && !is_equal) return; // no need to add MD + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { + if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + ++u; + } else { + ksprintf(str, "%d", u); + kputc(ref[x+j], str); + u = 0; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL) { + ksprintf(str, "%d", u); + kputc('^', str); + for (j = 0; j < l; ++j) { + if (ref[x+j] == 0) break; + kputc(ref[x+j], str); + } + u = 0; + if (j < l) break; + x += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + } else if (op == BAM_CREF_SKIP) { + x += l; + } + } + ksprintf(str, "%d", u); + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' != '%s'\n", bam1_qname(b), old_md+1, str->s); + } + free(str->s); free(str); +} + +int bam_fillmd(int argc, char *argv[]) +{ + int c, is_equal = 0, tid = -2, ret, len; + bamFile fp, fpout = 0; + bam_header_t *header; + faidx_t *fai; + char *ref = 0; + bam1_t *b; + + while ((c = getopt(argc, argv, "e")) >= 0) { + switch (c) { + case 'e': is_equal = 1; break; + default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; + } + } + if (optind + 1 >= argc) { + fprintf(stderr, "Usage: bam fillmd [-e] \n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + fpout = bam_dopen(fileno(stdout), "w"); + bam_header_write(fpout, header); + fai = fai_load(argv[optind+1]); + + b = bam_init1(); + while ((ret = bam_read1(fp, b)) >= 0) { + if (b->core.tid >= 0) { + if (tid != b->core.tid) { + free(ref); + ref = fai_fetch(fai, header->target_name[b->core.tid], &len); + tid = b->core.tid; + } + bam_fillmd1(b, ref, is_equal); + } + bam_write1(fpout, b); + } + bam_destroy1(b); + + free(ref); + fai_destroy(fai); + bam_header_destroy(header); + bam_close(fp); bam_close(fpout); + return 0; +} diff --git a/bam_pileup.c b/bam_pileup.c new file mode 100644 index 0000000..3ffd528 --- /dev/null +++ b/bam_pileup.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include "sam.h" + +typedef struct __linkbuf_t { + bam1_t b; + uint32_t beg, end; + struct __linkbuf_t *next; +} lbnode_t; + +/* --- BEGIN: Memory pool */ + +typedef struct { + int cnt, n, max; + lbnode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + mempool_t *mp; + mp = (mempool_t*)calloc(1, sizeof(mempool_t)); + return mp; +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) { + free(mp->buf[k]->b.data); + free(mp->buf[k]); + } + free(mp->buf); + free(mp); +} +static inline lbnode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, lbnode_t *p) +{ + --mp->cnt; p->next = 0; // clear lbnode_t::next here + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* --- END: Memory pool */ + +/* --- BEGIN: Auxiliary functions */ + +static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) +{ + unsigned k; + bam1_t *b = p->b; + bam1_core_t *c = &b->core; + uint32_t x = c->pos, y = 0; + int ret = 1, is_restart = 1; + + if (c->flag&BAM_FUNMAP) return 0; // unmapped read + assert(x <= pos); // otherwise a bug + p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation + int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length + if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip + if (x + l > pos) { // overlap with pos + p->indel = p->is_del = 0; + p->qpos = y + (pos - x); + if (x == pos && is_restart) p->is_head = 1; + if (x + l - 1 == pos) { // come to the end of a match + if (k < c->n_cigar - 1) { // there are additional operation(s) + uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR + int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation + if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del + else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins + if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP) + p->is_tail = 1; // tail + } else p->is_tail = 1; // this is the last operation; set tail + } + } + x += l; y += l; + } else if (op == BAM_CDEL) { // then set ->is_del + if (x + l > pos) { + p->indel = 0; p->is_del = 1; + p->qpos = y + (pos - x); + } + x += l; + } else if (op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); + if (x > pos) { + if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all + break; + } + } + assert(x > pos); // otherwise a bug + return ret; +} + +/* --- END: Auxiliary functions */ + +struct __bam_plbuf_t { + mempool_t *mp; + lbnode_t *head, *tail, *dummy; + bam_pileup_f func; + void *func_data; + int32_t tid, pos, max_tid, max_pos; + int max_pu, is_eof; + bam_pileup1_t *pu; + int flag_mask; +}; + +void bam_plbuf_reset(bam_plbuf_t *buf) +{ + lbnode_t *p, *q; + buf->max_tid = buf->max_pos = -1; + buf->tid = buf->pos = 0; + buf->is_eof = 0; + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; +} + +void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) +{ + if (mask < 0) buf->flag_mask = BAM_DEF_MASK; + else buf->flag_mask = BAM_FUNMAP | mask; +} + +bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +{ + bam_plbuf_t *buf; + buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t)); + buf->func = func; buf->func_data = data; + buf->mp = mp_init(); + buf->head = buf->tail = mp_alloc(buf->mp); + buf->dummy = mp_alloc(buf->mp); + buf->max_tid = buf->max_pos = -1; + buf->flag_mask = BAM_DEF_MASK; + return buf; +} + +void bam_plbuf_destroy(bam_plbuf_t *buf) +{ + mp_free(buf->mp, buf->dummy); + mp_free(buf->mp, buf->head); + if (buf->mp->cnt != 0) + fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt); + mp_destroy(buf->mp); + free(buf->pu); + free(buf); +} + +int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +{ + if (b) { // fill buffer + if (b->core.tid < 0) return 0; + if (b->core.flag & buf->flag_mask) return 0; + bam_copy1(&buf->tail->b, b); + buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); + if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n"); + abort(); + } + buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg; + if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) { + buf->tail->next = mp_alloc(buf->mp); + buf->tail = buf->tail->next; + } + } else buf->is_eof = 1; + while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) { + int n_pu = 0; + lbnode_t *p, *q; + buf->dummy->next = buf->head; + for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list + q->next = p->next; mp_free(buf->mp, p); p = q; + } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup + if (n_pu == buf->max_pu) { // then double the capacity + buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256; + buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu); + } + buf->pu[n_pu].b = &p->b; + if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP + } + } + buf->head = buf->dummy->next; // dummy->next may be changed + if (n_pu) { // then call user defined function + buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data); + } + // update tid and pos + if (buf->head->next) { + if (buf->tid > buf->head->b.core.tid) { + fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); + return 1; + } + } + if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence + buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference + } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid + buf->pos = buf->head->beg; // jump to the next position + } else ++buf->pos; // scan contiguously + if (buf->is_eof && buf->head->next == 0) break; + } + return 0; +} diff --git a/bam_plcmd.c b/bam_plcmd.c new file mode 100644 index 0000000..5d5506f --- /dev/null +++ b/bam_plcmd.c @@ -0,0 +1,385 @@ +#include +#include +#include +#include +#include "sam.h" +#include "faidx.h" +#include "bam_maqcns.h" +#include "khash.h" +#include "glf.h" +#include "kstring.h" + +typedef int *indel_list_t; +KHASH_MAP_INIT_INT64(64, indel_list_t) + +#define BAM_PLF_SIMPLE 0x01 +#define BAM_PLF_CNS 0x02 +#define BAM_PLF_INDEL_ONLY 0x04 +#define BAM_PLF_GLF 0x08 +#define BAM_PLF_VAR_ONLY 0x10 +#define BAM_PLF_2ND 0x20 + +typedef struct { + bam_header_t *h; + bam_maqcns_t *c; + bam_maqindel_opt_t *ido; + faidx_t *fai; + khash_t(64) *hash; + uint32_t format; + int tid, len, last_pos; + int mask; + char *ref; + glfFile fp_glf; // for glf output only +} pu_data_t; + +char **__bam_get_lines(const char *fn, int *_n); +void bam_init_header_hash(bam_header_t *header); +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + +static khash_t(64) *load_pos(const char *fn, bam_header_t *h) +{ + char **list; + int i, j, n, *fields, max_fields; + khash_t(64) *hash; + bam_init_header_hash(h); + list = __bam_get_lines(fn, &n); + hash = kh_init(64); + max_fields = 0; fields = 0; + for (i = 0; i < n; ++i) { + char *str = list[i]; + int chr, n_fields, ret; + khint_t k; + uint64_t x; + n_fields = ksplit_core(str, 0, &max_fields, &fields); + if (n_fields < 2) continue; + chr = bam_get_tid(h, str + fields[0]); + if (chr < 0) { + fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]); + continue; + } + x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1); + k = kh_put(64, hash, x, &ret); + if (ret == 0) { + fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]); + continue; + } + kh_val(hash, k) = 0; + if (n_fields > 2) { + // count + for (j = 2; j < n_fields; ++j) { + char *s = str + fields[j]; + if ((*s != '+' && *s != '-') || !isdigit(s[1])) break; + } + if (j > 2) { // update kh_val() + int *q, y, z; + q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int)); + q[0] = j - 2; z = j; y = 1; + for (j = 2; j < z; ++j) + q[y++] = atoi(str + fields[j]); + } + } + free(str); + } + free(list); free(fields); + return hash; +} + +// an analogy to pileup_func() below +static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +{ + pu_data_t *d = (pu_data_t*)data; + bam_maqindel_ret_t *r = 0; + int rb, *proposed_indels = 0; + glf1_t *g; + glf3_t *g3; + + if (d->fai == 0) { + fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n"); + exit(1); + } + if (d->hash) { // only output a list of sites + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } + g3 = glf3_init1(); + if (d->fai && (int)tid != d->tid) { + if (d->ref) { // then write the end mark + g3->rtype = GLF3_RTYPE_END; + glf3_write1(d->fp_glf, g3); + } + glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference + free(d->ref); + d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->tid = tid; + d->last_pos = 0; + } + rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); + memcpy(g3, g, sizeof(glf1_t)); + g3->rtype = GLF3_RTYPE_SUB; + g3->offset = pos - d->last_pos; + d->last_pos = pos; + glf3_write1(d->fp_glf, g3); + if (proposed_indels) + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + if (r) { // then write indel line + int het = 3 * n, min; + min = het; + if (min > r->gl[0]) min = r->gl[0]; + if (min > r->gl[1]) min = r->gl[1]; + g3->ref_base = 0; + g3->rtype = GLF3_RTYPE_INDEL; + memset(g3->lk, 0, 10); + g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255; + g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255; + g3->lk[2] = het - min < 255? het - min : 255; + g3->offset = 0; + g3->indel_len[0] = r->indel1; + g3->indel_len[1] = r->indel2; + g3->min_lk = min < 255? min : 255; + g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1; + g3->indel_seq[0] = strdup(r->s[0]+1); + g3->indel_seq[1] = strdup(r->s[1]+1); + glf3_write1(d->fp_glf, g3); + bam_maqindel_ret_destroy(r); + } + free(g); + glf3_destroy1(g3); + return 0; +} + +static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +{ + pu_data_t *d = (pu_data_t*)data; + bam_maqindel_ret_t *r = 0; + int i, j, rb, rms_mapq = -1, *proposed_indels = 0; + uint64_t rms_aux; + uint32_t cns = 0; + + // if GLF is required, suppress -c completely + if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data); + // if d->hash is initialized, only output the sites in the hash table + if (d->hash) { + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } + // update d->ref if necessary + if (d->fai && (int)tid != d->tid) { + free(d->ref); + d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->tid = tid; + } + rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + // when the indel-only mode is asked for, return if no reads mapped with indels + if (d->format & BAM_PLF_INDEL_ONLY) { + for (i = 0; i < n; ++i) + if (pu[i].indel != 0) break; + if (i == n) return 0; + } + // call the consensus and indel + if (d->format & BAM_PLF_CNS) // call consensus + cns = bam_maqcns_call(n, pu, d->c); + if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref) { // call indels + if (proposed_indels) // the first element gives the size of the array + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + } + // when only variant sites are asked for, test if the site is a variant + if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { + if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP + if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel + if (r) bam_maqindel_ret_destroy(r); + return 0; + } + } + } + // print the first 3 columns + printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb); + // print consensus information if required + if (d->format & BAM_PLF_CNS) { + int ref_q, rb4 = bam_nt16_table[rb]; + ref_q = 0; + if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP + ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff); + if (ref_q > 255) ref_q = 255; + } + rms_mapq = cns>>16&0xff; + printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq); + } + // print pileup sequences + printf("%d\t", n); + rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + rms_aux += tmp * tmp; + if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); + if (!p->is_del) { + int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + putchar(c); + if (p->indel > 0) { + printf("+%d", p->indel); + for (j = 1; j <= p->indel; ++j) { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } else if (p->indel < 0) { + printf("%d", p->indel); + for (j = 1; j <= -p->indel; ++j) { + c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N'; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } + } else putchar('*'); + if (p->is_tail) putchar('$'); + } + // finalize rms_mapq + rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); + if (rms_mapq < 0) rms_mapq = rms_aux; + putchar('\t'); + // print quality + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int c = bam1_qual(p->b)[p->qpos] + 33; + if (c > 126) c = 126; + putchar(c); + } + if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities + const unsigned char *q; + putchar('\t'); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + q = bam_aux_get(p->b, "E2"); + putchar(q? q[p->qpos + 1] : 'N'); + } + putchar('\t'); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + q = bam_aux_get(p->b, "U2"); + putchar(q? q[p->qpos + 1] : '!'); + } + } + // print mapping quality if -s is flagged on the command line + if (d->format & BAM_PLF_SIMPLE) { + putchar('\t'); + for (i = 0; i < n; ++i) { + int c = pu[i].b->core.qual + 33; + if (c > 126) c = 126; + putchar(c); + } + } + putchar('\n'); + // print the indel line if r has been calculated. This only happens if: + // a) -c or -i are flagged, AND b) the reference sequence is available + if (r) { + printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1); + if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]); + else printf("%s/%s\t", r->s[0], r->s[1]); + printf("%d\t%d\t", r->q_cns, r->q_ref); + printf("%d\t%d\t", rms_mapq, n); + printf("%s\t%s\t", r->s[0], r->s[1]); + //printf("%d\t%d\t", r->gl[0], r->gl[1]); + printf("%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_anti); + bam_maqindel_ret_destroy(r); + } + return 0; +} + +int bam_pileup(int argc, char *argv[]) +{ + int c, is_SAM = 0; + char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; + pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); + d->tid = -1; d->mask = BAM_DEF_MASK; + d->c = bam_maqcns_init(); + d->ido = bam_maqindel_opt_init(); + while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2")) >= 0) { + switch (c) { + case 's': d->format |= BAM_PLF_SIMPLE; break; + case 't': fn_list = strdup(optarg); break; + case 'l': fn_pos = strdup(optarg); break; + case 'f': fn_fa = strdup(optarg); break; + case 'T': d->c->theta = atof(optarg); break; + case 'N': d->c->n_hap = atoi(optarg); break; + case 'r': d->c->het_rate = atof(optarg); break; + case 'M': d->c->cap_mapQ = atoi(optarg); break; + case 'c': d->format |= BAM_PLF_CNS; break; + case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; + case 'v': d->format |= BAM_PLF_VAR_ONLY; break; + case 'm': d->mask = strtol(optarg, 0, 0); break; + case 'g': d->format |= BAM_PLF_GLF; break; + case '2': d->format |= BAM_PLF_2ND; break; + case 'I': d->ido->q_indel = atoi(optarg); break; + case 'G': d->ido->r_indel = atof(optarg); break; + case 'S': is_SAM = 1; break; + default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; + } + } + if (fn_list) is_SAM = 1; + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); + fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); + fprintf(stderr, " -S the input is in SAM\n"); + fprintf(stderr, " -2 output the 2nd best call and quality\n"); + fprintf(stderr, " -i only show lines/consensus with indels\n"); + fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); + fprintf(stderr, " -t FILE list of reference sequences (assume the input is in SAM)\n"); + fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); + fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); + fprintf(stderr, " -c output the maq consensus sequence\n"); + fprintf(stderr, " -v print variants only (for -c)\n"); + fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n"); + fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta); + fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap); + fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate); + fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel); + fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel); + fprintf(stderr, "\n"); + free(fn_list); free(fn_fa); free(d); + return 1; + } + if (fn_fa) d->fai = fai_load(fn_fa); + if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling + if (d->format & BAM_PLF_GLF) { // for glf output + glf3_header_t *h; + h = glf3_header_init(); + d->fp_glf = bgzf_fdopen(fileno(stdout), "w"); + glf3_header_write(d->fp_glf, h); + glf3_header_destroy(h); + } + if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY))) + fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n"); + { + samfile_t *fp; + fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0); + if (fp == 0 || fp->header == 0) { + fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n"); + return 1; + } + d->h = fp->header; + if (fn_pos) d->hash = load_pos(fn_pos, d->h); + sampileup(fp, d->mask, pileup_func, d); + samclose(fp); // d->h will be destroyed here + } + + // free + if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf); + if (fn_pos) { // free the hash table + khint_t k; + for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k) + if (kh_exist(d->hash, k)) free(kh_val(d->hash, k)); + kh_destroy(64, d->hash); + } + free(fn_pos); free(fn_list); free(fn_fa); + if (d->fai) fai_destroy(d->fai); + bam_maqcns_destroy(d->c); + free(d->ido); free(d->ref); free(d); + return 0; +} diff --git a/bam_rmdup.c b/bam_rmdup.c new file mode 100644 index 0000000..1fa6cad --- /dev/null +++ b/bam_rmdup.c @@ -0,0 +1,144 @@ +#include +#include +#include +#include +#include "bam.h" + +typedef bam1_t *bam1_p; +#include "khash.h" +KHASH_SET_INIT_STR(name) +KHASH_MAP_INIT_INT64(pos, bam1_p) + +#define BUFFER_SIZE 0x40000 + +typedef struct { + int n, max; + bam1_t **a; +} tmp_stack_t; + +static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) +{ + if (stack->n == stack->max) { + stack->max = stack->max? stack->max<<1 : 0x10000; + stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); + } + stack->a[stack->n++] = b; +} + +static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out) +{ + int i; + for (i = 0; i != stack->n; ++i) { + bam_write1(out, stack->a[i]); + bam_destroy1(stack->a[i]); + } + stack->n = 0; + if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash); +} + +static void clear_del_set(khash_t(name) *del_set) +{ + khint_t k; + for (k = kh_begin(del_set); k < kh_end(del_set); ++k) + if (kh_exist(del_set, k)) + free((char*)kh_key(del_set, k)); + kh_clear(name, del_set); +} + +void bam_rmdup_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b; + int last_tid = -1, last_pos = -1; + uint64_t n_checked = 0, n_removed = 0; + tmp_stack_t stack; + khint_t k; + khash_t(pos) *best_hash; + khash_t(name) *del_set; + + best_hash = kh_init(pos); + del_set = kh_init(name); + b = bam_init1(); + memset(&stack, 0, sizeof(tmp_stack_t)); + header = bam_header_read(in); + bam_header_write(out, header); + + kh_resize(name, del_set, 4 * BUFFER_SIZE); + kh_resize(pos, best_hash, 3 * BUFFER_SIZE); + while (bam_read1(in, b) >= 0) { + bam1_core_t *c = &b->core; + if (c->tid != last_tid || last_pos != c->pos) { + dump_best(&stack, best_hash, out); // write the result + if (c->tid != last_tid) { + kh_clear(pos, best_hash); + if (kh_size(del_set)) { // check + fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + clear_del_set(del_set); + } + if ((int)c->tid == -1) { // append unmapped reads + bam_write1(out, b); + while (bam_read1(in, b) >= 0) bam_write1(out, b); + break; + } + last_tid = c->tid; + fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]); + } + } + if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { + bam_write1(out, b); + } else if (c->isize > 0) { // paired, head + uint64_t key = (uint64_t)c->pos<<32 | c->isize; + int ret; + ++n_checked; + k = kh_put(pos, best_hash, key, &ret); + if (ret == 0) { // found in best_hash + bam1_t *p = kh_val(best_hash, k); + ++n_removed; + if (p->core.qual < c->qual) { // the current alignment is better + kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed + bam_copy1(p, b); // replaced as b + } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed + if (ret == 0) + fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); + } else { // not found in best_hash + kh_val(best_hash, k) = bam_dup1(b); + stack_insert(&stack, kh_val(best_hash, k)); + } + } else { // paired, tail + k = kh_get(name, del_set, bam1_qname(b)); + if (k != kh_end(del_set)) { + free((char*)kh_key(del_set, k)); + kh_del(name, del_set, k); + } else bam_write1(out, b); + } + last_pos = c->pos; + } + dump_best(&stack, best_hash, out); + + bam_header_destroy(header); + clear_del_set(del_set); + kh_destroy(name, del_set); + kh_destroy(pos, best_hash); + free(stack.a); + bam_destroy1(b); + fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked, + (double)n_removed/n_checked); +} +int bam_rmdup(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "Usage: samtools rmdup \n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + if (in == 0 || out == 0) { + fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); + return 1; + } + bam_rmdup_core(in, out); + bam_close(in); + bam_close(out); + return 0; +} diff --git a/bam_rmdupse.c b/bam_rmdupse.c new file mode 100644 index 0000000..df03717 --- /dev/null +++ b/bam_rmdupse.c @@ -0,0 +1,177 @@ +#include +#include "sam.h" +#include "khash.h" + +typedef struct { + int n, m; + int *a; +} listelem_t; + +KHASH_MAP_INIT_INT(32, listelem_t) + +#define BLOCK_SIZE 65536 + +typedef struct { + bam1_t *b; + int rpos, score; +} elem_t; + +typedef struct { + int n, max, x; + elem_t *buf; +} buffer_t; + +static int fill_buf(samfile_t *in, buffer_t *buf) +{ + int i, ret, last_tid, min_rpos = 0x7fffffff, capacity; + bam1_t *b = bam_init1(); + bam1_core_t *c = &b->core; + // squeeze out the empty cells at the beginning + for (i = 0; i < buf->n; ++i) + if (buf->buf[i].b) break; + if (i < buf->n) { // squeeze + if (i > 0) { + memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i)); + buf->n = buf->n - i; + } + } else buf->n = 0; + // calculate min_rpos + for (i = 0; i < buf->n; ++i) { + elem_t *e = buf->buf + i; + if (e->b && e->rpos >= 0 && e->rpos < min_rpos) + min_rpos = buf->buf[i].rpos; + } + // fill the buffer + buf->x = -1; + last_tid = buf->n? buf->buf[0].b->core.tid : -1; + capacity = buf->n + BLOCK_SIZE; + while ((ret = samread(in, b)) >= 0) { + elem_t *e; + uint8_t *qual = bam1_qual(b); + int is_mapped; + if (last_tid < 0) last_tid = c->tid; + if (c->tid != last_tid) { + if (buf->x < 0) buf->x = buf->n; + } + if (buf->n >= buf->max) { // enlarge + buf->max = buf->max? buf->max<<1 : 8; + buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max); + } + e = &buf->buf[buf->n++]; + e->b = bam_dup1(b); + e->rpos = -1; e->score = 0; + for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1; + e->score = (double)e->score / sqrt(c->l_qseq + 1); + is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1; + if (!is_mapped) e->score = -1; + if (is_mapped && (c->flag & BAM_FREVERSE)) { + e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b)); + if (min_rpos > e->rpos) min_rpos = e->rpos; + } + if (buf->n >= capacity) { + if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE; + else break; + } + } + if (ret >= 0 && buf->x < 0) buf->x = buf->n; + bam_destroy1(b); + return buf->n; +} + +static void rmdupse_buf(buffer_t *buf) +{ + khash_t(32) *h; + uint32_t key; + khint_t k; + int mpos, i, upper; + listelem_t *p; + mpos = 0x7fffffff; + mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff; + upper = (buf->x < 0)? buf->n : buf->x; + // fill the hash table + h = kh_init(32); + for (i = 0; i < upper; ++i) { + elem_t *e = buf->buf + i; + int ret; + if (e->score < 0) continue; + if (e->rpos >= 0) { + if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1; + else continue; + } else { + if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1; + else continue; + } + k = kh_put(32, h, key, &ret); + p = &kh_val(h, k); + if (ret == 0) { // present in the hash table + if (p->n == p->m) { + p->m <<= 1; + p->a = (int*)realloc(p->a, p->m * sizeof(int)); + } + p->a[p->n++] = i; + } else { + p->m = p->n = 1; + p->a = (int*)calloc(p->m, sizeof(int)); + p->a[0] = i; + } + } + // rmdup + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + int max, maxi; + p = &kh_val(h, k); + // get the max + for (i = max = 0, maxi = -1; i < p->n; ++i) { + if (buf->buf[p->a[i]].score > max) { + max = buf->buf[p->a[i]].score; + maxi = i; + } + } + // mark the elements + for (i = 0; i < p->n; ++i) { + buf->buf[p->a[i]].score = -1; + if (i != maxi) { + bam_destroy1(buf->buf[p->a[i]].b); + buf->buf[p->a[i]].b = 0; + } + } + // free + free(p->a); + } + } + kh_destroy(32, h); +} + +static void dump_buf(buffer_t *buf, samfile_t *out) +{ + int i; + for (i = 0; i < buf->n; ++i) { + elem_t *e = buf->buf + i; + if (e->score != -1) break; + if (e->b) { + samwrite(out, e->b); + bam_destroy1(e->b); + e->b = 0; + } + } +} + +int bam_rmdupse(int argc, char *argv[]) +{ + samfile_t *in, *out; + buffer_t *buf; + if (argc < 3) { + fprintf(stderr, "Usage: samtools rmdupse \n"); + return 1; + } + buf = calloc(1, sizeof(buffer_t)); + in = samopen(argv[1], "rb", 0); + out = samopen(argv[2], "wb", in->header); + while (fill_buf(in, buf)) { + rmdupse_buf(buf); + dump_buf(buf, out); + } + samclose(in); samclose(out); + free(buf->buf); free(buf); + return 0; +} diff --git a/bam_sort.c b/bam_sort.c new file mode 100644 index 0000000..402792a --- /dev/null +++ b/bam_sort.c @@ -0,0 +1,257 @@ +#include +#include +#include +#include +#include +#include +#include "bam.h" +#include "ksort.h" + +static int g_is_by_qname = 0; + +static inline int strnum_cmp(const char *a, const char *b) +{ + char *pa, *pb; + pa = (char*)a; pb = (char*)b; + while (*pa && *pb) { + if (isdigit(*pa) && isdigit(*pb)) { + long ai, bi; + ai = strtol(pa, &pa, 10); + bi = strtol(pb, &pb, 10); + if (ai != bi) return aibi? 1 : 0; + } else { + if (*pa != *pb) break; + ++pa; ++pb; + } + } + if (*pa == *pb) + return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; + return *pa<*pb? -1 : *pa>*pb? 1 : 0; +} + +#define HEAP_EMPTY 0xffffffffffffffffull + +typedef struct { + int i; + uint64_t pos; + bam1_t *b; +} heap1_t; + +static inline int heap_lt(const heap1_t a, const heap1_t b) +{ + if (g_is_by_qname) { + int t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); + return (t > 0 || (t == 0 && a.pos > b.pos)); + } else return (a.pos > b.pos); +} + +KSORT_INIT(heap, heap1_t, heap_lt) + +/*! + @abstract Merge multiple sorted BAM. + @param is_by_qname whether to sort by query name + @param out output BAM file name + @param n number of files to be merged + @param fn names of files to be merged + + @discussion Padding information may NOT correctly maintained. This + function is NOT thread safe. + */ +void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) +{ + bamFile fpout, *fp; + heap1_t *heap; + bam_header_t *hout = 0; + int i, j; + + g_is_by_qname = by_qname; + fp = (bamFile*)calloc(n, sizeof(bamFile)); + heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + for (i = 0; i != n; ++i) { + heap1_t *h; + bam_header_t *hin; + assert(fp[i] = bam_open(fn[i], "r")); + hin = bam_header_read(fp[i]); + if (i == 0) hout = hin; + else { // validate multiple baf + if (hout->n_targets != hin->n_targets) { + fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]); + exit(1); + } + for (j = 0; j < hout->n_targets; ++j) { + if (strcmp(hout->target_name[j], hin->target_name[j])) { + fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n", + hout->target_name[j], hin->target_name[j], fn[i]); + exit(1); + } + if (hout->target_len[j] != hin->target_len[j]) + fprintf(stderr, "[bam_merge_core] different target sequence length: %d != %d in file '%s'. Continue.\n", + hout->target_len[j], hin->target_len[j], fn[i]); + } + bam_header_destroy(hin); + } + h = heap + i; + h->i = i; + h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); + if (bam_read1(fp[i], h->b) >= 0) + h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); + else h->pos = HEAP_EMPTY; + } + fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); + assert(fpout); + bam_header_write(fpout, hout); + bam_header_destroy(hout); + + ks_heapmake(heap, n, heap); + while (heap->pos != HEAP_EMPTY) { + bam1_t *b = heap->b; + bam_write1_core(fpout, &b->core, b->data_len, b->data); + if ((j = bam_read1(fp[heap->i], b)) >= 0) + heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); + else if (j == -1) heap->pos = HEAP_EMPTY; + else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); + ks_heapadjust(heap, 0, n, heap); + } + + for (i = 0; i != n; ++i) { + bam_close(fp[i]); + free(heap[i].b->data); + free(heap[i].b); + } + bam_close(fpout); + free(fp); free(heap); +} +int bam_merge(int argc, char *argv[]) +{ + int c, is_by_qname = 0; + while ((c = getopt(argc, argv, "n")) >= 0) { + switch (c) { + case 'n': is_by_qname = 1; break; + } + } + if (optind + 2 >= argc) { + fprintf(stderr, "Usage: samtools merge [-n] [...]\n"); + return 1; + } + bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1); + return 0; +} + +typedef bam1_t *bam1_p; + +static inline int bam1_lt(const bam1_p a, const bam1_p b) +{ + if (g_is_by_qname) { + int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); + return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)))); + } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)); +} +KSORT_INIT(sort, bam1_p, bam1_lt) + +static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h) +{ + char *name; + int i; + bamFile fp; + ks_mergesort(sort, k, buf, 0); + name = (char*)calloc(strlen(prefix) + 20, 1); + if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); + else sprintf(name, "%s.bam", prefix); + assert(fp = bam_open(name, "w")); + free(name); + bam_header_write(fp, h); + for (i = 0; i < k; ++i) + bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data); + bam_close(fp); +} + +/*! + @abstract Sort an unsorted BAM file based on the chromosome order + and the leftmost position of an alignment + + @param is_by_qname whether to sort by query name + @param fn name of the file to be sorted + @param prefix prefix of the output and the temporary files; upon + sucessess, prefix.bam will be written. + @param max_mem approxiate maximum memory (very inaccurate) + + @discussion It may create multiple temporary subalignment files + and then merge them by calling bam_merge_core(). This function is + NOT thread safe. + */ +void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) +{ + int n, ret, k, i; + size_t mem; + bam_header_t *header; + bamFile fp; + bam1_t *b, **buf; + + g_is_by_qname = is_by_qname; + n = k = 0; mem = 0; + fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); + // write sub files + for (;;) { + if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); + b = buf[k]; + if ((ret = bam_read1(fp, b)) < 0) break; + mem += ret; + ++k; + if (mem >= max_mem) { + sort_blocks(n++, k, buf, prefix, header); + mem = 0; k = 0; + } + } + if (ret != -1) + fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); + if (n == 0) sort_blocks(-1, k, buf, prefix, header); + else { // then merge + char **fns, *fnout; + fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); + sort_blocks(n++, k, buf, prefix, header); + fnout = (char*)calloc(strlen(prefix) + 20, 1); + sprintf(fnout, "%s.bam", prefix); + fns = (char**)calloc(n, sizeof(char*)); + for (i = 0; i < n; ++i) { + fns[i] = (char*)calloc(strlen(prefix) + 20, 1); + sprintf(fns[i], "%s.%.4d.bam", prefix, i); + } + bam_merge_core(is_by_qname, fnout, n, fns); + free(fnout); + for (i = 0; i < n; ++i) { + unlink(fns[i]); + free(fns[i]); + } + free(fns); + } + for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) { + if (buf[k]) { + free(buf[k]->data); + free(buf[k]); + } + } + free(buf); + bam_header_destroy(header); + bam_close(fp); +} + +int bam_sort(int argc, char *argv[]) +{ + size_t max_mem = 500000000; + int c, is_by_qname = 0; + while ((c = getopt(argc, argv, "nm:")) >= 0) { + switch (c) { + case 'n': is_by_qname = 1; break; + case 'm': max_mem = atol(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: samtools sort [-n] [-m ] \n"); + return 1; + } + bam_sort_core(is_by_qname, argv[optind], argv[optind+1], max_mem); + return 0; +} diff --git a/bam_stat.c b/bam_stat.c new file mode 100644 index 0000000..c1c4a43 --- /dev/null +++ b/bam_stat.c @@ -0,0 +1,78 @@ +#include +#include +#include "bam.h" + +typedef struct { + long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; + long long n_sgltn, n_read1, n_read2; + long long n_qcfail, n_dup; + long long n_diffchr, n_diffhigh; +} bam_flagstat_t; + +#define flagstat_loop(s, c) do { \ + ++(s)->n_reads; \ + if ((c)->flag & BAM_FPAIRED) { \ + ++(s)->n_pair_all; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ + if ((c)->flag & BAM_FMUNMAP) ++(s)->n_sgltn; \ + if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ + ++(s)->n_pair_map; \ + if ((c)->mtid != (c)->tid) { \ + ++(s)->n_diffchr; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh; \ + } \ + } \ + } \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ + if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ + } while (0) + +bam_flagstat_t *bam_flagstat_core(bamFile fp) +{ + bam_flagstat_t *s; + bam1_t *b; + bam1_core_t *c; + int ret; + s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); + b = bam_init1(); + c = &b->core; + while ((ret = bam_read1(fp, b)) >= 0) + flagstat_loop(s, c); + bam_destroy1(b); + if (ret != -1) + fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + return s; +} +int bam_flagstat(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *header; + bam_flagstat_t *s; + if (argc == optind) { + fprintf(stderr, "Usage: samtools flagstat \n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + s = bam_flagstat_core(fp); + printf("%lld in total\n", s->n_reads); + printf("%lld QC failure\n", s->n_qcfail); + printf("%lld duplicates\n", s->n_dup); + printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); + printf("%lld paired in sequencing\n", s->n_pair_all); + printf("%lld read1\n", s->n_read1); + printf("%lld read2\n", s->n_read2); + printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); + printf("%lld with itself and mate mapped\n", s->n_pair_map); + printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); + printf("%lld with mate mapped to a different chr\n", s->n_diffchr); + printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); + free(s); + bam_header_destroy(header); + bam_close(fp); + return 0; +} diff --git a/bam_tview.c b/bam_tview.c new file mode 100644 index 0000000..be2579c --- /dev/null +++ b/bam_tview.c @@ -0,0 +1,379 @@ +#ifndef _NO_CURSES +#include +#ifdef NCURSES_VERSION +#include +#include +#include +#include "bam.h" +#include "faidx.h" +#include "bam_maqcns.h" + +char bam_aux_getCEi(bam1_t *b, int i); +char bam_aux_getCSi(bam1_t *b, int i); +char bam_aux_getCQi(bam1_t *b, int i); + +#define TV_MIN_ALNROW 2 +#define TV_MAX_GOTO 40 +#define TV_LOW_MAPQ 10 + +#define TV_COLOR_MAPQ 0 +#define TV_COLOR_BASEQ 1 +#define TV_COLOR_NUCL 2 +#define TV_COLOR_COL 3 +#define TV_COLOR_COLQ 4 + +#define TV_BASE_NUCL 0 +#define TV_BASE_COLOR_SPACE 1 + +typedef struct { + int mrow, mcol; + WINDOW *wgoto, *whelp; + + bam_index_t *idx; + bam_lplbuf_t *lplbuf; + bam_header_t *header; + bamFile fp; + int curr_tid, left_pos; + faidx_t *fai; + bam_maqcns_t *bmc; + + int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins; + char *ref; +} tview_t; + +int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + tview_t *tv = (tview_t*)data; + int i, j, c, rb, attr, max_ins = 0; + uint32_t call = 0; + if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen + // print referece + rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; + for (i = tv->last_pos + 1; i < pos; ++i) { + if (i%10 == 0) mvprintw(0, tv->ccol, "%-d", i+1); + c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; + mvaddch(1, tv->ccol++, c); + } + if (pos%10 == 0) mvprintw(0, tv->ccol, "%-d", pos+1); + // print consensus + call = bam_maqcns_call(n, pl, tv->bmc); + attr = A_UNDERLINE; + c = ",ACMGRSVTWYHKDBN"[call>>28&0xf]; + i = (call>>8&0xff)/10+1; + if (i > 4) i = 4; + attr |= COLOR_PAIR(i); + if (c == toupper(rb)) c = '.'; + attron(attr); + mvaddch(2, tv->ccol, c); + attroff(attr); + if(tv->ins) { + // calculate maximum insert + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel; + } + } + // core loop + for (j = 0; j <= max_ins; ++j) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + int row = TV_MIN_ALNROW + p->level - tv->row_shift; + if (j == 0) { + if (!p->is_del) { + if (tv->base_for == TV_BASE_COLOR_SPACE && + (c = bam_aux_getCSi(p->b, p->qpos))) { + c = bam_aux_getCSi(p->b, p->qpos); + // assume that if we found one color, we will be able to get the color error + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.'; + } + else { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + } else c = '*'; + } else { // padding + if (j > p->indel) c = '*'; + else { // insertion + if (tv->base_for == TV_BASE_NUCL) { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + else { + c = bam_aux_getCSi(p->b, p->qpos + j); + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.'; + } + } + } + if (row > TV_MIN_ALNROW && row < tv->mrow) { + int x; + attr = 0; + if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) + || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE; + if (tv->color_for == TV_COLOR_BASEQ) { + x = bam1_qual(p->b)[p->qpos]/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } else if (tv->color_for == TV_COLOR_MAPQ) { + x = p->b->core.qual/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } else if (tv->color_for == TV_COLOR_NUCL) { + x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5; + attr |= COLOR_PAIR(x); + } else if(tv->color_for == TV_COLOR_COL) { + x = 0; + switch(bam_aux_getCSi(p->b, p->qpos)) { + case '0': x = 0; break; + case '1': x = 1; break; + case '2': x = 2; break; + case '3': x = 3; break; + case '4': x = 4; break; + default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break; + } + x+=5; + attr |= COLOR_PAIR(x); + } else if(tv->color_for == TV_COLOR_COLQ) { + x = bam_aux_getCQi(p->b, p->qpos); + if(0 == x) x = bam1_qual(p->b)[p->qpos]; + x = x/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } + attron(attr); + mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c)); + attroff(attr); + } + } + c = j? '*' : rb; + if (c == '*') { + attr = COLOR_PAIR(8); + attron(attr); + mvaddch(1, tv->ccol++, c); + attroff(attr); + } else mvaddch(1, tv->ccol++, c); + } + tv->last_pos = pos; + return 0; +} + +tview_t *tv_init(const char *fn, const char *fn_fa) +{ + tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t)); + tv->is_dot = 1; + tv->idx = bam_index_load(fn); + if (tv->idx == 0) exit(1); + tv->fp = bam_open(fn, "r"); + bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); + assert(tv->fp); + tv->header = bam_header_read(tv->fp); + tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); + if (fn_fa) tv->fai = fai_load(fn_fa); + tv->bmc = bam_maqcns_init(); + tv->ins = 1; + bam_maqcns_prepare(tv->bmc); + + initscr(); + keypad(stdscr, TRUE); + clear(); + noecho(); + cbreak(); +#ifdef NCURSES_VERSION + getmaxyx(stdscr, tv->mrow, tv->mcol); +#else + tv->mrow = 80; tv->mcol = 40; +#endif + tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); + tv->whelp = newwin(27, 40, 5, 5); + tv->color_for = TV_COLOR_MAPQ; + start_color(); + init_pair(1, COLOR_BLUE, COLOR_BLACK); + init_pair(2, COLOR_GREEN, COLOR_BLACK); + init_pair(3, COLOR_YELLOW, COLOR_BLACK); + init_pair(4, COLOR_WHITE, COLOR_BLACK); + init_pair(5, COLOR_GREEN, COLOR_BLACK); + init_pair(6, COLOR_CYAN, COLOR_BLACK); + init_pair(7, COLOR_YELLOW, COLOR_BLACK); + init_pair(8, COLOR_RED, COLOR_BLACK); + init_pair(9, COLOR_BLUE, COLOR_BLACK); + return tv; +} + +void tv_destroy(tview_t *tv) +{ + delwin(tv->wgoto); delwin(tv->whelp); + endwin(); + + bam_lplbuf_destroy(tv->lplbuf); + bam_maqcns_destroy(tv->bmc); + bam_index_destroy(tv->idx); + if (tv->fai) fai_destroy(tv->fai); + free(tv->ref); + bam_header_destroy(tv->header); + bam_close(tv->fp); + free(tv); +} + +int tv_fetch_func(const bam1_t *b, void *data) +{ + tview_t *tv = (tview_t*)data; + bam_lplbuf_push(b, tv->lplbuf); + return 0; +} + +int tv_draw_aln(tview_t *tv, int tid, int pos) +{ + // reset + clear(); + tv->curr_tid = tid; tv->left_pos = pos; + tv->last_pos = tv->left_pos - 1; + tv->ccol = 0; + // print ref and consensus + if (tv->fai) { + char *str; + if (tv->ref) free(tv->ref); + str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); + sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); + tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); + free(str); + } + // draw aln + bam_lplbuf_reset(tv->lplbuf); + bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func); + bam_lplbuf_push(0, tv->lplbuf); + return 0; +} + +static void tv_win_goto(tview_t *tv, int *tid, int *pos) +{ + char str[256]; + int i, l = 0; + wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(tv->wgoto, 1, 2, "Goto: "); + for (;;) { + int c = wgetch(tv->wgoto); + wrefresh(tv->wgoto); + if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { + --l; + } else if (c == KEY_ENTER || c == '\012' || c == '\015') { + int _tid = -1, _beg, _end; + bam_parse_region(tv->header, str, &_tid, &_beg, &_end); + if (_tid >= 0) { + *tid = _tid; *pos = _beg; + return; + } + } else if (isgraph(c)) { + if (l < TV_MAX_GOTO) str[l++] = c; + } else if (c == '\027') l = 0; + else if (c == '\033') return; + str[l] = '\0'; + for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); + mvwprintw(tv->wgoto, 1, 8, "%s", str); + } +} + +static void tv_win_help(tview_t *tv) { + int r = 1; + WINDOW *win = tv->whelp; + wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(win, r++, 2, " -=- Help -=- "); + r++; + mvwprintw(win, r++, 2, "? This window"); + mvwprintw(win, r++, 2, "Arrows Small scroll movement"); + mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); + mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); + mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); + mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); + mvwprintw(win, r++, 2, "space Scroll one screen"); + mvwprintw(win, r++, 2, "backspace Scroll back one screen"); + mvwprintw(win, r++, 2, "g Go to specific location"); + mvwprintw(win, r++, 2, "m Color for mapping qual"); + mvwprintw(win, r++, 2, "n Color for nucleotide"); + mvwprintw(win, r++, 2, "b Color for base quality"); + mvwprintw(win, r++, 2, "c Color for cs color"); + mvwprintw(win, r++, 2, "z Color for cs qual"); + mvwprintw(win, r++, 2, ". Toggle on/off dot view"); + mvwprintw(win, r++, 2, "N Turn on nt view"); + mvwprintw(win, r++, 2, "C Turn on cs view"); + mvwprintw(win, r++, 2, "i Toggle on/off ins"); + mvwprintw(win, r++, 2, "q Exit"); + r++; + mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); + mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); + mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); + wrefresh(win); + wgetch(win); +} + +void tv_loop(tview_t *tv) +{ + int tid, pos; + tid = tv->curr_tid; pos = tv->left_pos; + while (1) { + int c = getch(); + //if(256 < c) {c = 1 + (c%256);} // Terminal was displaying ctrl-H as 263 via ssh from Mac OS X 10.5 computer + switch (c) { + case '?': tv_win_help(tv); break; + case '\033': + case 'q': goto end_loop; + case 'g': tv_win_goto(tv, &tid, &pos); break; + case 'm': tv->color_for = TV_COLOR_MAPQ; break; + case 'b': tv->color_for = TV_COLOR_BASEQ; break; + case 'n': tv->color_for = TV_COLOR_NUCL; break; + case 'c': tv->color_for = TV_COLOR_COL; break; + case 'z': tv->color_for = TV_COLOR_COLQ; break; + case KEY_LEFT: + case 'h': --pos; break; + case KEY_RIGHT: + case 'l': ++pos; break; + case KEY_SLEFT: + case 'H': pos -= 20; break; + case KEY_SRIGHT: + case 'L': pos += 20; break; + case '.': tv->is_dot = !tv->is_dot; break; + case 'N': tv->base_for = TV_BASE_NUCL; break; + case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; + case 'i': tv->ins = !tv->ins; break; + case '\010': pos -= 1000; break; + case '\014': pos += 1000; break; + case ' ': pos += tv->mcol; break; + case KEY_UP: + case 'j': --tv->row_shift; break; + case KEY_DOWN: + case 'k': ++tv->row_shift; break; + case KEY_BACKSPACE: + case '\177': pos -= tv->mcol; break; +#ifdef KEY_RESIZE + case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; +#endif + default: continue; + } + if (pos < 0) pos = 0; + if (tv->row_shift < 0) tv->row_shift = 0; + tv_draw_aln(tv, tid, pos); + } +end_loop: + return; +} + +int bam_tview_main(int argc, char *argv[]) +{ + tview_t *tv; + if (argc == 1) { + fprintf(stderr, "Usage: bamtk tview [ref.fasta]\n"); + return 1; + } + tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]); + tv_draw_aln(tv, 0, 0); + tv_loop(tv); + tv_destroy(tv); + return 0; +} +#else // #ifdef NCURSES_VERSION +#warning "The ncurses library is unavailable; tview is disabled." +int bam_tview_main(int argc, char *argv[]) +{ + fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n"); + return 1; +} +#endif +#endif // #ifndef _NO_CURSES diff --git a/bamtk.c b/bamtk.c new file mode 100644 index 0000000..3386836 --- /dev/null +++ b/bamtk.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include "bam.h" + +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "0.1.5c (r385)" +#endif + +int bam_taf2baf(int argc, char *argv[]); +int bam_pileup(int argc, char *argv[]); +int bam_merge(int argc, char *argv[]); +int bam_index(int argc, char *argv[]); +int bam_sort(int argc, char *argv[]); +int bam_tview_main(int argc, char *argv[]); +int bam_mating(int argc, char *argv[]); +int bam_rmdup(int argc, char *argv[]); +int bam_rmdupse(int argc, char *argv[]); +int bam_flagstat(int argc, char *argv[]); +int bam_fillmd(int argc, char *argv[]); + +int main_samview(int argc, char *argv[]); +int main_import(int argc, char *argv[]); + +int faidx_main(int argc, char *argv[]); +int glf3_view_main(int argc, char *argv[]); + +int bam_tagview(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *header; + bam1_t *b; + char tag[2]; + int ret; + if (argc < 3) { + fprintf(stderr, "Usage: samtools tagview \n"); + return 1; + } + fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + if (header == 0) { + fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); + return 1; + } + tag[0] = argv[2][0]; tag[1] = argv[2][1]; + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + while ((ret = bam_read1(fp, b)) >= 0) { + uint8_t *d = bam_aux_get(b, tag); + if (d) { + printf("%s\t%d\t", bam1_qname(b), b->core.flag); + if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d)); + else if (d[0] == 'f') printf("%f\n", bam_aux2f(d)); + else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d)); + else if (d[0] == 'A') printf("%c\n", bam_aux2A(d)); + else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d)); + else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d)); + else printf("\n"); + } + } + if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); + free(b->data); free(b); + bam_header_destroy(header); + bam_close(fp); + return 0; +} + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); + fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: samtools [options]\n\n"); + fprintf(stderr, "Command: import import from SAM (obsolete; use `view')\n"); + fprintf(stderr, " view export to the text format\n"); + fprintf(stderr, " sort sort alignment file\n"); + fprintf(stderr, " merge merge multiple sorted alignment files\n"); + fprintf(stderr, " pileup generate pileup output\n"); + fprintf(stderr, " faidx index/extract FASTA\n"); +#ifndef _NO_CURSES + fprintf(stderr, " tview text alignment viewer\n"); +#endif + fprintf(stderr, " index index alignment\n"); + fprintf(stderr, " fixmate fix mate information\n"); + fprintf(stderr, " rmdup remove PCR duplicates\n"); + fprintf(stderr, " glfview print GLFv3 file\n"); + fprintf(stderr, " flagstat simple stats\n"); + fprintf(stderr, " fillmd fill the MD tag and change identical base to =\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) return usage(); + if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); + else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); + else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); + else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); + else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); + else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); + else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); + else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1); + else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); + else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); + else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); + else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); +#ifndef _NO_CURSES + else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); +#endif + else { + fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); + return 1; + } + return 0; +} diff --git a/bgzf.c b/bgzf.c new file mode 100644 index 0000000..fe4e31d --- /dev/null +++ b/bgzf.c @@ -0,0 +1,634 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); + +typedef int8_t byte; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +inline +int +min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); + return fp; +} + +static +BGZF* +open_read(int fd) +{ +#ifdef _USE_KNETFILE + knetFile *file = knet_dopen(fd, "r"); +#else + FILE* file = fdopen(fd, "r"); +#endif + BGZF* fp; + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = fd; + fp->open_mode = 'r'; +#ifdef _USE_KNETFILE + fp->x.fpr = file; +#else + fp->file = file; +#endif + return fp; +} + +static +BGZF* +open_write(int fd, bool is_uncompressed) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp; + if (file == 0) return 0; + fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; +#ifdef _USE_KNETFILE + fp->x.fpw = file; +#else + fp->file = file; +#endif + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ +#ifdef _USE_KNETFILE + knetFile *file = knet_open(path, mode); + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = -1; + fp->open_mode = 'r'; + fp->x.fpr = file; +#else + int oflag = O_RDONLY; + int fd = open(path, oflag); + if (fd == -1) return 0; + fp = open_read(fd); +#endif + } else if (mode[0] == 'w' || mode[0] == 'W') { + int oflag = O_WRONLY | O_CREAT | O_TRUNC; + int fd = open(path, oflag, 0644); + if (fd == -1) return 0; + fp = open_write(fd, strstr(mode, "u")? 1 : 0); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (fd == -1) return 0; + if (mode[0] == 'r' || mode[0] == 'R') { + return open_read(fd); + } else if (mode[0] == 'w' || mode[0] == 'W') { + return open_write(fd, strstr(mode, "u")? 1 : 0); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + byte* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const byte* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (byte) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + +static +int +read_block(BGZF* fp) +{ + byte header[BLOCK_HEADER_LENGTH]; + int size = 0; +#ifdef _USE_KNETFILE + int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; + int count = knet_read(fp->x.fpr, header, sizeof(header)); +#else + int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; + int count = fread(header, 1, sizeof(header), fp->file); +#endif + if (count == 0) { + fp->block_length = 0; + return 0; + } + size = count; + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + byte* compressed_block = (byte*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; +#ifdef _USE_KNETFILE + count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); +#else + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); +#endif + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + size += count; + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + byte* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = min(length-bytes_read, available); + byte* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } +#ifdef _USE_KNETFILE + int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const byte* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = min(block_length - fp->block_offset, length - bytes_written); + byte* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } +#ifdef _USE_KNETFILE + if (fflush(fp->x.fpw) != 0) { +#else + if (fflush(fp->file) != 0) { +#endif + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { +#ifdef _USE_KNETFILE + int ret; + if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); + else ret = knet_close(fp->x.fpr); + if (ret != 0) return -1; +#else + if (fclose(fp->file) != 0) { + return -1; + } +#endif + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +int64_t +bgzf_tell(BGZF* fp) +{ + return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { +#else + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { +#endif + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} + diff --git a/bgzf.h b/bgzf.h new file mode 100644 index 0000000..d5eeafe --- /dev/null +++ b/bgzf.h @@ -0,0 +1,120 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include +#include +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +int64_t bgzf_tell(BGZF* fp); + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bgzip.c b/bgzip.c new file mode 100644 index 0000000..c58d55d --- /dev/null +++ b/bgzip.c @@ -0,0 +1,166 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static int bgzip_main_usage() +{ + printf("\n"); + printf("Usage: bgzip [options] [file] ...\n\n"); + printf("Options: -c write on standard output, keep original files unchanged\n"); + printf(" -d decompress\n"); + // printf(" -l list compressed file contents\n"); + printf(" -b INT decompress at virtual file pointer INT\n"); + printf(" -s INT decompress INT bytes in the uncompressed file\n"); + printf(" -h give this help\n"); + printf("\n"); + return 0; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) { + printf("bgzip: %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + printf("bgzip: not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) { + fprintf(stderr, "bgzip: %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +static +void +fail(BGZF* fp) +{ + printf("Error: %s\n", fp->error); + exit(1); +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + BGZF *rz; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ + switch(c){ + case 'h': return bgzip_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + // case 'l': compress = 2; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if(end >= 0 && end < start){ + fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); + return 1; + } + if(compress == 1){ + int f_src, f_dst = -1; + if(argc > optind){ + if((f_src = open(argv[optind], O_RDONLY)) < 0){ + fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); + return 1; + } + if(pstdout){ + f_dst = fileno(stdout); + } else { + char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } else if(pstdout){ + f_src = fileno(stdin); + f_dst = fileno(stdout); + } else return bgzip_main_usage(); + rz = bgzf_fdopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) { + if (bgzf_write(rz, buffer, c) < 0) { + fail(rz); + } + } + // f_dst will be closed here + if (bgzf_close(rz) < 0) { + fail(rz); + } + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + if(argc <= optind) return bgzip_main_usage(); + int f_dst; + if (argc > optind && !pstdout) { + char *name; + if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) { + printf("bgzip: %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } else f_dst = fileno(stdout); + rz = bgzf_open(argv[optind], "r"); + if (rz == NULL) { + printf("Could not open file: %s\n", argv[optind]); + return 1; + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(rz, start, SEEK_SET) < 0) { + fail(rz); + } + while(1){ + if(end < 0) c = bgzf_read(rz, buffer, WINDOW_SIZE); + else c = bgzf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if(c == 0) break; + if (c < 0) fail(rz); + start += c; + write(f_dst, buffer, c); + if(end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(rz) < 0) { + fail(rz); + } + if (!pstdout) unlink(argv[optind]); + return 0; + } +} + diff --git a/examples/00README.txt b/examples/00README.txt new file mode 100644 index 0000000..dbb276f --- /dev/null +++ b/examples/00README.txt @@ -0,0 +1,23 @@ +File ex1.fa contains two sequences cut from the human genome +build36. They were exatracted with command: + + samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 + +Sequence names were changed manually for simplicity. File ex1.sam.gz +contains MAQ alignments exatracted with: + + (samtools view NA18507_maq.bam 2:2044001-2045500; + samtools view NA18507_maq.bam 20:68001-69500) + +and processed with `samtools fixmate' to make it self-consistent as a +standalone alignment. + +To try samtools, you may run the following commands: + + samtools faidx ex1.fa # index the reference FASTA + samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM + samtools index ex1.bam # index BAM + samtools tview ex1.bam ex1.fa # view alignment + samtools pileup -cf ex1.fa ex1.bam # pileup and consensus + samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz + diff --git a/examples/Makefile b/examples/Makefile new file mode 100644 index 0000000..3fe3e5a --- /dev/null +++ b/examples/Makefile @@ -0,0 +1,27 @@ +all:../libbam.a ../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz calDepth + @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo; + +ex1.fa.fai:ex1.fa + ../samtools faidx ex1.fa +ex1.bam:ex1.sam.gz ex1.fa.fai + ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam +ex1.bam.bai:ex1.bam + ../samtools index ex1.bam +ex1.pileup.gz:ex1.bam ex1.fa + ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz +ex1.glf:ex1.bam ex1.fa + ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf +ex1.glfview.gz:ex1.glf + ../samtools glfview ex1.glf | gzip > ex1.glfview.gz + +../samtools: + (cd ..; make samtools) + +../libbam.a: + (cd ..; make libbam.a) + +calDepth:../libbam.a calDepth.c + gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam + +clean: + rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM \ No newline at end of file diff --git a/examples/calDepth.c b/examples/calDepth.c new file mode 100644 index 0000000..7a3239c --- /dev/null +++ b/examples/calDepth.c @@ -0,0 +1,62 @@ +#include +#include "sam.h" + +typedef struct { + int beg, end; + samfile_t *in; +} tmpstruct_t; + +// callback for bam_fetch() +static int fetch_func(const bam1_t *b, void *data) +{ + bam_plbuf_t *buf = (bam_plbuf_t*)data; + bam_plbuf_push(b, buf); + return 0; +} +// callback for bam_plbuf_init() +static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + tmpstruct_t *tmp = (tmpstruct_t*)data; + if ((int)pos >= tmp->beg && (int)pos < tmp->end) + printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n); + return 0; +} + +int main(int argc, char *argv[]) +{ + tmpstruct_t tmp; + if (argc == 1) { + fprintf(stderr, "Usage: calDepth [region]\n"); + return 1; + } + tmp.beg = 0; tmp.end = 0x7fffffff; + tmp.in = samopen(argv[1], "rb", 0); + if (tmp.in == 0) { + fprintf(stderr, "Fail to open BAM file %s\n", argv[1]); + return 1; + } + if (argc == 2) { // if a region is not specified + sampileup(tmp.in, -1, pileup_func, &tmp); + } else { + int ref; + bam_index_t *idx; + bam_plbuf_t *buf; + idx = bam_index_load(argv[1]); // load BAM index + if (idx == 0) { + fprintf(stderr, "BAM indexing file is not available.\n"); + return 1; + } + bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region + if (ref < 0) { + fprintf(stderr, "Invalid region %s\n", argv[2]); + return 1; + } + buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup + bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func); + bam_plbuf_push(0, buf); // finalize pileup + bam_index_destroy(idx); + bam_plbuf_destroy(buf); + } + samclose(tmp.in); + return 0; +} diff --git a/examples/ex1.fa b/examples/ex1.fa new file mode 100644 index 0000000..ef611b4 --- /dev/null +++ b/examples/ex1.fa @@ -0,0 +1,56 @@ +>seq1 +CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT +GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC +GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG +TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC +AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA +CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC +AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT +CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA +ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC +AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC +AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC +ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC +CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT +TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT +TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT +GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT +ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA +ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG +TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA +CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG +TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC +TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC +TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG +TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG +AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA +TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC +TCCCTCGTCTTCTTA +>seq2 +TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG +CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT +TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT +CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA +AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT +AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC +ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG +GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT +CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT +TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA +AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA +ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT +TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA +AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC +TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA +GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT +AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA +AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT +AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT +AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT +ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT +GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG +CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA +GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA +AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA +TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC +CAGAAAAAAATATTTACAGTAACT diff --git a/examples/ex1.sam.gz b/examples/ex1.sam.gz new file mode 100644 index 0000000..44c07ee Binary files /dev/null and b/examples/ex1.sam.gz differ diff --git a/faidx.c b/faidx.c new file mode 100644 index 0000000..36366c2 --- /dev/null +++ b/faidx.c @@ -0,0 +1,311 @@ +#include +#include +#include +#include +#include "faidx.h" +#include "khash.h" + +typedef struct { + uint64_t len:32, line_len:16, line_blen:16; + uint64_t offset; +} faidx1_t; +KHASH_MAP_INIT_STR(s, faidx1_t) + +#ifndef _NO_RAZF +#include "razf.h" +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#define RAZF FILE +#define razf_read(fp, buf, size) fread(buf, 1, size, fp) +#define razf_open(fn, mode) fopen(fn, mode) +#define razf_close(fp) fclose(fp) +#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) +#define razf_tell(fp) ftello(fp) +#endif + +struct __faidx_t { + RAZF *rz; + int n, m; + char **name; + khash_t(s) *hash; +}; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) +{ + khint_t k; + int ret; + faidx1_t t; + if (idx->n == idx->m) { + idx->m = idx->m? idx->m<<1 : 16; + idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); + } + idx->name[idx->n] = strdup(name); + k = kh_put(s, idx->hash, idx->name[idx->n], &ret); + t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; + kh_value(idx->hash, k) = t; + ++idx->n; +} + +faidx_t *fai_build_core(RAZF *rz) +{ + char c, *name; + int l_name, m_name, ret; + int len, line_len, line_blen, state; + int l1, l2; + faidx_t *idx; + uint64_t offset; + + idx = (faidx_t*)calloc(1, sizeof(faidx_t)); + idx->hash = kh_init(s); + name = 0; l_name = m_name = 0; + len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; + while (razf_read(rz, &c, 1)) { + if (c == '\n') { // an empty line + if (state == 1) { + offset = razf_tell(rz); + continue; + } else if ((state == 0 && len < 0) || state == 2) continue; + } + if (c == '>') { // fasta header + if (len >= 0) + fai_insert_index(idx, name, len, line_len, line_blen, offset); + l_name = 0; + while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { + if (m_name < l_name + 2) { + m_name = l_name + 2; + kroundup32(m_name); + name = (char*)realloc(name, m_name); + } + name[l_name++] = c; + } + name[l_name] = '\0'; + if (ret == 0) { + fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); + free(name); fai_destroy(idx); + return 0; + } + if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); + state = 1; len = 0; + offset = razf_tell(rz); + } else { + if (state == 3) { + fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 2) state = 3; + l1 = l2 = 0; + do { + ++l1; + if (isgraph(c)) ++l2; + } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); + if (state == 3 && l2) { + fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + ++l1; len += l2; + if (l2 >= 0x10000) { + fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 1) line_len = l1, line_blen = l2, state = 0; + else if (state == 0) { + if (l1 != line_len || l2 != line_blen) state = 2; + } + } + } + fai_insert_index(idx, name, len, line_len, line_blen, offset); + free(name); + return idx; +} + +void fai_save(const faidx_t *fai, FILE *fp) +{ + khint_t k; + int i; + for (i = 0; i < fai->n; ++i) { + faidx1_t x; + k = kh_get(s, fai->hash, fai->name[i]); + x = kh_value(fai->hash, k); + fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); + } +} + +faidx_t *fai_read(FILE *fp) +{ + faidx_t *fai; + char *buf, *p; + int len, line_len, line_blen; + long long offset; + fai = (faidx_t*)calloc(1, sizeof(faidx_t)); + fai->hash = kh_init(s); + buf = (char*)calloc(0x10000, 1); + while (!feof(fp) && fgets(buf, 0x10000, fp)) { + for (p = buf; *p && isgraph(*p); ++p); + *p = 0; ++p; + sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); + fai_insert_index(fai, buf, len, line_len, line_blen, offset); + } + free(buf); + return fai; +} + +void fai_destroy(faidx_t *fai) +{ + int i; + for (i = 0; i < fai->n; ++i) free(fai->name[i]); + free(fai->name); + kh_destroy(s, fai->hash); + if (fai->rz) razf_close(fai->rz); + free(fai); +} + +int fai_build(const char *fn) +{ + char *str; + RAZF *rz; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + rz = razf_open(fn, "r"); + if (rz == 0) { + fprintf(stderr, "[fai_build] fail to open the FASTA file.\n"); + free(str); + return -1; + } + fai = fai_build_core(rz); + razf_close(rz); + fp = fopen(str, "w"); + if (fp == 0) { + fprintf(stderr, "[fai_build] fail to write FASTA index.\n"); + fai_destroy(fai); free(str); + return -1; + } + fai_save(fai, fp); + fclose(fp); + free(str); + fai_destroy(fai); + return 0; +} + +faidx_t *fai_load(const char *fn) +{ + char *str; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + fp = fopen(str, "r"); + if (fp == 0) { + fprintf(stderr, "[fai_load] build FASTA index.\n"); + fai_build(fn); + fp = fopen(str, "r"); + if (fp == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); + free(str); + return 0; + } + } + fai = fai_read(fp); + fclose(fp); + fai->rz = razf_open(fn, "r"); + free(str); + if (fai->rz == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); + return 0; + } + return fai; +} + +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + char *s, *p, c; + int i, l, k; + khiter_t iter; + faidx1_t val; + khash_t(s) *h; + int beg, end; + + beg = end = -1; + h = fai->hash; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the ref_id */ + if (iter == kh_end(h)) { + *len = 0; + free(s); return 0; + } + val = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + beg = 0; end = val.len; + } else { + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + beg = atoi(p); + if (i < k) { + p = s + i + 1; + end = atoi(p); + } else end = val.len; + } + if (beg > 0) --beg; + if (beg >= val.len) beg = val.len; + if (end >= val.len) end = val.len; + if (beg > end) beg = end; + free(s); + + // now retrieve the sequence + l = 0; + s = (char*)malloc(end - beg + 2); + razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); + while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg) + if (isgraph(c)) s[l++] = c; + s[l] = '\0'; + *len = l; + return s; +} + +int faidx_main(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "Usage: faidx [ [...]]\n"); + return 1; + } else { + if (argc == 2) fai_build(argv[1]); + else { + int i, j, k, l; + char *s; + faidx_t *fai; + fai = fai_load(argv[1]); + if (fai == 0) return 1; + for (i = 2; i != argc; ++i) { + printf(">%s\n", argv[i]); + s = fai_fetch(fai, argv[i], &l); + for (j = 0; j < l; j += 60) { + for (k = 0; k < 60 && k < l - j; ++k) + putchar(s[j + k]); + putchar('\n'); + } + free(s); + } + fai_destroy(fai); + } + } + return 0; +} + +#ifdef FAIDX_MAIN +int main(int argc, char *argv[]) { return faidx_main(argc, argv); } +#endif diff --git a/faidx.h b/faidx.h new file mode 100644 index 0000000..1a52fb7 --- /dev/null +++ b/faidx.h @@ -0,0 +1,82 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef FAIDX_H +#define FAIDX_H + +/*! + @header + + Index FASTA files and extract subsequence. + + @copyright The Wellcome Trust Sanger Institute. + */ + +struct __faidx_t; +typedef struct __faidx_t faidx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Build index for a FASTA or razip compressed FASTA file. + @param fn FASTA file name + @return 0 on success; or -1 on failure + @discussion File "fn.fai" will be generated. + */ + int fai_build(const char *fn); + + /*! + @abstract Distroy a faidx_t struct. + @param fai Pointer to the struct to be destroyed + */ + void fai_destroy(faidx_t *fai); + + /*! + @abstract Load index from "fn.fai". + @param fn File name of the FASTA file + */ + faidx_t *fai_load(const char *fn); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *fai_fetch(const faidx_t *fai, const char *reg, int *len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/glf.c b/glf.c new file mode 100644 index 0000000..8d5346a --- /dev/null +++ b/glf.c @@ -0,0 +1,236 @@ +#include +#include +#include "glf.h" + +#ifdef _NO_BGZF +// then alias bgzf_*() functions +#endif + +static int glf3_is_BE = 0; + +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} + +glf3_header_t *glf3_header_init() +{ + glf3_is_BE = bam_is_big_endian(); + return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); +} + +glf3_header_t *glf3_header_read(glfFile fp) +{ + glf3_header_t *h; + char magic[4]; + h = glf3_header_init(); + bgzf_read(fp, magic, 4); + if (strncmp(magic, "GLF\3", 4)) { + fprintf(stderr, "[glf3_header_read] invalid magic.\n"); + glf3_header_destroy(h); + return 0; + } + bgzf_read(fp, &h->l_text, 4); + if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); + if (h->l_text) { + h->text = (uint8_t*)calloc(h->l_text + 1, 1); + bgzf_read(fp, h->text, h->l_text); + } + return h; +} + +void glf3_header_write(glfFile fp, const glf3_header_t *h) +{ + int32_t x; + bgzf_write(fp, "GLF\3", 4); + x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; + bgzf_write(fp, &x, 4); + if (h->l_text) bgzf_write(fp, h->text, h->l_text); +} + +void glf3_header_destroy(glf3_header_t *h) +{ + free(h->text); + free(h); +} + +char *glf3_ref_read(glfFile fp, int *len) +{ + int32_t n, x; + char *str; + *len = 0; + if (bgzf_read(fp, &n, 4) != 4) return 0; + if (glf3_is_BE) n = bam_swap_endian_4(n); + if (n < 0) { + fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); + return 0; + } + str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact + x = bgzf_read(fp, str, n); + x += bgzf_read(fp, len, 4); + if (x != n + 4) { + free(str); *len = -1; return 0; // truncated + } + if (glf3_is_BE) *len = bam_swap_endian_4(*len); + return str; +} + +void glf3_ref_write(glfFile fp, const char *str, int len) +{ + int32_t m, n = strlen(str) + 1; + m = glf3_is_BE? bam_swap_endian_4(n) : n; + bgzf_write(fp, &m, 4); + bgzf_write(fp, str, n); + if (glf3_is_BE) len = bam_swap_endian_4(len); + bgzf_write(fp, &len, 4); +} + +void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) +{ + int j; + if (g3->rtype == GLF3_RTYPE_END) return; + printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, + g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], + g3->depth, g3->rms_mapQ, g3->min_lk); + if (g3->rtype == GLF3_RTYPE_SUB) + for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); + else { + printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], + g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); + } + printf("\n"); +} + +int glf3_write1(glfFile fp, const glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + c = g3->rtype<<4 | g3->ref_base; + r = bgzf_write(fp, &c, 1); + if (g3->rtype == GLF3_RTYPE_END) return r; + y[0] = g3->offset; + y[1] = g3->min_lk<<24 | g3->depth; + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + r += bgzf_write(fp, y, 8); + r += bgzf_write(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); + else { + int16_t x[2]; + r += bgzf_write(fp, g3->lk, 3); + x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; + x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; + r += bgzf_write(fp, x, 4); + if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); + if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); + } + return r; +} + +#ifndef kv_roundup32 +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +int glf3_read1(glfFile fp, glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + r = bgzf_read(fp, &c, 1); + if (r == 0) return 0; + g3->ref_base = c & 0xf; + g3->rtype = c>>4; + if (g3->rtype == GLF3_RTYPE_END) return r; + r += bgzf_read(fp, y, 8); + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + g3->offset = y[0]; + g3->min_lk = y[1]>>24; + g3->depth = y[1]<<8>>8; + r += bgzf_read(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); + else { + int16_t x[2], max; + r += bgzf_read(fp, g3->lk, 3); + r += bgzf_read(fp, x, 4); + if (glf3_is_BE) { + x[0] = bam_swap_endian_2(x[0]); + x[1] = bam_swap_endian_2(x[1]); + } + g3->indel_len[0] = x[0]; + g3->indel_len[1] = x[1]; + x[0] = abs(x[0]); x[1] = abs(x[1]); + max = (x[0] > x[1]? x[0] : x[1]) + 1; + if (g3->max_len < max) { + g3->max_len = max; + kv_roundup32(g3->max_len); + g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); + g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); + } + r += bgzf_read(fp, g3->indel_seq[0], x[0]); + r += bgzf_read(fp, g3->indel_seq[1], x[1]); + g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; + } + return r; +} + +void glf3_view(glfFile fp) +{ + glf3_header_t *h; + char *name; + glf3_t *g3; + int len; + h = glf3_header_read(fp); + g3 = glf3_init1(); + while ((name = glf3_ref_read(fp, &len)) != 0) { + int pos = 0; + while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { + pos += g3->offset; + glf3_view1(name, g3, pos); + } + free(name); + } + glf3_header_destroy(h); + glf3_destroy1(g3); +} + +int glf3_view_main(int argc, char *argv[]) +{ + glfFile fp; + if (argc == 1) { + fprintf(stderr, "Usage: glfview \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "Fail to open file '%s'\n", argv[1]); + return 1; + } + glf3_view(fp); + bgzf_close(fp); + return 0; +} + +#ifdef GLFVIEW_MAIN +int main(int argc, char *argv[]) +{ + return glf3_view_main(argc, argv); +} +#endif diff --git a/glf.h b/glf.h new file mode 100644 index 0000000..12e5400 --- /dev/null +++ b/glf.h @@ -0,0 +1,56 @@ +#ifndef GLF_H_ +#define GLF_H_ + +typedef struct { + unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + unsigned char max_mapQ; /** maximum mapping quality */ + unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ + unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ +} glf1_t; + +#include +#include "bgzf.h" +typedef BGZF *glfFile; + +#define GLF3_RTYPE_END 0 +#define GLF3_RTYPE_SUB 1 +#define GLF3_RTYPE_INDEL 2 + +typedef struct { + uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + uint8_t rms_mapQ; /** RMS mapping quality */ + uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ + uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ + int32_t offset; /** the first base in a chromosome has offset zero. */ + // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) + int16_t indel_len[2]; + int32_t max_len; // maximum indel len; will be modified by glf3_read1() + char *indel_seq[2]; +} glf3_t; + +typedef struct { + int32_t l_text; + uint8_t *text; +} glf3_header_t; + +#ifdef __cplusplus +extern "C" { +#endif + +#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) +#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) + + glf3_header_t *glf3_header_init(); + glf3_header_t *glf3_header_read(glfFile fp); + void glf3_header_write(glfFile fp, const glf3_header_t *h); + void glf3_header_destroy(glf3_header_t *h); + char *glf3_ref_read(glfFile fp, int *len); + void glf3_ref_write(glfFile fp, const char *name, int len); + int glf3_write1(glfFile fp, const glf3_t *g3); + int glf3_read1(glfFile fp, glf3_t *g3); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..1d583ef --- /dev/null +++ b/khash.h @@ -0,0 +1,486 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.2" + +#include +#include +#include + +typedef uint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + uint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + uint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [uint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (uint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [uint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/knetfile.c b/knetfile.c new file mode 100644 index 0000000..cef197d --- /dev/null +++ b/knetfile.c @@ -0,0 +1,300 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "knetfile.h" + +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); + if (ret == -1) perror("select"); + return ret; +} + +static int kftp_get_response(knetFile *ftp) +{ + unsigned char c; + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (read(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + write(ftp->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + +static int kftp_pasv_connect(knetFile *ftp) +{ +#define __err_pasv_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + struct addrinfo hints, *res; + struct linger lng = { 0, 0 }; + int on = 1; + char host[80], port[10]; + + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + if (getaddrinfo(host, port, &hints, &res) != 0) { perror("getaddrinfo"); return -1; } + if ((ftp->fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_pasv_connect("socket"); + if (setsockopt(ftp->fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_pasv_connect("setsockopt"); + if (setsockopt(ftp->fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_pasv_connect("setsockopt"); + if (connect(ftp->fd, res->ai_addr, res->ai_addrlen) != 0) __err_pasv_connect("connect"); + freeaddrinfo(res); + return 0; +} + +int kftp_connect(knetFile *ftp) +{ +#define __err_connect(func) do { perror(func); return -1; } while (0) + + int on = 1; + { // open socket + struct addrinfo hints, *res; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if (getaddrinfo(ftp->host, "21", &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((ftp->ctrl_fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + if (setsockopt(ftp->ctrl_fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (connect(ftp->ctrl_fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + kftp_get_response(ftp); + } + { // login + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + } + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd >= 0) { + close(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + close(ftp->fd); + return kftp_connect(ftp); +} + +// initialize ->type, ->host and ->retr +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + fp->host = calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->seek_offset = -1; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + if (fp->fd >= 0) { + close(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + if (fp->offset) { + char tmp[32]; + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + close(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + if (fp->fd < 0) { + knet_close(fp); + return 0; + } + } else { + int fd = open(fn, O_RDONLY); + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +off_t knet_read(knetFile *fp, void *buf, off_t len) +{ + off_t l = 0; + if (fp->fd < 0) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + off_t rest = len, curr; + while (rest) { + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; + l += curr; rest -= curr; + } + fp->offset += l; + } else { + off_t rest = len, curr; + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + fp->is_ready = 1; + } + while (rest) { + if (socket_wait(fp->fd, 1) <= 0) break; // socket is not ready for reading + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; // FIXME: end of file or bad network? I do not know... + l += curr; rest -= curr; + } + fp->offset += l; + } + return l; +} + +int knet_seek(knetFile *fp, off_t off, int whence) +{ + if (fp->type == KNF_TYPE_LOCAL) { + if (lseek(fp->fd, off, whence) == -1) { + perror("lseek"); + return -1; + } + fp->offset = off; + return 0; + } + if (fp->type == KNF_TYPE_FTP) { + if (whence != SEEK_SET) { // FIXME: we can surely allow SEEK_CUR and SEEK_END in future + fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP. Offset is unchanged.\n"); + return -1; + } + fp->offset = off; + fp->is_ready = 0; + return 0; + } + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd >= 0) close(fp->ctrl_fd); + if (fp->fd >= 0) close(fp->fd); + free(fp->response); free(fp->retr); free(fp->host); + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char buf[256]; + knetFile *fp; +// fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); knet_seek(fp, 2500000000ll, SEEK_SET); + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); knet_seek(fp, 2000, SEEK_SET); +// fp = knet_open("knetfile.c", "r"); knet_seek(fp, 2000, SEEK_SET); + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + knet_close(fp); + return 0; +} +#endif diff --git a/knetfile.h b/knetfile.h new file mode 100644 index 0000000..bf45f3d --- /dev/null +++ b/knetfile.h @@ -0,0 +1,55 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include +#include + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr; + int64_t seek_offset; // for lazy seek +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + int knet_seek(knetFile *fp, off_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/kseq.h b/kseq.h new file mode 100644 index 0000000..bbe0125 --- /dev/null +++ b/kseq.h @@ -0,0 +1,223 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Last Modified: 12APR2009 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_MAX 1 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->l == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff --git a/ksort.h b/ksort.h new file mode 100644 index 0000000..16a03fd --- /dev/null +++ b/ksort.h @@ -0,0 +1,271 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/kstring.c b/kstring.c new file mode 100644 index 0000000..dc20fae --- /dev/null +++ b/kstring.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + return 0; +} +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..221ade2 --- /dev/null +++ b/kstring.h @@ -0,0 +1,59 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/misc/Makefile b/misc/Makefile new file mode 100644 index 0000000..4404ccc --- /dev/null +++ b/misc/Makefile @@ -0,0 +1,54 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 -m64 #-arch ppc +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_FILE_OFFSET_BITS=64 +OBJS= +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim +INCLUDES= -I.. +SUBDIRS= . + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +lib-recur all-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" $$target || exit 1; \ + cd $$wdir; \ + done; + +lib: + +wgsim:wgsim.o + $(CC) $(CFLAGS) -o $@ wgsim.o -lm + +md5fa:md5.o md5fa.o md5.h ../kseq.h + $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz + +md5sum-lite:md5sum-lite.o + $(CC) $(CFLAGS) -o $@ md5sum-lite.o + +md5sum-lite.o:md5.c md5.h + $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c + +maq2sam-short:maq2sam.c + $(CC) $(CFLAGS) -o $@ maq2sam.c -lz + +maq2sam-long:maq2sam.c + $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz + +md5fa.o:md5.h md5fa.c + $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/misc/blast2sam.pl b/misc/blast2sam.pl new file mode 100755 index 0000000..084f018 --- /dev/null +++ b/misc/blast2sam.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; +use Getopt::Std; + +&blast2sam; + +sub blast2sam { + my %opts = (); + getopts('s', \%opts); + die("Usage: blast2sam.pl \n") if (-t STDIN && @ARGV == 0); + my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq); + $show_seq = defined($opts{s}); + @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*'); + while (<>) { + if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print + &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); + @cigar = (); + } + if (/^Query= (\S+)/) { + $sam[0] = $1; + } elsif (/\((\S+)\s+letters\)/) { + $qlen = $1; $qlen =~ s/,//g; + } elsif (/^>(\S+)/) { + $sam[2] = $1; + } elsif (/Length = (\d+)/) { + $slen = $1; + } elsif (/Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/) { # the start of an alignment block + my ($as, $ev) = (int($1 + .499), $3); + $ev = "1$ev" if ($ev =~ /^e/); + @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev"); + @cigar = (); $qbeg = 0; + @cmaux = (0, 0, 0, ''); + } elsif (/Strand = (\S+) \/ (\S+)/) { + $sam[1] |= 0x10 if ($2 eq 'Minus'); + } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) { + $q = $2; + unless ($qbeg) { + $qbeg = $1; + push(@cigar, ($1-1) . "H") if ($1 > 1); + } + $qend = $3; + if ($show_seq) { + my $x = $q; + $x =~ s/-//g; $sam[9] .= $x; + } + } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) { + $s = $2; + if ($sam[1] & 0x10) { + $sam[3] = $3; + } else { + $sam[3] = $1 unless ($sam[3]); + } + &aln2cm(\@cigar, \$q, \$s, \@cmaux); + } + } + &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); +} + +sub blast_print_sam { + my ($sam, $cigar, $cmaux, $qrest) = @_; + push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); + push(@$cigar, $qrest . 'H') if ($qrest); + if ($sam->[1] & 0x10) { + @$cigar = reverse(@$cigar); + $sam->[9] = reverse($sam->[9]); + $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/; + } + $sam->[9] = '*' if (!$sam->[9]); + $sam->[5] = join('', @$cigar); + print join("\t", @$sam), "\n"; +} + +sub aln2cm { + my ($cigar, $q, $s, $cmaux) = @_; + my $l = length($$q); + for (my $i = 0; $i < $l; ++$i) { + my $op; + # set $op + if (substr($$q, $i, 1) eq '-') { $op = 2; } + elsif (substr($$s, $i, 1) eq '-') { $op = 1; } + else { $op = 0; } + # for CIGAR + if ($cmaux->[0] == $op) { + ++$cmaux->[1]; + } else { + push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); + $cmaux->[0] = $op; $cmaux->[1] = 1; + } + } +} diff --git a/misc/bowtie2sam.pl b/misc/bowtie2sam.pl new file mode 100755 index 0000000..5dff88d --- /dev/null +++ b/misc/bowtie2sam.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&bowtie2sam; +exit; + +sub bowtie2sam { + my %opts = (); + die("Usage: bowtie2sam.pl \n") if (@ARGV == 0 && -t STDIN); + # core loop + my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k); + $last = ''; + while (<>) { + my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches + if ($name eq $last) { + # I do not know whether the multiple hits are ordered on the + # number of mismatches. I assume they are not and so I have to + # keep all these multiple hits in memory. + @{$staging[$k]} = @s; + if ($best_s > $nm) { + $subbest_s = $best_s; + $best_s = $nm; + $best_k = $k; + } elsif ($subbest_s > $nm) { + $subbest_s = $nm; + } + ++$k; + } else { + if ($last) { + if ($best_s == $subbest_s) { + $staging[$best_k][4] = 0; + } elsif ($subbest_s - $best_s == 1) { + $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15); + } + print join("\t", @{$staging[$best_k]}), "\n"; + } + $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0; + @{$staging[0]} = @s; + $last = $name; + } + } + print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0); +} + +sub bowtie2sam_aux { + my ($line, $s) = @_; + chomp($line); + my @t = split("\t", $line); + my $ret; + @$s = (); + # read name + $s->[0] = $ret = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + # read & quality + $s->[9] = $t[4]; $s->[10] = $t[5]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[2]; $s->[3] = $t[3] + 1; + $s->[1] |= 0x10 if ($t[1] eq '-'); + # mapQ + $s->[4] = $t[6] == 0? 25 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + my $nm = @t - 7; + push(@$s, "NM:i:" . (@t-7)); + push(@$s, "X$nm:i:" . ($t[6]+1)); + my $md = ''; + if ($t[7]) { + $_ = $t[7]; + my $a = 0; + while (/(\d+):[ACGTN]>([ACGTN])/gi) { + my ($y, $z) = ($1, $2); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($s->[9]) - $a; + } else { + $md = length($s->[9]); + } + push(@$s, "MD:Z:$md"); + return ($ret, $nm); +} diff --git a/misc/export2sam.pl b/misc/export2sam.pl new file mode 100755 index 0000000..8e3e280 --- /dev/null +++ b/misc/export2sam.pl @@ -0,0 +1,107 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.2 (03JAN2009) + +use strict; +use warnings; +use Getopt::Std; + +&export2sam; +exit; + +sub export2sam { + my ($fh1, $fh2, $is_paired); + $is_paired = (@ARGV >= 2); + die("export2sam.pl []\n") if (@ARGV == 0); + open($fh1, $ARGV[0]) || die; + if ($is_paired) { + open($fh2, $ARGV[1]) || die; + } + # conversion table + my @conv_table; + for (-64..64) { + $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499)); + } + # core loop + while (<$fh1>) { + my (@s1, @s2); + &export2sam_aux($_, \@s1, \@conv_table, $is_paired); + if ($is_paired) { + $_ = <$fh2>; + &export2sam_aux($_, \@s2, \@conv_table, $is_paired); + if (@s1 && @s2) { # then set mate coordinate + my $isize = 0; + if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize + my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3]; + my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2[2] ne '*') { + @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize); + $s1[1] |= 0x20 if ($s2[1] & 0x10); + } else { + $s1[1] |= 0x8; + } + if ($s1[2] ne '*') { + @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize); + $s2[1] |= 0x20 if ($s1[1] & 0x10); + } else { + $s2[1] |= 0x8; + } + } + } + print join("\t", @s1), "\n" if (@s1); + print join("\t", @s2), "\n" if (@s2 && $is_paired); + } + close($fh1); + close($fh2) if ($is_paired); +} + +sub export2sam_aux { + my ($line, $s, $ct, $is_paired) = @_; + chomp($line); + my @t = split("\t", $line); + @$s = (); + return if ($t[21] ne 'Y'); + # read name + $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired); + # read & quality + $s->[9] = $t[8]; $s->[10] = $t[9]; + if ($t[13] eq 'R') { # then reverse the sequence and quality + $s->[9] = reverse($t[8]); + $s->[9] =~ tr/ACGTacgt/TGCAtgca/; + $s->[10] = reverse($t[9]); + } + $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + my $has_coor = 0; + $s->[2] = "*"; + if ($t[10] eq 'NM' || $t[10] eq 'QC') { + $s->[1] |= 0x4; # unmapped + } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) { + $s->[1] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? + push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + } else { + $s->[2] = $t[10]; + $has_coor = 1; + } + $s->[3] = $has_coor? $t[12] : 0; + $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R'); + # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?) + $s->[4] = 0; + $s->[4] = $t[15] if ($t[15] ne ''); + $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]); + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "BC:Z:$t[6]") if ($t[6]); + push(@$s, "MD:Z:$t[14]") if ($has_coor); + push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor); +} diff --git a/misc/interpolate_sam.pl b/misc/interpolate_sam.pl new file mode 100755 index 0000000..6cd6831 --- /dev/null +++ b/misc/interpolate_sam.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl +use strict; + +###Builds interpolated pileup from SAM file +##@description counts bases between paired ends and piles up single end reads. +##@output, uses a #header for the RNAME and then the number of reads per base +##@author sm8@sanger.ac.uk, Stephen B. Montgomery + +##@caveats +##Requires RNAME to have format as per example +## chromosome:NCBI36:18:1:76117153:1 +## supercontig::NT_113883:1:137703:1 +## clone::AC138827.3:1:149397:1 +##Expects simple CIGAR characters, M, I and D +##Expects SAM file to be sorted. +##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77) + +##Verify and read in SAM file +my $sam_file = $ARGV[0]; +if(!defined($sam_file)) { die("No sam file defined on arg 1"); } +unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); } +open(SAM, $sam_file) || die("Cannot open sam file"); + +##Globals +my $current_location = ""; ##Current RNAME being processed +my $current_size = 0; ##Size of sequence region being processed +my $current_position = 1; ##Current base being processed +my $open = 0; ##Number of open reads (PE reads that have not been closed) +my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the + ##contained value from those open and deletes the indexed position from the hash + +while (my $line = ) { + my @tokens = split /\t/, $line; + + if ($current_location ne $tokens[2]) { ##Start a new sequence region + for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; + } + if ($current_location ne "") { + print "\n"; + } + + ##Initiate a new sequence region + my @location_tokens = split /:/, $tokens[2]; + $current_position = 1; + $current_location = $tokens[2]; + $current_size = $location_tokens[4]; + $open = 0; + %close = (); + print "#" . $tokens[2] . "\n"; + + ##Print pileup to just before the first read (will be 0) + for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) { + print $open . "\n"; + } + $current_position = $tokens[3]; + + } else { ##Sequence region already open + if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position + ##cycle through to catch up to the current position + for (my $i = $current_position; $i < $tokens[3]; $i++) { + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; + } + $current_position = $tokens[3]; + } + } + $open++; ##Increment the number of open reads + + if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition + $open--; + my $parsed_cig = &parseCigar($tokens[5]); + my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; + if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } + $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; + } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition + my $parsed_cig = &parseCigar($tokens[5]); + my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; + if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } + $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; + } else { + #do nothing + } +} +for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; +} +print "\n"; +close(SAM); +exit(0); + +##reads and tokenizes simple cigarline +sub parseCigar() { + my $cigar_line = shift; + $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g; + my @cigar_tokens = split /\t/, $cigar_line; + my %parsed = ('M' => 0, + 'I' => 0, + 'D' => 0); + my @events = (); + for(my $i = 0; $i < scalar(@cigar_tokens); $i++) { + if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) { + if (!defined($parsed{$2})) { $parsed{$2} = 0; } + my $nt = $2; + if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; } + $parsed{$nt} += $1; + my %event_el = ("t" => $nt, + "n" => $1); + push @events, \%event_el; + } + } + $parsed{'events'} = \@events; + return \%parsed; +} diff --git a/misc/maq2sam.c b/misc/maq2sam.c new file mode 100644 index 0000000..758a698 --- /dev/null +++ b/misc/maq2sam.c @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include + +#define PACKAGE_VERSION "0.1.2 (20090521)" + +//#define MAQ_LONGREADS + +#ifdef MAQ_LONGREADS +# define MAX_READLEN 128 +#else +# define MAX_READLEN 64 +#endif + +#define MAX_NAMELEN 36 +#define MAQMAP_FORMAT_OLD 0 +#define MAQMAP_FORMAT_NEW -1 + +#define PAIRFLAG_FF 0x01 +#define PAIRFLAG_FR 0x02 +#define PAIRFLAG_RF 0x04 +#define PAIRFLAG_RR 0x08 +#define PAIRFLAG_PAIRED 0x10 +#define PAIRFLAG_DIFFCHR 0x20 +#define PAIRFLAG_NOMATCH 0x40 +#define PAIRFLAG_SW 0x80 + +typedef struct +{ + uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */ + uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual; + uint32_t seqid, pos; + int dist; + char name[MAX_NAMELEN]; +} maqmap1_t; + +typedef struct +{ + int format, n_ref; + char **ref_name; + uint64_t n_mapped_reads; + maqmap1_t *mapped_reads; +} maqmap_t; + +maqmap_t *maq_new_maqmap() +{ + maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t)); + mm->format = MAQMAP_FORMAT_NEW; + return mm; +} +void maq_delete_maqmap(maqmap_t *mm) +{ + int i; + if (mm == 0) return; + for (i = 0; i < mm->n_ref; ++i) + free(mm->ref_name[i]); + free(mm->ref_name); + free(mm->mapped_reads); + free(mm); +} +maqmap_t *maqmap_read_header(gzFile fp) +{ + maqmap_t *mm; + int k, len; + mm = maq_new_maqmap(); + gzread(fp, &mm->format, sizeof(int)); + if (mm->format != MAQMAP_FORMAT_NEW) { + if (mm->format > 0) { + fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n"); + exit(3); + } + assert(mm->format == MAQMAP_FORMAT_NEW); + } + gzread(fp, &mm->n_ref, sizeof(int)); + mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*)); + for (k = 0; k != mm->n_ref; ++k) { + gzread(fp, &len, sizeof(int)); + mm->ref_name[k] = (char*)malloc(len * sizeof(char)); + gzread(fp, mm->ref_name[k], len); + } + /* read number of mapped reads */ + gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t)); + return mm; +} + +void maq2tam_core(gzFile fp, const char *rg) +{ + maqmap_t *mm; + maqmap1_t mm1, *m1; + int ret; + m1 = &mm1; + mm = maqmap_read_header(fp); + while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) { + int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1]; + if (m1->flag) flag |= 1; + if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2; + if (m1->flag == 192) flag |= 4; + if (m1->flag == 64) flag |= 8; + if (m1->pos&1) flag |= 0x10; + if ((flag&1) && m1->dist != 0) { + int c; + if (m1->dist > 0) { + if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0; + else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1; + else c = m1->pos&1; + } else { + if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0; + else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1; + else c = m1->pos&1; + } + flag |= c; + } + if (flag) { + int l = strlen(m1->name); + if (m1->name[l-2] == '/') { + flag |= (m1->name[l-1] == '1')? 0x40 : 0x80; + m1->name[l-2] = '\0'; + } + } + printf("%s\t%d\t", m1->name, flag); + printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1); + if (m1->flag == 130) { + int c = (int8_t)m1->seq[MAX_READLEN-1]; + printf("%d\t", m1->alt_qual); + if (c == 0) printf("%dM\t", m1->size); + else { + if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c); + else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual); + } + se_mapq = 0; // zero SE mapQ for reads aligned by SW + } else { + if (flag&4) printf("0\t*\t"); + else printf("%d\t%dM\t", m1->map_qual, m1->size); + } + printf("*\t0\t%d\t", m1->dist); + for (j = 0; j != m1->size; ++j) { + if (m1->seq[j] == 0) putchar('N'); + else putchar("ACGT"[m1->seq[j]>>6&3]); + } + putchar('\t'); + for (j = 0; j != m1->size; ++j) + putchar((m1->seq[j]&0x3f) + 33); + putchar('\t'); + if (rg) printf("RG:Z:%s\t", rg); + if (flag&4) { // unmapped + printf("MF:i:%d\n", m1->flag); + } else { + printf("MF:i:%d\t", m1->flag); + if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq); + printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]); + } + } + if (ret > 0) + fprintf(stderr, "Truncated! Continue anyway.\n"); + maq_delete_maqmap(mm); +} + +int main(int argc, char *argv[]) +{ + gzFile fp; + if (argc == 1) { + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: maq2sam []\n"); + return 1; + } + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + maq2tam_core(fp, argc > 2? argv[2] : 0); + gzclose(fp); + return 0; +} diff --git a/misc/md5.c b/misc/md5.c new file mode 100644 index 0000000..ccead0e --- /dev/null +++ b/misc/md5.c @@ -0,0 +1,307 @@ +/* + ********************************************************************** + ** md5.c ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +#include "md5.h" + +/* forward declaration */ +static void Transform (); + +static unsigned char PADDING[64] = { + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +/* F, G and H are basic MD5 functions: selection, majority, parity */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */ +/* Rotation is separate from addition to prevent recomputation */ +#define FF(a, b, c, d, x, s, ac) \ + {(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) \ + {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) \ + {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) \ + {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +void MD5Init (mdContext) +MD5_CTX *mdContext; +{ + mdContext->i[0] = mdContext->i[1] = (UINT4)0; + + /* Load magic initialization constants. + */ + mdContext->buf[0] = (UINT4)0x67452301; + mdContext->buf[1] = (UINT4)0xefcdab89; + mdContext->buf[2] = (UINT4)0x98badcfe; + mdContext->buf[3] = (UINT4)0x10325476; +} + +void MD5Update (mdContext, inBuf, inLen) +MD5_CTX *mdContext; +unsigned char *inBuf; +unsigned int inLen; +{ + UINT4 in[16]; + int mdi; + unsigned int i, ii; + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* update number of bits */ + if ((mdContext->i[0] + ((UINT4)inLen << 3)) < mdContext->i[0]) + mdContext->i[1]++; + mdContext->i[0] += ((UINT4)inLen << 3); + mdContext->i[1] += ((UINT4)inLen >> 29); + + while (inLen--) { + /* add new character to buffer, increment mdi */ + mdContext->in[mdi++] = *inBuf++; + + /* transform if necessary */ + if (mdi == 0x40) { + for (i = 0, ii = 0; i < 16; i++, ii += 4) + in[i] = (((UINT4)mdContext->in[ii+3]) << 24) | + (((UINT4)mdContext->in[ii+2]) << 16) | + (((UINT4)mdContext->in[ii+1]) << 8) | + ((UINT4)mdContext->in[ii]); + Transform (mdContext->buf, in); + mdi = 0; + } + } +} + +void MD5Final (mdContext) +MD5_CTX *mdContext; +{ + UINT4 in[16]; + int mdi; + unsigned int i, ii; + unsigned int padLen; + + /* save number of bits */ + in[14] = mdContext->i[0]; + in[15] = mdContext->i[1]; + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* pad out to 56 mod 64 */ + padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi); + MD5Update (mdContext, PADDING, padLen); + + /* append length in bits and transform */ + for (i = 0, ii = 0; i < 14; i++, ii += 4) + in[i] = (((UINT4)mdContext->in[ii+3]) << 24) | + (((UINT4)mdContext->in[ii+2]) << 16) | + (((UINT4)mdContext->in[ii+1]) << 8) | + ((UINT4)mdContext->in[ii]); + Transform (mdContext->buf, in); + + /* store buffer in digest */ + for (i = 0, ii = 0; i < 4; i++, ii += 4) { + mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF); + mdContext->digest[ii+1] = + (unsigned char)((mdContext->buf[i] >> 8) & 0xFF); + mdContext->digest[ii+2] = + (unsigned char)((mdContext->buf[i] >> 16) & 0xFF); + mdContext->digest[ii+3] = + (unsigned char)((mdContext->buf[i] >> 24) & 0xFF); + } +} + +/* Basic MD5 step. Transform buf based on in. + */ +static void Transform (buf, in) +UINT4 *buf; +UINT4 *in; +{ + UINT4 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF ( a, b, c, d, in[ 0], S11, 3614090360u); /* 1 */ + FF ( d, a, b, c, in[ 1], S12, 3905402710u); /* 2 */ + FF ( c, d, a, b, in[ 2], S13, 606105819u); /* 3 */ + FF ( b, c, d, a, in[ 3], S14, 3250441966u); /* 4 */ + FF ( a, b, c, d, in[ 4], S11, 4118548399u); /* 5 */ + FF ( d, a, b, c, in[ 5], S12, 1200080426u); /* 6 */ + FF ( c, d, a, b, in[ 6], S13, 2821735955u); /* 7 */ + FF ( b, c, d, a, in[ 7], S14, 4249261313u); /* 8 */ + FF ( a, b, c, d, in[ 8], S11, 1770035416u); /* 9 */ + FF ( d, a, b, c, in[ 9], S12, 2336552879u); /* 10 */ + FF ( c, d, a, b, in[10], S13, 4294925233u); /* 11 */ + FF ( b, c, d, a, in[11], S14, 2304563134u); /* 12 */ + FF ( a, b, c, d, in[12], S11, 1804603682u); /* 13 */ + FF ( d, a, b, c, in[13], S12, 4254626195u); /* 14 */ + FF ( c, d, a, b, in[14], S13, 2792965006u); /* 15 */ + FF ( b, c, d, a, in[15], S14, 1236535329u); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG ( a, b, c, d, in[ 1], S21, 4129170786u); /* 17 */ + GG ( d, a, b, c, in[ 6], S22, 3225465664u); /* 18 */ + GG ( c, d, a, b, in[11], S23, 643717713u); /* 19 */ + GG ( b, c, d, a, in[ 0], S24, 3921069994u); /* 20 */ + GG ( a, b, c, d, in[ 5], S21, 3593408605u); /* 21 */ + GG ( d, a, b, c, in[10], S22, 38016083u); /* 22 */ + GG ( c, d, a, b, in[15], S23, 3634488961u); /* 23 */ + GG ( b, c, d, a, in[ 4], S24, 3889429448u); /* 24 */ + GG ( a, b, c, d, in[ 9], S21, 568446438u); /* 25 */ + GG ( d, a, b, c, in[14], S22, 3275163606u); /* 26 */ + GG ( c, d, a, b, in[ 3], S23, 4107603335u); /* 27 */ + GG ( b, c, d, a, in[ 8], S24, 1163531501u); /* 28 */ + GG ( a, b, c, d, in[13], S21, 2850285829u); /* 29 */ + GG ( d, a, b, c, in[ 2], S22, 4243563512u); /* 30 */ + GG ( c, d, a, b, in[ 7], S23, 1735328473u); /* 31 */ + GG ( b, c, d, a, in[12], S24, 2368359562u); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH ( a, b, c, d, in[ 5], S31, 4294588738u); /* 33 */ + HH ( d, a, b, c, in[ 8], S32, 2272392833u); /* 34 */ + HH ( c, d, a, b, in[11], S33, 1839030562u); /* 35 */ + HH ( b, c, d, a, in[14], S34, 4259657740u); /* 36 */ + HH ( a, b, c, d, in[ 1], S31, 2763975236u); /* 37 */ + HH ( d, a, b, c, in[ 4], S32, 1272893353u); /* 38 */ + HH ( c, d, a, b, in[ 7], S33, 4139469664u); /* 39 */ + HH ( b, c, d, a, in[10], S34, 3200236656u); /* 40 */ + HH ( a, b, c, d, in[13], S31, 681279174u); /* 41 */ + HH ( d, a, b, c, in[ 0], S32, 3936430074u); /* 42 */ + HH ( c, d, a, b, in[ 3], S33, 3572445317u); /* 43 */ + HH ( b, c, d, a, in[ 6], S34, 76029189u); /* 44 */ + HH ( a, b, c, d, in[ 9], S31, 3654602809u); /* 45 */ + HH ( d, a, b, c, in[12], S32, 3873151461u); /* 46 */ + HH ( c, d, a, b, in[15], S33, 530742520u); /* 47 */ + HH ( b, c, d, a, in[ 2], S34, 3299628645u); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II ( a, b, c, d, in[ 0], S41, 4096336452u); /* 49 */ + II ( d, a, b, c, in[ 7], S42, 1126891415u); /* 50 */ + II ( c, d, a, b, in[14], S43, 2878612391u); /* 51 */ + II ( b, c, d, a, in[ 5], S44, 4237533241u); /* 52 */ + II ( a, b, c, d, in[12], S41, 1700485571u); /* 53 */ + II ( d, a, b, c, in[ 3], S42, 2399980690u); /* 54 */ + II ( c, d, a, b, in[10], S43, 4293915773u); /* 55 */ + II ( b, c, d, a, in[ 1], S44, 2240044497u); /* 56 */ + II ( a, b, c, d, in[ 8], S41, 1873313359u); /* 57 */ + II ( d, a, b, c, in[15], S42, 4264355552u); /* 58 */ + II ( c, d, a, b, in[ 6], S43, 2734768916u); /* 59 */ + II ( b, c, d, a, in[13], S44, 1309151649u); /* 60 */ + II ( a, b, c, d, in[ 4], S41, 4149444226u); /* 61 */ + II ( d, a, b, c, in[11], S42, 3174756917u); /* 62 */ + II ( c, d, a, b, in[ 2], S43, 718787259u); /* 63 */ + II ( b, c, d, a, in[ 9], S44, 3951481745u); /* 64 */ + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +/* lh3: the following code is added by me */ + +#ifdef MD5SUM_MAIN +#include +#include +#include +#define HEX_STR "0123456789abcdef" + +static void md5_one(const char *fn) +{ + unsigned char buf[4096]; + MD5_CTX md5; + int l; + FILE *fp; + + fp = strcmp(fn, "-")? fopen(fn, "r") : stdin; + if (fp == 0) { + fprintf(stderr, "md5sum: %s: No such file or directory\n", fn); + exit(1); + } + MD5Init(&md5); + while ((l = fread(buf, 1, 4096, fp)) > 0) + MD5Update(&md5, buf, l); + MD5Final(&md5); + if (fp != stdin) fclose(fp); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[md5.digest[l]>>4&0xf], HEX_STR[md5.digest[l]&0xf]); + printf(" %s\n", fn); +} +int main(int argc, char *argv[]) +{ + int i; + if (argc == 1) md5_one("-"); + else for (i = 1; i < argc; ++i) md5_one(argv[i]); + return 0; +} +#endif diff --git a/misc/md5.h b/misc/md5.h new file mode 100644 index 0000000..678ac27 --- /dev/null +++ b/misc/md5.h @@ -0,0 +1,68 @@ +/* + ********************************************************************** + ** md5.h -- Header file for implementation of MD5 ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version ** + ** Revised (for MD5): RLR 4/27/91 ** + ** -- G modified to have y&~z instead of y&z ** + ** -- FF, GG, HH modified to add in last register done ** + ** -- Access pattern: round 2 works mod 5, round 3 works mod 3 ** + ** -- distinct additive constant for each step ** + ** -- round 4 added, working mod 7 ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +#ifndef MD5_H +#define MD5_H + +#include + +/* typedef a 32 bit type */ +typedef uint32_t UINT4; + +/* Data structure for MD5 (Message Digest) computation */ +typedef struct { + UINT4 i[2]; /* number of _bits_ handled mod 2^64 */ + UINT4 buf[4]; /* scratch buffer */ + unsigned char in[64]; /* input buffer */ + unsigned char digest[16]; /* actual digest after MD5Final call */ +} MD5_CTX; + +#ifdef __cplusplus +extern "C" { +#endif + + void MD5Init(MD5_CTX *mdContext); + void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, unsigned intinLen); + void MD5Final(MD5_CTX *mdContext); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/misc/md5fa.c b/misc/md5fa.c new file mode 100644 index 0000000..c41db2d --- /dev/null +++ b/misc/md5fa.c @@ -0,0 +1,58 @@ +#include +#include +#include "md5.h" +#include "kseq.h" + +#define HEX_STR "0123456789abcdef" + +KSEQ_INIT(gzFile, gzread) + +static void md5_one(const char *fn) +{ + MD5_CTX md5_one, md5_all; + int l, i, k; + gzFile fp; + kseq_t *seq; + unsigned char unordered[16]; + + for (l = 0; l < 16; ++l) unordered[l] = 0; + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "md5fa: %s: No such file or directory\n", fn); + exit(1); + } + + MD5Init(&md5_all); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = k = 0; i < seq->seq.l; ++i) { + if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); + else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; + } + MD5Init(&md5_one); + MD5Update(&md5_one, (unsigned char*)seq->seq.s, k); + MD5Final(&md5_one); + for (l = 0; l < 16; ++l) { + printf("%c%c", HEX_STR[md5_one.digest[l]>>4&0xf], HEX_STR[md5_one.digest[l]&0xf]); + unordered[l] ^= md5_one.digest[l]; + } + printf(" %s %s\n", fn, seq->name.s); + MD5Update(&md5_all, (unsigned char*)seq->seq.s, k); + } + MD5Final(&md5_all); + kseq_destroy(seq); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[md5_all.digest[l]>>4&0xf], HEX_STR[md5_all.digest[l]&0xf]); + printf(" %s >ordered\n", fn); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]); + printf(" %s >unordered\n", fn); +} + +int main(int argc, char *argv[]) +{ + int i; + if (argc == 1) md5_one("-"); + else for (i = 1; i < argc; ++i) md5_one(argv[i]); + return 0; +} diff --git a/misc/novo2sam.pl b/misc/novo2sam.pl new file mode 100755 index 0000000..3d3436c --- /dev/null +++ b/misc/novo2sam.pl @@ -0,0 +1,281 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.3 + +#Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com) + +#use strict; +#use warnings; +use Data::Dumper; +use Getopt::Std; + +&novo2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub novo2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: novo2sam.pl [-p] \n") if (@ARGV == 0); + my $is_paired = defined($opts{p}); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + next if (/^#/); + next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/); + &novo2sam_aux($_, $s_curr, $is_paired); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub novo2sam_aux { + my ($line, $s, $is_paired) = @_; + + chomp($line); + my @t = split(/\s+/, $line); + my @variations = @t[13 .. $#t]; + @$s = (); + return if ($t[4] ne 'U'); + my $len = length($t[2]); + # read name + $s->[0] = substr($t[0], 1); + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7); + $s->[1] |= 2 if ($t[10] eq '.'); + # read & quality + if ($t[9] eq 'R') { + $s->[9] = reverse($t[2]); + $s->[10] = reverse($t[3]); + $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; + } else { + $s->[9] = $t[2]; $s->[10] = $t[3]; + } + # cigar + my $cigarstring =""; + if (scalar @variations ==0 ) { + $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment + } else { + #convert to correct CIGAR + my $tmpstr = join" ",@variations ; + if ( $tmpstr=~ /\+|\-/ ) { + $cigarstring = cigar_method($line,\@variations,$len); + $s->[5]=$cigarstring; + } else { + $s->[5]=$len. "M"; + } +} + +# coor + $s->[2] = substr($t[7], 1); $s->[3] = $t[8]; + $s->[1] |= 0x10 if ($t[9] eq 'R'); + # mapQ + $s->[4] = $t[5] > $t[6]? $t[5] : $t[6]; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:".(@t-13)); + my $md = ''; + $md = mdtag($md,$line,\@variations,$len); + push(@$s, "MD:Z:$md"); + +} + +sub mdtag { + my $oldmd = shift; + my $line = shift; + my $ref =shift; + my $rdlen = shift; + my @variations = @$ref; + my $string=""; + my $mdtag=""; + my $t=1; + my $q=1; + my $deleteflag=0; + my $len =0; + foreach $string (@variations) { + my ($indeltype,$insert) = indeltype($string); + if ($indeltype eq "+") { + $len = length ($insert); + $q+=$len; + next; + } + my $pos = $1 if $string =~ /^(\d+)/; + $len = $pos - $t; + if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) { + $mdtag.=$len; + } + $t+=$len; + $q+=$len; + if ($indeltype eq ">") { + $mdtag.=$insert; + $deleteflag=0; + $t+=1; + $q+=1; + } + if ($indeltype eq "-") { + my $deletedbase = $2 if $string =~ /(\d+)\-([A-Z]+)/; + if ($deleteflag == 0 ) { + $mdtag.="^"; + } + $mdtag.=$deletedbase; + $deleteflag=1; + $t+=1; + } + } + $len = $rdlen - $q + 1; + if ($len > 0) { + $mdtag.="$len"; + } +# print "In:$line\n"; +# print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n"; + + return $mdtag; +} + +sub indeltype { + my $string = shift; + my $insert=""; + my $indeltype; + if ($string =~ /([A-Z]+)\>/) { + $indeltype=">"; + $insert=$1; + } elsif ($string =~ /\-/) { + $indeltype="-"; + } elsif ($string =~ /\+([A-Z]+)/) { + $indeltype="+"; + $insert=$1; + } + return ($indeltype,$insert); + +} + + +sub cigar_method { + my $line = shift; + my $ref =shift; + my $rdlen = shift; + my @variations = @$ref; + my $string=""; + my $type=""; + my $t =1; + my $q=1; + my $indeltype=""; + my $cigar= ""; + my $insert = ""; + my $len=0; + my @cig=(); + foreach $string (@variations) { + next if $string =~ />/; + my $pos = $1 if $string =~ /^(\d+)/; + + if ($string =~ /\+([A-Z]+)/) { + $indeltype="+"; + $insert = $1; + }elsif ($string =~ /\-([A-Z]+)/) { + $indeltype="-"; + $insert = $1; + } +#print "$pos $indeltype $insert $t $q\n"; + $len = $pos - $t; + if ( $len > 0) { + $cigar.=$len."M"; + push(@cig,$len."M"); + } + $t+=$len; + $q+=$len; + + if ($indeltype eq "-") { + $cigar.="D"; + push(@cig,"D"); + $t++; + } + if ($indeltype eq "+") { + $len = length ($insert); + if ($len == 1) { + $cigar.="I"; + push(@cig,"I"); + } + if ($len > 1) { + $cigar.=$len."I"; + push(@cig,$len."I") + } + $q+=$len; + } + $insert=""; + } + $len= $rdlen - $q + 1; + if ($len > 0) { + $cigar.=$len."M"; + push(@cig,$len."M"); + } + + $cigar = newcigar($cigar,'D'); + $cigar = newcigar($cigar,'I'); + + #print "$line\n"; + #print "c CIGAR:\t$cigar\n\n"; + return $cigar; + +} + + + +sub newcigar { + my $cigar = shift; + my $char = shift; + my $new = ""; + my $copy = $cigar; +#print "$cigar\n"; + $copy =~ s/^($char+)/$1;/g; +#print "$copy\n"; + $copy =~ s/([^0-9$char])($char+)/$1;$2;/g; +#print "$copy\n"; + my @parts = split(/;/,$copy); + my $el=""; + foreach $el (@parts) { +#print "$el\n"; + if ($el =~ /^$char+$/) { + $new.=length($el).$char; + }else { + $new.=$el; + } + + } + return $new; +} diff --git a/misc/samtools.pl b/misc/samtools.pl new file mode 100755 index 0000000..c014c52 --- /dev/null +++ b/misc/samtools.pl @@ -0,0 +1,255 @@ +#!/usr/bin/perl -w + +# Author: lh3 + +use strict; +use warnings; +use Getopt::Std; + +my $version = '0.3.2 (r321)'; +&usage if (@ARGV < 1); + +my $command = shift(@ARGV); +my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter); + +die("Unknown command \"$command\".\n") if (!defined($func{$command})); +&{$func{$command}}; +exit(0); + +# +# showALEN +# + +sub showALEN { + die(qq/Usage: samtools.pl showALEN \n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + my @t = split; + my $l = 0; + $_ = $t[5]; + s/(\d+)[SMI]/$l+=$1/eg; + print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n"; + } +} + +# +# varFilter +# + +sub varFilter { + my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef); + getopts('pd:D:l:Q:w:W:N:G:', \%opts); + die(qq/ +Usage: samtools.pl varFilter [options] + +Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] + -q INT minimum RMS mapping quality for gaps [$opts{q}] + -d INT minimum read depth [$opts{d}] + -D INT maximum read depth [$opts{D}] + + -G INT min indel score for nearby SNP filtering [$opts{G}] + -w INT SNP within INT bp around a gap to be filtered [$opts{w}] + + -W INT window size for filtering dense SNPs [$opts{W}] + -N INT max number of SNPs in a window [$opts{N}] + + -l INT window size for filtering adjacent gaps [$opts{l}] + + -p print filtered variants +\n/) if (@ARGV == 0 && -t STDIN); + + # calculate the window size + my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W}); + my $max_dist = $ol > $ow? $ol : $ow; + $max_dist = $oW if ($max_dist < $oW); + # the core loop + my @staging; # (indel_filtering_score, flt_tag) + while (<>) { + my @t = split; + next if ($t[2] eq $t[3] || $t[3] eq '*/*'); # skip non-var sites + # clear the out-of-range elements + while (@staging) { + last if ($staging[0][2] eq $t[0] && $staging[0][3] + $max_dist >= $t[1]); + varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much + } + my ($flt, $score) = (0, -1); + # first a simple filter + if ($t[7] < $opts{d}) { + $flt = 2; + } elsif ($t[7] > $opts{D}) { + $flt = 3; + } + # site dependent filters + if ($flt == 0) { + if ($t[2] eq '*') { # an indel + $flt = 1 if ($t[6] < $opts{q}); + # filtering SNPs + if ($t[5] >= $opts{G}) { + for my $x (@staging) { + next if ($x->[0] >= 0 || $x->[3] + $ow < $t[1]); + $x->[1] = 5 if ($x->[1] == 0); + } + } + # calculate the filtering score (different from indel quality) + $score = $t[5]; + $score += $opts{s} * $t[10] if ($t[8] ne '*'); + $score += $opts{s} * $t[11] if ($t[9] ne '*'); + # check the staging list for indel filtering + for my $x (@staging) { + next if ($x->[0] < 0 || $x->[3] + $ol < $t[1]); + if ($x->[0] < $score) { + $x->[1] = 6; + } else { + $flt = 6; last; + } + } + } else { # a SNP + $flt = 1 if ($t[6] < $opts{Q}); + # check adjacent SNPs + my $k = 1; + for my $x (@staging) { + ++$k if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5)); + } + # filtering is necessary + if ($k > $opts{N}) { + $flt = 4; + for my $x (@staging) { + $x->[1] = 4 if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && $x->[1] == 0); + } + } else { # then check gap filter + for my $x (@staging) { + next if ($x->[0] < 0 || $x->[3] + $ow < $t[1]); + if ($x->[0] >= $opts{G}) { + $flt = 5; last; + } + } + } + } + } + push(@staging, [$score, $flt, @t]); + } + # output the last few elements in the staging list + while (@staging) { + varFilter_aux(shift @staging, $opts{p}); + } +} + +sub varFilter_aux { + my ($first, $is_print) = @_; + if ($first->[1] == 0) { + print join("\t", @$first[2 .. @$first-1]), "\n"; + } elsif ($is_print) { + print STDERR join("\t", substr("UQdDWGgX", $first->[1], 1), @$first[2 .. @$first-1]), "\n"; + } +} + +# +# pileup2fq +# + +sub pileup2fq { + my %opts = (d=>3, D=>255, Q=>25, G=>25, l=>10); + getopts('d:D:Q:G:l:', \%opts); + die(qq/ +Usage: samtools.pl pileup2fq [options] + +Options: -d INT minimum depth [$opts{d}] + -D INT maximum depth [$opts{D}] + -Q INT min RMS mapQ [$opts{Q}] + -G INT minimum indel score [$opts{G}] + -l INT indel filter winsize [$opts{l}]\n +/) if (@ARGV == 0 && -t STDIN); + + my ($last_chr, $seq, $qual, @gaps, $last_pos); + my $_Q = $opts{Q}; + my $_d = $opts{d}; + my $_D = $opts{D}; + + $last_chr = ''; + while (<>) { + my @t = split; + if ($last_chr ne $t[0]) { + &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); + $last_chr = $t[0]; + $last_pos = 0; + $seq = ''; $qual = ''; + @gaps = (); + } + if ($t[1] - $last_pos != 1) { + $seq .= 'n' x ($t[1] - $last_pos - 1); + $qual .= '!' x ($t[1] - $last_pos - 1); + } + if ($t[2] eq '*') { + push(@gaps, $t[1]) if ($t[5] >= $opts{G}); + } else { + $seq .= ($t[6] >= $_Q && $t[7] >= $_d && $t[7] <= $_D)? uc($t[3]) : lc($t[3]); + my $q = $t[4] + 33; + $q = 126 if ($q > 126); + $qual .= chr($q); + } + $last_pos = $t[1]; + } + &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); +} + +sub p2q_post_process { + my ($chr, $seq, $qual, $gaps, $l) = @_; + &p2q_filter_gaps($seq, $gaps, $l); + print "\@$chr\n"; &p2q_print_str($seq); + print "+\n"; &p2q_print_str($qual); +} + +sub p2q_filter_gaps { + my ($seq, $gaps, $l) = @_; + for my $g (@$gaps) { + my $x = $g > $l? $g - $l : 0; + substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l)); + } +} + +sub p2q_print_str { + my ($s) = @_; + my $l = length($$s); + for (my $i = 0; $i < $l; $i += 60) { + print substr($$s, $i, 60), "\n"; + } +} + +# +# varStats +# + +sub varStats { + my %opts = (d=>'', c=>5); + getopts('d:c:', \%opts); + die("Usage: samtools.pl varStats [-d dbSNP.snp] [-c $opts{c}] \n") if (@ARGV == 0 && -t STDIN); + my (@cnt, %hash); + my $col = $opts{c} - 1; + while (<>) { + my @t = split; + if ($t[2] eq '*') { + } else { + my $q = $t[$col]; + $q = 99 if ($q > 99); + $q = int($q/10); + my $is_het = ($t[3] =~ /^[ACGT]$/)? 0 : 1; + ++$cnt[$q][$is_het]; + $hash{$t[0],$t[1]} = $q; + } + } +} + +# +# Usage +# + +sub usage { + die(qq/ +Program: samtools.pl (helper script for SAMtools) +Version: $version +Contact: Heng Li \n +Usage: samtools.pl []\n +Command: varFilter filtering SNPs and short indels + pileup2fq generate fastq from `pileup -c' + showALEN print alignment length (ALEN) following CIGAR +\n/); +} diff --git a/misc/soap2sam.pl b/misc/soap2sam.pl new file mode 100755 index 0000000..b37135e --- /dev/null +++ b/misc/soap2sam.pl @@ -0,0 +1,109 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&soap2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub soap2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: soap2sam.pl [-p] \n") if (@ARGV == 0 && -t STDIN); + my $is_paired = defined($opts{p}); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + s/[\177-\377]|[\000-\010]|[\012-\040]//g; + next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub soap2sam_aux { + my ($line, $s, $is_paired) = @_; + chomp($line); + my @t = split(/\s+/, $line); + return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]); + @$s = (); + # fix SOAP-2.1.x bugs + @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/); + # read name + $s->[0] = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7); + $s->[1] |= 2 if ($is_paired); + # read & quality + $s->[9] = $t[1]; + $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[7]; $s->[3] = $t[8]; + $s->[1] |= 0x10 if ($t[6] eq '-'); + # mapQ + $s->[4] = $t[3] == 1? 30 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:$t[9]"); + my $md = ''; + if ($t[9]) { + my @x; + for (10 .. $#t) { + push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i); + } + @x = sort(@x); + my $a = 0; + for (@x) { + my ($y, $z) = split(","); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($t[1]) - $a; + } else { + $md = length($t[1]); + } + push(@$s, "MD:Z:$md"); + return 0; +} diff --git a/misc/wgsim.c b/misc/wgsim.c new file mode 100644 index 0000000..1522eee --- /dev/null +++ b/misc/wgsim.c @@ -0,0 +1,502 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* This program is separated from maq's read simulator with Colin + * Hercus' modification to allow longer indels. Colin is the chief + * developer of novoalign. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PACKAGE_VERSION "0.2.3" + +const uint8_t nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; + +/* Simple normal random number generator, copied from genran.c */ + +double ran_normal() +{ + static int iset = 0; + static double gset; + double fac, rsq, v1, v2; + if (iset == 0) { + do { + v1 = 2.0 * drand48() - 1.0; + v2 = 2.0 * drand48() - 1.0; + rsq = v1 * v1 + v2 * v2; + } while (rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0 * log(rsq) / rsq); + gset = v1 * fac; + iset = 1; + return v2 * fac; + } else { + iset = 0; + return gset; + } +} + +/* FASTA parser, copied from seq.c */ + +typedef struct { + int l, m; /* length and maximum buffer size */ + unsigned char *s; /* sequence */ +} seq_t; + +#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 + +static int SEQ_BLOCK_SIZE = 512; + +void seq_set_block_size(int size) +{ + SEQ_BLOCK_SIZE = size; +} + +int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) +{ + int c, l, max; + char *p; + + c = 0; + while (!feof(fp) && fgetc(fp) != '>'); + if (feof(fp)) return -1; + p = locus; + while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') + if (c != '\r') *p++ = c; + *p = '\0'; + if (comment) { + p = comment; + if (c != '\n') { + while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); + if (c != '\n') { + *p++ = c; + while (!feof(fp) && (c = fgetc(fp)) != '\n') + if (c != '\r') *p++ = c; + } + } + *p = '\0'; + } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); + l = 0; max = seq->m; + while (!feof(fp) && (c = fgetc(fp)) != '>') { + if (isalpha(c) || c == '-' || c == '.') { + if (l + 1 >= max) { + max += SEQ_BLOCK_SIZE; + seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); + } + seq->s[l++] = (unsigned char)c; + } + } + if (c == '>') ungetc(c,fp); + seq->s[l] = 0; + seq->m = max; seq->l = l; + return l; +} + +/* Error-checking open, copied from utils.c */ + +#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) + +FILE *err_xopen_core(const char *func, const char *fn, const char *mode) +{ + FILE *fp = 0; + if (strcmp(fn, "-") == 0) + return (strstr(mode, "r"))? stdin : stdout; + if ((fp = fopen(fn, mode)) == 0) { + fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); + abort(); + } + return fp; +} + +/* wgsim */ + +enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; +typedef unsigned short mut_t; +static mut_t mutmsk = (mut_t)0xf000; + +typedef struct { + int l, m; /* length and maximum buffer size */ + mut_t *s; /* sequence */ +} mutseq_t; + +static double ERR_RATE = 0.02; +static double MUT_RATE = 0.001; +static double INDEL_FRAC = 0.1; +static double INDEL_EXTEND = 0.3; +static int IS_SOLID = 0; +static int SHOW_MM_INFO = 1; + +void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +{ + int i, deleting = 0; + mutseq_t *ret[2]; + + ret[0] = hap1; ret[1] = hap2; + ret[0]->l = seq->l; ret[1]->l = seq->l; + ret[0]->m = seq->m; ret[1]->m = seq->m; + ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); + for (i = 0; i != seq->l; ++i) { + int c; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; + if (deleting) { + if (drand48() < INDEL_EXTEND) { + if (deleting & 1) ret[0]->s[i] |= DELETE; + if (deleting & 2) ret[1]->s[i] |= DELETE; + continue; + } else deleting = 0; + } + if (c < 4 && drand48() < MUT_RATE) { // mutation + if (drand48() >= INDEL_FRAC) { // substitution + double r = drand48(); + c = (c + (int)(r * 3.0 + 1)) & 3; + if (is_hap || drand48() < 0.333333) { // hom + ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c; + } else { // het + ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c; + } + } else { // indel + if (drand48() < 0.5) { // deletion + if (is_hap || drand48() < 0.333333) { // hom-del + ret[0]->s[i] = ret[1]->s[i] = DELETE; + deleting = 3; + } else { // het-del + deleting = drand48()<0.5?1:2; + ret[deleting-1]->s[i] = DELETE; + } + } else { // insertion + int num_ins = 0, ins = 0; + do { + num_ins++; + ins = (ins << 2) | (int)(drand48() * 4.0); + } while (num_ins < 4 && drand48() < INDEL_EXTEND); + + if (is_hap || drand48() < 0.333333) { // hom-ins + ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } else { // het-ins + ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } + } + } + } + } +} +void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) +{ + int i; + for (i = 0; i != seq->l; ++i) { + int c[3]; + c[0] = nst_nt4_table[(int)seq->s[i]]; + c[1] = hap1->s[i]; c[2] = hap2->s[i]; + if (c[0] >= 4) continue; + if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) { + printf("%s\t%d\t", name, i+1); + if (c[1] == c[2]) { // hom + if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); + } else if ((c[1]&mutmsk) == DELETE) { // del + printf("%c\t-\t-\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while(n > 0) { + putchar("ACGTN"[ins & 0x3]); + n--; + } + printf("\t-\n"); + } else assert(0); + } else { // het + if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]); + } else if ((c[1]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if ((c[2]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1 + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + n--; + } + printf("\t+\n"); + } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2 + printf("-\t"); + int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t+\n"); + } else assert(0); + } + } + } +} + +void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +{ + seq_t seq; + mutseq_t rseq[2]; + uint64_t tot_len, ii; + int i, l, n_ref; + char name[256], *qstr; + int size[2], Q; + uint8_t *tmp_seq[2]; + mut_t *target; + + INIT_SEQ(seq); + srand48(time(0)); + seq_set_block_size(0x1000000); + l = size_l > size_r? size_l : size_r; + qstr = (char*)calloc(l+1, 1); + tmp_seq[0] = (uint8_t*)calloc(l+2, 1); + tmp_seq[1] = (uint8_t*)calloc(l+2, 1); + size[0] = size_l; size[1] = size_r; + + Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + + tot_len = n_ref = 0; + while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + tot_len += l; + ++n_ref; + } + fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); + rewind(fp_fa); + + while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); + if (l < dist + 3 * std_dev) { + fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); + continue; + } + + // generate mutations and print them out + maq_mut_diref(&seq, is_hap, rseq, rseq+1); + maq_print_mutref(name, &seq, rseq, rseq+1); + + for (ii = 0; ii != n_pairs; ++ii) { // the core loop + double ran; + int d, pos, s[2], is_flip = 0; + int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k; + FILE *fpo[2]; + + do { // avoid boundary failure + ran = ran_normal(); + ran = ran * std_dev + dist; + d = (int)(ran + 0.5); + pos = (int)((l - d + 1) * drand48()); + } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); + + // flip or not + if (drand48() < 0.5) { + fpo[0] = fpout1; fpo[1] = fpout2; + s[0] = size[0]; s[1] = size[1]; + } else { + fpo[1] = fpout1; fpo[0] = fpout2; + s[1] = size[0]; s[0] = size[1]; + is_flip = 1; + } + + // generate the read sequences + target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated + n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; + +#define __gen_read(x, start, iter) do { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ + int c = target[i], mut_type = c & mutmsk; \ + if (ext_coor[x] < 0) { \ + if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ + ext_coor[x] = i; \ + } \ + if (mut_type == DELETE) ++n_indel[x]; \ + else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \ + tmp_seq[x][k++] = c & 0xf; \ + if (mut_type == SUBSTITUTE) ++n_sub[x]; \ + } else { \ + int n, ins; \ + ++n_indel[x]; \ + tmp_seq[x][k++] = c & 0xf; \ + for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \ + tmp_seq[x][k++] = ins & 0x3; \ + } \ + } \ + if (k != s[x]) ext_coor[x] = -10; \ + } while (0) + + if (!IS_SOLID) { + __gen_read(0, pos, ++i); + __gen_read(1, pos + d - 1, --i); + for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement + } else { + int c1, c2, c; + ++s[0]; ++s[1]; // temporarily increase read length by 1 + if (is_flip) { // RR pair + __gen_read(0, pos + s[0], --i); + __gen_read(1, pos + d - 1, --i); + } else { // FF pair + __gen_read(0, pos, ++i); + __gen_read(1, pos + d - 1 - s[1], ++i); + ++ext_coor[0]; ++ext_coor[1]; + } + // change to color sequence: (0,1,2,3) -> (A,C,G,T) + for (j = 0; j < 2; ++j) { + c1 = tmp_seq[j][0]; + for (i = 1; i < s[j]; ++i) { + c2 = tmp_seq[j][i]; + c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<= 4) c = 4; // actually c should be never larger than 4 if everything is correct + else if (drand48() < ERR_RATE) { + c = (c + (int)(drand48() * 3.0 + 1)) & 3; + ++n_err[j]; + } + tmp_seq[j][i] = c; + } + } + + // print + for (j = 0; j < 2; ++j) { + for (i = 0; i < s[j]; ++i) qstr[i] = Q; + qstr[i] = 0; + if (SHOW_MM_INFO) { + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); + } else { + fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, + (long long)ii, j==0? is_flip+1 : 2-is_flip, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); + } + for (i = 0; i < s[j]; ++i) + fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); + fprintf(fpo[j], "\n+\n%s\n", qstr); + } + } + free(rseq[0].s); free(rseq[1].s); + } + free(seq.s); free(qstr); + free(tmp_seq[0]); free(tmp_seq[1]); +} + +static int simu_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: wgsim (short read simulator)\n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Contact: Heng Li \n\n"); + fprintf(stderr, "Usage: wgsim [options] \n\n"); + fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE); + fprintf(stderr, " -d INT outer distance between the two ends [500]\n"); + fprintf(stderr, " -s INT standard deviation [50]\n"); + fprintf(stderr, " -N INT number of read pairs [1000000]\n"); + fprintf(stderr, " -1 INT length of the first read [70]\n"); + fprintf(stderr, " -2 INT length of the second read [70]\n"); + fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); + fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); + fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); + fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); + fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); + fprintf(stderr, " -h haplotype mode\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + int64_t N; + int dist, std_dev, c, size_l, size_r, is_hap = 0; + FILE *fpout1, *fpout2, *fp_fa; + + N = 1000000; dist = 500; std_dev = 50; + size_l = size_r = 70; + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { + switch (c) { + case 'd': dist = atoi(optarg); break; + case 's': std_dev = atoi(optarg); break; + case 'N': N = atoi(optarg); break; + case '1': size_l = atoi(optarg); break; + case '2': size_r = atoi(optarg); break; + case 'e': ERR_RATE = atof(optarg); break; + case 'r': MUT_RATE = atof(optarg); break; + case 'R': INDEL_FRAC = atof(optarg); break; + case 'X': INDEL_EXTEND = atof(optarg); break; + case 'c': IS_SOLID = 1; break; + case 'C': SHOW_MM_INFO = 0; break; + case 'h': is_hap = 1; break; + } + } + if (argc - optind < 3) return simu_usage(); + fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); + fpout1 = xopen(argv[optind+1], "w"); + fpout2 = xopen(argv[optind+2], "w"); + wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); + + fclose(fpout1); fclose(fpout2); fclose(fp_fa); + return 0; +} diff --git a/misc/wgsim_eval.pl b/misc/wgsim_eval.pl new file mode 100755 index 0000000..99e2ac9 --- /dev/null +++ b/misc/wgsim_eval.pl @@ -0,0 +1,74 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.3 + +use strict; +use warnings; +use Getopt::Std; + +&wgsim_eval; +exit; + +sub wgsim_eval { + my %opts; + getopts('pc', \%opts); + die("Usage: wgsim_eval.pl [-pc] \n") if (@ARGV == 0 && -t STDIN); + my (@c0, @c1); + my ($max_q, $flag) = (0, 0); + my $gap = 5; + $flag |= 1 if (defined $opts{p}); + $flag |= 2 if (defined $opts{c}); + while (<>) { + my @t = split; + my $line = $_; + my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); + $max_q = $q if ($q > $max_q); + # right coordinate + $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; + --$rght; + # correct for soft clipping + $left -= $1 if (/^(\d+)S/); + $rght += $1 if (/(\d+)S$/); + # skip unmapped reads + next if (($t[1]&0x4) || $chr eq '*'); + # parse read name and check + if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) { + if ($1 ne $chr) { # different chr + $is_correct = 0; + } else { + if ($flag & 2) { + if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward + $is_correct = 0 if (abs($2 - $left) > $gap); + } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse + $is_correct = 0 if (abs($3 - $rght) > $gap); + } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward + $is_correct = 0 if (abs($3 - $left) > $gap); + } else { # R3, reverse + $is_correct = 0 if (abs($2 - $rght) > $gap); + } + } else { + if ($t[1] & 0x10) { # reverse + $is_correct = 0 if (abs($3 - $rght) > $gap); # in case of indels that are close to the end of a reads + } else { + $is_correct = 0 if (abs($2 - $left) > $gap); + } + } + } + } else { + warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); + next; + } + ++$c0[$q]; + ++$c1[$q] unless ($is_correct); + print STDERR $line if (($flag&1) && !$is_correct && $q > 0); + } + # print + my ($cc0, $cc1) = (0, 0); + for (my $i = $max_q; $i >= 0; --$i) { + $c0[$i] = 0 unless (defined $c0[$i]); + $c1[$i] = 0 unless (defined $c1[$i]); + $cc0 += $c0[$i]; $cc1 += $c1[$i]; + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0); + } +} diff --git a/misc/zoom2sam.pl b/misc/zoom2sam.pl new file mode 100755 index 0000000..5306bfa --- /dev/null +++ b/misc/zoom2sam.pl @@ -0,0 +1,97 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.0 + +use strict; +use warnings; +use Getopt::Std; + +&zoom2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub zoom2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: zoom2sam.pl [-p] +Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2); + my $is_paired = defined($opts{p}); + my $len = shift(@ARGV); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + &zoom2sam_aux($_, $s_curr, $is_paired, $len); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub zoom2sam_aux { + my ($line, $s, $is_paired, $len) = @_; + chomp($line); + my @t = split("\t", $line); + @$s = (); + # read name + $s->[0] = $t[0]; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/); + $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/); + $s->[1] |= 2 if ($is_paired); + # read & quality + $s->[9] = "*"; $s->[10] = "*"; + # cigar + $s->[5] = $len . "M"; + # coor + my @s = split(/\s+/, $t[1]); + $s->[2] = $s[0]; + $t[1] =~ /:(\d+)$/; + $s->[3] = $1 + 1; + if ($s->[0] =~ /_[FR]$/) { + my $u = ($s->[0] =~ /_F$/)? 1 : 0; + my $w = ($t[2] eq '+')? 1 : 0; + $s->[1] |= 0x10 if ($u ^ $w); + $s->[0] =~ s/_[FR]$//; + } else { + $s->[1] |= 0x10 if ($t[2] eq '-'); + } + # mapQ + $s->[4] = 30; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:$t[3]"); +} diff --git a/razf.c b/razf.c new file mode 100644 index 0000000..b56065b --- /dev/null +++ b/razf.c @@ -0,0 +1,684 @@ +/* + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan , Heng Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NO_RAZF + +#include +#include +#include +#include +#include +#include "razf.h" + +#if ZLIB_VERNUM < 0x1221 +struct _gz_header_s { + int text; + uLong time; + int xflags; + int os; + Bytef *extra; + uInt extra_len; + uInt extra_max; + Bytef *name; + uInt name_max; + Bytef *comment; + uInt comm_max; + int hcrc; + int done; +}; +#warning "zlib < 1.2.2.1; RAZF writing is disabled." +#endif + +#define DEF_MEM_LEVEL 8 + +static inline uint32_t byte_swap_4(uint32_t v){ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint64_t byte_swap_8(uint64_t v){ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} + +static inline int is_big_endian(){ + int x = 0x01; + char *c = (char*)&x; + return (c[0] != 0x01); +} + +#ifndef _RZ_READONLY +static void add_zindex(RAZF *rz, int64_t in, int64_t out){ + if(rz->index->size == rz->index->cap){ + rz->index->cap = rz->index->cap * 1.5 + 2; + rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap); + rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1)); + } + if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out; + rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE]; + rz->index->size ++; +} + +static void save_zindex(RAZF *rz, int fd){ + int32_t i, v32; + int is_be; + is_be = is_big_endian(); + if(is_be) write(fd, &rz->index->size, sizeof(int)); + else { + v32 = byte_swap_4((uint32_t)rz->index->size); + write(fd, &v32, sizeof(uint32_t)); + } + v32 = rz->index->size / RZ_BIN_SIZE + 1; + if(!is_be){ + for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } + write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); + write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size); +} +#endif + +static void load_zindex(RAZF *rz, int fd){ + int32_t i, v32; + int is_be; + if(!rz->load_index) return; + if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex)); + is_be = is_big_endian(); + read(fd, &rz->index->size, sizeof(int)); + if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size); + rz->index->cap = rz->index->size; + v32 = rz->index->size / RZ_BIN_SIZE + 1; + rz->index->bin_offsets = malloc(sizeof(int64_t) * v32); + read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); + rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size); + read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size); + if(!is_be){ + for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } +} + +#ifdef _RZ_READONLY +static RAZF* razf_open_w(int fd) +{ + fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n"); + return 0; +} +#else +static RAZF* razf_open_w(int fd){ + RAZF *rz; + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'w'; + rz->filedes = fd; + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->index = calloc(sizeof(ZBlockIndex), 1); + deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->header = calloc(sizeof(gz_header), 1); + rz->header->os = 0x03; //Unix + rz->header->text = 0; + rz->header->time = 0; + rz->header->extra = malloc(7); + strncpy((char*)rz->header->extra, "RAZF", 4); + rz->header->extra[4] = 1; // obsolete field + // block size = RZ_BLOCK_SIZE, Big-Endian + rz->header->extra[5] = RZ_BLOCK_SIZE >> 8; + rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF; + rz->header->extra_len = 7; + rz->header->name = rz->header->comment = 0; + rz->header->hcrc = 0; + deflateSetHeader(rz->stream, rz->header); + rz->block_pos = rz->block_off = 0; + return rz; +} + +static void _razf_write(RAZF* rz, const void *data, int size){ + int tout; + rz->stream->avail_in = size; + rz->stream->next_in = (void*)data; + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_NO_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out) break; + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + if(rz->stream->avail_in == 0) break; + }; + rz->in += size - rz->stream->avail_in; + rz->block_off += size - rz->stream->avail_in; +} + +static void razf_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + if(rz->stream->avail_out){ + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FULL_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out == 0){ + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } + rz->block_pos = rz->out; + rz->block_off = 0; +} + +static void razf_end_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FINISH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out < RZ_BUFFER_SIZE){ + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } +} + +static void _razf_buffered_write(RAZF *rz, const void *data, int size){ + int i, n; + while(1){ + if(rz->buf_len == RZ_BUFFER_SIZE){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_len = 0; + } + if(size + rz->buf_len < RZ_BUFFER_SIZE){ + for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; + rz->buf_len += size; + return; + } else { + n = RZ_BUFFER_SIZE - rz->buf_len; + for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; + size -= n; + data += n; + rz->buf_len += n; + } + } +} + +int razf_write(RAZF* rz, const void *data, int size){ + int ori_size, n; + int64_t next_block; + ori_size = size; + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + while(rz->in + rz->buf_len + size >= next_block){ + n = next_block - rz->in - rz->buf_len; + _razf_buffered_write(rz, data, n); + data += n; + size -= n; + razf_flush(rz); + add_zindex(rz, rz->in, rz->out); + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + } + _razf_buffered_write(rz, data, size); + return ori_size; +} +#endif + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */ +#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define RESERVED 0xE0 /* bits 5..7: reserved */ + +static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){ + int method, flags, n, len; + if(size < 2) return 0; + if(data[0] != 0x1f || data[1] != 0x8b) return 0; + if(size < 4) return 0; + method = data[2]; + flags = data[3]; + if(method != Z_DEFLATED || (flags & RESERVED)) return 0; + n = 4 + 6; // Skip 6 bytes + *extra_off = n + 2; + *extra_len = 0; + if(flags & EXTRA_FIELD){ + if(size < n + 2) return 0; + len = ((int)data[n + 1] << 8) | data[n]; + n += 2; + *extra_off = n; + while(len){ + if(n >= size) return 0; + n ++; + len --; + } + *extra_len = n - (*extra_off); + } + if(flags & ORIG_NAME) while(n < size && data[n++]); + if(flags & COMMENT) while(n < size && data[n++]); + if(flags & HEAD_CRC){ + if(n + 2 > size) return 0; + n += 2; + } + return n; +} + +static RAZF* razf_open_r(int fd, int _load_index){ + RAZF *rz; + int ext_off, ext_len; + int n, is_be, ret; + int64_t end; + unsigned char c[] = "RAZF"; + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'r'; + rz->filedes = fd; + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL; + n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); + ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len); + if(ret == 0){ + PLAIN_FILE: + rz->in = n; + rz->file_type = FILE_TYPE_PLAIN; + memcpy(rz->outbuf, rz->inbuf, n); + rz->buf_len = n; + free(rz->stream); + rz->stream = NULL; + return rz; + } + rz->header_size = ret; + ret = inflateInit2(rz->stream, -WINDOW_BITS); + if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;} + rz->stream->avail_in = n - rz->header_size; + rz->stream->next_in = rz->inbuf + rz->header_size; + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->file_type = FILE_TYPE_GZ; + rz->in = rz->header_size; + rz->block_pos = rz->header_size; + rz->next_block_pos = rz->header_size; + rz->block_off = 0; + if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz; + if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){ + fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__); + return rz; + } + rz->load_index = _load_index; + rz->file_type = FILE_TYPE_RZ; + if(lseek(fd, -16, SEEK_END) == -1){ + UNSEEKABLE: + rz->seekable = 0; + rz->index = NULL; + rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL; + } else { + is_be = is_big_endian(); + rz->seekable = 1; + read(fd, &end, sizeof(int64_t)); + if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end); + else rz->src_end = end; + read(fd, &end, sizeof(int64_t)); + if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end); + else rz->end = end; + if(n > rz->end){ + rz->stream->avail_in -= n - rz->end; + n = rz->end; + } + if(rz->end > rz->src_end){ + lseek(fd, rz->in, SEEK_SET); + goto UNSEEKABLE; + } + if(lseek(fd, rz->end, SEEK_SET) != rz->end){ + lseek(fd, rz->in, SEEK_SET); + goto UNSEEKABLE; + } + load_zindex(rz, fd); + lseek(fd, n, SEEK_SET); + } + return rz; +} + +RAZF* razf_dopen(int fd, const char *mode){ + if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 1); + else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd); + else return NULL; +} + +RAZF* razf_dopen2(int fd, const char *mode) +{ + if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 0); + else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd); + else return NULL; +} + +static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){ + int fd; + RAZF *rz; + if(strcasecmp(mode, "r") == 0){ + fd = open(filename, O_RDONLY); + rz = razf_open_r(fd, _load_index); + } else if(strcasecmp(mode, "w") == 0){ + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + rz = razf_open_w(fd); + } else return NULL; + return rz; +} + +RAZF* razf_open(const char *filename, const char *mode){ + return _razf_open(filename, mode, 1); +} + +RAZF* razf_open2(const char *filename, const char *mode){ + return _razf_open(filename, mode, 0); +} + +int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){ + int64_t n; + if(rz->mode != 'r' && rz->mode != 'R') return 0; + switch(rz->file_type){ + case FILE_TYPE_PLAIN: + if(rz->end == 0x7fffffffffffffffLL){ + if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0; + rz->end = lseek(rz->filedes, 0, SEEK_END); + lseek(rz->filedes, n, SEEK_SET); + } + *u_size = *c_size = rz->end; + return 1; + case FILE_TYPE_GZ: + return 0; + case FILE_TYPE_RZ: + if(rz->src_end == rz->end) return 0; + *u_size = rz->src_end; + *c_size = rz->end; + return 1; + default: + return 0; + } +} + +static int _razf_read(RAZF* rz, void *data, int size){ + int ret, tin; + if(rz->z_eof || rz->z_err) return 0; + if (rz->file_type == FILE_TYPE_PLAIN) { + ret = read(rz->filedes, data, size); + if (ret == 0) rz->z_eof = 1; + return ret; + } + rz->stream->avail_out = size; + rz->stream->next_out = data; + while(rz->stream->avail_out){ + if(rz->stream->avail_in == 0){ + if(rz->in >= rz->end){ rz->z_eof = 1; break; } + if(rz->end - rz->in < RZ_BUFFER_SIZE){ + rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in); + } else { + rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); + } + if(rz->stream->avail_in == 0){ + rz->z_eof = 1; + break; + } + rz->stream->next_in = rz->inbuf; + } + tin = rz->stream->avail_in; + ret = inflate(rz->stream, Z_BLOCK); + rz->in += tin - rz->stream->avail_in; + if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){ + fprintf(stderr, "[_razf_read] inflate error: %d (at %s:%d)\n", ret, __FILE__, __LINE__); + rz->z_err = 1; + break; + } + if(ret == Z_STREAM_END){ + rz->z_eof = 1; + break; + } + if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){ + rz->buf_flush = 1; + rz->next_block_pos = rz->in; + break; + } + } + return size - rz->stream->avail_out; +} + +int razf_read(RAZF *rz, void *data, int size){ + int ori_size, i; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + for(i=0;ioutbuf + rz->buf_off)[i]; + rz->buf_off += size; + rz->buf_len -= size; + data += size; + rz->block_off += size; + size = 0; + break; + } else { + for(i=0;ibuf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i]; + data += rz->buf_len; + size -= rz->buf_len; + rz->block_off += rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof && rz->buf_len == 0) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +int razf_skip(RAZF* rz, int size){ + int ori_size; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + rz->buf_off += size; + rz->buf_len -= size; + rz->block_off += size; + size = 0; + break; + } else { + size -= rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + rz->block_off += rz->buf_len; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){ + lseek(rz->filedes, in, SEEK_SET); + rz->in = in; + rz->out = out; + rz->block_pos = in; + rz->next_block_pos = in; + rz->block_off = 0; + rz->buf_flush = 0; + rz->z_eof = rz->z_err = 0; + inflateReset(rz->stream); + rz->stream->avail_in = 0; + rz->buf_off = rz->buf_len = 0; +} + +int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){ + int64_t pos; + rz->z_eof = 0; + if(rz->file_type == FILE_TYPE_PLAIN){ + rz->buf_off = rz->buf_len = 0; + pos = block_start + block_offset; + pos = lseek(rz->filedes, pos, SEEK_SET); + rz->out = rz->in = pos; + return pos; + } + if(block_start == rz->block_pos && block_offset >= rz->block_off) { + block_offset -= rz->block_off; + goto SKIP; // Needn't reset inflate + } + if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start + _razf_reset_read(rz, block_start, 0); + SKIP: + if(block_offset) razf_skip(rz, block_offset); + return rz->block_off; +} + +int64_t razf_seek(RAZF* rz, int64_t pos, int where){ + int64_t idx; + int64_t seek_pos, new_out; + rz->z_eof = 0; + if (where == SEEK_CUR) pos += rz->out; + else if (where == SEEK_END) pos += rz->src_end; + if(rz->file_type == FILE_TYPE_PLAIN){ + seek_pos = lseek(rz->filedes, pos, SEEK_SET); + rz->buf_off = rz->buf_len = 0; + rz->out = rz->in = seek_pos; + return seek_pos; + } else if(rz->file_type == FILE_TYPE_GZ){ + if(pos >= rz->out) goto SKIP; + return rz->out; + } + if(pos == rz->out) return pos; + if(pos > rz->src_end) return rz->out; + if(!rz->seekable || !rz->load_index){ + if(pos >= rz->out) goto SKIP; + } + idx = pos / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + new_out = (idx + 1) * RZ_BLOCK_SIZE; + if(pos > rz->out && new_out <= rz->out) goto SKIP; + _razf_reset_read(rz, seek_pos, new_out); + SKIP: + razf_skip(rz, (int)(pos - rz->out)); + return rz->out; +} + +uint64_t razf_tell2(RAZF *rz) +{ + /* + if (rz->load_index) { + int64_t idx, seek_pos; + idx = rz->out / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off) + fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n", + (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off); + } + */ + return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff); +} + +int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where) +{ + if (where != SEEK_SET) return -1; + return razf_jump(rz, voffset>>16, voffset&0xffff); +} + +void razf_close(RAZF *rz){ + if(rz->mode == 'w'){ +#ifndef _RZ_READONLY + razf_end_flush(rz); + deflateEnd(rz->stream); + save_zindex(rz, rz->filedes); + if(is_big_endian()){ + write(rz->filedes, &rz->in, sizeof(int64_t)); + write(rz->filedes, &rz->out, sizeof(int64_t)); + } else { + uint64_t v64 = byte_swap_8((uint64_t)rz->in); + write(rz->filedes, &v64, sizeof(int64_t)); + v64 = byte_swap_8((uint64_t)rz->out); + write(rz->filedes, &v64, sizeof(int64_t)); + } +#endif + } else if(rz->mode == 'r'){ + if(rz->stream) inflateEnd(rz->stream); + } + if(rz->inbuf) free(rz->inbuf); + if(rz->outbuf) free(rz->outbuf); + if(rz->header){ + free(rz->header->extra); + free(rz->header->name); + free(rz->header->comment); + free(rz->header); + } + if(rz->index){ + free(rz->index->bin_offsets); + free(rz->index->cell_offsets); + free(rz->index); + } + free(rz->stream); + close(rz->filedes); + free(rz); +} + +#endif diff --git a/razf.h b/razf.h new file mode 100644 index 0000000..f7e5097 --- /dev/null +++ b/razf.h @@ -0,0 +1,123 @@ + /*- + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan , Heng Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef __RAZF_RJ_H +#define __RAZF_RJ_H + +#include +#include +#include "zlib.h" + +#if ZLIB_VERNUM < 0x1221 +#define _RZ_READONLY +struct _gz_header_s; +typedef struct _gz_header_s _gz_header; +#define gz_header _gz_header +#endif + +#define WINDOW_BITS 15 + +#ifndef RZ_BLOCK_SIZE +#define RZ_BLOCK_SIZE (1<mode from HEAD to TYPE after call inflateReset */ + int buf_off, buf_len; + int z_err, z_eof; + int seekable; + /* Indice where the source is seekable */ + int load_index; + /* set has_index to 0 in mode 'w', then index will be discarded */ +} RAZF; + +#ifdef __cplusplus +extern "C" { +#endif + + RAZF* razf_dopen(int data_fd, const char *mode); + RAZF *razf_open(const char *fn, const char *mode); + int razf_write(RAZF* rz, const void *data, int size); + int razf_read(RAZF* rz, void *data, int size); + int64_t razf_seek(RAZF* rz, int64_t pos, int where); + void razf_close(RAZF* rz); + +#define razf_tell(rz) ((rz)->out) + + RAZF* razf_open2(const char *filename, const char *mode); + RAZF* razf_dopen2(int fd, const char *mode); + uint64_t razf_tell2(RAZF *rz); + int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/razip.c b/razip.c new file mode 100644 index 0000000..2b49883 --- /dev/null +++ b/razip.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include +#include +#include "razf.h" + +#define WINDOW_SIZE 4096 + +static int razf_main_usage() +{ + printf("\n"); + printf("Usage: razip [options] [file] ...\n\n"); + printf("Options: -c write on standard output, keep original files unchanged\n"); + printf(" -d decompress\n"); + printf(" -l list compressed file contents\n"); + printf(" -b INT decompress at INT position in the uncompressed file\n"); + printf(" -s INT decompress INT bytes in the uncompressed file\n"); + printf(" -h give this help\n"); + printf("\n"); + return 0; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) { + printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + printf("razip: not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) { + fprintf(stderr, "razip: %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + RAZF *rz; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ + switch(c){ + case 'h': return razf_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'l': compress = 2; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if(end >= 0 && end < start){ + fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); + return 1; + } + if(compress == 1){ + int f_src, f_dst = -1; + if(argc > optind){ + if((f_src = open(argv[optind], O_RDONLY)) < 0){ + fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); + return 1; + } + if(pstdout){ + f_dst = fileno(stdout); + } else { + char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + strcpy(name, argv[optind]); + strcat(name, ".rz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } else if(pstdout){ + f_src = fileno(stdin); + f_dst = fileno(stdout); + } else return razf_main_usage(); + rz = razf_dopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); + razf_close(rz); // f_dst will be closed here + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + if(argc <= optind) return razf_main_usage(); + if(compress == 2){ + rz = razf_open(argv[optind], "r"); + if(rz->file_type == FILE_TYPE_RZ) { + printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name"); + printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end, + argv[optind]); + } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]); + } else { + int f_dst; + if (argc > optind && !pstdout) { + char *name; + if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) { + printf("razip: %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } else f_dst = fileno(stdout); + rz = razf_open(argv[optind], "r"); + buffer = malloc(WINDOW_SIZE); + razf_seek(rz, start, SEEK_SET); + while(1){ + if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE); + else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if(c <= 0) break; + start += c; + write(f_dst, buffer, c); + if(end >= 0 && start >= end) break; + } + free(buffer); + if (!pstdout) unlink(argv[optind]); + } + razf_close(rz); + return 0; + } +} + diff --git a/sam.c b/sam.c new file mode 100644 index 0000000..45cb05c --- /dev/null +++ b/sam.c @@ -0,0 +1,151 @@ +#include +#include "sam.h" + +#define TYPE_BAM 1 +#define TYPE_READ 2 + +bam_header_t *bam_header_dup(const bam_header_t *h0) +{ + bam_header_t *h; + int i; + h = bam_header_init(); + *h = *h0; + h->hash = 0; + h->text = (char*)calloc(h->l_text + 1, 1); + memcpy(h->text, h0->text, h->l_text); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + } + if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib); + return h; +} +static void append_header_text(bam_header_t *header, char* text, int len) +{ + int x = header->l_text + 1; + int y = header->l_text + len + 1; // 1 byte null + if (text == 0) return; + kroundup32(x); + kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. + header->l_text += len; + header->text[header->l_text] = 0; +} + +samfile_t *samopen(const char *fn, const char *mode, const void *aux) +{ + samfile_t *fp; + fp = (samfile_t*)calloc(1, sizeof(samfile_t)); + if (mode[0] == 'r') { // read + fp->type |= TYPE_READ; + if (mode[1] == 'b') { // binary + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + if (fp->x.bam == 0) goto open_err_ret; + fp->header = bam_header_read(fp->x.bam); + } else { // text + fp->x.tamr = sam_open(fn); + if (fp->x.tamr == 0) goto open_err_ret; + fp->header = sam_header_read(fp->x.tamr); + if (fp->header->n_targets == 0) { // no @SQ fields + if (aux) { // check if aux is present + bam_header_t *textheader = fp->header; + fp->header = sam_header_read2((const char*)aux); + append_header_text(fp->header, textheader->text, textheader->l_text); + bam_header_destroy(textheader); + } + if (fp->header->n_targets == 0) + fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); + } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } + sam_header_parse_rg(fp->header); + } else if (mode[0] == 'w') { // write + fp->header = bam_header_dup((const bam_header_t*)aux); + if (mode[1] == 'b') { // binary + char bmode[3]; + bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); + if (fp->x.bam == 0) goto open_err_ret; + bam_header_write(fp->x.bam, fp->header); + } else { // text + // open file + fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; + if (fp->x.tamr == 0) goto open_err_ret; + // write header + if (strstr(mode, "h")) { + int i; + bam_header_t *alt; + // parse the header text + alt = bam_header_init(); + alt->l_text = fp->header->l_text; alt->text = fp->header->text; + sam_header_parse(alt); + alt->l_text = 0; alt->text = 0; + // check if there are @SQ lines in the header + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} + if (alt->n_targets != fp->header->n_targets) + fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + } else { // then dump ->target_{name,len} + for (i = 0; i < fp->header->n_targets; ++i) + fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); + } + bam_header_destroy(alt); + } + } + } + return fp; + +open_err_ret: + free(fp); + return 0; +} + +void samclose(samfile_t *fp) +{ + if (fp == 0) return; + if (fp->header) bam_header_destroy(fp->header); + if (fp->type & TYPE_BAM) bam_close(fp->x.bam); + else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); + else fclose(fp->x.tamw); + free(fp); +} + +int samread(samfile_t *fp, bam1_t *b) +{ + if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading + if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); + else return sam_read1(fp->x.tamr, fp->header, b); +} + +int samwrite(samfile_t *fp, const bam1_t *b) +{ + if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing + if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); + else { + char *s = bam_format1(fp->header, b); + int l = strlen(s); + fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); + free(s); + return l + 1; + } +} + +int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_plbuf_t *buf; + int ret; + bam1_t *b; + b = bam_init1(); + buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); + while ((ret = samread(fp, b)) >= 0) + bam_plbuf_push(b, buf); + bam_plbuf_push(0, buf); + bam_plbuf_destroy(buf); + bam_destroy1(b); + return 0; +} diff --git a/sam.h b/sam.h new file mode 100644 index 0000000..970cf2d --- /dev/null +++ b/sam.h @@ -0,0 +1,94 @@ +#ifndef BAM_SAM_H +#define BAM_SAM_H + +#include "bam.h" + +/*! + @header + + This file provides higher level of I/O routines and unifies the APIs + for SAM and BAM formats. These APIs are more convenient and + recommended. + + @copyright Genome Research Ltd. + */ + +/*! @typedef + @abstract SAM/BAM file handler + @field type type of the handler; bit 1 for BAM and bit 2 for reading + @field bam BAM file handler; valid if (type&1) == 1 + @field tamr SAM file handler for reading; valid if type == 2 + @field tamw SAM file handler for writing; valid if type == 0 + @field header header struct + */ +typedef struct { + int type; + union { + tamFile tamr; + bamFile bam; + FILE *tamw; + } x; + bam_header_t *header; +} samfile_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Open a SAM/BAM file + + @param fn SAM/BAM file name; "-" is recognized as stdin (for + reading) or stdout (for writing). + + @param mode open mode /[rw](b?)(u?)(h?)/: 'r' for reading, 'w' for + writing, 'b' for BAM I/O, 'u' for uncompressed BAM output and 'h' + for outputing header in SAM. If 'b' present, it must immediately + follow 'r' or 'w'. Valid modes are "r", "w", "wh", "rb", "wb" and + "wbu" exclusively. + + @param aux auxiliary data; if mode[0]=='w', aux points to + bam_header_t; if strcmp(mode, "rb")==0 and @SQ header lines in SAM + are absent, aux points the file name of the list of the reference; + aux is not used otherwise. + + @return SAM/BAM file handler + */ + samfile_t *samopen(const char *fn, const char *mode, const void *aux); + + /*! + @abstract Close a SAM/BAM handler + @param fp file handler to be closed + */ + void samclose(samfile_t *fp); + + /*! + @abstract Read one alignment + @param fp file handler + @param b alignment + @return bytes read + */ + int samread(samfile_t *fp, bam1_t *b); + + /*! + @abstract Write one alignment + @param fp file handler + @param b alignment + @return bytes written + */ + int samwrite(samfile_t *fp, const bam1_t *b); + + /*! + @abstract Get the pileup for a whole alignment file + @param fp file handler + @param mask mask transferred to bam_plbuf_set_mask() + @param func user defined function called in the pileup process + #param data user provided data for func() + */ + int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sam_view.c b/sam_view.c new file mode 100644 index 0000000..02aee3c --- /dev/null +++ b/sam_view.c @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include "sam.h" + +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; +static char *g_library, *g_rg; + +static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) +{ + if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) + return 1; + if (g_library || g_rg) { + uint8_t *s = bam_aux_get(b, "RG"); + if (s) { + if (g_rg && strcmp(g_rg, (char*)(s + 1)) == 0) return 0; + if (g_library) { + const char *p = bam_strmap_get(h->rg2lib, (char*)(s + 1)); + return (p && strcmp(p, g_library) == 0)? 0 : 1; + } return 1; + } else return 1; + } else return 0; +} + +// callback function for bam_fetch() +static int view_func(const bam1_t *b, void *data) +{ + if (!__g_skip_aln(((samfile_t*)data)->header, b)) + samwrite((samfile_t*)data, b); + return 0; +} + +static int usage(void); + +int main_samview(int argc, char *argv[]) +{ + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0; + samfile_t *in = 0, *out = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0; + + /* parse command-line options */ + strcpy(in_mode, "r"); strcpy(out_mode, "w"); + while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:")) >= 0) { + switch (c) { + case 'S': is_bamin = 0; break; + case 'b': is_bamout = 1; break; + case 't': fn_list = strdup(optarg); is_bamin = 0; break; + case 'h': is_header = 1; break; + case 'H': is_header_only = 1; break; + case 'o': fn_out = strdup(optarg); break; + case 'f': g_flag_on = strtol(optarg, 0, 0); break; + case 'F': g_flag_off = strtol(optarg, 0, 0); break; + case 'q': g_min_mapQ = atoi(optarg); break; + case 'u': is_uncompressed = 1; break; + case 'l': g_library = strdup(optarg); break; + case 'r': g_rg = strdup(optarg); break; + default: return usage(); + } + } + if (is_uncompressed) is_bamout = 1; + if (is_header_only) is_header = 1; + if (is_bamout) strcat(out_mode, "b"); + if (is_bamin) strcat(in_mode, "b"); + if (is_header) strcat(out_mode, "h"); + if (is_uncompressed) strcat(out_mode, "u"); + if (argc == optind) return usage(); + + // open file handlers + if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { + fprintf(stderr, "[main_samview] fail to open file for reading.\n"); + goto view_end; + } + if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { + fprintf(stderr, "[main_samview] fail to open file for writing.\n"); + goto view_end; + } + if (is_header_only) goto view_end; // no need to print alignments + + if (argc == optind + 1) { // convert/print the entire file + bam1_t *b = bam_init1(); + int r; + while ((r = samread(in, b)) >= 0) // read one alignment from `in' + if (!__g_skip_aln(in->header, b)) + samwrite(out, b); // write the alignment to `out' + if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n"); + bam_destroy1(b); + } else { // retrieve alignments in specified regions + int i; + bam_index_t *idx = 0; + if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index + if (idx == 0) { // index is unavailable + fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n"); + ret = 1; + goto view_end; + } + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' + if (tid < 0) { // reference name is not found + fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n"); + continue; + } + bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments + } + bam_index_destroy(idx); // destroy the BAM index + } + +view_end: + // close files, free and return + free(fn_list); free(fn_out); free(g_library); free(g_rg); + samclose(in); + samclose(out); + return ret; +} + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools view [options] | [region1 [...]]\n\n"); + fprintf(stderr, "Options: -b output BAM\n"); + fprintf(stderr, " -h print header for the SAM output\n"); + fprintf(stderr, " -H print header only (no alignments)\n"); + fprintf(stderr, " -S input is SAM\n"); + fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); + fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); + fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); + fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); + fprintf(stderr, " -q INT minimum mapping quality [0]\n"); + fprintf(stderr, " -l STR only output reads in library STR [null]\n"); + fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); + fprintf(stderr, "\n\ +Notes:\n\ +\n\ + 1. By default, this command assumes the file on the command line is in\n\ + the BAM format and it prints the alignments in SAM. If `-t' is\n\ + applied, the input file is assumed to be in the SAM format. The\n\ + file supplied with `-t' is SPACE/TAB delimited with the first two\n\ + fields of each line consisting of the reference name and the\n\ + corresponding sequence length. The `.fai' file generated by `faidx'\n\ + can be used here. This file may be empty if reads are unaligned.\n\ +\n\ + 2. SAM->BAM conversion: `samtools view -bt ref.fa.fai in.sam.gz'.\n\ +\n\ + 3. BAM->SAM conversion: `samtools view in.bam'.\n\ +\n\ + 4. A region should be presented in one of the following formats:\n\ + `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\ + specified, the input alignment file must be an indexed BAM file.\n\ +\n\ + 5. Option `-u' is preferred over `-b' when the output is piped to\n\ + another samtools command.\n\ +\n"); + return 1; +} + +int main_import(int argc, char *argv[]) +{ + int argc2, ret; + char **argv2; + if (argc != 4) { + fprintf(stderr, "Usage: bamtk import \n"); + return 1; + } + argc2 = 6; + argv2 = calloc(6, sizeof(char*)); + argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; + ret = main_samview(argc2, argv2); + free(argv2); + return ret; +} diff --git a/samtools.1 b/samtools.1 new file mode 100644 index 0000000..45e1612 --- /dev/null +++ b/samtools.1 @@ -0,0 +1,422 @@ +.TH samtools 1 "6 July 2009" "samtools-0.1.5" "Bioinformatics tools" +.SH NAME +.PP +samtools - Utilities for the Sequence Alignment/Map (SAM) format +.SH SYNOPSIS +.PP +samtools view -bt ref_list.txt -o aln.bam aln.sam.gz +.PP +samtools sort aln.bam aln.sorted +.PP +samtools index aln.sorted.bam +.PP +samtools view aln.sorted.bam chr2:20,100,000-20,200,000 +.PP +samtools merge out.bam in1.bam in2.bam in3.bam +.PP +samtools faidx ref.fasta +.PP +samtools pileup -f ref.fasta aln.sorted.bam +.PP +samtools tview aln.sorted.bam ref.fasta + +.SH DESCRIPTION +.PP +Samtools is a set of utilities that manipulate alignments in the BAM +format. It imports from and exports to the SAM (Sequence Alignment/Map) +format, does sorting, merging and indexing, and allows to retrieve reads +in any regions swiftly. + +Samtools is designed to work on a stream. It regards an input file `-' +as the standard input (stdin) and an output file `-' as the standard +output (stdout). Several commands can thus be combined with Unix +pipes. Samtools always output warning and error messages to the standard +error output (stderr). + +Samtools is also able to open a BAM (not SAM) file on a remote FTP +server if the BAM file name starts with `ftp://'. Samtools checks the +current working directory for the index file and will download the index +upon absence. Samtools achieves random FTP file access with the `REST' +ftp command. It does not retrieve the entire alignment file unless it is +asked to do so. + +.SH COMMANDS AND OPTIONS + +.TP 10 +.B import +samtools import + +Since 0.1.4, this command is an alias of: + +samtools view -bt -o + +.TP +.B sort +samtools sort [-n] [-m maxMem] + +Sort alignments by leftmost coordinates. File +.I .bam +will be created. This command may also create temporary files +.I .%d.bam +when the whole alignment cannot be fitted into memory (controlled by +option -m). + +.B OPTIONS: +.RS +.TP 8 +.B -n +Sort by read names rather than by chromosomal coordinates +.TP +.B -m INT +Approximately the maximum required memory. [500000000] +.RE + +.TP +.B merge +samtools merge [-n] [...] + +Merge multiple sorted alignments. The header of +.I +will be copied to +.I +and the headers of other files will be ignored. + +.B OPTIONS: +.RS +.TP 8 +.B -n +The input alignments are sorted by read names rather than by chromosomal +coordinates +.RE + +.TP +.B index +samtools index + +Index sorted alignment for fast random access. Index file +.I .bai +will be created. + +.TP +.B view +samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F +skipFlag] [-q minMapQ] [-l library] [-r readGroup] | [region1 [...]] + +Extract/print all or sub alignments in SAM or BAM format. If no region +is specified, all the alignments will be printed; otherwise only +alignments overlapping the specified regions will be output. An +alignment may be given multiple times if it is overlapping several +regions. A region can be presented, for example, in the following +format: `chr2', `chr2:1000000' or `chr2:1,000,000-2,000,000'. The +coordinate is 1-based. + +.B OPTIONS: +.RS +.TP 8 +.B -b +Output in the BAM format. +.TP +.B -u +Output uncompressed BAM. This option saves time spent on +compression/decomprssion and is thus preferred when the output is piped +to another samtools command. +.TP +.B -h +Include the header in the output. +.TP +.B -H +Output the header only. +.TP +.B -S +Input is in SAM. If @SQ header lines are absent, the +.B `-t' +option is required. +.TP +.B -t FILE +This file is TAB-delimited. Each line must contain the reference name +and the length of the reference, one line for each distinct reference; +additional fields are ignored. This file also defines the order of the +reference sequences in sorting. If you run `samtools faidx ', +the resultant index file +.I .fai +can be used as this +.I +file. +.TP +.B -o FILE +Output file [stdout] +.TP +.B -f INT +Only output alignments with all bits in INT present in the FLAG +field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0] +.TP +.B -F INT +Skip alignments with bits present in INT [0] +.TP +.B -q INT +Skip alignments with MAPQ smaller than INT [0] +.TP +.B -l STR +Only output reads in library STR [null] +.TP +.B -r STR +Only output reads in read group STR [null] +.RE + +.TP +.B faidx +samtools faidx [region1 [...]] + +Index reference sequence in the FASTA format or extract subsequence from +indexed reference sequence. If no region is specified, +.B faidx +will index the file and create +.I .fai +on the disk. If regions are speficified, the subsequences will be +retrieved and printed to stdout in the FASTA format. The input file can +be compressed in the +.B RAZF +format. + +.TP +.B pileup +samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] +[-iscgS2] [-T theta] [-N nHap] [-r pairDiffRate] | + +Print the alignment in the pileup format. In the pileup format, each +line represents a genomic position, consisting of chromosome name, +coordinate, reference base, read bases, read qualities and alignment +mapping qualities. Information on match, mismatch, indel, strand, +mapping quality and start and end of a read are all encoded at the read +base column. At this column, a dot stands for a match to the reference +base on the forward strand, a comma for a match on the reverse strand, +`ACGTN' for a mismatch on the forward strand and `acgtn' for a mismatch +on the reverse strand. A pattern `\\+[0-9]+[ACGTNacgtn]+' indicates +there is an insertion between this reference position and the next +reference position. The length of the insertion is given by the integer +in the pattern, followed by the inserted sequence. Similarly, a pattern +`-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. The +deleted bases will be presented as `*' in the following lines. Also at +the read base column, a symbol `^' marks the start of a read segment +which is a contiguous subsequence on the read separated by `N/S/H' CIGAR +operations. The ASCII of the character following `^' minus 33 gives the +mapping quality. A symbol `$' marks the end of a read segment. + +If option +.B -c +is applied, the consensus base, consensus quality, SNP quality and RMS +mapping quality of the reads covering the site will be inserted between +the `reference base' and the `read bases' columns. An indel occupies an +additional line. Each indel line consists of chromosome name, +coordinate, a star, the genotype, consensus quality, SNP quality, RMS +mapping quality, # covering reads, the first alllele, the second allele, +# reads supporting the first allele, # reads supporting the second +allele and # reads containing indels different from the top two alleles. + +.B OPTIONS: +.RS + +.TP 10 +.B -s +Print the mapping quality as the last column. This option makes the +output easier to parse, although this format is not space efficient. + +.TP +.B -S +The input file is in SAM. + +.TP +.B -i +Only output pileup lines containing indels. + +.TP +.B -f FILE +The reference sequence in the FASTA format. Index file +.I FILE.fai +will be created if +absent. + +.TP +.B -M INT +Cap mapping quality at INT [60] + +.TP +.B -t FILE +List of reference names ane sequence lengths, in the format described +for the +.B import +command. If this option is present, samtools assumes the input +.I +is in SAM format; otherwise it assumes in BAM format. + +.TP +.B -l FILE +List of sites at which pileup is output. This file is space +delimited. The first two columns are required to be chromosome and +1-based coordinate. Additional columns are ignored. It is +recommended to use option +.B -s +together with +.B -l +as in the default format we may not know the mapping quality. + +.TP +.B -c +Call the consensus sequence using MAQ consensus model. Options +.B -T, +.B -N, +.B -I +and +.B -r +are only effective when +.B -c +or +.B -g +is in use. + +.TP +.B -g +Generate genotype likelihood in the binary GLFv3 format. This option +suppresses -c, -i and -s. + +.TP +.B -T FLOAT +The theta parameter (error dependency coefficient) in the maq consensus +calling model [0.85] + +.TP +.B -N INT +Number of haplotypes in the sample (>=2) [2] + +.TP +.B -r FLOAT +Expected fraction of differences between a pair of haplotypes [0.001] + +.TP +.B -I INT +Phred probability of an indel in sequencing/prep. [40] + +.RE + +.TP +.B tview +samtools tview [ref.fasta] + +Text alignment viewer (based on the ncurses library). In the viewer, +press `?' for help and press `g' to check the alignment start from a +region in the format like `chr10:10,000,000'. Note that if the region +showed on the screen contains no mapped reads, a blank screen will be +seen. This is a known issue and will be improved later. + +.RE + +.TP +.B fixmate +samtools fixmate + +Fill in mate coordinates, ISIZE and mate related flags from a +name-sorted alignment. + +.TP +.B rmdup +samtools rmdup + +Remove potential PCR duplicates: if multiple read pairs have identical +external coordinates, only retain the pair with highest mapping quality. +This command +.B ONLY +works with FR orientation and requires ISIZE is correctly set. + +.RE + +.TP +.B rmdupse +samtools rmdupse + +Remove potential duplicates for single-ended reads. This command will +treat all reads as single-ended even if they are paired in fact. + +.RE + +.TP +.B fillmd +samtools fillmd [-e] + +Generate the MD tag. If the MD tag is already present, this command will +give a warning if the MD tag generated is different from the existing +tag. + +.B OPTIONS: +.RS +.TP 8 +.B -e +Convert a the read base to = if it is identical to the aligned reference +base. Indel caller does not support the = bases at the moment. + +.RE + +.SH SAM FORMAT + +SAM is TAB-delimited. Apart from the header lines, which are started +with the `@' symbol, each alignment line consists of: + +.TS +center box; +cb | cb | cb +n | l | l . +Col Field Description +_ +1 QNAME Query (pair) NAME +2 FLAG bitwise FLAG +3 RNAME Reference sequence NAME +4 POS 1-based leftmost POSition/coordinate of clipped sequence +5 MAPQ MAPping Quality (Phred-scaled) +6 CIAGR extended CIGAR string +7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) +8 MPOS 1-based Mate POSistion +9 ISIZE Inferred insert SIZE +10 SEQ query SEQuence on the same strand as the reference +11 QUAL query QUALity (ASCII-33 gives the Phred base quality) +12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE +.TE + +.PP +Each bit in the FLAG field is defined as: + +.TS +center box; +cb | cb +l | l . +Flag Description +_ +0x0001 the read is paired in sequencing +0x0002 the read is mapped in a proper pair +0x0004 the query sequence itself is unmapped +0x0008 the mate is unmapped +0x0010 strand of the query (1 for reverse) +0x0020 strand of the mate +0x0040 the read is the first read in a pair +0x0080 the read is the second read in a pair +0x0100 the alignment is not primary +0x0200 the read fails platform/vendor quality checks +0x0400 the read is either a PCR or an optical duplicate +.TE + +.SH LIMITATIONS +.PP +.IP o 2 +Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. +.IP o 2 +CIGAR operation P is not properly handled at the moment. + +.SH AUTHOR +.PP +Heng Li from the Sanger Institute wrote the C version of samtools. Bob +Handsaker from the Broad Institute implemented the BGZF library and Jue +Ruan from Beijing Genomics Institute wrote the RAZF library. Various +people in the 1000Genomes Project contributed to the SAM format +specification. + +.SH SEE ALSO +.PP +Samtools website: http://samtools.sourceforge.net diff --git a/source.dot b/source.dot new file mode 100644 index 0000000..1735774 --- /dev/null +++ b/source.dot @@ -0,0 +1,19 @@ +digraph { + faidx[label="faidx.c\n(faidx)"] + import[label="bam_import.c\n(import)"] + plcmd[label="bam_plcmd.c\n(pileup)"] + sort[label="bam_sort.c\n(sort, merge)"] + index[label="bam_index.c\n(index)"] + tview[label="bam_tview.c\n(tview)"] + glf[label="glf.c\n(glfview)"] + rmdup[label="bam_rmdup.c\n(rmdup)"] + fixmate[label="bam_mate.c\n(fixmate)"] + "bam_aux.c" -> {"bam.c", import} + glf -> {"bam_maqcns.c", plcmd} + "bgzf.c" -> {"bam.c", glf} + "bam.c" -> {index, "bam_pileup.c", sort, import, rmdup, fixmate} + "bam_pileup.c" -> {"bam_lpileup.c", plcmd} + {"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview + {import, faidx, "bam_maqcns.c"} -> plcmd + {tview, plcmd, faidx, sort, import, index, glf, rmdup, fixmate} -> "bamtk.c\n(view)" +} \ No newline at end of file