From b27e00385f41769d03a8cca4dbd71275fc9fa906 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 8 Sep 2009 19:38:06 +0900 Subject: [PATCH] Imported Upstream version 0.1.5c --- AUTHORS | 16 + COPYING | 21 + ChangeLog | 2099 +++++++++++++++++++++++++++++++++++++++ INSTALL | 29 + Makefile | 69 ++ NEWS | 224 +++++ bam.c | 290 ++++++ bam.h | 714 +++++++++++++ bam_aux.c | 232 +++++ bam_color.c | 127 +++ bam_endian.h | 42 + bam_import.c | 475 +++++++++ bam_index.c | 551 ++++++++++ bam_lpileup.c | 214 ++++ bam_maqcns.c | 526 ++++++++++ bam_maqcns.h | 55 + bam_mate.c | 70 ++ bam_md.c | 117 +++ bam_pileup.c | 214 ++++ bam_plcmd.c | 385 +++++++ bam_rmdup.c | 144 +++ bam_rmdupse.c | 177 ++++ bam_sort.c | 257 +++++ bam_stat.c | 78 ++ bam_tview.c | 379 +++++++ bamtk.c | 118 +++ bgzf.c | 634 ++++++++++++ bgzf.h | 120 +++ bgzip.c | 166 ++++ examples/00README.txt | 23 + examples/Makefile | 27 + examples/calDepth.c | 62 ++ examples/ex1.fa | 56 ++ examples/ex1.sam.gz | Bin 0 -> 114565 bytes faidx.c | 311 ++++++ faidx.h | 82 ++ glf.c | 236 +++++ glf.h | 56 ++ khash.h | 486 +++++++++ knetfile.c | 300 ++++++ knetfile.h | 55 + kseq.h | 223 +++++ ksort.h | 271 +++++ kstring.c | 81 ++ kstring.h | 59 ++ misc/Makefile | 54 + misc/blast2sam.pl | 92 ++ misc/bowtie2sam.pl | 92 ++ misc/export2sam.pl | 107 ++ misc/interpolate_sam.pl | 125 +++ misc/maq2sam.c | 173 ++++ misc/md5.c | 307 ++++++ misc/md5.h | 68 ++ misc/md5fa.c | 58 ++ misc/novo2sam.pl | 281 ++++++ misc/samtools.pl | 255 +++++ misc/soap2sam.pl | 109 ++ misc/wgsim.c | 502 ++++++++++ misc/wgsim_eval.pl | 74 ++ misc/zoom2sam.pl | 97 ++ razf.c | 684 +++++++++++++ razf.h | 123 +++ razip.c | 141 +++ sam.c | 151 +++ sam.h | 94 ++ sam_view.c | 172 ++++ samtools.1 | 422 ++++++++ source.dot | 19 + 68 files changed, 15071 insertions(+) create mode 100644 AUTHORS create mode 100644 COPYING create mode 100644 ChangeLog create mode 100644 INSTALL create mode 100644 Makefile create mode 100644 NEWS create mode 100644 bam.c create mode 100644 bam.h create mode 100644 bam_aux.c create mode 100644 bam_color.c create mode 100644 bam_endian.h create mode 100644 bam_import.c create mode 100644 bam_index.c create mode 100644 bam_lpileup.c create mode 100644 bam_maqcns.c create mode 100644 bam_maqcns.h create mode 100644 bam_mate.c create mode 100644 bam_md.c create mode 100644 bam_pileup.c create mode 100644 bam_plcmd.c create mode 100644 bam_rmdup.c create mode 100644 bam_rmdupse.c create mode 100644 bam_sort.c create mode 100644 bam_stat.c create mode 100644 bam_tview.c create mode 100644 bamtk.c create mode 100644 bgzf.c create mode 100644 bgzf.h create mode 100644 bgzip.c create mode 100644 examples/00README.txt create mode 100644 examples/Makefile create mode 100644 examples/calDepth.c create mode 100644 examples/ex1.fa create mode 100644 examples/ex1.sam.gz create mode 100644 faidx.c create mode 100644 faidx.h create mode 100644 glf.c create mode 100644 glf.h create mode 100644 khash.h create mode 100644 knetfile.c create mode 100644 knetfile.h create mode 100644 kseq.h create mode 100644 ksort.h create mode 100644 kstring.c create mode 100644 kstring.h create mode 100644 misc/Makefile create mode 100755 misc/blast2sam.pl create mode 100755 misc/bowtie2sam.pl create mode 100755 misc/export2sam.pl create mode 100755 misc/interpolate_sam.pl create mode 100644 misc/maq2sam.c create mode 100644 misc/md5.c create mode 100644 misc/md5.h create mode 100644 misc/md5fa.c create mode 100755 misc/novo2sam.pl create mode 100755 misc/samtools.pl create mode 100755 misc/soap2sam.pl create mode 100644 misc/wgsim.c create mode 100755 misc/wgsim_eval.pl create mode 100755 misc/zoom2sam.pl create mode 100644 razf.c create mode 100644 razf.h create mode 100644 razip.c create mode 100644 sam.c create mode 100644 sam.h create mode 100644 sam_view.c create mode 100644 samtools.1 create mode 100644 source.dot diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..435431c --- /dev/null +++ b/AUTHORS @@ -0,0 +1,16 @@ +Heng Li from the Sanger Institute wrote most of the initial source codes +of SAMtools and various converters. + +Bob Handsaker from the Broad Institute is a major contributor to the +SAM/BAM specification. He designed and implemented the BGZF format, the +underlying indexable compression format for the BAM format. BGZF does +not support arithmetic between file offsets. + +Jue Ruan for the Beijing Genome Institute designed and implemented the +RAZF format, an alternative indexable compression format. RAZF supports +arithmetic between file offsets, at the cost of increased index file +size and the full compatibility with gzip. RAZF is optional and only +used in `faidx' for indexing RAZF compressed fasta files. + +Colin Hercus updated novo2sam.pl to support gapped alignment by +novoalign. diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..82fa2f4 --- /dev/null +++ b/COPYING @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2008-2009 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..3bf82a5 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,2099 @@ +------------------------------------------------------------------------ +r372 | lh3lh3 | 2009-07-07 09:49:27 +0100 (Tue, 07 Jul 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + + * samtools-0.1.4-23 (r372) + * keep header text if "view -t" is used (by Gerton) + +------------------------------------------------------------------------ +r371 | lh3lh3 | 2009-07-07 01:13:32 +0100 (Tue, 07 Jul 2009) | 2 lines +Changed paths: + M /trunk/samtools/samtools.1 + +update documentation + +------------------------------------------------------------------------ +r370 | bhandsaker | 2009-07-02 22:24:34 +0100 (Thu, 02 Jul 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + +Introduced LIBPATH variable so this could be overridden to allow samtools to build correct at the Broad. + +------------------------------------------------------------------------ +r369 | lh3lh3 | 2009-07-02 13:36:53 +0100 (Thu, 02 Jul 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-22 (r369) + * in pileup, optionally print E2 and U2 + * remove the debugging code in bam_aux_get() (Drat!) + +------------------------------------------------------------------------ +r368 | lh3lh3 | 2009-07-02 11:32:26 +0100 (Thu, 02 Jul 2009) | 6 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_rmdup.c + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/faidx.c + M /trunk/samtools/faidx.h + M /trunk/samtools/glf.c + + * samtools-0.1.4-21 (r368) + * propagate errors rather than exit or complain assertion failure. Assertion + should be only used for checking internal bugs, but not for external input + inconsistency. I was just a bit lazy. + * small memory leak may be present on failure, though + +------------------------------------------------------------------------ +r367 | lh3lh3 | 2009-06-30 16:18:42 +0100 (Tue, 30 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +reduce the chance of blocking in FTP connection + +------------------------------------------------------------------------ +r366 | lh3lh3 | 2009-06-30 15:35:21 +0100 (Tue, 30 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +minor changes to knetfile: invalid fd equals -1 rather than 0 + +------------------------------------------------------------------------ +r365 | lh3lh3 | 2009-06-30 14:04:30 +0100 (Tue, 30 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + + * samtools-0.1.4-20 (r365) + * download the BAM index file if it is not found in the current working directory. + +------------------------------------------------------------------------ +r364 | lh3lh3 | 2009-06-30 12:39:07 +0100 (Tue, 30 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/knetfile.c + + * samtools-0.1.4-19 (r364) + * knetfile: report error when the file is not present on FTP + +------------------------------------------------------------------------ +r363 | lh3lh3 | 2009-06-29 23:23:32 +0100 (Mon, 29 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + + * samtools-0.1.4-18 (r363) + * knetfile: do not trigger network communication in FTP seek (lazy seek) + * bgzf: cache recent blocks (disabled by default) + +------------------------------------------------------------------------ +r362 | lh3lh3 | 2009-06-25 21:04:34 +0100 (Thu, 25 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/bgzf.c + +write changelog + +------------------------------------------------------------------------ +r361 | lh3lh3 | 2009-06-25 21:03:10 +0100 (Thu, 25 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-17 (r361) + * if a file is given on FTP, search locally for the BAM index + +------------------------------------------------------------------------ +r360 | lh3lh3 | 2009-06-25 20:44:52 +0100 (Thu, 25 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + + * samtools-0.1.4-16 (r360) + * report more information in index when the input is not sorted + * change the behaviour of knet_seek() such that it returns 0 on success + * support knetfile library in BGZF + +------------------------------------------------------------------------ +r359 | lh3lh3 | 2009-06-25 17:10:55 +0100 (Thu, 25 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + M /trunk/samtools/knetfile.h + +fixed bugs in knetfile.* + +------------------------------------------------------------------------ +r358 | lh3lh3 | 2009-06-25 13:53:19 +0100 (Thu, 25 Jun 2009) | 2 lines +Changed paths: + A /trunk/samtools/knetfile.h + +this is the header file + +------------------------------------------------------------------------ +r357 | lh3lh3 | 2009-06-25 13:52:03 +0100 (Thu, 25 Jun 2009) | 3 lines +Changed paths: + A /trunk/samtools/knetfile.c + + * open a file at FTP + * preliminary version + +------------------------------------------------------------------------ +r354 | lh3lh3 | 2009-06-24 14:02:25 +0100 (Wed, 24 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-15 (r354) + * fixed a memory leak in bam_view1(), although samtools is not using this routine. + +------------------------------------------------------------------------ +r351 | lh3lh3 | 2009-06-18 00:16:26 +0100 (Thu, 18 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/faidx.c + + * samtools-0.1.4-13 (r351) + * make faidx more tolerant to empty lines right before or after > lines + * hope this does not introduce new bugs... + +------------------------------------------------------------------------ +r350 | lh3lh3 | 2009-06-16 14:37:01 +0100 (Tue, 16 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-13 (r350) + * fixed a small memory leak in pileup, caused by recent modifications + +------------------------------------------------------------------------ +r347 | lh3lh3 | 2009-06-13 21:20:49 +0100 (Sat, 13 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-12 (r347) + * added `-S' to pileup, similar to `view -S' + +------------------------------------------------------------------------ +r346 | lh3lh3 | 2009-06-13 17:52:31 +0100 (Sat, 13 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam_view.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.4-11 (r346) + * allow to select a read group at view command-line + +------------------------------------------------------------------------ +r344 | lh3lh3 | 2009-06-13 14:06:24 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/calDepth.c + +added more comments + +------------------------------------------------------------------------ +r343 | lh3lh3 | 2009-06-13 14:01:22 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/calDepth.c + +nothing really + +------------------------------------------------------------------------ +r342 | lh3lh3 | 2009-06-13 13:58:48 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/Makefile + A /trunk/samtools/examples/calDepth.c + +added an example of calculating read depth + +------------------------------------------------------------------------ +r341 | lh3lh3 | 2009-06-13 13:00:08 +0100 (Sat, 13 Jun 2009) | 6 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + A /trunk/samtools/bam_color.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam.h + + * samtools-0.1.4-10 (r341) + * only include key APIs in libbam.a + * move color-specific routines to bam_color.c + * update documentations + * remove the support of -q in pileup + +------------------------------------------------------------------------ +r340 | lh3lh3 | 2009-06-13 11:17:14 +0100 (Sat, 13 Jun 2009) | 6 lines +Changed paths: + M /trunk/samtools/INSTALL + M /trunk/samtools/Makefile + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/razf.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-9 (r340) + * added a warning to razf.c if zlib<1.2.2.1 + * fixed a compilation warning + * fixed a segfault caused by @RG parsing + * detect NCURSES in bam_tview.c + +------------------------------------------------------------------------ +r339 | lh3lh3 | 2009-06-13 10:35:19 +0100 (Sat, 13 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/INSTALL + +update INSTALL + +------------------------------------------------------------------------ +r338 | lh3lh3 | 2009-06-13 00:15:24 +0100 (Sat, 13 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kstring.h + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-8 (r338) + * parse the @RG header lines and allow to choose library at the "samtools view" + command line + +------------------------------------------------------------------------ +r337 | lh3lh3 | 2009-06-12 21:25:50 +0100 (Fri, 12 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.4-7 (r337) + * bgzf.c: support mode string "wu": uncompressed output + * "samtools view" support "-u" command-line option + +------------------------------------------------------------------------ +r336 | lh3lh3 | 2009-06-12 17:20:12 +0100 (Fri, 12 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/misc/Makefile + M /trunk/samtools/razf.c + M /trunk/samtools/razf.h + M /trunk/samtools/razip.c + + * no changes to samtools itself + * remove zlib source codes + * make RAZF reading compatible with old version of zlib + * on old version of zlib, writing is not available + +------------------------------------------------------------------------ +r335 | lh3lh3 | 2009-06-12 16:47:33 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + D /trunk/samtools/zlib + +remove zlib for simplification... + +------------------------------------------------------------------------ +r334 | lh3lh3 | 2009-06-12 15:43:36 +0100 (Fri, 12 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-6 (r334) + * do not export bam_aux_get_core() for Bio::DB::Sam because it has already + been implemented in that. + * this version works with the latest Bio::DB::Sam (20090612) + +------------------------------------------------------------------------ +r333 | lh3lh3 | 2009-06-12 15:33:42 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r332 | lh3lh3 | 2009-06-12 15:21:21 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/AUTHORS + M /trunk/samtools/Makefile + M /trunk/samtools/misc/Makefile + +fixed minor things in Makefile + +------------------------------------------------------------------------ +r331 | lh3lh3 | 2009-06-12 15:07:05 +0100 (Fri, 12 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-5 (r3310 + * no change to samtools itself. Version number is increased to reflect the + changes in the Makefile building system. + +------------------------------------------------------------------------ +r330 | lh3lh3 | 2009-06-12 15:03:38 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/AUTHORS + D /trunk/samtools/README + +update information... + +------------------------------------------------------------------------ +r329 | lh3lh3 | 2009-06-12 14:52:21 +0100 (Fri, 12 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/novo2sam.pl + + * updated novoalign converter by Colin Hercus et al. + * this version works with indels + +------------------------------------------------------------------------ +r328 | lh3lh3 | 2009-06-12 14:50:53 +0100 (Fri, 12 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/INSTALL + M /trunk/samtools/Makefile + M /trunk/samtools/misc/Makefile + M /trunk/samtools/zlib/Makefile + + * update Makefile + * update INSTALL instruction + +------------------------------------------------------------------------ +r327 | lh3lh3 | 2009-06-12 14:18:29 +0100 (Fri, 12 Jun 2009) | 4 lines +Changed paths: + A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.generic:325) + D /trunk/samtools/Makefile.am + D /trunk/samtools/Makefile.generic + D /trunk/samtools/Makefile.lite + D /trunk/samtools/autogen.sh + D /trunk/samtools/cleanup.sh + D /trunk/samtools/configure.ac + A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.generic:305) + D /trunk/samtools/misc/Makefile.am + D /trunk/samtools/misc/Makefile.generic + M /trunk/samtools/razf.c + A /trunk/samtools/zlib + A /trunk/samtools/zlib/Makefile + A /trunk/samtools/zlib/adler32.c + A /trunk/samtools/zlib/compress.c + A /trunk/samtools/zlib/crc32.c + A /trunk/samtools/zlib/crc32.h + A /trunk/samtools/zlib/deflate.c + A /trunk/samtools/zlib/deflate.h + A /trunk/samtools/zlib/gzio.c + A /trunk/samtools/zlib/infback.c + A /trunk/samtools/zlib/inffast.c + A /trunk/samtools/zlib/inffast.h + A /trunk/samtools/zlib/inffixed.h + A /trunk/samtools/zlib/inflate.c + A /trunk/samtools/zlib/inflate.h + A /trunk/samtools/zlib/inftrees.c + A /trunk/samtools/zlib/inftrees.h + A /trunk/samtools/zlib/trees.c + A /trunk/samtools/zlib/trees.h + A /trunk/samtools/zlib/uncompr.c + A /trunk/samtools/zlib/zconf.h + A /trunk/samtools/zlib/zlib.h + A /trunk/samtools/zlib/zutil.c + A /trunk/samtools/zlib/zutil.h + D /trunk/samtools/zutil.h + + * added zlib-1.2.3 as razip requires that + * prepare to changed back to the Makefile building system + * unfinished! (will be soon) + +------------------------------------------------------------------------ +r326 | lh3lh3 | 2009-06-12 14:12:03 +0100 (Fri, 12 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + +Unfinished + +------------------------------------------------------------------------ +r325 | lh3lh3 | 2009-06-10 16:27:59 +0100 (Wed, 10 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.4-4 (r325) + * further avoid wrong consensus calls in repetitive regions. + +------------------------------------------------------------------------ +r324 | lh3lh3 | 2009-06-10 15:56:17 +0100 (Wed, 10 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam.h + + * samtools-0.1.4-3 (r324) + * make maqcns generate the correct call in repetitive regions. + * allow filtering on mapQ at the pileup command line + +------------------------------------------------------------------------ +r323 | lh3lh3 | 2009-06-10 10:04:21 +0100 (Wed, 10 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.3.2 (r322) + * indels and SNPs use different mapping quality threshold + +------------------------------------------------------------------------ +r322 | lh3lh3 | 2009-06-10 10:03:22 +0100 (Wed, 10 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/export2sam.pl + +fixed a typo + +------------------------------------------------------------------------ +r321 | lh3lh3 | 2009-06-09 09:21:48 +0100 (Tue, 09 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + +just typo. no real change + +------------------------------------------------------------------------ +r320 | lh3lh3 | 2009-06-08 14:32:51 +0100 (Mon, 08 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + +a little bit code cleanup + +------------------------------------------------------------------------ +r319 | lh3lh3 | 2009-06-08 14:22:33 +0100 (Mon, 08 Jun 2009) | 4 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.3.1 + * change default parameters + * optionally print filtered variants + +------------------------------------------------------------------------ +r318 | lh3lh3 | 2009-06-08 14:14:26 +0100 (Mon, 08 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.3.0 + * combine snpFilter and indelFilter + +------------------------------------------------------------------------ +r317 | lh3lh3 | 2009-06-08 11:31:42 +0100 (Mon, 08 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.3 + * change a default parameter + +------------------------------------------------------------------------ +r316 | lh3lh3 | 2009-06-08 11:11:06 +0100 (Mon, 08 Jun 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + + * samtools-0.1.4-2 (r316) + * pileup: cap mapping quality at 60 (by default) + * pileup: always calculate RMS mapq + * pileup: allow to output variant sites only + +------------------------------------------------------------------------ +r312 | lh3lh3 | 2009-06-04 13:01:10 +0100 (Thu, 04 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.2 + * added pileup2fq + +------------------------------------------------------------------------ +r311 | lh3lh3 | 2009-06-03 09:40:40 +0100 (Wed, 03 Jun 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * in snpFilter, suppress non-SNP sites + +------------------------------------------------------------------------ +r310 | lh3lh3 | 2009-06-01 14:35:13 +0100 (Mon, 01 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.1 + * fixed a typo + +------------------------------------------------------------------------ +r309 | lh3lh3 | 2009-06-01 14:04:39 +0100 (Mon, 01 Jun 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.2.0 + * snpFilter + +------------------------------------------------------------------------ +r306 | lh3lh3 | 2009-05-28 11:49:35 +0100 (Thu, 28 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bgzf.c + + * minor changes to bgzf: return NULL if fd == -1 + * suggested by {kdj,jm18}@sanger.ac.uk + +------------------------------------------------------------------------ +r305 | lh3lh3 | 2009-05-28 11:16:08 +0100 (Thu, 28 May 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/interpolate_sam.pl + +Script for paired-end pileup, contributed by Stephen Montgomery. + +------------------------------------------------------------------------ +r304 | lh3lh3 | 2009-05-28 11:08:49 +0100 (Thu, 28 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + + * samtools-0.1.4-1 (r304) + * fixed a minor bug in printing headers + +------------------------------------------------------------------------ +r297 | lh3lh3 | 2009-05-21 16:06:16 +0100 (Thu, 21 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/maq2sam.c + M /trunk/samtools/samtools.1 + +Release samtools-0.1.4 + +------------------------------------------------------------------------ +r296 | lh3lh3 | 2009-05-21 12:53:14 +0100 (Thu, 21 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-24 (r296) + * another similar bug in the indel caller + +------------------------------------------------------------------------ +r295 | lh3lh3 | 2009-05-21 12:50:28 +0100 (Thu, 21 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-23 (r295) + * fixed a critical bug in the indel caller + +------------------------------------------------------------------------ +r294 | lh3lh3 | 2009-05-20 13:00:20 +0100 (Wed, 20 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_stat.c + +added a missing header file + +------------------------------------------------------------------------ +r293 | lh3lh3 | 2009-05-19 23:44:25 +0100 (Tue, 19 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-22 (r293) + * open tview in the dot-view mode by default + +------------------------------------------------------------------------ +r292 | lh3lh3 | 2009-05-18 21:01:23 +0100 (Mon, 18 May 2009) | 6 lines +Changed paths: + M /trunk/samtools/samtools.1 + +Added a note to the manual. Currently SAMtools used unaligned words in +several places. Although this does not cause bus errors to me, it may +affect portability. Please see the "Bus error" wiki page for more +information. Also thank James Bonfields for pointing this out. + + +------------------------------------------------------------------------ +r286 | lh3lh3 | 2009-05-14 15:23:13 +0100 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-21 (286) + * declare bam_aux_get_core() in bam.h + +------------------------------------------------------------------------ +r276 | lh3lh3 | 2009-05-13 10:07:55 +0100 (Wed, 13 May 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-20 (r276) + * remove bam1_t::hash again. We need to modify the Perl API anyway to + make it work with the latest SVN. + * As is suggested by Tim, scan "{base}.bai" and "{base}.bam.bai" for index + +------------------------------------------------------------------------ +r275 | lh3lh3 | 2009-05-12 21:14:10 +0100 (Tue, 12 May 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam.h + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-19 (r275) + * a minor change to the bam1_t struct: added back "void *hash" for the + backward compatibility with Bio::DB::Sam + +------------------------------------------------------------------------ +r273 | lh3lh3 | 2009-05-12 14:28:39 +0100 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_rmdupse.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-18 (r273) + * rmdupse: do not remove unmapped reads + +------------------------------------------------------------------------ +r272 | lh3lh3 | 2009-05-12 14:20:00 +0100 (Tue, 12 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_rmdupse.c + +change a parameter. It does nothing + +------------------------------------------------------------------------ +r271 | lh3lh3 | 2009-05-12 14:17:58 +0100 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile.am + M /trunk/samtools/Makefile.generic + M /trunk/samtools/Makefile.lite + A /trunk/samtools/bam_rmdupse.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/configure.ac + + * samtools-0.1.3-17 (r271) + * added 'rmdupse' command + +------------------------------------------------------------------------ +r267 | lh3lh3 | 2009-05-05 22:31:41 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.3-16 (r267) + * in sam_view.c, changed g_flag_on based on the suggestion by Angie Hinrichs + +------------------------------------------------------------------------ +r266 | lh3lh3 | 2009-05-05 22:23:27 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-15 (r266) + * report an error if a non-* reference is present while @SQ is absent + +------------------------------------------------------------------------ +r265 | lh3lh3 | 2009-05-05 22:09:00 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.3-14 (r262) + * make samopen() recognize @SQ header lines + +------------------------------------------------------------------------ +r261 | lh3lh3 | 2009-05-05 15:10:30 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam_view.c + + * samtools-0.1.3-13 (r260) + * report error for file I/O error + +------------------------------------------------------------------------ +r260 | lh3lh3 | 2009-05-05 15:01:16 +0100 (Tue, 05 May 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.am + +update Makefile.am + +------------------------------------------------------------------------ +r259 | lh3lh3 | 2009-05-05 14:52:25 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/sam.c + M /trunk/samtools/sam.h + + * samtools-0.1.3-12 (r259) + * use the new I/O interface in pileup + +------------------------------------------------------------------------ +r258 | lh3lh3 | 2009-05-05 14:33:22 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile.generic + M /trunk/samtools/Makefile.lite + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + A /trunk/samtools/sam.c + A /trunk/samtools/sam.h + A /trunk/samtools/sam_view.c + + * samtools-0.1.3-11 (r258) + * unify the interface to BAM and SAM I/O + +------------------------------------------------------------------------ +r257 | lh3lh3 | 2009-05-05 09:53:35 +0100 (Tue, 05 May 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile.lite + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-10 (r257) + * allow hex with "pileup -m" + +------------------------------------------------------------------------ +r256 | lh3lh3 | 2009-05-04 19:16:50 +0100 (Mon, 04 May 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-9 (r256) + * fixed a bug in bam_lpileup.c + * I do not know if this also fixes the bug causing assertion failure in the tview + +------------------------------------------------------------------------ +r251 | lh3lh3 | 2009-04-28 13:53:23 +0100 (Tue, 28 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-8 (r251) + * fixed a bug when there are reads without coordinates + +------------------------------------------------------------------------ +r250 | lh3lh3 | 2009-04-28 13:43:33 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + A /trunk/samtools/AUTHORS + A /trunk/samtools/README + M /trunk/samtools/cleanup.sh + +added missing files + +------------------------------------------------------------------------ +r249 | lh3lh3 | 2009-04-28 13:37:16 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.generic + M /trunk/samtools/Makefile.lite + M /trunk/samtools/configure.ac + M /trunk/samtools/misc/Makefile.generic + +improve large file support in compilation + +------------------------------------------------------------------------ +r248 | lh3lh3 | 2009-04-28 13:33:24 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/INSTALL + +update INSTALL + +------------------------------------------------------------------------ +r247 | lh3lh3 | 2009-04-28 13:28:50 +0100 (Tue, 28 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.am + M /trunk/samtools/autogen.sh + M /trunk/samtools/cleanup.sh + M /trunk/samtools/configure.ac + A /trunk/samtools/misc/Makefile.am + +fixed various issues about the GNU building scripts + +------------------------------------------------------------------------ +r246 | lh3lh3 | 2009-04-28 13:10:23 +0100 (Tue, 28 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + D /trunk/samtools/Makefile + A /trunk/samtools/Makefile.am + A /trunk/samtools/Makefile.generic + A /trunk/samtools/autogen.sh + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + A /trunk/samtools/cleanup.sh + A /trunk/samtools/configure.ac + D /trunk/samtools/misc/Makefile + A /trunk/samtools/misc/Makefile.generic (from /trunk/samtools/misc/Makefile:245) + + * samtools-0.1.3-7 (r246) + * incorporated revisions from Nils Homer + * enhanced support of displaying color-space reads + +------------------------------------------------------------------------ +r244 | lh3lh3 | 2009-04-25 11:49:40 +0100 (Sat, 25 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-6 (r244) + * fixed segfault for unmapped reads + +------------------------------------------------------------------------ +r243 | lh3lh3 | 2009-04-24 21:27:26 +0100 (Fri, 24 Apr 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-5 (r243) + * fixed a long existing bug which may cause memory leak + * check MD + * consensus calling now works with "=", but indel calling not + +------------------------------------------------------------------------ +r242 | lh3lh3 | 2009-04-24 20:44:46 +0100 (Fri, 24 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-4 (r242) + * fixed a memory leak + +------------------------------------------------------------------------ +r240 | lh3lh3 | 2009-04-24 16:40:18 +0100 (Fri, 24 Apr 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/Makefile.lite + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + A /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-3 (r240) + * generate MD tag + * generate "=" bases + * the plain pileup now support "=" bases, but consensus calling and glfgen may fail + +------------------------------------------------------------------------ +r239 | lh3lh3 | 2009-04-24 12:08:20 +0100 (Fri, 24 Apr 2009) | 5 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_aux.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-2 (r239) + * fixed bugs in bam_aux.c (these functions nevered used by samtools) + * removed bam_aux_init()/bam_aux_destroy() + * added tagview for testing bam_aux + +------------------------------------------------------------------------ +r235 | lh3lh3 | 2009-04-21 23:17:39 +0100 (Tue, 21 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.3-1 + * fixed a bug in pileup: the first read in a chromosome may not be printed + +------------------------------------------------------------------------ +r232 | lh3lh3 | 2009-04-16 15:25:43 +0100 (Thu, 16 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.lite + +a missing file in Makefile.lite + +------------------------------------------------------------------------ +r227 | lh3lh3 | 2009-04-15 22:02:53 +0100 (Wed, 15 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + +Release samtools-0.1.3 + +------------------------------------------------------------------------ +r223 | lh3lh3 | 2009-04-15 14:31:32 +0100 (Wed, 15 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-28 + * make samtools more robust to weird input such as empty file + +------------------------------------------------------------------------ +r222 | lh3lh3 | 2009-04-15 14:05:33 +0100 (Wed, 15 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/samtools.1 + +prepare for release 0.1.3 + +------------------------------------------------------------------------ +r221 | lh3lh3 | 2009-04-15 13:32:14 +0100 (Wed, 15 Apr 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/blast2sam.pl + +convert NCBI-BLASTN to SAM + +------------------------------------------------------------------------ +r220 | lh3lh3 | 2009-04-15 13:18:19 +0100 (Wed, 15 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-27 + * fixed a small memory leak in tview + +------------------------------------------------------------------------ +r219 | lh3lh3 | 2009-04-15 13:00:08 +0100 (Wed, 15 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_rmdup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-26 + * fixed a bug in rmdup when there are unmapped reads + +------------------------------------------------------------------------ +r218 | lh3lh3 | 2009-04-14 22:28:58 +0100 (Tue, 14 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + +proposed NEWS for the new release (have not yet) + +------------------------------------------------------------------------ +r216 | lh3lh3 | 2009-04-14 22:10:46 +0100 (Tue, 14 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/misc/samtools.pl + + * samtools.pl-0.1.1 + * improve indelFilter to avoid filtering true indels. The new filter relies + on the new pileup indel line implemented in samtools-0.1.2-25 + +------------------------------------------------------------------------ +r215 | lh3lh3 | 2009-04-14 22:04:19 +0100 (Tue, 14 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.2-25 + * change the pileup indel line to shows the number of alignments actually + containing indels + +------------------------------------------------------------------------ +r211 | lh3lh3 | 2009-04-13 12:07:13 +0100 (Mon, 13 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + +update ChangeLog from "svn log" + +------------------------------------------------------------------------ +r210 | lh3lh3 | 2009-04-12 20:57:05 +0100 (Sun, 12 Apr 2009) | 4 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kseq.h + + * samtools-0.1.2-24 + * in merge, gives a warning rather than error if the target sequence length is different + * allow empty header + +------------------------------------------------------------------------ +r209 | lh3lh3 | 2009-04-12 20:32:44 +0100 (Sun, 12 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-23 + * recognize '*' at the QUAL field + +------------------------------------------------------------------------ +r208 | lh3lh3 | 2009-04-12 20:08:02 +0100 (Sun, 12 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_import.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kseq.h + + * samtools-0.1.2-22 + * the field separater is TAB only, now + +------------------------------------------------------------------------ +r207 | lh3lh3 | 2009-04-08 15:18:03 +0100 (Wed, 08 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/examples/ex1.sam.gz + + * fixed the problem in the example alignment due to the bug in fixmate + +------------------------------------------------------------------------ +r206 | lh3lh3 | 2009-04-08 15:15:05 +0100 (Wed, 08 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_mate.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/soap2sam.pl + + * samtools-0.1.2-21 + * fixed a nasty bug in `fixmate' + +------------------------------------------------------------------------ +r205 | lh3lh3 | 2009-04-08 10:57:08 +0100 (Wed, 08 Apr 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/bowtie2sam.pl + M /trunk/samtools/misc/soap2sam.pl + M /trunk/samtools/misc/wgsim_eval.pl + +make the script robust to the bugs in SOAP-2.1.7 + +------------------------------------------------------------------------ +r200 | lh3lh3 | 2009-04-02 15:14:56 +0100 (Thu, 02 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-20 + * check if file is truncated in flagstat + +------------------------------------------------------------------------ +r199 | lh3lh3 | 2009-04-02 15:09:10 +0100 (Thu, 02 Apr 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-19 + * print the header if requested + +------------------------------------------------------------------------ +r193 | lh3lh3 | 2009-03-27 15:09:50 +0000 (Fri, 27 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-18 + * fixed a minor bug reported by Nils Homer + +------------------------------------------------------------------------ +r185 | lh3lh3 | 2009-03-24 11:50:32 +0000 (Tue, 24 Mar 2009) | 2 lines +Changed paths: + A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.std:184) + D /trunk/samtools/Makefile.std + A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.std:184) + D /trunk/samtools/misc/Makefile.std + +rename Makefile.std as Makefile. GNU building systerm is not ready and may take some time... + +------------------------------------------------------------------------ +r184 | lh3lh3 | 2009-03-24 10:36:38 +0000 (Tue, 24 Mar 2009) | 4 lines +Changed paths: + D /trunk/samtools/Makefile + A /trunk/samtools/Makefile.std (from /trunk/samtools/Makefile:183) + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + D /trunk/samtools/misc/Makefile + A /trunk/samtools/misc/Makefile.std (from /trunk/samtools/misc/Makefile:182) + M /trunk/samtools/samtools.1 + + * samtools-0.1.2-17 + * incorporating Nils' changes + * rename Makefile to Makefile.std and prepare to add the GNU building systerms (also by Nils) + +------------------------------------------------------------------------ +r183 | lh3lh3 | 2009-03-24 10:30:23 +0000 (Tue, 24 Mar 2009) | 4 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/kseq.h + A /trunk/samtools/kstring.c + A /trunk/samtools/kstring.h + + * samtools-0.1.2-16 + * made pileup take a list of proposed indels. An insertion is N at the moment. + * added my kstring library for a bit complex parsing of the position list. + +------------------------------------------------------------------------ +r169 | lh3lh3 | 2009-03-12 13:40:14 +0000 (Thu, 12 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/soap2sam.pl + + * soap2sam.pl-0.1.2 + * more robust to truncated soap output + +------------------------------------------------------------------------ +r168 | lh3lh3 | 2009-03-11 10:49:00 +0000 (Wed, 11 Mar 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.lite + +added bam_stat.o to Makefile.lite + +------------------------------------------------------------------------ +r167 | lh3lh3 | 2009-03-10 22:11:31 +0000 (Tue, 10 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-15 + * generate RMS of mapQ instead of max mapQ + +------------------------------------------------------------------------ +r166 | lh3lh3 | 2009-03-10 22:06:45 +0000 (Tue, 10 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/glf.c + M /trunk/samtools/glf.h + M /trunk/samtools/misc/Makefile + + * samtools-0.1.2-14 + * implemented GLFv3 + +------------------------------------------------------------------------ +r159 | lh3lh3 | 2009-03-03 11:26:08 +0000 (Tue, 03 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-13 + * fixed a minor bug in displaying pileup + +------------------------------------------------------------------------ +r158 | lh3lh3 | 2009-03-03 11:24:16 +0000 (Tue, 03 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-12 + * optionally print SAM header + +------------------------------------------------------------------------ +r153 | lh3lh3 | 2009-03-02 10:45:28 +0000 (Mon, 02 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/glf.c + + * samtools-0.1.2-11 + * use "GLF\3" as the magic for GLFv3 files + +------------------------------------------------------------------------ +r152 | lh3lh3 | 2009-03-02 10:39:09 +0000 (Mon, 02 Mar 2009) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/glf.c + M /trunk/samtools/glf.h + + * samtools-0.1.2-10 + * fixed a bug in import: core.bin is undefined for unmapped reads + * this bug can be alleviated (not completely solved) in bam_index.c + * update to GLFv3: pos is changed to offset for better compression + +------------------------------------------------------------------------ +r151 | lh3lh3 | 2009-03-01 15:18:43 +0000 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/wgsim.c + + * wgsim-0.2.3 + * fixed a bug in simulating indels + +------------------------------------------------------------------------ +r145 | lh3lh3 | 2009-02-26 19:43:57 +0000 (Thu, 26 Feb 2009) | 4 lines +Changed paths: + M /trunk/samtools/misc/wgsim.c + + * wgsim-0.2.2 + * allow to print mismatch information as fastq comment. MAQ does + not like long read names. + +------------------------------------------------------------------------ +r141 | lh3lh3 | 2009-02-26 14:53:03 +0000 (Thu, 26 Feb 2009) | 6 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/misc/wgsim.c + M /trunk/samtools/misc/wgsim_eval.pl + + * wgsim-0.2.1 + * fixed a bug about color read coordinates + * fixed a bug in read names + * wgsim_eval.pl-0.1.3 + * make the script work with color reads + +------------------------------------------------------------------------ +r140 | lh3lh3 | 2009-02-26 14:02:57 +0000 (Thu, 26 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/Makefile + M /trunk/samtools/misc/wgsim.c + + * wgsim: added a note + +------------------------------------------------------------------------ +r139 | lh3lh3 | 2009-02-26 11:39:08 +0000 (Thu, 26 Feb 2009) | 7 lines +Changed paths: + M /trunk/samtools/misc/wgsim.c + M /trunk/samtools/misc/wgsim_eval.pl + + * wgsim-0.2.0 + * considerable code clean up + * print number of substitutions/indels/errors on each read + * potentially support SOLiD simulation, though not tested at the moment + * wgsim_eval.pl-0.1.2 + * change in accordant with wgsim + +------------------------------------------------------------------------ +r129 | lh3lh3 | 2009-02-18 22:23:27 +0000 (Wed, 18 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-9 + * fixed a bug in bam_fetch, caused by completely contained adjacent chunks + +------------------------------------------------------------------------ +r128 | bhandsaker | 2009-02-18 19:06:57 +0000 (Wed, 18 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + +Fix annoying segv when invalid region specified. + +------------------------------------------------------------------------ +r127 | lh3lh3 | 2009-02-17 10:49:55 +0000 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + D /trunk/samtools/misc/indel_filter.pl + A /trunk/samtools/misc/samtools.pl + + * move indel_filter.pl to samtools.pl + +------------------------------------------------------------------------ +r126 | lh3lh3 | 2009-02-14 21:22:30 +0000 (Sat, 14 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_mate.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-7 + * fixed a bug in fixmate: SE reads are flagged as BAM_FMUNMAP + +------------------------------------------------------------------------ +r125 | lh3lh3 | 2009-02-13 09:54:45 +0000 (Fri, 13 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-7 + * fixed a minor bug in flagstat + +------------------------------------------------------------------------ +r124 | lh3lh3 | 2009-02-12 11:15:32 +0000 (Thu, 12 Feb 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/indel_filter.pl + + * samtools-0.1.2-6 + * improve indel caller by setting maximum window size + +------------------------------------------------------------------------ +r123 | lh3lh3 | 2009-02-12 10:30:29 +0000 (Thu, 12 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * output max mapping quality in indel line + +------------------------------------------------------------------------ +r122 | lh3lh3 | 2009-02-11 10:59:10 +0000 (Wed, 11 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/misc/maq2sam.c + +fixed a bug in generating tag AM + +------------------------------------------------------------------------ +r121 | lh3lh3 | 2009-02-03 10:43:11 +0000 (Tue, 03 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + +fixed a potential memory problem in indexing + +------------------------------------------------------------------------ +r120 | bhandsaker | 2009-02-02 15:52:52 +0000 (Mon, 02 Feb 2009) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + +Pass LIBS to recursive targets to facilitate building at Broad. + +------------------------------------------------------------------------ +r119 | lh3lh3 | 2009-02-02 10:12:15 +0000 (Mon, 02 Feb 2009) | 4 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-3 + * fixed a bug in generating GLFv2 for indels + * improve flagstat report a little bit + +------------------------------------------------------------------------ +r118 | lh3lh3 | 2009-01-29 12:33:23 +0000 (Thu, 29 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + A /trunk/samtools/bam_stat.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.2-1 + * added flagstat command + +------------------------------------------------------------------------ +r116 | lh3lh3 | 2009-01-28 13:31:12 +0000 (Wed, 28 Jan 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + +Release SAMtools-0.1.2 + +------------------------------------------------------------------------ +r115 | lh3lh3 | 2009-01-28 12:54:08 +0000 (Wed, 28 Jan 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/indel_filter.pl + +Script for filtering indel results + +------------------------------------------------------------------------ +r114 | lh3lh3 | 2009-01-25 11:45:37 +0000 (Sun, 25 Jan 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/zoom2sam.pl + +convert ZOOM to SAM + +------------------------------------------------------------------------ +r113 | lh3lh3 | 2009-01-24 14:25:07 +0000 (Sat, 24 Jan 2009) | 2 lines +Changed paths: + A /trunk/samtools/misc/novo2sam.pl + +add a script to convert novo alignment to SAM + +------------------------------------------------------------------------ +r112 | lh3lh3 | 2009-01-23 20:57:39 +0000 (Fri, 23 Jan 2009) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/ChangeLog.old + M /trunk/samtools/samtools.1 + +update documentation and ChangeLog + +------------------------------------------------------------------------ +r111 | lh3lh3 | 2009-01-23 19:22:59 +0000 (Fri, 23 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.1-19 + * fixed a bug in "merge" command line + +------------------------------------------------------------------------ +r110 | lh3lh3 | 2009-01-22 15:36:48 +0000 (Thu, 22 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/misc/Makefile + A /trunk/samtools/misc/bowtie2sam.pl (from /branches/dev/samtools/misc/bowtie2sam.pl:108) + M /trunk/samtools/misc/export2sam.pl + A /trunk/samtools/misc/soap2sam.pl (from /branches/dev/samtools/misc/soap2sam.pl:108) + A /trunk/samtools/misc/wgsim.c (from /branches/dev/samtools/misc/wgsim.c:108) + A /trunk/samtools/misc/wgsim_eval.pl (from /branches/dev/samtools/misc/wgsim_eval.pl:108) + + * merge from branches/dev/ + * all future development will happen here + +------------------------------------------------------------------------ +r109 | lh3lh3 | 2009-01-22 15:14:27 +0000 (Thu, 22 Jan 2009) | 3 lines +Changed paths: + M /trunk/samtools/COPYING + M /trunk/samtools/ChangeLog + A /trunk/samtools/INSTALL (from /branches/dev/samtools/INSTALL:108) + M /trunk/samtools/Makefile + A /trunk/samtools/Makefile.lite (from /branches/dev/samtools/Makefile.lite:108) + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + M /trunk/samtools/bam_import.c + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_lpileup.c + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + A /trunk/samtools/bam_mate.c (from /branches/dev/samtools/bam_mate.c:108) + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + A /trunk/samtools/bam_rmdup.c (from /branches/dev/samtools/bam_rmdup.c:108) + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bgzf.h + M /trunk/samtools/examples/00README.txt + A /trunk/samtools/examples/Makefile (from /branches/dev/samtools/examples/Makefile:108) + D /trunk/samtools/examples/ex1.fa.fai + M /trunk/samtools/examples/ex1.sam.gz + M /trunk/samtools/faidx.c + A /trunk/samtools/glf.c (from /branches/dev/samtools/glf.c:108) + M /trunk/samtools/glf.h + M /trunk/samtools/misc/Makefile + M /trunk/samtools/misc/maq2sam.c + M /trunk/samtools/razf.c + M /trunk/samtools/source.dot + + * Merge from branches/dev/ + * all future development will happen here at trunk/ + +------------------------------------------------------------------------ +r79 | bhandsaker | 2009-01-07 21:42:15 +0000 (Wed, 07 Jan 2009) | 2 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_tview.c + +Fix problem with compiling without curses. + +------------------------------------------------------------------------ +r63 | lh3lh3 | 2008-12-22 15:58:02 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /trunk/samtools (from /branches/dev/samtools:62) + +Create trunk copy + +------------------------------------------------------------------------ +r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/NEWS + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + +Release samtools-0.1.1 + +------------------------------------------------------------------------ +r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines +Changed paths: + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-66 + * fixed a bug in razf.c: reset z_eof when razf_seek() is called + * fixed a memory leak in parsing a region + * changed pileup a little bit when -s is in use: output ^ and $ + * when a bam is not indexed, output more meaningful error message + * fixed a bug in indexing for small alignment + * fixed a bug in the viewer when we come to the end of a reference file + * updated documentation + * prepare to release 0.1.1 + +------------------------------------------------------------------------ +r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/examples + A /branches/dev/samtools/examples/00README.txt + A /branches/dev/samtools/examples/ex1.fa + A /branches/dev/samtools/examples/ex1.fa.fai + A /branches/dev/samtools/examples/ex1.sam.gz + +example + +------------------------------------------------------------------------ +r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/misc/export2sam.pl + + * added comments + * fixed several bugs + +------------------------------------------------------------------------ +r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/export2sam.pl + +convert Export format to SAM; not thoroughly tested + +------------------------------------------------------------------------ +r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/source.dot + + * samtools-0.1.0-65 + * pileup: generate maq-like simple output + * pileup: allow to output pileup at required sites + * source.dot: source file relationship graph + * tview: fixed a minor bug + +------------------------------------------------------------------------ +r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines +Changed paths: + D /branches/dev/samtools/misc/all2sam.pl + +remove all2sam.pl + +------------------------------------------------------------------------ +r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/COPYING + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/faidx.h + M /branches/dev/samtools/khash.h + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/ksort.h + M /branches/dev/samtools/samtools.1 + +Added copyright information and a bit more documentation. No code change. + +------------------------------------------------------------------------ +r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-64 + * improved efficiency of the indel caller for spliced alignments + +------------------------------------------------------------------------ +r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-63 + * a bit code cleanup: reduce the dependency between source files + +------------------------------------------------------------------------ +r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-62 + * fixed a memory leak + +------------------------------------------------------------------------ +r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/samtools.1 + +update documentation, ChangeLog and a comment + +------------------------------------------------------------------------ +r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-61 + * moved pileup command to a separate source file + * added indel caller + * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!) + * updated documentation + +------------------------------------------------------------------------ +r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-60 + * fixed another bug in maqcns when there is a nearby deletion + +------------------------------------------------------------------------ +r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-59 + * pileup: outputing consensus is now optional + * fixed a bug in glfgen. This bug also exists in maq's glfgen. However, + I am not quite sure why the previous version may have problem. + +------------------------------------------------------------------------ +r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-58 + * add maq consensus to pileup. However, I will move this part to a new + command as strictly speaking, consensus callin is not part of pileup, + and imposing it would make it harder to generate for other language + bindings. + +------------------------------------------------------------------------ +r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +Fix bug in tell() after reads that consume to the exact end of a block. + +------------------------------------------------------------------------ +r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-57 + * fixed a bug in parser when there is auxiliary fields + * made the parser a bit more robust + +------------------------------------------------------------------------ +r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-56 + * fixed a bug in bgzf (only reading is affected) + * fixed a typo in bam_index.c + * in bam_index.c, check potential bugs in the underlying I/O library + +------------------------------------------------------------------------ +r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-55 + * tried to make pileup work with clipping (previously not), though NOT tested + * removed -v from pileup + * made pileup take the reference sequence + +------------------------------------------------------------------------ +r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-54 + * in parser, recognize "=", rather than ",", as a match + * in parser, correctl parse "=" at the MRNM field. + +------------------------------------------------------------------------ +r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/maq2sam.c + +fixed a bug in handling maq flag 64 and 192 + +------------------------------------------------------------------------ +r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +also calculate unordered md5sum check + +------------------------------------------------------------------------ +r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a minor bug when there are space in the sequence + +------------------------------------------------------------------------ +r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a potential memory leak + +------------------------------------------------------------------------ +r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * fixed a bug in import: bin is wrongly calculated + +------------------------------------------------------------------------ +r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/all2sam.pl + +nothing, really + +------------------------------------------------------------------------ +r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/md5.c + A /branches/dev/samtools/misc/md5.h + A /branches/dev/samtools/misc/md5fa.c + + * fixed two warnings in kseq.h + * added md5sum utilities + +------------------------------------------------------------------------ +r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/kseq.h + D /branches/dev/samtools/kstream.h + + * samtools-0.1.0-52 + * replace kstream with kseq. kseq is a superset of kstream. I need the + extra functions in kseq.h. + * also compile stand-alone faidx + +------------------------------------------------------------------------ +r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-51 + * sorting by read names is available + +------------------------------------------------------------------------ +r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/misc/maq2sam.c + + * samtools-0.1.0-50 + * format change to meet the latest specification + +------------------------------------------------------------------------ +r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/misc/maq2sam.c + + * minor change in maqcns: special care when n==0 + * change maq2sam to meet the latest specification + +------------------------------------------------------------------------ +r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + +considerable code clean up in razf + +------------------------------------------------------------------------ +r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/faidx.c + +make RAZF optional in faidx.c + +------------------------------------------------------------------------ +r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-49 + * added routines for retrieving aux data, NOT TESTED YET! + +------------------------------------------------------------------------ +r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-48 + * bgzf: fixed a potential integer overflow on 32-it machines + * maqcns: set the minimum combined quality as 0 + * supporting hex strings + +------------------------------------------------------------------------ +r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-47 + * fixed the bug in maqcns + +------------------------------------------------------------------------ +r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_maqcns.c + A /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/glf.h + + * samtools-0.1.0-46 + * add MAQ consensus caller, currently BUGGY! + +------------------------------------------------------------------------ +r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-45 + * tview: display padded alignment (but not P operation) + * better coordinates and reference sequence + +------------------------------------------------------------------------ +r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + +new ChangeLog + +------------------------------------------------------------------------ +r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + D /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6) + +Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from +the log of my personal SVN repository. + +------------------------------------------------------------------------ +r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-44 + * declare fseeko and ftello as some Linux may not do this by default and + missing these declarations will make bgzf buggy + * get rid of some harmless warings + * use BGZF by default, now + +------------------------------------------------------------------------ +r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + + * samtools-0.1.0-43 + * fixed a bug in razf_read() + * give more warnings when the file is truncated (or due to bugs in I/O library) + +------------------------------------------------------------------------ +r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +fixed a bug in bgzf.c at the end of the file + +------------------------------------------------------------------------ +r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-42 + * a lot happened to RAZF, although samtools itself is untouched. Better + also update the version number anyway to avoid confusion + +------------------------------------------------------------------------ +r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +a change from Jue, but I think it should not matter + +------------------------------------------------------------------------ +r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a potential bug in razf. However, it seems still buggy, just +rarely happens, very rarely. + +------------------------------------------------------------------------ +r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a bug in razf, with the help of Jue + +------------------------------------------------------------------------ +r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + +remove a comment + +------------------------------------------------------------------------ +r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + + * Jue has updated razf to realize Bob's scheme + +------------------------------------------------------------------------ +r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/samtools.1 + +the manual page + +------------------------------------------------------------------------ +r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/Makefile + A /branches/dev/samtools/bam.c + A /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_aux.c + A /branches/dev/samtools/bam_endian.h + A /branches/dev/samtools/bam_import.c + A /branches/dev/samtools/bam_index.c + A /branches/dev/samtools/bam_lpileup.c + A /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_sort.c + A /branches/dev/samtools/bam_tview.c + A /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/bgzf.c + A /branches/dev/samtools/bgzf.h + A /branches/dev/samtools/bgzip.c + A /branches/dev/samtools/faidx.c + A /branches/dev/samtools/faidx.h + A /branches/dev/samtools/khash.h + A /branches/dev/samtools/ksort.h + A /branches/dev/samtools/kstream.h + A /branches/dev/samtools/misc + A /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/all2sam.pl + A /branches/dev/samtools/misc/maq2sam.c + A /branches/dev/samtools/razf.c + A /branches/dev/samtools/razf.h + A /branches/dev/samtools/razip.c + A /branches/dev/samtools/zutil.h + +The initial version of samtools, replicated from my local SVN repository. +The current version is: 0.1.0-42. All future development will happen here. + +------------------------------------------------------------------------ +r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools + +samtools (C version) + +------------------------------------------------------------------------ diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..f1cf7aa --- /dev/null +++ b/INSTALL @@ -0,0 +1,29 @@ +System Requirements +=================== + +SAMtools depends on the zlib library . The latest +version 1.2.3 is preferred and with the latest version you can compile +razip and use it to compress a FASTA file. SAMtools' faidx is able to +index a razip-compressed FASTA file to save diskspace. Older zlib also +works with SAMtools, but razip cannot be compiled. + +The text-based viewer (tview) requires the GNU ncurses library +, which comes with Mac OS X and +most of the modern Linux/Unix distributions. If you do not have this +library installed, you can still compile the rest of SAMtools by +manually modifying one line in Makefile. + + +Compilation +=========== + +Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can +compile razip with `make razip'. + + +Installation +============ + +Simply copy `samtools' and other executables/scripts in `misc' to a +location you want (e.g. a directory in your $PATH). No further +configurations are required. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7bb4469 --- /dev/null +++ b/Makefile @@ -0,0 +1,69 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE #-D_NO_CURSES +LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ + bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \ + bam_sort.o +AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ + bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ + bamtk.o +PROG= samtools +INCLUDES= +SUBDIRS= . misc +LIBPATH= + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libbam.a + +libbam.a:$(LOBJS) + $(AR) -cru $@ $(LOBJS) + +### For the curses library: comment out `-lcurses' if you do not have curses installed +samtools:lib $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -lcurses -lz -L. -lbam + +razip:razip.o razf.o + $(CC) $(CFLAGS) -o $@ razf.o razip.o -lz + +bgzip:bgzip.o bgzf.o + $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz + +razip.o:razf.h +bam.o:bam.h razf.h bam_endian.h kstring.h +sam.o:sam.h bam.h +bam_import.o:bam.h kseq.h khash.h razf.h +bam_pileup.o:bam.h razf.h ksort.h +bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h +bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h +bam_lpileup.o:bam.h ksort.h +bam_tview.o:bam.h faidx.h bam_maqcns.h +bam_maqcns.o:bam.h ksort.h bam_maqcns.h +bam_sort.o:bam.h ksort.h razf.h +bam_md.o:bam.h faidx.h +glf.o:glf.h + +faidx.o:faidx.h razf.h khash.h +faidx_main.o:faidx.h razf.h + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM razip $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..149c090 --- /dev/null +++ b/NEWS @@ -0,0 +1,224 @@ +Beta Release 0.1.5 (7 July, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Support opening a BAM alignment on FTP. Users can now use "tview" to + view alignments at the NCBI ftp site. Please read manual for more + information. + + * In library, propagate errors rather than exit or complain assertion + failure. + + * Simplified the building system and fixed compiling errors caused by + zlib<1.2.2.1. + + * Fixed an issue about lost header information when a SAM is imported + with "view -t". + + * Implemented "samtool.pl varFilter" which filters both SNPs and short + indels. This command replaces "indelFilter". + + * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from + pileup output. + + * In pileup, cap mapping quality at 60. This helps filtering when + different aligners are in use. + + * In pileup, allow to output variant sites only. + + * Made pileup generate correct calls in repetitive region. At the same + time, I am considering to implement a simplified model in SOAPsnp, + although this has not happened yet. + + * In view, added '-u' option to output BAM without compression. This + option is preferred when the output is piped to other commands. + + * In view, added '-l' and '-r' to get the alignments for one library or + read group. The "@RG" header lines are now partially parsed. + + * Do not include command line utilities to libbam.a. + + * Fixed memory leaks in pileup and bam_view1(). + + * Made faidx more tolerant to empty lines right before or after FASTA > + lines. + + +Changes in other utilities: + + * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign. + + +This release involves several modifications to the key code base which +may potentially introduce new bugs even though we have tried to minimize +this by testing on several examples. Please let us know if you catch +bugs. + +(0.1.5: 7 July 2009, r373) + + + +Beta Release 0.1.4 (21 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Added the 'rmdupse' command: removing duplicates for SE reads. + + * Fixed a critical bug in the indel caller: clipped alignments are not + processed correctly. + + * Fixed a bug in the tview: gapped alignment may be incorrectly + displayed. + + * Unified the interface to BAM and SAM I/O. This is done by + implementing a wrapper on top of the old APIs and therefore old APIs + are still valid. The new I/O APIs also recognize the @SQ header + lines. + + * Generate the MD tag. + + * Generate "=" bases. However, the indel caller will not work when "=" + bases are present. + + * Enhanced support of color-read display (by Nils Homer). + + * Implemented the GNU building system. However, currently the building + system does not generate libbam.a. We will improve this later. For + the time being, `make -f Makefile.generic' is preferred. + + * Fixed a minor bug in pileup: the first read in a chromosome may be + skipped. + + * Fixed bugs in bam_aux.c. These bugs do not affect other components as + they were not used previously. + + * Output the 'SM' tag from maq2sam. + +(0.1.4: 21 May 2009, r297) + + + +Beta Release 0.1.3 (15 April, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in SAMtools: + + * SAMtools is more consistent with the specification: a) '*' in the + QUAL field is allowed; b) the field separator is TAB only and SPACE + is treated as a character in a field; c) empty header is allowed. + + * Implemented GLFv3 support in pileup. + + * Fixed a severe bug in fixmate: strand information is wrongly + overwritten. + + * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are + not correctly retrieved sometimes. + + * Fixed a bug in rmdup: segfault if unmapped reads are present. + + * Move indel_filter.pl to samtools.pl and improved the filtering by + checking the actual number of alignments containing indels. The indel + pileup line is also changed a little to make this filtration easier. + + * Fixed a minor bug in indexing: the bin number of an unmapped read is + wrongly calculated. + + * Added `flagstat' command to show statistics on the FLAG field. + + * Improved indel caller by setting the maximum window size in local + realignment. + +Changes in other utilities: + + * Fixed a bug in maq2sam: a tag name is obsolete. + + * Improvement to wgsim: a) added support for SOLiD read simulation; b) + show the number of substitutions/indels/errors in read name; c) + considerable code clean up. + + * Various converters: improved functionality in general. + + * Updated the example SAM due to the previous bug in fixmate. + +(0.1.3: 15 April 2009, r227) + + + +Beta Release 0.1.2 (28 January, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in SAMtools: + + * Implemented a Bayesian indel caller. The new caller generate scores + and genotype and is potentially more accurate than Maq's indel + caller. The pileup format is also changed accordingly. + + * Implemented rmdup command: remove potential PCR duplicates. Note that + this command ONLY works for FR orientation and requires ISIZE is + correctly set. + + * Added fixmate command: fill in mate coordinates, ISIZE and mate + related flags from a name-sorted alignment. + + * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved. + + * Allow to select reads shown in the pileup output with a mask. + + * Generate GLFv2 from pileup. + + * Added two more flags for flagging PCR/optical duplicates and for QC + failure. + + * Fixed a bug in sort command: name sorting for large alignment did not + work. + + * Allow to completely disable RAZF (using Makefile.lite) as some people + have problem to compile it. + + * Fixed a bug in import command when there are reads without + coordinates. + + * Fixed a bug in tview: clipping broke the alignment viewer. + + * Fixed a compiling error when _NO_CURSES is applied. + + * Fixed a bug in merge command. + +Changes in other utilities: + + * Added wgsim, a paired-end reads simulator. Wgsim was adapted from + maq's reads simulator. Colin Hercus further improved it to allow + longer indels. + + * Added wgsim_eval.pl, a script that evaluates the accuracy of + alignment on reads generated by wgsim. + + * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not + work properly when multiple hits are output. + + * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will + be retained when multiple hits are present. + + * Fixed a bug in export2sam.pl for QC reads. + + * Support RG tag at MAQ->SAM converter. + + * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and + indel are not properly handled, though. + + * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the + default Illumina output. + +(0.1.2: 28 January 2008; r116) + + + +Beta Release 0.1.1 (22 December, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The is the first public release of samtools. For more information, +please check the manual page `samtools.1' and the samtools website +http://samtools.sourceforge.net \ No newline at end of file diff --git a/bam.c b/bam.c new file mode 100644 index 0000000..1ff4a5a --- /dev/null +++ b/bam.c @@ -0,0 +1,290 @@ +#include +#include +#include +#include "bam.h" +#include "bam_endian.h" +#include "kstring.h" + +int bam_is_be = 0; + +/************************** + * CIGAR related routines * + **************************/ + +int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg) +{ + unsigned k; + int32_t x = c->pos, y = 0; + int state = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; // operation + int l = cigar[k] >> BAM_CIGAR_SHIFT; // length + if (state == 0 && (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CINS) && x + l > pos) { + reg->tbeg = x; reg->qbeg = y; reg->cbeg = k; + state = 1; + } + if (op == BAM_CMATCH) { x += l; y += l; } + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + if (state == 1 && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CREF_SKIP || k == c->n_cigar - 1)) { + reg->tend = x; reg->qend = y; reg->cend = k; + } + } + return state? 0 : -1; +} + +uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) +{ + uint32_t k, end; + end = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) + end += cigar[k] >> BAM_CIGAR_SHIFT; + } + return end; +} + +int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) +{ + uint32_t k; + int32_t l = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) + l += cigar[k] >> BAM_CIGAR_SHIFT; + } + return l; +} + +/******************** + * BAM I/O routines * + ********************/ + +bam_header_t *bam_header_init() +{ + bam_is_be = bam_is_big_endian(); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); +} + +void bam_header_destroy(bam_header_t *header) +{ + int32_t i; + extern void bam_destroy_header_hash(bam_header_t *header); + if (header == 0) return; + if (header->target_name) { + for (i = 0; i < header->n_targets; ++i) + free(header->target_name[i]); + free(header->target_name); + free(header->target_len); + } + free(header->text); +#ifndef BAM_NO_HASH + if (header->rg2lib) bam_strmap_destroy(header->rg2lib); + bam_destroy_header_hash(header); +#endif + free(header); +} + +bam_header_t *bam_header_read(bamFile fp) +{ + bam_header_t *header; + char buf[4]; + int32_t i, name_len; + // read "BAM1" + if (bam_read(fp, buf, 4) != 4) return 0; + if (strncmp(buf, "BAM\001", 4)) { + fprintf(stderr, "[bam_header_read] wrong header\n"); + return 0; + } + header = bam_header_init(); + // read plain text and the number of reference sequences + bam_read(fp, &header->l_text, 4); + if (bam_is_be) bam_swap_endian_4p(&header->l_text); + header->text = (char*)calloc(header->l_text + 1, 1); + bam_read(fp, header->text, header->l_text); + bam_read(fp, &header->n_targets, 4); + if (bam_is_be) bam_swap_endian_4p(&header->n_targets); + // read reference sequence names and lengths + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); + for (i = 0; i != header->n_targets; ++i) { + bam_read(fp, &name_len, 4); + if (bam_is_be) bam_swap_endian_4p(&name_len); + header->target_name[i] = (char*)calloc(name_len, 1); + bam_read(fp, header->target_name[i], name_len); + bam_read(fp, &header->target_len[i], 4); + if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); + } + return header; +} + +int bam_header_write(bamFile fp, const bam_header_t *header) +{ + char buf[4]; + int32_t i, name_len, x; + // write "BAM1" + strncpy(buf, "BAM\001", 4); + bam_write(fp, buf, 4); + // write plain text and the number of reference sequences + if (bam_is_be) { + x = bam_swap_endian_4(header->l_text); + bam_write(fp, &x, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + x = bam_swap_endian_4(header->n_targets); + bam_write(fp, &x, 4); + } else { + bam_write(fp, &header->l_text, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + bam_write(fp, &header->n_targets, 4); + } + // write sequence names and lengths + for (i = 0; i != header->n_targets; ++i) { + char *p = header->target_name[i]; + name_len = strlen(p) + 1; + if (bam_is_be) { + x = bam_swap_endian_4(name_len); + bam_write(fp, &x, 4); + } else bam_write(fp, &name_len, 4); + bam_write(fp, p, name_len); + if (bam_is_be) { + x = bam_swap_endian_4(header->target_len[i]); + bam_write(fp, &x, 4); + } else bam_write(fp, &header->target_len[i], 4); + } + return 0; +} + +static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint8_t *s; + uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); + while (s < data + data_len) { + uint8_t type; + s += 2; // skip key + type = toupper(*s); ++s; // skip type + if (type == 'C' || type == 'A') ++s; + else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } + else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } + else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } +} + +int bam_read1(bamFile fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + + assert(BAM_CORE_SIZE == 32); + if ((ret = bam_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3; + if (bam_is_be) { + bam_swap_endian_4p(&block_len); + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->data_len = block_len - BAM_CORE_SIZE; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; + b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + return 4 + block_len; +} + +inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y; + int i; + assert(BAM_CORE_SIZE == 32); + x[0] = c->tid; + x[1] = c->pos; + x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; + x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + x[4] = c->l_qseq; + x[5] = c->mtid; + x[6] = c->mpos; + x[7] = c->isize; + if (bam_is_be) { + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + y = block_len; + bam_write(fp, bam_swap_endian_4p(&y), 4); + swap_endian_data(c, data_len, data); + } else bam_write(fp, &block_len, 4); + bam_write(fp, x, BAM_CORE_SIZE); + bam_write(fp, data, data_len); + if (bam_is_be) swap_endian_data(c, data_len, data); + return 4 + block_len; +} + +int bam_write1(bamFile fp, const bam1_t *b) +{ + return bam_write1_core(fp, &b->core, b->data_len, b->data); +} + +char *bam_format1(const bam_header_t *header, const bam1_t *b) +{ + uint8_t *s = bam1_seq(b), *t = bam1_qual(b); + int i; + const bam1_core_t *c = &b->core; + kstring_t str; + str.l = str.m = 0; str.s = 0; + + ksprintf(&str, "%s\t%d\t", bam1_qname(b), c->flag); + if (c->tid < 0) kputs("*\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->tid]); + ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); + if (c->n_cigar == 0) kputc('*', &str); + else { + for (i = 0; i < c->n_cigar; ++i) + ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); + } + kputc('\t', &str); + if (c->mtid < 0) kputs("*\t", &str); + else if (c->mtid == c->tid) kputs("=\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->mtid]); + ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); + for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); + kputc('\t', &str); + if (t[0] == 0xff) kputc('*', &str); + else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); + s = bam1_aux(b); + while (s < b->data + b->data_len) { + uint8_t type, key[2]; + key[0] = s[0]; key[1] = s[1]; + s += 2; type = *s; ++s; + ksprintf(&str, "\t%c%c:", key[0], key[1]); + if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } + else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } + else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; } + else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } + else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } + else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } + else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } + else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } + else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } + else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } + } + return str.s; +} + +void bam_view1(const bam_header_t *header, const bam1_t *b) +{ + char *s = bam_format1(header, b); + printf("%s\n", s); + free(s); +} diff --git a/bam.h b/bam.h new file mode 100644 index 0000000..83c03ad --- /dev/null +++ b/bam.h @@ -0,0 +1,714 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BAM_BAM_H +#define BAM_BAM_H + +/*! + @header + + BAM library provides I/O and various operations on manipulating files + in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) + format. It now supports importing from or exporting to TAM, sorting, + merging, generating pileup, and quickly retrieval of reads overlapped + with a specified region. + + @copyright Genome Research Ltd. + */ + +#include +#include +#include +#include + +#define _IOLIB 2 + +#if _IOLIB == 1 && !defined(_NO_RAZF) +#define BAM_TRUE_OFFSET +#include "razf.h" +/*! @abstract BAM file handler */ +typedef RAZF *bamFile; +#define bam_open(fn, mode) razf_open(fn, mode) +#define bam_dopen(fd, mode) razf_dopen(fd, mode) +#define bam_close(fp) razf_close(fp) +#define bam_read(fp, buf, size) razf_read(fp, buf, size) +#define bam_write(fp, buf, size) razf_write(fp, buf, size) +#define bam_tell(fp) razf_tell(fp) +#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir) +#elif _IOLIB == 2 +#define BAM_VIRTUAL_OFFSET16 +#include "bgzf.h" +/*! @abstract BAM file handler */ +typedef BGZF *bamFile; +#define bam_open(fn, mode) bgzf_open(fn, mode) +#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) +#define bam_close(fp) bgzf_close(fp) +#define bam_read(fp, buf, size) bgzf_read(fp, buf, size) +#define bam_write(fp, buf, size) bgzf_write(fp, buf, size) +#define bam_tell(fp) bgzf_tell(fp) +#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) +#elif _IOLIB == 3 +#define BAM_VIRTUAL_OFFSET16 +#include "razf.h" +/*! @abstract BAM file handler */ +typedef RAZF *bamFile; +#define bam_open(fn, mode) razf_open2(fn, mode) +#define bam_dopen(fd, mode) razf_dopen2(fd, mode) +#define bam_close(fp) razf_close(fp) +#define bam_read(fp, buf, size) razf_read(fp, buf, size) +#define bam_write(fp, buf, size) razf_write(fp, buf, size) +#define bam_tell(fp) razf_tell2(fp) +#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir) +#endif + +/*! @typedef + @abstract Structure for the alignment header. + @field n_targets number of reference sequences + @field target_name names of the reference sequences + @field target_len lengths of the referene sequences + @field hash hash table for fast name lookup + @field rg2lib hash table for @RG-ID -> LB lookup + @field l_text length of the plain text in the header + @field text plain text + + @discussion Field hash points to null by default. It is a private + member. + */ +typedef struct { + int32_t n_targets; + char **target_name; + uint32_t *target_len; + void *hash, *rg2lib; + int l_text; + char *text; +} bam_header_t; + +/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ +#define BAM_FPAIRED 1 +/*! @abstract the read is mapped in a proper pair */ +#define BAM_FPROPER_PAIR 2 +/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ +#define BAM_FUNMAP 4 +/*! @abstract the mate is unmapped */ +#define BAM_FMUNMAP 8 +/*! @abstract the read is mapped to the reverse strand */ +#define BAM_FREVERSE 16 +/*! @abstract the mate is mapped to the reverse strand */ +#define BAM_FMREVERSE 32 +/*! @abstract this is read1 */ +#define BAM_FREAD1 64 +/*! @abstract this is read2 */ +#define BAM_FREAD2 128 +/*! @abstract not primary alignment */ +#define BAM_FSECONDARY 256 +/*! @abstract QC failure */ +#define BAM_FQCFAIL 512 +/*! @abstract optical or PCR duplicate */ +#define BAM_FDUP 1024 + +/*! @abstract defautl mask for pileup */ +#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) + +#define BAM_CORE_SIZE sizeof(bam1_core_t) + +/** + * Describing how CIGAR operation/length is packed in a 32-bit integer. + */ +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +/* + CIGAR operations. + */ +/*! @abstract CIGAR: match */ +#define BAM_CMATCH 0 +/*! @abstract CIGAR: insertion to the reference */ +#define BAM_CINS 1 +/*! @abstract CIGAR: deletion from the reference */ +#define BAM_CDEL 2 +/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +#define BAM_CREF_SKIP 3 +/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +#define BAM_CSOFT_CLIP 4 +/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +#define BAM_CHARD_CLIP 5 +/*! @abstract CIGAR: padding */ +#define BAM_CPAD 6 + +/*! @typedef + @abstract Structure for core alignment information. + @field tid chromosome ID, defined by bam_header_t + @field pos 0-based leftmost coordinate + @field strand strand; 0 for forward and 1 otherwise + @field bin bin calculated by bam_reg2bin() + @field qual mapping quality + @field l_qname length of the query name + @field flag bitwise flag + @field n_cigar number of CIGAR operations + @field l_qseq length of the query sequence (read) + */ +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +/*! @typedef + @abstract Structure for one alignment. + @field core core information about the alignment + @field l_aux length of auxiliary data + @field data_len current length of bam1_t::data + @field m_data maximum length of bam1_t::data + @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux + + @discussion Notes: + + 1. qname is zero tailing and core.l_qname includes the tailing '\0'. + 2. l_qseq is calculated from the total length of an alignment block + on reading or from CIGAR. + */ +typedef struct { + bam1_core_t core; + int l_aux, data_len, m_data; + uint8_t *data; +} bam1_t; + +#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) + +/*! @function + @abstract Get the CIGAR array + @param b pointer to an alignment + @return pointer to the CIGAR array + + @discussion In the CIGAR array, each element is a 32-bit integer. The + lower 4 bits gives a CIGAR operation and the higher 28 bits keep the + length of a CIGAR. + */ +#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) + +/*! @function + @abstract Get the name of the query + @param b pointer to an alignment + @return pointer to the name string, null terminated + */ +#define bam1_qname(b) ((char*)((b)->data)) + +/*! @function + @abstract Get query sequence + @param b pointer to an alignment + @return pointer to sequence + + @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, + 8 for T and 15 for N. Two bases are packed in one byte with the base + at the higher 4 bits having smaller coordinate on the read. It is + recommended to use bam1_seqi() macro to get the base. + */ +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) + +/*! @function + @abstract Get query quality + @param b pointer to an alignment + @return pointer to quality string + */ +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2) + +/*! @function + @abstract Get a base on read + @param s Query sequence returned by bam1_seq() + @param i The i-th position, 0-based + @return 4-bit integer representing the base. + */ +#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) + +/*! @function + @abstract Get query sequence and quality + @param b pointer to an alignment + @return pointer to the concatenated auxiliary data + */ +#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) + +#ifndef kroundup32 +/*! @function + @abstract Round an integer to the next closest power-2 integer. + @param x integer to be rounded (in place) + @discussion x will be modified. + */ +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +/*! + @abstract Whether the machine is big-endian; modified only in + bam_header_init(). + */ +extern int bam_is_be; + +/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ +extern unsigned char bam_nt16_table[256]; + +/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ +extern char *bam_nt16_rev_table; + +extern char bam_nt16_nt4_table[]; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! @abstract TAM file handler */ + typedef struct __tamFile_t *tamFile; + + /*! + @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. + @param fn SAM file name + @return SAM file handler + */ + tamFile sam_open(const char *fn); + + /*! + @abstract Close a SAM file handler + @param fp SAM file handler + */ + void sam_close(tamFile fp); + + /*! + @abstract Read one alignment from a SAM file handler + @param fp SAM file handler + @param header header information (ordered names of chromosomes) + @param b read alignment; all members in b will be updated + @return 0 if successful; otherwise negative + */ + int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); + + /*! + @abstract Read header information from a TAB-delimited list file. + @param fn_list file name for the list + @return a pointer to the header structure + + @discussion Each line in this file consists of chromosome name and + the length of chromosome. + */ + bam_header_t *sam_header_read2(const char *fn_list); + + /*! + @abstract Read header from a SAM file (if present) + @param fp SAM file handler + @return pointer to header struct; 0 if no @SQ lines available + */ + bam_header_t *sam_header_read(tamFile fp); + + /*! + @abstract Parse @SQ lines a update a header struct + @param h pointer to the header struct to be updated + @return number of target sequences + + @discussion bam_header_t::{n_targets,target_len,target_name} will + be destroyed in the first place. + */ + int sam_header_parse(bam_header_t *h); + + /*! + @abstract Parse @RG lines a update a header struct + @param h pointer to the header struct to be updated + @return number of @RG lines + + @discussion bam_header_t::rg2lib will be destroyed in the first + place. + */ + int sam_header_parse_rg(bam_header_t *h); + +#define sam_write1(header, b) bam_view1(header, b) + + int bam_strmap_put(void *strmap, const char *rg, const char *lib); + const char *bam_strmap_get(const void *strmap, const char *rg); + void *bam_strmap_dup(const void*); + void *bam_strmap_init(); + void bam_strmap_destroy(void *strmap); + + /*! + @abstract Initialize a header structure. + @return the pointer to the header structure + + @discussion This function also modifies the global variable + bam_is_be. + */ + bam_header_t *bam_header_init(); + + /*! + @abstract Destroy a header structure. + @param header pointer to the header + */ + void bam_header_destroy(bam_header_t *header); + + /*! + @abstract Read a header structure from BAM. + @param fp BAM file handler, opened by bam_open() + @return pointer to the header structure + + @discussion The file position indicator must be placed at the + beginning of the file. Upon success, the position indicator will + be set at the start of the first alignment. + */ + bam_header_t *bam_header_read(bamFile fp); + + /*! + @abstract Write a header structure to BAM. + @param fp BAM file handler + @param header pointer to the header structure + @return always 0 currently + */ + int bam_header_write(bamFile fp, const bam_header_t *header); + + /*! + @abstract Read an alignment from BAM. + @param fp BAM file handler + @param b read alignment; all members are updated. + @return number of bytes read from the file + + @discussion The file position indicator must be + placed right before an alignment. Upon success, this function + will set the position indicator to the start of the next + alignment. This function is not affected by the machine + endianness. + */ + int bam_read1(bamFile fp, bam1_t *b); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param c pointer to the bam1_core_t structure + @param data_len total length of variable size data related to + the alignment + @param data pointer to the concatenated data + @return number of bytes written to the file + + @discussion This function is not affected by the machine + endianness. + */ + int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param b alignment to write + @return number of bytes written to the file + + @abstract It is equivalent to: + bam_write1_core(fp, &b->core, b->data_len, b->data) + */ + int bam_write1(bamFile fp, const bam1_t *b); + + /*! @function + @abstract Initiate a pointer to bam1_t struct + */ +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) + + /*! @function + @abstract Free the memory allocated for an alignment. + @param b pointer to an alignment + */ +#define bam_destroy1(b) do { \ + free((b)->data); free(b); \ + } while (0) + + /*! + @abstract Format a BAM record in the SAM format + @param header pointer to the header structure + @param b alignment to print + @return a pointer to the SAM string + */ + char *bam_format1(const bam_header_t *header, const bam1_t *b); + + /*! @typedef + @abstract Structure for one alignment covering the pileup position. + @field b pointer to the alignment + @field qpos position of the read base at the pileup site, 0-based + @field indel indel length; 0 for no indel, positive for ins and negative for del + @field is_del 1 iff the base on the padded read is a deletion + @field level the level of the read in the "viewer" mode + + @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The + difference between the two functions is that the former does not + set bam_pileup1_t::level, while the later does. Level helps the + implementation of alignment viewers, but calculating this has some + overhead. + */ + typedef struct { + bam1_t *b; + int32_t qpos; + int indel, level; + uint32_t is_del:1, is_head:1, is_tail:1; + } bam_pileup1_t; + + struct __bam_plbuf_t; + /*! @abstract pileup buffer */ + typedef struct __bam_plbuf_t bam_plbuf_t; + + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + + /*! @typedef + @abstract Type of function to be called by bam_plbuf_push(). + @param tid chromosome ID as is defined in the header + @param pos start coordinate of the alignment, 0-based + @param n number of elements in pl array + @param pl array of alignments + @param data user provided data + @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. + */ + typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); + + /*! + @abstract Reset a pileup buffer for another pileup process + @param buf the pileup buffer to be reset + */ + void bam_plbuf_reset(bam_plbuf_t *buf); + + /*! + @abstract Initialize a buffer for pileup. + @param func fucntion to be called by bam_pileup_core() + @param data user provided data + @return pointer to the pileup buffer + */ + bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); + + /*! + @abstract Destroy a pileup buffer. + @param buf pointer to the pileup buffer + */ + void bam_plbuf_destroy(bam_plbuf_t *buf); + + /*! + @abstract Push an alignment to the pileup buffer. + @param b alignment to be pushed + @param buf pileup buffer + @see bam_plbuf_init() + @return always 0 currently + + @discussion If all the alignments covering a particular site have + been collected, this function will call the user defined function + as is provided to bam_plbuf_init(). The coordinate of the site and + all the alignments will be transferred to the user defined + function as function parameters. + + When all the alignments are pushed to the buffer, this function + needs to be called with b equal to NULL. This will flush the + buffer. A pileup buffer can only be reused when bam_plbuf_reset() + is called. + */ + int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); + + struct __bam_lplbuf_t; + typedef struct __bam_lplbuf_t bam_lplbuf_t; + + void bam_lplbuf_reset(bam_lplbuf_t *buf); + + /*! @abstract bam_plbuf_init() equivalent with level calculated. */ + bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); + + /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ + void bam_lplbuf_destroy(bam_lplbuf_t *tv); + + /*! @abstract bam_plbuf_push() equivalent with level calculated. */ + int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); + + /*! @abstract bam_plbuf_file() equivalent with level calculated. */ + int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); + + struct __bam_index_t; + typedef struct __bam_index_t bam_index_t; + + /*! + @abstract Build index for a BAM file. + @discussion Index file "fn.bai" will be created. + @param fn name of the BAM file + @return always 0 currently + */ + int bam_index_build(const char *fn); + + /*! + @abstract Load index from file "fn.bai". + @param fn name of the BAM file (NOT the index file) + @return pointer to the index structure + */ + bam_index_t *bam_index_load(const char *fn); + + /*! + @abstract Destroy an index structure. + @param idx pointer to the index structure + */ + void bam_index_destroy(bam_index_t *idx); + + /*! @typedef + @abstract Type of function to be called by bam_fetch(). + @param b the alignment + @param data user provided data + */ + typedef int (*bam_fetch_f)(const bam1_t *b, void *data); + + /*! + @abstract Retrieve the alignments that are overlapped with the + specified region. + + @discussion A user defined function will be called for each + retrieved alignment ordered by its start position. + + @param fp BAM file handler + @param idx pointer to the alignment index + @param tid chromosome ID as is defined in the header + @param beg start coordinate, 0-based + @param end end coordinate, 0-based + @param data user provided data (will be transferred to func) + @param func user defined function + */ + int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); + + /*! + @abstract Parse a region in the format: "chr2:100,000-200,000". + @discussion bam_header_t::hash will be initialized if empty. + @param header pointer to the header structure + @param str string to be parsed + @param ref_id the returned chromosome ID + @param begin the returned start coordinate + @param end the returned end coordinate + @return 0 on success; -1 on failure + */ + int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + + /*! + @abstract Retrieve data of a tag + @param b pointer to an alignment struct + @param tag two-character tag to be retrieved + + @return pointer to the type and data. The first character is the + type that can be 'iIsScCdfAZH'. + + @discussion Use bam_aux2?() series to convert the returned data to + the corresponding type. + */ + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); + + int32_t bam_aux2i(const uint8_t *s); + float bam_aux2f(const uint8_t *s); + double bam_aux2d(const uint8_t *s); + char bam_aux2A(const uint8_t *s); + char *bam_aux2Z(const uint8_t *s); + + void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); + + uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() + + /*! + @abstract Calculate the rightmost coordinate of an alignment on the + reference genome. + + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return the rightmost coordinate, 0-based + */ + uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar); + + /*! + @abstract Calculate the length of the query sequence from CIGAR. + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return length of the query sequence + */ + int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); + + typedef struct { + int32_t qbeg, qend; + int32_t tbeg, tend; + int32_t cbeg, cend; + } bam_segreg_t; + + int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg); + +#ifdef __cplusplus +} +#endif + +/*! + @abstract Calculate the minimum bin that contains a region [beg,end). + @param beg start of the region, 0-based + @param end end of the region, 0-based + @return bin + */ +static inline int bam_reg2bin(uint32_t beg, uint32_t end) +{ + --end; + if (beg>>14 == end>>14) return 4681 + (beg>>14); + if (beg>>17 == end>>17) return 585 + (beg>>17); + if (beg>>20 == end>>20) return 73 + (beg>>20); + if (beg>>23 == end>>23) return 9 + (beg>>23); + if (beg>>26 == end>>26) return 1 + (beg>>26); + return 0; +} + +/*! + @abstract Copy an alignment + @param bdst destination alignment struct + @param bsrc source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +{ + uint8_t *data = bdst->data; + int m_data = bdst->m_data; // backup data and m_data + if (m_data < bsrc->m_data) { // double the capacity + m_data = bsrc->m_data; kroundup32(m_data); + data = (uint8_t*)realloc(data, m_data); + } + memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data + *bdst = *bsrc; // copy the rest + // restore the backup + bdst->m_data = m_data; + bdst->data = data; + return bdst; +} + +/*! + @abstract Duplicate an alignment + @param src source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_dup1(const bam1_t *src) +{ + bam1_t *b; + b = bam_init1(); + *b = *src; + b->m_data = b->data_len; + b->data = (uint8_t*)calloc(b->data_len, 1); + memcpy(b->data, src->data, b->data_len); + return b; +} + +#endif diff --git a/bam_aux.c b/bam_aux.c new file mode 100644 index 0000000..7482500 --- /dev/null +++ b/bam_aux.c @@ -0,0 +1,232 @@ +#include +#include "bam.h" +#include "khash.h" +typedef char *str_p; +KHASH_MAP_INIT_STR(s, int) +KHASH_MAP_INIT_STR(r2l, str_p) + +void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) +{ + int ori_len = b->data_len; + b->data_len += 3 + len; + b->l_aux += 3 + len; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; + b->data[ori_len + 2] = type; + memcpy(b->data + ori_len + 3, data, len); +} + +uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) +{ + return bam_aux_get(b, tag); +} + +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) +{ + uint8_t *s; + int y = tag[0]<<8 | tag[1]; + s = bam1_aux(b); + while (s < b->data + b->data_len) { + int type, x = (int)s[0]<<8 | s[1]; + s += 2; + if (x == y) return s; + type = toupper(*s); ++s; + if (type == 'C') ++s; + else if (type == 'S') s += 2; + else if (type == 'I' || type == 'F') s += 4; + else if (type == 'D') s += 8; + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } + return 0; +} + +void bam_init_header_hash(bam_header_t *header) +{ + if (header->hash == 0) { + int ret, i; + khiter_t iter; + khash_t(s) *h; + header->hash = h = kh_init(s); + for (i = 0; i < header->n_targets; ++i) { + iter = kh_put(s, h, header->target_name[i], &ret); + kh_value(h, iter) = i; + } + } +} + +void bam_destroy_header_hash(bam_header_t *header) +{ + if (header->hash) + kh_destroy(s, (khash_t(s)*)header->hash); +} + +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) +{ + khint_t k; + khash_t(s) *h = (khash_t(s)*)header->hash; + k = kh_get(s, h, seq_name); + return k == kh_end(h)? -1 : kh_value(h, k); +} + +int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + khiter_t iter; + khash_t(s) *h; + + bam_init_header_hash(header); + h = (khash_t(s)*)header->hash; + + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the ref_id */ + if (iter == kh_end(h)) { // name not found + *ref_id = -1; free(s); + return -1; + } + *ref_id = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return -1; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) { + fprintf(stderr, "[bam_parse_region] invalid region.\n"); + return -1; + } + return 0; +} + +int32_t bam_aux2i(const uint8_t *s) +{ + int type; + if (s == 0) return 0; + type = *s++; + if (type == 'c') return (int32_t)*(int8_t*)s; + else if (type == 'C') return (int32_t)*(uint8_t*)s; + else if (type == 's') return (int32_t)*(int16_t*)s; + else if (type == 'S') return (int32_t)*(uint16_t*)s; + else if (type == 'i' || type == 'I') return *(int32_t*)s; + else return 0; +} + +float bam_aux2f(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'f') return *(float*)s; + else return 0.0; +} + +double bam_aux2d(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'd') return *(double*)s; + else return 0.0; +} + +char bam_aux2A(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'A') return *(char*)s; + else return 0; +} + +char *bam_aux2Z(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'Z' || type == 'H') return (char*)s; + else return 0; +} + +/****************** + * rg2lib related * + ******************/ + +int bam_strmap_put(void *rg2lib, const char *rg, const char *lib) +{ + int ret; + khint_t k; + khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; + char *key; + if (h == 0) return 1; + key = strdup(rg); + k = kh_put(r2l, h, key, &ret); + if (ret) kh_val(h, k) = strdup(lib); + else { + fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg); + free(key); + } + return 0; +} + +const char *bam_strmap_get(const void *rg2lib, const char *rg) +{ + const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; + khint_t k; + if (h == 0) return 0; + k = kh_get(r2l, h, rg); + if (k != kh_end(h)) return (const char*)kh_val(h, k); + else return 0; +} + +void *bam_strmap_dup(const void *rg2lib) +{ + const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; + khash_t(r2l) *g; + khint_t k, l; + int ret; + if (h == 0) return 0; + g = kh_init(r2l); + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + char *key = strdup(kh_key(h, k)); + l = kh_put(r2l, g, key, &ret); + kh_val(g, l) = strdup(kh_val(h, k)); + } + } + return g; +} + +void *bam_strmap_init() +{ + return (void*)kh_init(r2l); +} + +void bam_strmap_destroy(void *rg2lib) +{ + khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; + khint_t k; + if (h == 0) return; + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free((char*)kh_key(h, k)); free(kh_val(h, k)); + } + } + kh_destroy(r2l, h); +} diff --git a/bam_color.c b/bam_color.c new file mode 100644 index 0000000..75aedd6 --- /dev/null +++ b/bam_color.c @@ -0,0 +1,127 @@ +#include +#include "bam.h" + +/*! + @abstract Get the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCSi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) i = strlen(cs) - 1 - i; + else i++; + return cs[i]; +} + +/*! + @abstract Get the color quality of the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color quality + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCQi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CQ"); + char *cq = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cq = bam_aux2Z(c); + // adjust for strandedness + if(bam1_strand(b)) i = strlen(cq) - 1 - i; + return cq[i]; +} + +char bam_aux_nt2int(char a) +{ + switch(toupper(a)) { + case 'A': + return 0; + break; + case 'C': + return 1; + break; + case 'G': + return 2; + break; + case 'T': + return 3; + break; + default: + return 4; + break; + } +} + +char bam_aux_ntnt2cs(char a, char b) +{ + a = bam_aux_nt2int(a); + b = bam_aux_nt2int(b); + if(4 == a || 4 == b) return '4'; + return "0123"[(int)(a ^ b)]; +} + +/*! + @abstract Get the color error profile at the give position + @param b pointer to an alignment + @return the original color if the color was an error, '-' (dash) otherwise + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCEi(bam1_t *b, int i) +{ + int cs_i; + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + char prev_b, cur_b; + char cur_color, cor_color; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) { //reverse strand + cs_i = strlen(cs) - 1 - i; + // get current color + cur_color = cs[cs_i]; + // get previous base + prev_b = (0 == cs_i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + else { + cs_i=i+1; + // get current color + cur_color = cs[cs_i]; + // get previous base + prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + + // corrected color + cor_color = bam_aux_ntnt2cs(prev_b, cur_b); + + if(cur_color == cor_color) { + return '-'; + } + else { + return cur_color; + } +} diff --git a/bam_endian.h b/bam_endian.h new file mode 100644 index 0000000..0fc74a8 --- /dev/null +++ b/bam_endian.h @@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif diff --git a/bam_import.c b/bam_import.c new file mode 100644 index 0000000..fccaa02 --- /dev/null +++ b/bam_import.c @@ -0,0 +1,475 @@ +#include +#include +#include +#include +#include +#include +#include +#include "kstring.h" +#include "bam.h" +#include "kseq.h" +#include "khash.h" + +KSTREAM_INIT(gzFile, gzread, 8192) +KHASH_MAP_INIT_STR(ref, uint64_t) + +void bam_init_header_hash(bam_header_t *header); +void bam_destroy_header_hash(bam_header_t *header); +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + +unsigned char bam_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"; + +struct __tamFile_t { + gzFile fp; + kstream_t *ks; + kstring_t *str; + uint64_t n_lines; + int is_first; +}; + +char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only +{ + char **list = 0, *s; + int n = 0, dret, m = 0; + gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + kstream_t *ks; + kstring_t *str; + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, '\n', str, &dret) > 0) { + if (n == m) { + m = m? m << 1 : 16; + list = (char**)realloc(list, m * sizeof(char*)); + } + if (str->s[str->l-1] == '\r') + str->s[--str->l] = '\0'; + s = list[n++] = (char*)calloc(str->l + 1, 1); + strcpy(s, str->s); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + *_n = n; + return list; +} + +static bam_header_t *hash2header(const kh_ref_t *hash) +{ + bam_header_t *header; + khiter_t k; + header = bam_header_init(); + header->n_targets = kh_size(hash); + header->target_name = (char**)calloc(kh_size(hash), sizeof(char*)); + header->target_len = (uint32_t*)calloc(kh_size(hash), 4); + for (k = kh_begin(hash); k != kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + int i = (int)kh_value(hash, k); + header->target_name[i] = (char*)kh_key(hash, k); + header->target_len[i] = kh_value(hash, k)>>32; + } + } + bam_init_header_hash(header); + return header; +} +bam_header_t *sam_header_read2(const char *fn) +{ + bam_header_t *header; + int c, dret, ret; + gzFile fp; + kstream_t *ks; + kstring_t *str; + kh_ref_t *hash; + khiter_t k; + hash = kh_init(ref); + fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + assert(fp); + ks = ks_init(fp); + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + while (ks_getuntil(ks, 0, str, &dret) > 0) { + char *s = strdup(str->s); + int len, i; + i = kh_size(hash); + ks_getuntil(ks, 0, str, &dret); + len = atoi(str->s); + k = kh_put(ref, hash, s, &ret); + kh_value(hash, k) = (uint64_t)len<<32 | i; + if (dret != '\n') + while ((c = ks_getc(ks)) != '\n' && c != -1); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); + header = hash2header(hash); + kh_destroy(ref, hash); + return header; +} +static inline uint8_t *alloc_data(bam1_t *b, int size) +{ + if (b->m_data < size) { + b->m_data = size; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + return b->data; +} +static inline void parse_error(int64_t n_lines, const char * __restrict msg) +{ + fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg); + abort(); +} +static inline void append_text(bam_header_t *header, kstring_t *str) +{ + int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null + kroundup32(x); kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. + header->l_text += str->l + 1; + header->text[header->l_text] = 0; +} + +int sam_header_parse_rg(bam_header_t *h) +{ + kstring_t *rgid, *rglib; + char *p, *q, *s, *r; + int n = 0; + + // free + if (h == 0) return 0; + bam_strmap_destroy(h->rg2lib); h->rg2lib = 0; + if (h->l_text < 3) return 0; + // parse @RG lines + h->rg2lib = bam_strmap_init(); + rgid = calloc(1, sizeof(kstring_t)); + rglib = calloc(1, sizeof(kstring_t)); + s = h->text; + while ((s = strstr(s, "@RG")) != 0) { + if (rgid->l && rglib->l) { + bam_strmap_put(h->rg2lib, rgid->s, rglib->s); + ++n; + } + rgid->l = rglib->l = 0; + s += 3; + r = s; + if ((p = strstr(s, "ID:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + kputsn(q, p - q, rgid); + } else { + fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n"); + break; + } + if (r < p) r = p; + if ((p = strstr(s, "LB:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + kputsn(q, p - q, rglib); + } else { + fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n"); + break; + } + if (r < p) r = p; + s = r + 3; + } + if (rgid->l && rglib->l) { + bam_strmap_put(h->rg2lib, rgid->s, rglib->s); + ++n; + } + free(rgid->s); free(rgid); + free(rglib->s); free(rglib); + if (n == 0) { + bam_strmap_destroy(h->rg2lib); + h->rg2lib = 0; + } + return n; +} + +int sam_header_parse(bam_header_t *h) +{ + int i; + char *s, *p, *q, *r; + + // free + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + if (h->l_text < 3) return 0; + // count number of @SQ + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + ++h->n_targets; + s += 3; + } + if (h->n_targets == 0) return 0; + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + // parse @SQ lines + i = 0; + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + s += 3; + r = s; + if ((p = strstr(s, "SN:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + h->target_name[i] = (char*)calloc(p - q + 1, 1); + strncpy(h->target_name[i], q, p - q); + } else goto header_err_ret; + if (r < p) r = p; + if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10); + else goto header_err_ret; + if (r < p) r = p; + s = r + 3; + ++i; + } + sam_header_parse_rg(h); + return h->n_targets; + +header_err_ret: + fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n"); + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + return 0; +} + +bam_header_t *sam_header_read(tamFile fp) +{ + int ret, dret; + bam_header_t *header = bam_header_init(); + kstring_t *str = fp->str; + while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header + str->s[str->l] = dret; // note that str->s is NOT null terminated!! + append_text(header, str); + if (dret != '\n') { + ret = ks_getuntil(fp->ks, '\n', str, &dret); + str->s[str->l] = '\n'; // NOT null terminated!! + append_text(header, str); + } + ++fp->n_lines; + } + sam_header_parse(header); + bam_init_header_hash(header); + fp->is_first = 1; + return header; +} + +int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) +{ + int ret, doff, doff0, dret, z = 0; + bam1_core_t *c = &b->core; + kstring_t *str = fp->str; + kstream_t *ks = fp->ks; + + if (fp->is_first) { + fp->is_first = 0; + ret = str->l; + } else { + do { // special consideration for empty lines + ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); + if (ret >= 0) z += str->l + 1; + } while (ret == 0); + } + if (ret < 0) return -1; + ++fp->n_lines; + doff = 0; + + { // name + c->l_qname = strlen(str->s) + 1; + memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname); + doff += c->l_qname; + } + { // flag, tid, pos, qual + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s); + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); + if (c->tid < 0 && strcmp(str->s, "*")) { + if (header->n_targets == 0) { + fprintf(stderr, "[sam_read1] missing header? Abort!\n"); + exit(1); + } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s); + } + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; + if (ret < 0) return -2; + } + { // cigar + char *s, *t; + int i, op; + long x; + c->n_cigar = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; + z += str->l + 1; + if (str->s[0] != '*') { + for (s = str->s; *s; ++s) { + if (isalpha(*s)) ++c->n_cigar; + else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); + } + b->data = alloc_data(b, doff + c->n_cigar * 4); + for (i = 0, s = str->s; i != c->n_cigar; ++i) { + x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M') op = BAM_CMATCH; + else if (op == 'I') op = BAM_CINS; + else if (op == 'D') op = BAM_CDEL; + else if (op == 'N') op = BAM_CREF_SKIP; + else if (op == 'S') op = BAM_CSOFT_CLIP; + else if (op == 'H') op = BAM_CHARD_CLIP; + else if (op == 'P') op = BAM_CPAD; + else parse_error(fp->n_lines, "invalid CIGAR operation"); + s = t + 1; + bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; + } + if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); + c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); + doff += c->n_cigar * 4; + } else c->bin = bam_reg2bin(c->pos, c->pos + 1); + } + { // mtid, mpos, isize + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; + if (ret < 0) return -4; + } + { // seq and qual + int i; + uint8_t *p; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq + z += str->l + 1; + c->l_qseq = strlen(str->s); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; + bzero(p, (c->l_qseq+1)/2); + for (i = 0; i < c->l_qseq; ++i) + p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual + z += str->l + 1; + if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) + parse_error(fp->n_lines, "sequence and quality are inconsistent"); + p += (c->l_qseq+1)/2; + if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff; + else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; + doff += c->l_qseq + (c->l_qseq+1)/2; + } + doff0 = doff; + if (dret != '\n' && dret != '\r') { // aux + while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) { + uint8_t *s, type, key[2]; + z += str->l + 1; + if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') + parse_error(fp->n_lines, "missing colon in auxiliary data"); + key[0] = str->s[0]; key[1] = str->s[1]; + type = str->s[3]; + s = alloc_data(b, doff + 3) + doff; + s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility + s = alloc_data(b, doff + 2) + doff; + *s++ = 'A'; *s = str->s[5]; + doff += 2; + } else if (type == 'I' || type == 'i') { + long long x; + s = alloc_data(b, doff + 5) + doff; + x = (long long)atoll(str->s + 5); + if (x < 0) { + if (x >= -127) { + *s++ = 'c'; *(int8_t*)s = (int8_t)x; + s += 1; doff += 2; + } else if (x >= -32767) { + *s++ = 's'; *(int16_t*)s = (int16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'i'; *(int32_t*)s = (int32_t)x; + s += 4; doff += 5; + if (x < -2147483648ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } else { + if (x <= 255) { + *s++ = 'C'; *s++ = (uint8_t)x; + doff += 2; + } else if (x <= 65535) { + *s++ = 'S'; *(uint16_t*)s = (uint16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'I'; *(uint32_t*)s = (uint32_t)x; + s += 4; doff += 5; + if (x > 4294967295ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } + } else if (type == 'f') { + s = alloc_data(b, doff + 5) + doff; + *s++ = 'f'; + *(float*)s = (float)atof(str->s + 5); + s += 4; doff += 5; + } else if (type == 'd') { + s = alloc_data(b, doff + 9) + doff; + *s++ = 'd'; + *(float*)s = (float)atof(str->s + 9); + s += 8; doff += 9; + } else if (type == 'Z' || type == 'H') { + int size = 1 + (str->l - 5) + 1; + if (type == 'H') { // check whether the hex string is valid + int i; + if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even"); + for (i = 0; i < str->l - 5; ++i) { + int c = toupper(str->s[5 + i]); + if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) + parse_error(fp->n_lines, "invalid hex character"); + } + } + s = alloc_data(b, doff + size) + doff; + *s++ = type; + memcpy(s, str->s + 5, str->l - 5); + s[str->l - 5] = 0; + doff += size; + } else parse_error(fp->n_lines, "unrecognized type"); + if (dret == '\n' || dret == '\r') break; + } + } + b->l_aux = doff - doff0; + b->data_len = doff; + return z; +} + +tamFile sam_open(const char *fn) +{ + tamFile fp; + fp = (tamFile)calloc(1, sizeof(struct __tamFile_t)); + fp->str = (kstring_t*)calloc(1, sizeof(kstring_t)); + fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + fp->ks = ks_init(fp->fp); + return fp; +} + +void sam_close(tamFile fp) +{ + if (fp) { + ks_destroy(fp->ks); + gzclose(fp->fp); + free(fp->str->s); free(fp->str); + free(fp); + } +} diff --git a/bam_index.c b/bam_index.c new file mode 100644 index 0000000..72ef270 --- /dev/null +++ b/bam_index.c @@ -0,0 +1,551 @@ +#include +#include +#include "bam.h" +#include "khash.h" +#include "ksort.h" +#include "bam_endian.h" +#include "knetfile.h" + +/*! + @header + + Alignment indexing. Before indexing, BAM must be sorted based on the + leftmost coordinate of alignments. In indexing, BAM uses two indices: + a UCSC binning index and a simple linear index. The binning index is + efficient for alignments spanning long distance, while the auxiliary + linear index helps to reduce unnecessary seek calls especially for + short alignments. + + The UCSC binning scheme was suggested by Richard Durbin and Lincoln + Stein and is explained by Kent et al. (2002). In this scheme, each bin + represents a contiguous genomic region which can be fully contained in + another bin; each alignment is associated with a bin which represents + the smallest region containing the entire alignment. The binning + scheme is essentially another representation of R-tree. A distinct bin + uniquely corresponds to a distinct internal node in a R-tree. Bin A is + a child of Bin B if region A is contained in B. + + In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin + 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp, + 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to + find the alignments overlapped with a region [rbeg,rend), we need to + calculate the list of bins that may be overlapped the region and test + the alignments in the bins to confirm the overlaps. If the specified + region is short, typically only a few alignments in six bins need to + be retrieved. The overlapping alignments can be quickly fetched. + + */ + +#define BAM_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. +#define BAM_LIDX_SHIFT 14 + +typedef struct { + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) +KSORT_INIT(off, pair64_t, pair64_lt) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} bam_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bam_lidx_t; + +KHASH_MAP_INIT_INT(i, bam_binlist_t) + +struct __bam_index_t { + int32_t n; + khash_t(i) **index; + bam_lidx_t *index2; +}; + +// requirement: len <= LEN_MASK +static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + bam_binlist_t *l; + int ret; + k = kh_put(i, h, bin, &ret); + l = &kh_value(h, k); + if (ret) { // not present + l->m = 1; l->n = 0; + l->list = (pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; l->list[l->n++].v = end; +} + +static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset) +{ + int i, beg, end; + beg = b->core.pos >> BAM_LIDX_SHIFT; + end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + for (i = beg + 1; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + index2->n = end + 1; +} + +static void merge_chunks(bam_index_t *idx) +{ +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + khash_t(i) *index; + int i, l, m; + khint_t k; + for (i = 0; i < idx->n; ++i) { + index = idx->index[i]; + for (k = kh_begin(index); k != kh_end(index); ++k) { + bam_binlist_t *p; + if (!kh_exist(index, k)) continue; + p = &kh_value(index, k); + m = 0; + for (l = 1; l < p->n; ++l) { +#ifdef BAM_TRUE_OFFSET + if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v; +#else + if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; +#endif + else p->list[++m] = p->list[l]; + } // ~for(l) + p->n = m + 1; + } // ~for(k) + } // ~for(i) +#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) +} + +bam_index_t *bam_index_core(bamFile fp) +{ + bam1_t *b; + bam_header_t *h; + int i, ret; + bam_index_t *idx; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; + bam1_core_t *c; + uint64_t save_off, last_off; + + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + h = bam_header_read(fp); + c = &b->core; + + idx->n = h->n_targets; + bam_header_destroy(h); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + + save_bin = save_tid = last_tid = last_bin = 0xffffffffu; + save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; + while ((ret = bam_read1(fp, b)) >= 0) { + if (last_tid != c->tid) { // change of chromosomes + last_tid = c->tid; + last_bin = 0xffffffffu; + } else if (last_coor > c->pos) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", + bam1_qname(b), last_coor, c->pos, c->tid+1); + exit(1); + } + if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->bin != last_bin) { // then possibly write the binning index + if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + save_off = last_off; + save_bin = last_bin = c->bin; + save_tid = c->tid; + if (save_tid < 0) break; + } + if (bam_tell(fp) <= last_off) { + fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", + (unsigned long long)bam_tell(fp), (unsigned long long)last_off); + exit(1); + } + last_off = bam_tell(fp); + last_coor = b->core.pos; + } + if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + merge_chunks(idx); + if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); + free(b->data); free(b); + return idx; +} + +void bam_index_destroy(bam_index_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) + free(kh_value(index, k).list); + } + kh_destroy(i, index); + free(index2->offset); + } + free(idx->index); free(idx->index2); + free(idx); +} + +void bam_index_save(const bam_index_t *idx, FILE *fp) +{ + int32_t i, size; + khint_t k; + fwrite("BAI\1", 1, 4, fp); + if (bam_is_be) { + uint32_t x = idx->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&idx->n, 4, 1, fp); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + // write binning index + size = kh_size(index); + if (bam_is_be) { // big endian + uint32_t x = size; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&size, 4, 1, fp); + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) { + bam_binlist_t *p = &kh_value(index, k); + if (bam_is_be) { // big endian + uint32_t x; + x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + fwrite(p->list, 16, p->n, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } else { + fwrite(&kh_key(index, k), 4, 1, fp); + fwrite(&p->n, 4, 1, fp); + fwrite(p->list, 16, p->n, fp); + } + } + } + // write linear index (index2) + if (bam_is_be) { + int x = index2->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&index2->n, 4, 1, fp); + if (bam_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + fwrite(index2->offset, 8, index2->n, fp); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else fwrite(index2->offset, 8, index2->n, fp); + } + fflush(fp); +} + +static bam_index_t *bam_index_load_core(FILE *fp) +{ + int i; + char magic[4]; + bam_index_t *idx; + if (fp == 0) { + fprintf(stderr, "[bam_index_load_core] fail to load index.\n"); + return 0; + } + fread(magic, 1, 4, fp); + if (strncmp(magic, "BAI\1", 4)) { + fprintf(stderr, "[bam_index_load] wrong magic number.\n"); + fclose(fp); + return 0; + } + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + fread(&idx->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&idx->n); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index; + bam_lidx_t *index2 = idx->index2 + i; + uint32_t key, size; + khint_t k; + int j, ret; + bam_binlist_t *p; + index = idx->index[i] = kh_init(i); + // load binning index + fread(&size, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&size); + for (j = 0; j < (int)size; ++j) { + fread(&key, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&key); + k = kh_put(i, index, key, &ret); + p = &kh_value(index, k); + fread(&p->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&p->n); + p->m = p->n; + p->list = (pair64_t*)malloc(p->m * 16); + fread(p->list, 16, p->n, fp); + if (bam_is_be) { + int x; + for (x = 0; x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } + } + // load linear index + fread(&index2->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + fread(index2->offset, index2->n, 8, fp); + if (bam_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +bam_index_t *bam_index_load_local(const char *_fn) +{ + FILE *fp; + char *fnidx, *fn; + + if (strstr(_fn, "ftp://") == _fn) { + const char *p; + int l = strlen(_fn); + for (p = _fn + l - 1; p >= _fn; --p) + if (*p == '/') break; + fn = strdup(p + 1); + } else fn = strdup(_fn); + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + fp = fopen(fnidx, "r"); + if (fp == 0) { // try "{base}.bai" + char *s = strstr(fn, "bam"); + if (s == fn + strlen(fn) - 3) { + strcpy(fnidx, fn); + fnidx[strlen(fn)-1] = 'i'; + fp = fopen(fnidx, "r"); + } + } + free(fnidx); free(fn); + if (fp) { + bam_index_t *idx = bam_index_load_core(fp); + fclose(fp); + return idx; + } else return 0; +} + +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} + +bam_index_t *bam_index_load(const char *fn) +{ + bam_index_t *idx; + idx = bam_index_load_local(fn); + if (idx == 0 && strstr(fn, "ftp://") == fn) { + char *fnidx = calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bai"); + fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); + download_from_remote(fnidx); + idx = bam_index_load_local(fn); + } + if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); + return idx; +} + +int bam_index_build2(const char *fn, const char *_fnidx) +{ + char *fnidx; + FILE *fpidx; + bamFile fp; + bam_index_t *idx; + if ((fp = bam_open(fn, "r")) == 0) { + fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n"); + return -1; + } + idx = bam_index_core(fp); + bam_close(fp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + } else fnidx = strdup(_fnidx); + fpidx = fopen(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + bam_index_save(idx, fpidx); + bam_index_destroy(idx); + fclose(fpidx); + free(fnidx); + return 0; +} + +int bam_index_build(const char *fn) +{ + return bam_index_build2(fn, 0); +} + +int bam_index(int argc, char *argv[]) +{ + if (argc < 2) { + fprintf(stderr, "Usage: samtools index []\n"); + return 1; + } + if (argc >= 3) bam_index_build2(argv[1], argv[2]); + else bam_index_build(argv[1]); + return 0; +} + +#define MAX_BIN 37450 // =(8^6-1)/7+1 + +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +{ + int i = 0, k; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; +} + +static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) +{ + uint32_t rbeg = b->core.pos; + uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; + return (rend > beg && rbeg < end); +} + +int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +{ + uint16_t *bins; + int i, n_bins, n_off; + pair64_t *off; + khint_t k; + khash_t(i) *index; + uint64_t min_off; + + bins = (uint16_t*)calloc(MAX_BIN, 2); + n_bins = reg2bins(beg, end, bins); + index = idx->index[tid]; + min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) + n_off += kh_value(index, k).n; + } + if (n_off == 0) { + free(bins); return 0; + } + off = (pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { + int j; + bam_binlist_t *p = &kh_value(index, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + free(bins); + { + bam1_t *b; + int l, ret, n_seeks; + uint64_t curr_off; + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + ks_introsort(off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + { // merge adjacent blocks +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + for (i = 1, l = 0; i < n_off; ++i) { +#ifdef BAM_TRUE_OFFSET + if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v; +#else + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; +#endif + else off[++l] = off[i]; + } + n_off = l + 1; +#endif + } + // retrive alignments + n_seeks = 0; i = -1; curr_off = 0; + for (;;) { + if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk + if (i == n_off - 1) break; // no more chunks + if (i >= 0) assert(curr_off == off[i].v); // otherwise bug + if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek + bam_seek(fp, off[i+1].u, SEEK_SET); + curr_off = bam_tell(fp); + ++n_seeks; + } + ++i; + } + if ((ret = bam_read1(fp, b)) > 0) { + curr_off = bam_tell(fp); + if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed + else if (is_overlap(beg, end, b)) func(b, data); + } else break; // end of file + } +// fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); + bam_destroy1(b); + } + free(off); + return 0; +} diff --git a/bam_lpileup.c b/bam_lpileup.c new file mode 100644 index 0000000..425290e --- /dev/null +++ b/bam_lpileup.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include "bam.h" +#include "ksort.h" + +#define TV_GAP 2 + +typedef struct __freenode_t { + uint32_t level:28, cnt:4; + struct __freenode_t *next; +} freenode_t, *freenode_p; + +#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) +KSORT_INIT(node, freenode_p, freenode_lt) + +/* Memory pool, similar to the one in bam_pileup.c */ +typedef struct { + int cnt, n, max; + freenode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + return (mempool_t*)calloc(1, sizeof(mempool_t)); +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) free(mp->buf[k]); + free(mp->buf); free(mp); +} +static inline freenode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, freenode_t *p) +{ + --mp->cnt; p->next = 0; p->cnt = TV_GAP; + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* core part */ +struct __bam_lplbuf_t { + int max, n_cur, n_pre; + int max_level, *cur_level, *pre_level; + mempool_t *mp; + freenode_t **aux, *head, *tail; + int n_nodes, m_aux; + bam_pileup_f func; + void *user_data; + bam_plbuf_t *plbuf; +}; + +void bam_lplbuf_reset(bam_lplbuf_t *buf) +{ + freenode_t *p, *q; + bam_plbuf_reset(buf->plbuf); + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; + buf->max_level = 0; + buf->n_cur = buf->n_pre = 0; + buf->n_nodes = 0; +} + +static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + bam_lplbuf_t *tv = (bam_lplbuf_t*)data; + freenode_t *p; + int i, l, max_level; + // allocate memory if necessary + if (tv->max < n) { // enlarge + tv->max = n; + kroundup32(tv->max); + tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); + tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); + } + tv->n_cur = n; + // update cnt + for (p = tv->head; p->next; p = p->next) + if (p->cnt > 0) --p->cnt; + // calculate cur_level[] + max_level = 0; + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) { + if (tv->head->next && tv->head->cnt == 0) { // then take a free slot + freenode_t *p = tv->head->next; + tv->cur_level[i] = tv->head->level; + mp_free(tv->mp, tv->head); + tv->head = p; + --tv->n_nodes; + } else tv->cur_level[i] = ++tv->max_level; + } else { + tv->cur_level[i] = tv->pre_level[l++]; + if (p->is_tail) { // then return a free slot + tv->tail->level = tv->cur_level[i]; + tv->tail->next = mp_alloc(tv->mp); + tv->tail = tv->tail->next; + ++tv->n_nodes; + } + } + if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; + ((bam_pileup1_t*)p)->level = tv->cur_level[i]; + } + assert(l == tv->n_pre); + tv->func(tid, pos, n, pl, tv->user_data); + // sort the linked list + if (tv->n_nodes) { + freenode_t *q; + if (tv->n_nodes + 1 > tv->m_aux) { // enlarge + tv->m_aux = tv->n_nodes + 1; + kroundup32(tv->m_aux); + tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); + } + for (p = tv->head, i = l = 0; p->next;) { + if (p->level > max_level) { // then discard this entry + q = p->next; + mp_free(tv->mp, p); + p = q; + } else { + tv->aux[i++] = p; + p = p->next; + } + } + tv->aux[i] = tv->tail; // add a proper tail for the loop below + tv->n_nodes = i; + if (tv->n_nodes) { + ks_introsort(node, tv->n_nodes, tv->aux); + for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; + tv->head = tv->aux[0]; + } else tv->head = tv->tail; + } + // clean up + tv->max_level = max_level; + memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); + // squeeze out terminated levels + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!p->is_tail) + tv->pre_level[l++] = tv->pre_level[i]; + } + tv->n_pre = l; +/* + fprintf(stderr, "%d\t", pos+1); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) fprintf(stderr, "^"); + if (p->is_tail) fprintf(stderr, "$"); + fprintf(stderr, "%d,", p->level); + } + fprintf(stderr, "\n"); +*/ + return 0; +} + +bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) +{ + bam_lplbuf_t *tv; + tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); + tv->mp = mp_init(); + tv->head = tv->tail = mp_alloc(tv->mp); + tv->func = func; + tv->user_data = data; + tv->plbuf = bam_plbuf_init(tview_func, tv); + return (bam_lplbuf_t*)tv; +} + +void bam_lplbuf_destroy(bam_lplbuf_t *tv) +{ + freenode_t *p, *q; + free(tv->cur_level); free(tv->pre_level); + bam_plbuf_destroy(tv->plbuf); + free(tv->aux); + for (p = tv->head; p->next;) { + q = p->next; + mp_free(tv->mp, p); p = q; + } + mp_free(tv->mp, p); + assert(tv->mp->cnt == 0); + mp_destroy(tv->mp); + free(tv); +} + +int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) +{ + return bam_plbuf_push(b, tv->plbuf); +} + +int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_lplbuf_t *buf; + int ret; + bam1_t *b; + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + buf = bam_lplbuf_init(func, func_data); + bam_plbuf_set_mask(buf->plbuf, mask); + while ((ret = bam_read1(fp, b)) >= 0) + bam_lplbuf_push(b, buf); + bam_lplbuf_push(0, buf); + bam_lplbuf_destroy(buf); + free(b->data); free(b); + return 0; +} diff --git a/bam_maqcns.c b/bam_maqcns.c new file mode 100644 index 0000000..464288a --- /dev/null +++ b/bam_maqcns.c @@ -0,0 +1,526 @@ +#include +#include "bam.h" +#include "bam_maqcns.h" +#include "ksort.h" +KSORT_INIT_GENERIC(uint32_t) + +#define MAX_WINDOW 33 + +typedef struct __bmc_aux_t { + int max; + uint32_t *info; +} bmc_aux_t; + +typedef struct { + float esum[4], fsum[4]; + uint32_t c[4]; + uint32_t rms_mapQ; +} glf_call_aux_t; + +char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +/* + P() = \theta \sum_{i=1}^{N-1} 1/i + P(D|) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2] + p_k = i/k / \sum_{i=1}^{N-1} 1/i + */ +static void cal_het(bam_maqcns_t *aa) +{ + int k, n1, n2; + double sum_harmo; // harmonic sum + double poly_rate; + double p1 = 0.0, p3 = 0.0; // just for testing + + free(aa->lhet); + aa->lhet = (double*)calloc(256 * 256, sizeof(double)); + sum_harmo = 0.0; + for (k = 1; k <= aa->n_hap - 1; ++k) + sum_harmo += 1.0 / k; + for (n1 = 0; n1 < 256; ++n1) { + for (n2 = 0; n2 < 256; ++n2) { + long double sum = 0.0; + double lC = lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1} + for (k = 1; k <= aa->n_hap - 1; ++k) { + double pk = 1.0 / k / sum_harmo; + double log1 = log((double)k/aa->n_hap); + double log2 = log(1.0 - (double)k/aa->n_hap); + sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2)); + } + aa->lhet[n1<<8|n2] = lC + logl(sum); + if (n1 == 17 && n2 == 3) p3 = lC + logl(expl(logl(0.5) * 20)); + if (n1 == 19 && n2 == 1) p1 = lC + logl(expl(logl(0.5) * 20)); + } + } + poly_rate = aa->het_rate * sum_harmo; + aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate)); +} + +/** initialize the helper structure */ +static void cal_coef(bam_maqcns_t *aa) +{ + int k, n, q; + long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256]; + double *lC; + + lC = (double*)calloc(256 * 256, sizeof(double)); + // aa->lhet will be allocated and initialized + free(aa->fk); free(aa->coef); + aa->fk = (double*)calloc(256, sizeof(double)); + aa->coef = (double*)calloc(256*256*64, sizeof(double)); + aa->fk[0] = fk2[0] = 1.0; + for (n = 1; n != 256; ++n) { + aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta; + fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands + } + for (n = 1; n != 256; ++n) + for (k = 1; k <= n; ++k) + lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); + for (q = 1; q != 64; ++q) { + double e = pow(10.0, -q/10.0); + double le = log(e); + double le1 = log(1.0-e); + for (n = 1; n != 256; ++n) { + double *coef = aa->coef + (q<<16|n<<8); + sum_a[n+1] = 0.0; + for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k} + sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1); + b[k] = sum_a[k+1] / sum_a[k]; + if (b[k] > 0.99) b[k] = 0.99; + } + for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k}) + q_c[k] = -4.343 * fk2[k] * logl(b[k] / e); + for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i + for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9 + tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k]))); + coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk} + } + } + } + free(lC); +} + +bam_maqcns_t *bam_maqcns_init() +{ + bam_maqcns_t *bm; + bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t)); + bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t)); + bm->het_rate = 0.001; + bm->theta = 0.85; + bm->n_hap = 2; + bm->eta = 0.03; + bm->cap_mapQ = 60; + return bm; +} + +void bam_maqcns_prepare(bam_maqcns_t *bm) +{ + cal_coef(bm); cal_het(bm); +} + +void bam_maqcns_destroy(bam_maqcns_t *bm) +{ + if (bm == 0) return; + free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); + free(bm->aux); free(bm); +} + +glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm) +{ + glf_call_aux_t *b; + int i, j, k, w[8], c, n; + glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t)); + float p[16], min_p = 1e30; + uint64_t rms; + + g->ref_base = ref_base; + if (_n == 0) return g; + + // construct aux array + if (bm->aux->max < _n) { + bm->aux->max = _n; + kroundup32(bm->aux->max); + bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max); + } + for (i = n = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + uint32_t q, x = 0, qq; + if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; + q = (uint32_t)bam1_qual(p->b)[p->qpos]; + x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; + if (p->b->core.qual < q) q = p->b->core.qual; + x |= q << 24; + qq = bam1_seqi(bam1_seq(p->b), p->qpos); + q = bam_nt16_nt4_table[qq? qq : ref_base]; + if (!p->is_del && q < 4) x |= 1 << 21 | q << 16; + bm->aux->info[n++] = x; + } + ks_introsort(uint32_t, n, bm->aux->info); + // generate esum and fsum + b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t)); + for (k = 0; k != 8; ++k) w[k] = 0; + rms = 0; + for (j = n - 1; j >= 0; --j) { // calculate esum and fsum + uint32_t info = bm->aux->info[j]; + int tmp; + if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); + k = info>>16&7; + if (info>>24 > 0) { + b->esum[k&3] += bm->fk[w[k]] * (info>>24); + b->fsum[k&3] += bm->fk[w[k]]; + if (w[k] < 0xff) ++w[k]; + ++b->c[k&3]; + } + tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ; + rms += tmp * tmp; + } + b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499); + // rescale ->c[] + for (j = c = 0; j != 4; ++j) c += b->c[j]; + if (c > 255) { + for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5); + for (j = c = 0; j != 4; ++j) c += b->c[j]; + } + // generate likelihood + for (j = 0; j != 4; ++j) { + // homozygous + float tmp1, tmp3; + int tmp2, bar_e; + for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) { + if (j == k) continue; + tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.5); + if (bar_e < 4) bar_e = 4; // should not happen + if (bar_e > 63) bar_e = 63; + p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; + } else p[j<<2|j] = 0.0; // all the bases are j + // heterozygous + for (k = j + 1; k < 4; ++k) { + for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) { + if (i == j || i == k) continue; + tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.5); + if (bar_e < 4) bar_e = 4; + if (bar_e > 63) bar_e = 63; + p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; + } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k + } + // + for (k = 0; k != 4; ++k) + if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0; + } + + { // fix p[k<<2|k] + float max1, max2, min1, min2; + int max_k, min_k; + max_k = min_k = -1; + max1 = max2 = -1.0; min1 = min2 = 1e30; + for (k = 0; k < 4; ++k) { + if (b->esum[k] > max1) { + max2 = max1; max1 = b->esum[k]; max_k = k; + } else if (b->esum[k] > max2) max2 = b->esum[k]; + } + for (k = 0; k < 4; ++k) { + if (p[k<<2|k] < min1) { + min2 = min1; min1 = p[k<<2|k]; min_k = k; + } else if (p[k<<2|k] < min2) min2 = p[k<<2|k]; + } + if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2)) + p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0; + } + + // convert necessary information to glf1_t + g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ; + g->depth = n > 16777215? 16777215 : n; + for (j = 0; j != 4; ++j) + for (k = j; k < 4; ++k) + if (p[j<<2|k] < min_p) min_p = p[j<<2|k]; + g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5); + for (j = c = 0; j != 4; ++j) + for (k = j; k < 4; ++k) + g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5); + + free(b); + return g; +} + +uint32_t glf2cns(const glf1_t *g, int q_r) +{ + int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1; + uint32_t x = 0; + for (i = k = 0; i < 4; ++i) + for (j = i; j < 4; ++j) { + tmp[j<<2|i] = -1; + tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r); + } + for (i = 0; i < 16; ++i) { + if (tmp[i] < 0) continue; + if (tmp[i] < min) { + min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i; + } else if (tmp[i] < min2) { + min3 = min2; min2 = tmp[i]; min_g2 = i; + } else if (tmp[i] < min3) min3 = tmp[i]; + } + x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28; + x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24; + x |= (uint32_t)g->max_mapQ << 16; + x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8; + x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff; + return x; +} + +uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) +{ + glf1_t *g; + uint32_t x; + if (n) { + g = bam_maqcns_glfgen(n, pl, 0xf, bm); + x = glf2cns(g, (int)(bm->q_r + 0.5)); + free(g); + } else x = 0xfU<<28 | 0xfU<<24; + return x; +} + +/************** *****************/ + +bam_maqindel_opt_t *bam_maqindel_opt_init() +{ + bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); + mi->q_indel = 40; + mi->r_indel = 0.00015; + // + mi->mm_penalty = 3; + mi->indel_err = 4; + mi->ambi_thres = 10; + return mi; +} + +void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) +{ + if (mir == 0) return; + free(mir->s[0]); free(mir->s[1]); free(mir); +} + +#define MINUS_CONST 0x10000000 + +bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types) +{ + int i, j, n_types, *types, left, right; + bam_maqindel_ret_t *ret = 0; + // if there is no proposed indel, check if there is an indel from the alignment + if (_n_types == 0) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; + } + if (i == n) return 0; // no indel + } + { // calculate how many types of indels are available (set n_types and types) + int m; + uint32_t *aux; + aux = (uint32_t*)calloc(n + _n_types + 1, 4); + m = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) + aux[m++] = MINUS_CONST + p->indel; + } + if (_n_types) // then also add this to aux[] + for (i = 0; i < _n_types; ++i) + if (_types[i]) aux[m++] = MINUS_CONST + _types[i]; + ks_introsort(uint32_t, m, aux); + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + types = (int*)calloc(n_types, sizeof(int)); + j = 0; + types[j++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) { + if (aux[i] != aux[i-1]) + types[j++] = aux[i] - MINUS_CONST; + } + free(aux); + } + { // calculate left and right boundary + bam_segreg_t seg; + left = 0x7fffffff; right = 0; + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP)) { + bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg); + if (seg.tbeg < left) left = seg.tbeg; + if (seg.tend > right) right = seg.tend; + } + } + if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW; + if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW; + } + { // the core part + char *ref2, *inscns = 0; + int k, l, *score, *pscore, max_ins = types[n_types-1]; + ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1); + if (max_ins > 0) { // get the consensus of inserted sequences + int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); + // count occurrences + for (i = 0; i < n_types; ++i) { + if (types[i] <= 0) continue; // not insertion + for (j = 0; j < n; ++j) { + const bam_pileup1_t *p = pl + j; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) { + for (k = 1; k <= p->indel; ++k) { + int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)]; + if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c]; + } + } + } + } + // construct the consensus of inserted sequence + inscns = (char*)calloc(n_types * max_ins, sizeof(char)); + for (i = 0; i < n_types; ++i) { + for (j = 0; j < types[i]; ++j) { + int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4; + for (k = 0; k < 4; ++k) { + if (ia[k] > max) { + max = ia[k]; + max_k = k; + } + } + inscns[i*max_ins + j] = max? 1<b->core; + int s, ps; + bam_segreg_t seg; + if (c->flag&BAM_FUNMAP) continue; + cigar = bam1_cigar(p->b); + bam_segreg(pos, c, cigar, &seg); + for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) { + int cq = bam1_seqi(bam1_seq(p->b), l), ct; + // in the following line, "<" will happen if reads are too long + ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15; + if (cq < 15 && ct < 15) { + s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } + } + score[i*n + j] = s; pscore[i*n + j] = ps; + if (types[i] != 0) { // then try the other way to calculate the score + for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) { + int cq = bam1_seqi(bam1_seq(p->b), l), ct; + ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15; + if (cq < 15 && ct < 15) { + s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } + } + } + if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores + if (pscore[i*n+j] > ps) pscore[i*n+j] = ps; + if (types[i] != 0) score[i*n+j] -= mi->indel_err; + //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j, + // score[i*n+j], pscore[i*n+j]); + } + } + { // get final result + int *sum, max1, max2, max1_i, max2_i; + // pick up the best two score + sum = (int*)calloc(n_types, sizeof(int)); + for (i = 0; i < n_types; ++i) + for (j = 0; j < n; ++j) + sum[i] += -pscore[i*n+j]; + max1 = max2 = -0x7fffffff; max1_i = max2_i = -1; + for (i = 0; i < n_types; ++i) { + if (sum[i] > max1) { + max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i; + } else if (sum[i] > max2) { + max2 = sum[i]; max2_i = i; + } + } + free(sum); + // write ret + ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t)); + ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i]; + ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1); + ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1); + // write indel sequence + if (ret->indel1 > 0) { + ret->s[0][0] = '+'; + for (k = 0; k < ret->indel1; ++k) + ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; + } else if (ret->indel1 < 0) { + ret->s[0][0] = '-'; + for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k) + ret->s[0][k+1] = ref[pos + k + 1]; + } else ret->s[0][0] = '*'; + if (ret->indel2 > 0) { + ret->s[1][0] = '+'; + for (k = 0; k < ret->indel2; ++k) + ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; + } else if (ret->indel2 < 0) { + ret->s[1][0] = '-'; + for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k) + ret->s[1][k+1] = ref[pos + k + 1]; + } else ret->s[1][0] = '*'; + // write count + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel == ret->indel1) ++ret->cnt1; + else if (p->indel == ret->indel2) ++ret->cnt2; + else ++ret->cnt_anti; + } + // write gl[] + ret->gl[0] = ret->gl[1] = 0; + for (j = 0; j < n; ++j) { + int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; + //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2); + if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel; + else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel; + } + } + free(score); free(pscore); free(ref2); free(inscns); + } + { // call genotype + int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); + int min1, min2, min1_i; + q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel; + q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel; + q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel; + min1 = min2 = 0x7fffffff; min1_i = -1; + for (i = 0; i < 3; ++i) { + if (q[i] < min1) { + min2 = min1; min1 = q[i]; min1_i = i; + } else if (q[i] < min2) min2 = q[i]; + } + ret->gt = min1_i; + ret->q_cns = min2 - min1; + // set q_ref + if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3; + else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2]; + if (ret->q_ref < 0) ret->q_ref = 0; + } + free(types); + return ret; +} diff --git a/bam_maqcns.h b/bam_maqcns.h new file mode 100644 index 0000000..36704d7 --- /dev/null +++ b/bam_maqcns.h @@ -0,0 +1,55 @@ +#ifndef BAM_MAQCNS_H +#define BAM_MAQCNS_H + +#include "glf.h" + +struct __bmc_aux_t; + +typedef struct { + float het_rate, theta; + int n_hap, cap_mapQ; + + float eta, q_r; + double *fk, *coef; + double *lhet; + struct __bmc_aux_t *aux; +} bam_maqcns_t; + +typedef struct { + int q_indel; + float r_indel; + // hidden parameters, unchangeable from command line + int mm_penalty, indel_err, ambi_thres; +} bam_maqindel_opt_t; + +typedef struct { + int indel1, indel2; + int cnt1, cnt2, cnt_ambi, cnt_anti; + char *s[2]; + // + int gt, gl[2]; + int q_cns, q_ref; +} bam_maqindel_ret_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bam_maqcns_t *bam_maqcns_init(); + void bam_maqcns_prepare(bam_maqcns_t *bm); + void bam_maqcns_destroy(bam_maqcns_t *bm); + glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm); + uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm); + // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2 + uint32_t glf2cns(const glf1_t *g, int q_r); + + bam_maqindel_opt_t *bam_maqindel_opt_init(); + bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types); + void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bam_mate.c b/bam_mate.c new file mode 100644 index 0000000..61f808a --- /dev/null +++ b/bam_mate.c @@ -0,0 +1,70 @@ +#include +#include +#include "bam.h" + +// currently, this function ONLY works if each read has one hit +void bam_mating_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b[2]; + int curr, has_prev; + + header = bam_header_read(in); + bam_header_write(out, header); + + b[0] = bam_init1(); + b[1] = bam_init1(); + curr = 0; has_prev = 0; + while (bam_read1(in, b[curr]) >= 0) { + bam1_t *cur = b[curr], *pre = b[1-curr]; + if (has_prev) { + if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name + cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; + pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; + if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) + { + uint32_t cur5, pre5; + cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; + cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; + } else cur->core.isize = pre->core.isize = 0; + if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; + else cur->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; + else pre->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } + if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_write1(out, pre); + bam_write1(out, cur); + has_prev = 0; + } else { // unpaired or singleton + pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; + if (pre->core.flag & BAM_FPAIRED) { + pre->core.flag |= BAM_FMUNMAP; + pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; + } + bam_write1(out, pre); + } + } else has_prev = 1; + curr = 1 - curr; + } + if (has_prev) bam_write1(out, b[1-curr]); + bam_header_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); +} + +int bam_mating(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "samtools fixmate \n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + bam_mating_core(in, out); + bam_close(in); bam_close(out); + return 0; +} diff --git a/bam_md.c b/bam_md.c new file mode 100644 index 0000000..a20f9b3 --- /dev/null +++ b/bam_md.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include "faidx.h" +#include "bam.h" +#include "kstring.h" + +void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +{ + uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, u = 0; + kstring_t *str; + uint8_t *old_md; + + old_md = bam_aux_get(b, "MD"); + if (c->flag & BAM_FUNMAP) return; + if (old_md && !is_equal) return; // no need to add MD + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { + if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + ++u; + } else { + ksprintf(str, "%d", u); + kputc(ref[x+j], str); + u = 0; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL) { + ksprintf(str, "%d", u); + kputc('^', str); + for (j = 0; j < l; ++j) { + if (ref[x+j] == 0) break; + kputc(ref[x+j], str); + } + u = 0; + if (j < l) break; + x += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + } else if (op == BAM_CREF_SKIP) { + x += l; + } + } + ksprintf(str, "%d", u); + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' != '%s'\n", bam1_qname(b), old_md+1, str->s); + } + free(str->s); free(str); +} + +int bam_fillmd(int argc, char *argv[]) +{ + int c, is_equal = 0, tid = -2, ret, len; + bamFile fp, fpout = 0; + bam_header_t *header; + faidx_t *fai; + char *ref = 0; + bam1_t *b; + + while ((c = getopt(argc, argv, "e")) >= 0) { + switch (c) { + case 'e': is_equal = 1; break; + default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; + } + } + if (optind + 1 >= argc) { + fprintf(stderr, "Usage: bam fillmd [-e] \n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + fpout = bam_dopen(fileno(stdout), "w"); + bam_header_write(fpout, header); + fai = fai_load(argv[optind+1]); + + b = bam_init1(); + while ((ret = bam_read1(fp, b)) >= 0) { + if (b->core.tid >= 0) { + if (tid != b->core.tid) { + free(ref); + ref = fai_fetch(fai, header->target_name[b->core.tid], &len); + tid = b->core.tid; + } + bam_fillmd1(b, ref, is_equal); + } + bam_write1(fpout, b); + } + bam_destroy1(b); + + free(ref); + fai_destroy(fai); + bam_header_destroy(header); + bam_close(fp); bam_close(fpout); + return 0; +} diff --git a/bam_pileup.c b/bam_pileup.c new file mode 100644 index 0000000..3ffd528 --- /dev/null +++ b/bam_pileup.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include "sam.h" + +typedef struct __linkbuf_t { + bam1_t b; + uint32_t beg, end; + struct __linkbuf_t *next; +} lbnode_t; + +/* --- BEGIN: Memory pool */ + +typedef struct { + int cnt, n, max; + lbnode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + mempool_t *mp; + mp = (mempool_t*)calloc(1, sizeof(mempool_t)); + return mp; +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) { + free(mp->buf[k]->b.data); + free(mp->buf[k]); + } + free(mp->buf); + free(mp); +} +static inline lbnode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, lbnode_t *p) +{ + --mp->cnt; p->next = 0; // clear lbnode_t::next here + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* --- END: Memory pool */ + +/* --- BEGIN: Auxiliary functions */ + +static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) +{ + unsigned k; + bam1_t *b = p->b; + bam1_core_t *c = &b->core; + uint32_t x = c->pos, y = 0; + int ret = 1, is_restart = 1; + + if (c->flag&BAM_FUNMAP) return 0; // unmapped read + assert(x <= pos); // otherwise a bug + p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation + int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length + if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip + if (x + l > pos) { // overlap with pos + p->indel = p->is_del = 0; + p->qpos = y + (pos - x); + if (x == pos && is_restart) p->is_head = 1; + if (x + l - 1 == pos) { // come to the end of a match + if (k < c->n_cigar - 1) { // there are additional operation(s) + uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR + int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation + if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del + else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins + if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP) + p->is_tail = 1; // tail + } else p->is_tail = 1; // this is the last operation; set tail + } + } + x += l; y += l; + } else if (op == BAM_CDEL) { // then set ->is_del + if (x + l > pos) { + p->indel = 0; p->is_del = 1; + p->qpos = y + (pos - x); + } + x += l; + } else if (op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); + if (x > pos) { + if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all + break; + } + } + assert(x > pos); // otherwise a bug + return ret; +} + +/* --- END: Auxiliary functions */ + +struct __bam_plbuf_t { + mempool_t *mp; + lbnode_t *head, *tail, *dummy; + bam_pileup_f func; + void *func_data; + int32_t tid, pos, max_tid, max_pos; + int max_pu, is_eof; + bam_pileup1_t *pu; + int flag_mask; +}; + +void bam_plbuf_reset(bam_plbuf_t *buf) +{ + lbnode_t *p, *q; + buf->max_tid = buf->max_pos = -1; + buf->tid = buf->pos = 0; + buf->is_eof = 0; + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; +} + +void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) +{ + if (mask < 0) buf->flag_mask = BAM_DEF_MASK; + else buf->flag_mask = BAM_FUNMAP | mask; +} + +bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +{ + bam_plbuf_t *buf; + buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t)); + buf->func = func; buf->func_data = data; + buf->mp = mp_init(); + buf->head = buf->tail = mp_alloc(buf->mp); + buf->dummy = mp_alloc(buf->mp); + buf->max_tid = buf->max_pos = -1; + buf->flag_mask = BAM_DEF_MASK; + return buf; +} + +void bam_plbuf_destroy(bam_plbuf_t *buf) +{ + mp_free(buf->mp, buf->dummy); + mp_free(buf->mp, buf->head); + if (buf->mp->cnt != 0) + fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt); + mp_destroy(buf->mp); + free(buf->pu); + free(buf); +} + +int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +{ + if (b) { // fill buffer + if (b->core.tid < 0) return 0; + if (b->core.flag & buf->flag_mask) return 0; + bam_copy1(&buf->tail->b, b); + buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); + if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n"); + abort(); + } + buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg; + if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) { + buf->tail->next = mp_alloc(buf->mp); + buf->tail = buf->tail->next; + } + } else buf->is_eof = 1; + while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) { + int n_pu = 0; + lbnode_t *p, *q; + buf->dummy->next = buf->head; + for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list + q->next = p->next; mp_free(buf->mp, p); p = q; + } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup + if (n_pu == buf->max_pu) { // then double the capacity + buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256; + buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu); + } + buf->pu[n_pu].b = &p->b; + if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP + } + } + buf->head = buf->dummy->next; // dummy->next may be changed + if (n_pu) { // then call user defined function + buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data); + } + // update tid and pos + if (buf->head->next) { + if (buf->tid > buf->head->b.core.tid) { + fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); + return 1; + } + } + if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence + buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference + } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid + buf->pos = buf->head->beg; // jump to the next position + } else ++buf->pos; // scan contiguously + if (buf->is_eof && buf->head->next == 0) break; + } + return 0; +} diff --git a/bam_plcmd.c b/bam_plcmd.c new file mode 100644 index 0000000..5d5506f --- /dev/null +++ b/bam_plcmd.c @@ -0,0 +1,385 @@ +#include +#include +#include +#include +#include "sam.h" +#include "faidx.h" +#include "bam_maqcns.h" +#include "khash.h" +#include "glf.h" +#include "kstring.h" + +typedef int *indel_list_t; +KHASH_MAP_INIT_INT64(64, indel_list_t) + +#define BAM_PLF_SIMPLE 0x01 +#define BAM_PLF_CNS 0x02 +#define BAM_PLF_INDEL_ONLY 0x04 +#define BAM_PLF_GLF 0x08 +#define BAM_PLF_VAR_ONLY 0x10 +#define BAM_PLF_2ND 0x20 + +typedef struct { + bam_header_t *h; + bam_maqcns_t *c; + bam_maqindel_opt_t *ido; + faidx_t *fai; + khash_t(64) *hash; + uint32_t format; + int tid, len, last_pos; + int mask; + char *ref; + glfFile fp_glf; // for glf output only +} pu_data_t; + +char **__bam_get_lines(const char *fn, int *_n); +void bam_init_header_hash(bam_header_t *header); +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + +static khash_t(64) *load_pos(const char *fn, bam_header_t *h) +{ + char **list; + int i, j, n, *fields, max_fields; + khash_t(64) *hash; + bam_init_header_hash(h); + list = __bam_get_lines(fn, &n); + hash = kh_init(64); + max_fields = 0; fields = 0; + for (i = 0; i < n; ++i) { + char *str = list[i]; + int chr, n_fields, ret; + khint_t k; + uint64_t x; + n_fields = ksplit_core(str, 0, &max_fields, &fields); + if (n_fields < 2) continue; + chr = bam_get_tid(h, str + fields[0]); + if (chr < 0) { + fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]); + continue; + } + x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1); + k = kh_put(64, hash, x, &ret); + if (ret == 0) { + fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]); + continue; + } + kh_val(hash, k) = 0; + if (n_fields > 2) { + // count + for (j = 2; j < n_fields; ++j) { + char *s = str + fields[j]; + if ((*s != '+' && *s != '-') || !isdigit(s[1])) break; + } + if (j > 2) { // update kh_val() + int *q, y, z; + q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int)); + q[0] = j - 2; z = j; y = 1; + for (j = 2; j < z; ++j) + q[y++] = atoi(str + fields[j]); + } + } + free(str); + } + free(list); free(fields); + return hash; +} + +// an analogy to pileup_func() below +static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +{ + pu_data_t *d = (pu_data_t*)data; + bam_maqindel_ret_t *r = 0; + int rb, *proposed_indels = 0; + glf1_t *g; + glf3_t *g3; + + if (d->fai == 0) { + fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n"); + exit(1); + } + if (d->hash) { // only output a list of sites + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } + g3 = glf3_init1(); + if (d->fai && (int)tid != d->tid) { + if (d->ref) { // then write the end mark + g3->rtype = GLF3_RTYPE_END; + glf3_write1(d->fp_glf, g3); + } + glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference + free(d->ref); + d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->tid = tid; + d->last_pos = 0; + } + rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); + memcpy(g3, g, sizeof(glf1_t)); + g3->rtype = GLF3_RTYPE_SUB; + g3->offset = pos - d->last_pos; + d->last_pos = pos; + glf3_write1(d->fp_glf, g3); + if (proposed_indels) + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + if (r) { // then write indel line + int het = 3 * n, min; + min = het; + if (min > r->gl[0]) min = r->gl[0]; + if (min > r->gl[1]) min = r->gl[1]; + g3->ref_base = 0; + g3->rtype = GLF3_RTYPE_INDEL; + memset(g3->lk, 0, 10); + g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255; + g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255; + g3->lk[2] = het - min < 255? het - min : 255; + g3->offset = 0; + g3->indel_len[0] = r->indel1; + g3->indel_len[1] = r->indel2; + g3->min_lk = min < 255? min : 255; + g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1; + g3->indel_seq[0] = strdup(r->s[0]+1); + g3->indel_seq[1] = strdup(r->s[1]+1); + glf3_write1(d->fp_glf, g3); + bam_maqindel_ret_destroy(r); + } + free(g); + glf3_destroy1(g3); + return 0; +} + +static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +{ + pu_data_t *d = (pu_data_t*)data; + bam_maqindel_ret_t *r = 0; + int i, j, rb, rms_mapq = -1, *proposed_indels = 0; + uint64_t rms_aux; + uint32_t cns = 0; + + // if GLF is required, suppress -c completely + if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data); + // if d->hash is initialized, only output the sites in the hash table + if (d->hash) { + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } + // update d->ref if necessary + if (d->fai && (int)tid != d->tid) { + free(d->ref); + d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->tid = tid; + } + rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + // when the indel-only mode is asked for, return if no reads mapped with indels + if (d->format & BAM_PLF_INDEL_ONLY) { + for (i = 0; i < n; ++i) + if (pu[i].indel != 0) break; + if (i == n) return 0; + } + // call the consensus and indel + if (d->format & BAM_PLF_CNS) // call consensus + cns = bam_maqcns_call(n, pu, d->c); + if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref) { // call indels + if (proposed_indels) // the first element gives the size of the array + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + } + // when only variant sites are asked for, test if the site is a variant + if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { + if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP + if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel + if (r) bam_maqindel_ret_destroy(r); + return 0; + } + } + } + // print the first 3 columns + printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb); + // print consensus information if required + if (d->format & BAM_PLF_CNS) { + int ref_q, rb4 = bam_nt16_table[rb]; + ref_q = 0; + if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP + ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff); + if (ref_q > 255) ref_q = 255; + } + rms_mapq = cns>>16&0xff; + printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq); + } + // print pileup sequences + printf("%d\t", n); + rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + rms_aux += tmp * tmp; + if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); + if (!p->is_del) { + int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + putchar(c); + if (p->indel > 0) { + printf("+%d", p->indel); + for (j = 1; j <= p->indel; ++j) { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } else if (p->indel < 0) { + printf("%d", p->indel); + for (j = 1; j <= -p->indel; ++j) { + c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N'; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } + } else putchar('*'); + if (p->is_tail) putchar('$'); + } + // finalize rms_mapq + rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); + if (rms_mapq < 0) rms_mapq = rms_aux; + putchar('\t'); + // print quality + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int c = bam1_qual(p->b)[p->qpos] + 33; + if (c > 126) c = 126; + putchar(c); + } + if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities + const unsigned char *q; + putchar('\t'); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + q = bam_aux_get(p->b, "E2"); + putchar(q? q[p->qpos + 1] : 'N'); + } + putchar('\t'); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + q = bam_aux_get(p->b, "U2"); + putchar(q? q[p->qpos + 1] : '!'); + } + } + // print mapping quality if -s is flagged on the command line + if (d->format & BAM_PLF_SIMPLE) { + putchar('\t'); + for (i = 0; i < n; ++i) { + int c = pu[i].b->core.qual + 33; + if (c > 126) c = 126; + putchar(c); + } + } + putchar('\n'); + // print the indel line if r has been calculated. This only happens if: + // a) -c or -i are flagged, AND b) the reference sequence is available + if (r) { + printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1); + if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]); + else printf("%s/%s\t", r->s[0], r->s[1]); + printf("%d\t%d\t", r->q_cns, r->q_ref); + printf("%d\t%d\t", rms_mapq, n); + printf("%s\t%s\t", r->s[0], r->s[1]); + //printf("%d\t%d\t", r->gl[0], r->gl[1]); + printf("%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_anti); + bam_maqindel_ret_destroy(r); + } + return 0; +} + +int bam_pileup(int argc, char *argv[]) +{ + int c, is_SAM = 0; + char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; + pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); + d->tid = -1; d->mask = BAM_DEF_MASK; + d->c = bam_maqcns_init(); + d->ido = bam_maqindel_opt_init(); + while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2")) >= 0) { + switch (c) { + case 's': d->format |= BAM_PLF_SIMPLE; break; + case 't': fn_list = strdup(optarg); break; + case 'l': fn_pos = strdup(optarg); break; + case 'f': fn_fa = strdup(optarg); break; + case 'T': d->c->theta = atof(optarg); break; + case 'N': d->c->n_hap = atoi(optarg); break; + case 'r': d->c->het_rate = atof(optarg); break; + case 'M': d->c->cap_mapQ = atoi(optarg); break; + case 'c': d->format |= BAM_PLF_CNS; break; + case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; + case 'v': d->format |= BAM_PLF_VAR_ONLY; break; + case 'm': d->mask = strtol(optarg, 0, 0); break; + case 'g': d->format |= BAM_PLF_GLF; break; + case '2': d->format |= BAM_PLF_2ND; break; + case 'I': d->ido->q_indel = atoi(optarg); break; + case 'G': d->ido->r_indel = atof(optarg); break; + case 'S': is_SAM = 1; break; + default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; + } + } + if (fn_list) is_SAM = 1; + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); + fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); + fprintf(stderr, " -S the input is in SAM\n"); + fprintf(stderr, " -2 output the 2nd best call and quality\n"); + fprintf(stderr, " -i only show lines/consensus with indels\n"); + fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); + fprintf(stderr, " -t FILE list of reference sequences (assume the input is in SAM)\n"); + fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); + fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); + fprintf(stderr, " -c output the maq consensus sequence\n"); + fprintf(stderr, " -v print variants only (for -c)\n"); + fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n"); + fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta); + fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap); + fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate); + fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel); + fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel); + fprintf(stderr, "\n"); + free(fn_list); free(fn_fa); free(d); + return 1; + } + if (fn_fa) d->fai = fai_load(fn_fa); + if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling + if (d->format & BAM_PLF_GLF) { // for glf output + glf3_header_t *h; + h = glf3_header_init(); + d->fp_glf = bgzf_fdopen(fileno(stdout), "w"); + glf3_header_write(d->fp_glf, h); + glf3_header_destroy(h); + } + if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY))) + fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n"); + { + samfile_t *fp; + fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0); + if (fp == 0 || fp->header == 0) { + fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n"); + return 1; + } + d->h = fp->header; + if (fn_pos) d->hash = load_pos(fn_pos, d->h); + sampileup(fp, d->mask, pileup_func, d); + samclose(fp); // d->h will be destroyed here + } + + // free + if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf); + if (fn_pos) { // free the hash table + khint_t k; + for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k) + if (kh_exist(d->hash, k)) free(kh_val(d->hash, k)); + kh_destroy(64, d->hash); + } + free(fn_pos); free(fn_list); free(fn_fa); + if (d->fai) fai_destroy(d->fai); + bam_maqcns_destroy(d->c); + free(d->ido); free(d->ref); free(d); + return 0; +} diff --git a/bam_rmdup.c b/bam_rmdup.c new file mode 100644 index 0000000..1fa6cad --- /dev/null +++ b/bam_rmdup.c @@ -0,0 +1,144 @@ +#include +#include +#include +#include +#include "bam.h" + +typedef bam1_t *bam1_p; +#include "khash.h" +KHASH_SET_INIT_STR(name) +KHASH_MAP_INIT_INT64(pos, bam1_p) + +#define BUFFER_SIZE 0x40000 + +typedef struct { + int n, max; + bam1_t **a; +} tmp_stack_t; + +static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) +{ + if (stack->n == stack->max) { + stack->max = stack->max? stack->max<<1 : 0x10000; + stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); + } + stack->a[stack->n++] = b; +} + +static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out) +{ + int i; + for (i = 0; i != stack->n; ++i) { + bam_write1(out, stack->a[i]); + bam_destroy1(stack->a[i]); + } + stack->n = 0; + if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash); +} + +static void clear_del_set(khash_t(name) *del_set) +{ + khint_t k; + for (k = kh_begin(del_set); k < kh_end(del_set); ++k) + if (kh_exist(del_set, k)) + free((char*)kh_key(del_set, k)); + kh_clear(name, del_set); +} + +void bam_rmdup_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b; + int last_tid = -1, last_pos = -1; + uint64_t n_checked = 0, n_removed = 0; + tmp_stack_t stack; + khint_t k; + khash_t(pos) *best_hash; + khash_t(name) *del_set; + + best_hash = kh_init(pos); + del_set = kh_init(name); + b = bam_init1(); + memset(&stack, 0, sizeof(tmp_stack_t)); + header = bam_header_read(in); + bam_header_write(out, header); + + kh_resize(name, del_set, 4 * BUFFER_SIZE); + kh_resize(pos, best_hash, 3 * BUFFER_SIZE); + while (bam_read1(in, b) >= 0) { + bam1_core_t *c = &b->core; + if (c->tid != last_tid || last_pos != c->pos) { + dump_best(&stack, best_hash, out); // write the result + if (c->tid != last_tid) { + kh_clear(pos, best_hash); + if (kh_size(del_set)) { // check + fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + clear_del_set(del_set); + } + if ((int)c->tid == -1) { // append unmapped reads + bam_write1(out, b); + while (bam_read1(in, b) >= 0) bam_write1(out, b); + break; + } + last_tid = c->tid; + fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]); + } + } + if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { + bam_write1(out, b); + } else if (c->isize > 0) { // paired, head + uint64_t key = (uint64_t)c->pos<<32 | c->isize; + int ret; + ++n_checked; + k = kh_put(pos, best_hash, key, &ret); + if (ret == 0) { // found in best_hash + bam1_t *p = kh_val(best_hash, k); + ++n_removed; + if (p->core.qual < c->qual) { // the current alignment is better + kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed + bam_copy1(p, b); // replaced as b + } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed + if (ret == 0) + fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); + } else { // not found in best_hash + kh_val(best_hash, k) = bam_dup1(b); + stack_insert(&stack, kh_val(best_hash, k)); + } + } else { // paired, tail + k = kh_get(name, del_set, bam1_qname(b)); + if (k != kh_end(del_set)) { + free((char*)kh_key(del_set, k)); + kh_del(name, del_set, k); + } else bam_write1(out, b); + } + last_pos = c->pos; + } + dump_best(&stack, best_hash, out); + + bam_header_destroy(header); + clear_del_set(del_set); + kh_destroy(name, del_set); + kh_destroy(pos, best_hash); + free(stack.a); + bam_destroy1(b); + fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked, + (double)n_removed/n_checked); +} +int bam_rmdup(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "Usage: samtools rmdup \n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + if (in == 0 || out == 0) { + fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); + return 1; + } + bam_rmdup_core(in, out); + bam_close(in); + bam_close(out); + return 0; +} diff --git a/bam_rmdupse.c b/bam_rmdupse.c new file mode 100644 index 0000000..df03717 --- /dev/null +++ b/bam_rmdupse.c @@ -0,0 +1,177 @@ +#include +#include "sam.h" +#include "khash.h" + +typedef struct { + int n, m; + int *a; +} listelem_t; + +KHASH_MAP_INIT_INT(32, listelem_t) + +#define BLOCK_SIZE 65536 + +typedef struct { + bam1_t *b; + int rpos, score; +} elem_t; + +typedef struct { + int n, max, x; + elem_t *buf; +} buffer_t; + +static int fill_buf(samfile_t *in, buffer_t *buf) +{ + int i, ret, last_tid, min_rpos = 0x7fffffff, capacity; + bam1_t *b = bam_init1(); + bam1_core_t *c = &b->core; + // squeeze out the empty cells at the beginning + for (i = 0; i < buf->n; ++i) + if (buf->buf[i].b) break; + if (i < buf->n) { // squeeze + if (i > 0) { + memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i)); + buf->n = buf->n - i; + } + } else buf->n = 0; + // calculate min_rpos + for (i = 0; i < buf->n; ++i) { + elem_t *e = buf->buf + i; + if (e->b && e->rpos >= 0 && e->rpos < min_rpos) + min_rpos = buf->buf[i].rpos; + } + // fill the buffer + buf->x = -1; + last_tid = buf->n? buf->buf[0].b->core.tid : -1; + capacity = buf->n + BLOCK_SIZE; + while ((ret = samread(in, b)) >= 0) { + elem_t *e; + uint8_t *qual = bam1_qual(b); + int is_mapped; + if (last_tid < 0) last_tid = c->tid; + if (c->tid != last_tid) { + if (buf->x < 0) buf->x = buf->n; + } + if (buf->n >= buf->max) { // enlarge + buf->max = buf->max? buf->max<<1 : 8; + buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max); + } + e = &buf->buf[buf->n++]; + e->b = bam_dup1(b); + e->rpos = -1; e->score = 0; + for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1; + e->score = (double)e->score / sqrt(c->l_qseq + 1); + is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1; + if (!is_mapped) e->score = -1; + if (is_mapped && (c->flag & BAM_FREVERSE)) { + e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b)); + if (min_rpos > e->rpos) min_rpos = e->rpos; + } + if (buf->n >= capacity) { + if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE; + else break; + } + } + if (ret >= 0 && buf->x < 0) buf->x = buf->n; + bam_destroy1(b); + return buf->n; +} + +static void rmdupse_buf(buffer_t *buf) +{ + khash_t(32) *h; + uint32_t key; + khint_t k; + int mpos, i, upper; + listelem_t *p; + mpos = 0x7fffffff; + mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff; + upper = (buf->x < 0)? buf->n : buf->x; + // fill the hash table + h = kh_init(32); + for (i = 0; i < upper; ++i) { + elem_t *e = buf->buf + i; + int ret; + if (e->score < 0) continue; + if (e->rpos >= 0) { + if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1; + else continue; + } else { + if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1; + else continue; + } + k = kh_put(32, h, key, &ret); + p = &kh_val(h, k); + if (ret == 0) { // present in the hash table + if (p->n == p->m) { + p->m <<= 1; + p->a = (int*)realloc(p->a, p->m * sizeof(int)); + } + p->a[p->n++] = i; + } else { + p->m = p->n = 1; + p->a = (int*)calloc(p->m, sizeof(int)); + p->a[0] = i; + } + } + // rmdup + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + int max, maxi; + p = &kh_val(h, k); + // get the max + for (i = max = 0, maxi = -1; i < p->n; ++i) { + if (buf->buf[p->a[i]].score > max) { + max = buf->buf[p->a[i]].score; + maxi = i; + } + } + // mark the elements + for (i = 0; i < p->n; ++i) { + buf->buf[p->a[i]].score = -1; + if (i != maxi) { + bam_destroy1(buf->buf[p->a[i]].b); + buf->buf[p->a[i]].b = 0; + } + } + // free + free(p->a); + } + } + kh_destroy(32, h); +} + +static void dump_buf(buffer_t *buf, samfile_t *out) +{ + int i; + for (i = 0; i < buf->n; ++i) { + elem_t *e = buf->buf + i; + if (e->score != -1) break; + if (e->b) { + samwrite(out, e->b); + bam_destroy1(e->b); + e->b = 0; + } + } +} + +int bam_rmdupse(int argc, char *argv[]) +{ + samfile_t *in, *out; + buffer_t *buf; + if (argc < 3) { + fprintf(stderr, "Usage: samtools rmdupse \n"); + return 1; + } + buf = calloc(1, sizeof(buffer_t)); + in = samopen(argv[1], "rb", 0); + out = samopen(argv[2], "wb", in->header); + while (fill_buf(in, buf)) { + rmdupse_buf(buf); + dump_buf(buf, out); + } + samclose(in); samclose(out); + free(buf->buf); free(buf); + return 0; +} diff --git a/bam_sort.c b/bam_sort.c new file mode 100644 index 0000000..402792a --- /dev/null +++ b/bam_sort.c @@ -0,0 +1,257 @@ +#include +#include +#include +#include +#include +#include +#include "bam.h" +#include "ksort.h" + +static int g_is_by_qname = 0; + +static inline int strnum_cmp(const char *a, const char *b) +{ + char *pa, *pb; + pa = (char*)a; pb = (char*)b; + while (*pa && *pb) { + if (isdigit(*pa) && isdigit(*pb)) { + long ai, bi; + ai = strtol(pa, &pa, 10); + bi = strtol(pb, &pb, 10); + if (ai != bi) return aibi? 1 : 0; + } else { + if (*pa != *pb) break; + ++pa; ++pb; + } + } + if (*pa == *pb) + return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; + return *pa<*pb? -1 : *pa>*pb? 1 : 0; +} + +#define HEAP_EMPTY 0xffffffffffffffffull + +typedef struct { + int i; + uint64_t pos; + bam1_t *b; +} heap1_t; + +static inline int heap_lt(const heap1_t a, const heap1_t b) +{ + if (g_is_by_qname) { + int t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); + return (t > 0 || (t == 0 && a.pos > b.pos)); + } else return (a.pos > b.pos); +} + +KSORT_INIT(heap, heap1_t, heap_lt) + +/*! + @abstract Merge multiple sorted BAM. + @param is_by_qname whether to sort by query name + @param out output BAM file name + @param n number of files to be merged + @param fn names of files to be merged + + @discussion Padding information may NOT correctly maintained. This + function is NOT thread safe. + */ +void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) +{ + bamFile fpout, *fp; + heap1_t *heap; + bam_header_t *hout = 0; + int i, j; + + g_is_by_qname = by_qname; + fp = (bamFile*)calloc(n, sizeof(bamFile)); + heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + for (i = 0; i != n; ++i) { + heap1_t *h; + bam_header_t *hin; + assert(fp[i] = bam_open(fn[i], "r")); + hin = bam_header_read(fp[i]); + if (i == 0) hout = hin; + else { // validate multiple baf + if (hout->n_targets != hin->n_targets) { + fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]); + exit(1); + } + for (j = 0; j < hout->n_targets; ++j) { + if (strcmp(hout->target_name[j], hin->target_name[j])) { + fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n", + hout->target_name[j], hin->target_name[j], fn[i]); + exit(1); + } + if (hout->target_len[j] != hin->target_len[j]) + fprintf(stderr, "[bam_merge_core] different target sequence length: %d != %d in file '%s'. Continue.\n", + hout->target_len[j], hin->target_len[j], fn[i]); + } + bam_header_destroy(hin); + } + h = heap + i; + h->i = i; + h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); + if (bam_read1(fp[i], h->b) >= 0) + h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); + else h->pos = HEAP_EMPTY; + } + fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); + assert(fpout); + bam_header_write(fpout, hout); + bam_header_destroy(hout); + + ks_heapmake(heap, n, heap); + while (heap->pos != HEAP_EMPTY) { + bam1_t *b = heap->b; + bam_write1_core(fpout, &b->core, b->data_len, b->data); + if ((j = bam_read1(fp[heap->i], b)) >= 0) + heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); + else if (j == -1) heap->pos = HEAP_EMPTY; + else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); + ks_heapadjust(heap, 0, n, heap); + } + + for (i = 0; i != n; ++i) { + bam_close(fp[i]); + free(heap[i].b->data); + free(heap[i].b); + } + bam_close(fpout); + free(fp); free(heap); +} +int bam_merge(int argc, char *argv[]) +{ + int c, is_by_qname = 0; + while ((c = getopt(argc, argv, "n")) >= 0) { + switch (c) { + case 'n': is_by_qname = 1; break; + } + } + if (optind + 2 >= argc) { + fprintf(stderr, "Usage: samtools merge [-n] [...]\n"); + return 1; + } + bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1); + return 0; +} + +typedef bam1_t *bam1_p; + +static inline int bam1_lt(const bam1_p a, const bam1_p b) +{ + if (g_is_by_qname) { + int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); + return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)))); + } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)); +} +KSORT_INIT(sort, bam1_p, bam1_lt) + +static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h) +{ + char *name; + int i; + bamFile fp; + ks_mergesort(sort, k, buf, 0); + name = (char*)calloc(strlen(prefix) + 20, 1); + if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); + else sprintf(name, "%s.bam", prefix); + assert(fp = bam_open(name, "w")); + free(name); + bam_header_write(fp, h); + for (i = 0; i < k; ++i) + bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data); + bam_close(fp); +} + +/*! + @abstract Sort an unsorted BAM file based on the chromosome order + and the leftmost position of an alignment + + @param is_by_qname whether to sort by query name + @param fn name of the file to be sorted + @param prefix prefix of the output and the temporary files; upon + sucessess, prefix.bam will be written. + @param max_mem approxiate maximum memory (very inaccurate) + + @discussion It may create multiple temporary subalignment files + and then merge them by calling bam_merge_core(). This function is + NOT thread safe. + */ +void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) +{ + int n, ret, k, i; + size_t mem; + bam_header_t *header; + bamFile fp; + bam1_t *b, **buf; + + g_is_by_qname = is_by_qname; + n = k = 0; mem = 0; + fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); + // write sub files + for (;;) { + if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); + b = buf[k]; + if ((ret = bam_read1(fp, b)) < 0) break; + mem += ret; + ++k; + if (mem >= max_mem) { + sort_blocks(n++, k, buf, prefix, header); + mem = 0; k = 0; + } + } + if (ret != -1) + fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); + if (n == 0) sort_blocks(-1, k, buf, prefix, header); + else { // then merge + char **fns, *fnout; + fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); + sort_blocks(n++, k, buf, prefix, header); + fnout = (char*)calloc(strlen(prefix) + 20, 1); + sprintf(fnout, "%s.bam", prefix); + fns = (char**)calloc(n, sizeof(char*)); + for (i = 0; i < n; ++i) { + fns[i] = (char*)calloc(strlen(prefix) + 20, 1); + sprintf(fns[i], "%s.%.4d.bam", prefix, i); + } + bam_merge_core(is_by_qname, fnout, n, fns); + free(fnout); + for (i = 0; i < n; ++i) { + unlink(fns[i]); + free(fns[i]); + } + free(fns); + } + for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) { + if (buf[k]) { + free(buf[k]->data); + free(buf[k]); + } + } + free(buf); + bam_header_destroy(header); + bam_close(fp); +} + +int bam_sort(int argc, char *argv[]) +{ + size_t max_mem = 500000000; + int c, is_by_qname = 0; + while ((c = getopt(argc, argv, "nm:")) >= 0) { + switch (c) { + case 'n': is_by_qname = 1; break; + case 'm': max_mem = atol(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: samtools sort [-n] [-m ] \n"); + return 1; + } + bam_sort_core(is_by_qname, argv[optind], argv[optind+1], max_mem); + return 0; +} diff --git a/bam_stat.c b/bam_stat.c new file mode 100644 index 0000000..c1c4a43 --- /dev/null +++ b/bam_stat.c @@ -0,0 +1,78 @@ +#include +#include +#include "bam.h" + +typedef struct { + long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; + long long n_sgltn, n_read1, n_read2; + long long n_qcfail, n_dup; + long long n_diffchr, n_diffhigh; +} bam_flagstat_t; + +#define flagstat_loop(s, c) do { \ + ++(s)->n_reads; \ + if ((c)->flag & BAM_FPAIRED) { \ + ++(s)->n_pair_all; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ + if ((c)->flag & BAM_FMUNMAP) ++(s)->n_sgltn; \ + if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ + ++(s)->n_pair_map; \ + if ((c)->mtid != (c)->tid) { \ + ++(s)->n_diffchr; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh; \ + } \ + } \ + } \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ + if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ + } while (0) + +bam_flagstat_t *bam_flagstat_core(bamFile fp) +{ + bam_flagstat_t *s; + bam1_t *b; + bam1_core_t *c; + int ret; + s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); + b = bam_init1(); + c = &b->core; + while ((ret = bam_read1(fp, b)) >= 0) + flagstat_loop(s, c); + bam_destroy1(b); + if (ret != -1) + fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + return s; +} +int bam_flagstat(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *header; + bam_flagstat_t *s; + if (argc == optind) { + fprintf(stderr, "Usage: samtools flagstat \n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + s = bam_flagstat_core(fp); + printf("%lld in total\n", s->n_reads); + printf("%lld QC failure\n", s->n_qcfail); + printf("%lld duplicates\n", s->n_dup); + printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); + printf("%lld paired in sequencing\n", s->n_pair_all); + printf("%lld read1\n", s->n_read1); + printf("%lld read2\n", s->n_read2); + printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); + printf("%lld with itself and mate mapped\n", s->n_pair_map); + printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); + printf("%lld with mate mapped to a different chr\n", s->n_diffchr); + printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); + free(s); + bam_header_destroy(header); + bam_close(fp); + return 0; +} diff --git a/bam_tview.c b/bam_tview.c new file mode 100644 index 0000000..be2579c --- /dev/null +++ b/bam_tview.c @@ -0,0 +1,379 @@ +#ifndef _NO_CURSES +#include +#ifdef NCURSES_VERSION +#include +#include +#include +#include "bam.h" +#include "faidx.h" +#include "bam_maqcns.h" + +char bam_aux_getCEi(bam1_t *b, int i); +char bam_aux_getCSi(bam1_t *b, int i); +char bam_aux_getCQi(bam1_t *b, int i); + +#define TV_MIN_ALNROW 2 +#define TV_MAX_GOTO 40 +#define TV_LOW_MAPQ 10 + +#define TV_COLOR_MAPQ 0 +#define TV_COLOR_BASEQ 1 +#define TV_COLOR_NUCL 2 +#define TV_COLOR_COL 3 +#define TV_COLOR_COLQ 4 + +#define TV_BASE_NUCL 0 +#define TV_BASE_COLOR_SPACE 1 + +typedef struct { + int mrow, mcol; + WINDOW *wgoto, *whelp; + + bam_index_t *idx; + bam_lplbuf_t *lplbuf; + bam_header_t *header; + bamFile fp; + int curr_tid, left_pos; + faidx_t *fai; + bam_maqcns_t *bmc; + + int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins; + char *ref; +} tview_t; + +int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + tview_t *tv = (tview_t*)data; + int i, j, c, rb, attr, max_ins = 0; + uint32_t call = 0; + if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen + // print referece + rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; + for (i = tv->last_pos + 1; i < pos; ++i) { + if (i%10 == 0) mvprintw(0, tv->ccol, "%-d", i+1); + c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; + mvaddch(1, tv->ccol++, c); + } + if (pos%10 == 0) mvprintw(0, tv->ccol, "%-d", pos+1); + // print consensus + call = bam_maqcns_call(n, pl, tv->bmc); + attr = A_UNDERLINE; + c = ",ACMGRSVTWYHKDBN"[call>>28&0xf]; + i = (call>>8&0xff)/10+1; + if (i > 4) i = 4; + attr |= COLOR_PAIR(i); + if (c == toupper(rb)) c = '.'; + attron(attr); + mvaddch(2, tv->ccol, c); + attroff(attr); + if(tv->ins) { + // calculate maximum insert + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel; + } + } + // core loop + for (j = 0; j <= max_ins; ++j) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + int row = TV_MIN_ALNROW + p->level - tv->row_shift; + if (j == 0) { + if (!p->is_del) { + if (tv->base_for == TV_BASE_COLOR_SPACE && + (c = bam_aux_getCSi(p->b, p->qpos))) { + c = bam_aux_getCSi(p->b, p->qpos); + // assume that if we found one color, we will be able to get the color error + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.'; + } + else { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + } else c = '*'; + } else { // padding + if (j > p->indel) c = '*'; + else { // insertion + if (tv->base_for == TV_BASE_NUCL) { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + else { + c = bam_aux_getCSi(p->b, p->qpos + j); + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.'; + } + } + } + if (row > TV_MIN_ALNROW && row < tv->mrow) { + int x; + attr = 0; + if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) + || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE; + if (tv->color_for == TV_COLOR_BASEQ) { + x = bam1_qual(p->b)[p->qpos]/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } else if (tv->color_for == TV_COLOR_MAPQ) { + x = p->b->core.qual/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } else if (tv->color_for == TV_COLOR_NUCL) { + x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5; + attr |= COLOR_PAIR(x); + } else if(tv->color_for == TV_COLOR_COL) { + x = 0; + switch(bam_aux_getCSi(p->b, p->qpos)) { + case '0': x = 0; break; + case '1': x = 1; break; + case '2': x = 2; break; + case '3': x = 3; break; + case '4': x = 4; break; + default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break; + } + x+=5; + attr |= COLOR_PAIR(x); + } else if(tv->color_for == TV_COLOR_COLQ) { + x = bam_aux_getCQi(p->b, p->qpos); + if(0 == x) x = bam1_qual(p->b)[p->qpos]; + x = x/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } + attron(attr); + mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c)); + attroff(attr); + } + } + c = j? '*' : rb; + if (c == '*') { + attr = COLOR_PAIR(8); + attron(attr); + mvaddch(1, tv->ccol++, c); + attroff(attr); + } else mvaddch(1, tv->ccol++, c); + } + tv->last_pos = pos; + return 0; +} + +tview_t *tv_init(const char *fn, const char *fn_fa) +{ + tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t)); + tv->is_dot = 1; + tv->idx = bam_index_load(fn); + if (tv->idx == 0) exit(1); + tv->fp = bam_open(fn, "r"); + bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); + assert(tv->fp); + tv->header = bam_header_read(tv->fp); + tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); + if (fn_fa) tv->fai = fai_load(fn_fa); + tv->bmc = bam_maqcns_init(); + tv->ins = 1; + bam_maqcns_prepare(tv->bmc); + + initscr(); + keypad(stdscr, TRUE); + clear(); + noecho(); + cbreak(); +#ifdef NCURSES_VERSION + getmaxyx(stdscr, tv->mrow, tv->mcol); +#else + tv->mrow = 80; tv->mcol = 40; +#endif + tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); + tv->whelp = newwin(27, 40, 5, 5); + tv->color_for = TV_COLOR_MAPQ; + start_color(); + init_pair(1, COLOR_BLUE, COLOR_BLACK); + init_pair(2, COLOR_GREEN, COLOR_BLACK); + init_pair(3, COLOR_YELLOW, COLOR_BLACK); + init_pair(4, COLOR_WHITE, COLOR_BLACK); + init_pair(5, COLOR_GREEN, COLOR_BLACK); + init_pair(6, COLOR_CYAN, COLOR_BLACK); + init_pair(7, COLOR_YELLOW, COLOR_BLACK); + init_pair(8, COLOR_RED, COLOR_BLACK); + init_pair(9, COLOR_BLUE, COLOR_BLACK); + return tv; +} + +void tv_destroy(tview_t *tv) +{ + delwin(tv->wgoto); delwin(tv->whelp); + endwin(); + + bam_lplbuf_destroy(tv->lplbuf); + bam_maqcns_destroy(tv->bmc); + bam_index_destroy(tv->idx); + if (tv->fai) fai_destroy(tv->fai); + free(tv->ref); + bam_header_destroy(tv->header); + bam_close(tv->fp); + free(tv); +} + +int tv_fetch_func(const bam1_t *b, void *data) +{ + tview_t *tv = (tview_t*)data; + bam_lplbuf_push(b, tv->lplbuf); + return 0; +} + +int tv_draw_aln(tview_t *tv, int tid, int pos) +{ + // reset + clear(); + tv->curr_tid = tid; tv->left_pos = pos; + tv->last_pos = tv->left_pos - 1; + tv->ccol = 0; + // print ref and consensus + if (tv->fai) { + char *str; + if (tv->ref) free(tv->ref); + str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); + sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); + tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); + free(str); + } + // draw aln + bam_lplbuf_reset(tv->lplbuf); + bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func); + bam_lplbuf_push(0, tv->lplbuf); + return 0; +} + +static void tv_win_goto(tview_t *tv, int *tid, int *pos) +{ + char str[256]; + int i, l = 0; + wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(tv->wgoto, 1, 2, "Goto: "); + for (;;) { + int c = wgetch(tv->wgoto); + wrefresh(tv->wgoto); + if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { + --l; + } else if (c == KEY_ENTER || c == '\012' || c == '\015') { + int _tid = -1, _beg, _end; + bam_parse_region(tv->header, str, &_tid, &_beg, &_end); + if (_tid >= 0) { + *tid = _tid; *pos = _beg; + return; + } + } else if (isgraph(c)) { + if (l < TV_MAX_GOTO) str[l++] = c; + } else if (c == '\027') l = 0; + else if (c == '\033') return; + str[l] = '\0'; + for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); + mvwprintw(tv->wgoto, 1, 8, "%s", str); + } +} + +static void tv_win_help(tview_t *tv) { + int r = 1; + WINDOW *win = tv->whelp; + wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(win, r++, 2, " -=- Help -=- "); + r++; + mvwprintw(win, r++, 2, "? This window"); + mvwprintw(win, r++, 2, "Arrows Small scroll movement"); + mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); + mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); + mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); + mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); + mvwprintw(win, r++, 2, "space Scroll one screen"); + mvwprintw(win, r++, 2, "backspace Scroll back one screen"); + mvwprintw(win, r++, 2, "g Go to specific location"); + mvwprintw(win, r++, 2, "m Color for mapping qual"); + mvwprintw(win, r++, 2, "n Color for nucleotide"); + mvwprintw(win, r++, 2, "b Color for base quality"); + mvwprintw(win, r++, 2, "c Color for cs color"); + mvwprintw(win, r++, 2, "z Color for cs qual"); + mvwprintw(win, r++, 2, ". Toggle on/off dot view"); + mvwprintw(win, r++, 2, "N Turn on nt view"); + mvwprintw(win, r++, 2, "C Turn on cs view"); + mvwprintw(win, r++, 2, "i Toggle on/off ins"); + mvwprintw(win, r++, 2, "q Exit"); + r++; + mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); + mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); + mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); + wrefresh(win); + wgetch(win); +} + +void tv_loop(tview_t *tv) +{ + int tid, pos; + tid = tv->curr_tid; pos = tv->left_pos; + while (1) { + int c = getch(); + //if(256 < c) {c = 1 + (c%256);} // Terminal was displaying ctrl-H as 263 via ssh from Mac OS X 10.5 computer + switch (c) { + case '?': tv_win_help(tv); break; + case '\033': + case 'q': goto end_loop; + case 'g': tv_win_goto(tv, &tid, &pos); break; + case 'm': tv->color_for = TV_COLOR_MAPQ; break; + case 'b': tv->color_for = TV_COLOR_BASEQ; break; + case 'n': tv->color_for = TV_COLOR_NUCL; break; + case 'c': tv->color_for = TV_COLOR_COL; break; + case 'z': tv->color_for = TV_COLOR_COLQ; break; + case KEY_LEFT: + case 'h': --pos; break; + case KEY_RIGHT: + case 'l': ++pos; break; + case KEY_SLEFT: + case 'H': pos -= 20; break; + case KEY_SRIGHT: + case 'L': pos += 20; break; + case '.': tv->is_dot = !tv->is_dot; break; + case 'N': tv->base_for = TV_BASE_NUCL; break; + case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; + case 'i': tv->ins = !tv->ins; break; + case '\010': pos -= 1000; break; + case '\014': pos += 1000; break; + case ' ': pos += tv->mcol; break; + case KEY_UP: + case 'j': --tv->row_shift; break; + case KEY_DOWN: + case 'k': ++tv->row_shift; break; + case KEY_BACKSPACE: + case '\177': pos -= tv->mcol; break; +#ifdef KEY_RESIZE + case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; +#endif + default: continue; + } + if (pos < 0) pos = 0; + if (tv->row_shift < 0) tv->row_shift = 0; + tv_draw_aln(tv, tid, pos); + } +end_loop: + return; +} + +int bam_tview_main(int argc, char *argv[]) +{ + tview_t *tv; + if (argc == 1) { + fprintf(stderr, "Usage: bamtk tview [ref.fasta]\n"); + return 1; + } + tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]); + tv_draw_aln(tv, 0, 0); + tv_loop(tv); + tv_destroy(tv); + return 0; +} +#else // #ifdef NCURSES_VERSION +#warning "The ncurses library is unavailable; tview is disabled." +int bam_tview_main(int argc, char *argv[]) +{ + fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n"); + return 1; +} +#endif +#endif // #ifndef _NO_CURSES diff --git a/bamtk.c b/bamtk.c new file mode 100644 index 0000000..3386836 --- /dev/null +++ b/bamtk.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include "bam.h" + +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "0.1.5c (r385)" +#endif + +int bam_taf2baf(int argc, char *argv[]); +int bam_pileup(int argc, char *argv[]); +int bam_merge(int argc, char *argv[]); +int bam_index(int argc, char *argv[]); +int bam_sort(int argc, char *argv[]); +int bam_tview_main(int argc, char *argv[]); +int bam_mating(int argc, char *argv[]); +int bam_rmdup(int argc, char *argv[]); +int bam_rmdupse(int argc, char *argv[]); +int bam_flagstat(int argc, char *argv[]); +int bam_fillmd(int argc, char *argv[]); + +int main_samview(int argc, char *argv[]); +int main_import(int argc, char *argv[]); + +int faidx_main(int argc, char *argv[]); +int glf3_view_main(int argc, char *argv[]); + +int bam_tagview(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *header; + bam1_t *b; + char tag[2]; + int ret; + if (argc < 3) { + fprintf(stderr, "Usage: samtools tagview \n"); + return 1; + } + fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + if (header == 0) { + fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); + return 1; + } + tag[0] = argv[2][0]; tag[1] = argv[2][1]; + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + while ((ret = bam_read1(fp, b)) >= 0) { + uint8_t *d = bam_aux_get(b, tag); + if (d) { + printf("%s\t%d\t", bam1_qname(b), b->core.flag); + if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d)); + else if (d[0] == 'f') printf("%f\n", bam_aux2f(d)); + else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d)); + else if (d[0] == 'A') printf("%c\n", bam_aux2A(d)); + else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d)); + else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d)); + else printf("\n"); + } + } + if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); + free(b->data); free(b); + bam_header_destroy(header); + bam_close(fp); + return 0; +} + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); + fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: samtools [options]\n\n"); + fprintf(stderr, "Command: import import from SAM (obsolete; use `view')\n"); + fprintf(stderr, " view export to the text format\n"); + fprintf(stderr, " sort sort alignment file\n"); + fprintf(stderr, " merge merge multiple sorted alignment files\n"); + fprintf(stderr, " pileup generate pileup output\n"); + fprintf(stderr, " faidx index/extract FASTA\n"); +#ifndef _NO_CURSES + fprintf(stderr, " tview text alignment viewer\n"); +#endif + fprintf(stderr, " index index alignment\n"); + fprintf(stderr, " fixmate fix mate information\n"); + fprintf(stderr, " rmdup remove PCR duplicates\n"); + fprintf(stderr, " glfview print GLFv3 file\n"); + fprintf(stderr, " flagstat simple stats\n"); + fprintf(stderr, " fillmd fill the MD tag and change identical base to =\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) return usage(); + if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); + else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); + else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); + else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); + else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); + else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); + else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); + else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1); + else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); + else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); + else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); + else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); +#ifndef _NO_CURSES + else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); +#endif + else { + fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); + return 1; + } + return 0; +} diff --git a/bgzf.c b/bgzf.c new file mode 100644 index 0000000..fe4e31d --- /dev/null +++ b/bgzf.c @@ -0,0 +1,634 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); + +typedef int8_t byte; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +inline +int +min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); + return fp; +} + +static +BGZF* +open_read(int fd) +{ +#ifdef _USE_KNETFILE + knetFile *file = knet_dopen(fd, "r"); +#else + FILE* file = fdopen(fd, "r"); +#endif + BGZF* fp; + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = fd; + fp->open_mode = 'r'; +#ifdef _USE_KNETFILE + fp->x.fpr = file; +#else + fp->file = file; +#endif + return fp; +} + +static +BGZF* +open_write(int fd, bool is_uncompressed) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp; + if (file == 0) return 0; + fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; +#ifdef _USE_KNETFILE + fp->x.fpw = file; +#else + fp->file = file; +#endif + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ +#ifdef _USE_KNETFILE + knetFile *file = knet_open(path, mode); + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = -1; + fp->open_mode = 'r'; + fp->x.fpr = file; +#else + int oflag = O_RDONLY; + int fd = open(path, oflag); + if (fd == -1) return 0; + fp = open_read(fd); +#endif + } else if (mode[0] == 'w' || mode[0] == 'W') { + int oflag = O_WRONLY | O_CREAT | O_TRUNC; + int fd = open(path, oflag, 0644); + if (fd == -1) return 0; + fp = open_write(fd, strstr(mode, "u")? 1 : 0); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (fd == -1) return 0; + if (mode[0] == 'r' || mode[0] == 'R') { + return open_read(fd); + } else if (mode[0] == 'w' || mode[0] == 'W') { + return open_write(fd, strstr(mode, "u")? 1 : 0); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + byte* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const byte* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (byte) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + +static +int +read_block(BGZF* fp) +{ + byte header[BLOCK_HEADER_LENGTH]; + int size = 0; +#ifdef _USE_KNETFILE + int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; + int count = knet_read(fp->x.fpr, header, sizeof(header)); +#else + int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; + int count = fread(header, 1, sizeof(header), fp->file); +#endif + if (count == 0) { + fp->block_length = 0; + return 0; + } + size = count; + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + byte* compressed_block = (byte*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; +#ifdef _USE_KNETFILE + count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); +#else + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); +#endif + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + size += count; + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + byte* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = min(length-bytes_read, available); + byte* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } +#ifdef _USE_KNETFILE + int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const byte* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = min(block_length - fp->block_offset, length - bytes_written); + byte* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } +#ifdef _USE_KNETFILE + if (fflush(fp->x.fpw) != 0) { +#else + if (fflush(fp->file) != 0) { +#endif + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { +#ifdef _USE_KNETFILE + int ret; + if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); + else ret = knet_close(fp->x.fpr); + if (ret != 0) return -1; +#else + if (fclose(fp->file) != 0) { + return -1; + } +#endif + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +int64_t +bgzf_tell(BGZF* fp) +{ + return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { +#else + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { +#endif + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} + diff --git a/bgzf.h b/bgzf.h new file mode 100644 index 0000000..d5eeafe --- /dev/null +++ b/bgzf.h @@ -0,0 +1,120 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include +#include +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +int64_t bgzf_tell(BGZF* fp); + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bgzip.c b/bgzip.c new file mode 100644 index 0000000..c58d55d --- /dev/null +++ b/bgzip.c @@ -0,0 +1,166 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static int bgzip_main_usage() +{ + printf("\n"); + printf("Usage: bgzip [options] [file] ...\n\n"); + printf("Options: -c write on standard output, keep original files unchanged\n"); + printf(" -d decompress\n"); + // printf(" -l list compressed file contents\n"); + printf(" -b INT decompress at virtual file pointer INT\n"); + printf(" -s INT decompress INT bytes in the uncompressed file\n"); + printf(" -h give this help\n"); + printf("\n"); + return 0; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) { + printf("bgzip: %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + printf("bgzip: not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) { + fprintf(stderr, "bgzip: %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +static +void +fail(BGZF* fp) +{ + printf("Error: %s\n", fp->error); + exit(1); +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + BGZF *rz; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ + switch(c){ + case 'h': return bgzip_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + // case 'l': compress = 2; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if(end >= 0 && end < start){ + fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); + return 1; + } + if(compress == 1){ + int f_src, f_dst = -1; + if(argc > optind){ + if((f_src = open(argv[optind], O_RDONLY)) < 0){ + fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); + return 1; + } + if(pstdout){ + f_dst = fileno(stdout); + } else { + char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } else if(pstdout){ + f_src = fileno(stdin); + f_dst = fileno(stdout); + } else return bgzip_main_usage(); + rz = bgzf_fdopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) { + if (bgzf_write(rz, buffer, c) < 0) { + fail(rz); + } + } + // f_dst will be closed here + if (bgzf_close(rz) < 0) { + fail(rz); + } + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + if(argc <= optind) return bgzip_main_usage(); + int f_dst; + if (argc > optind && !pstdout) { + char *name; + if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) { + printf("bgzip: %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } else f_dst = fileno(stdout); + rz = bgzf_open(argv[optind], "r"); + if (rz == NULL) { + printf("Could not open file: %s\n", argv[optind]); + return 1; + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(rz, start, SEEK_SET) < 0) { + fail(rz); + } + while(1){ + if(end < 0) c = bgzf_read(rz, buffer, WINDOW_SIZE); + else c = bgzf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if(c == 0) break; + if (c < 0) fail(rz); + start += c; + write(f_dst, buffer, c); + if(end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(rz) < 0) { + fail(rz); + } + if (!pstdout) unlink(argv[optind]); + return 0; + } +} + diff --git a/examples/00README.txt b/examples/00README.txt new file mode 100644 index 0000000..dbb276f --- /dev/null +++ b/examples/00README.txt @@ -0,0 +1,23 @@ +File ex1.fa contains two sequences cut from the human genome +build36. They were exatracted with command: + + samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 + +Sequence names were changed manually for simplicity. File ex1.sam.gz +contains MAQ alignments exatracted with: + + (samtools view NA18507_maq.bam 2:2044001-2045500; + samtools view NA18507_maq.bam 20:68001-69500) + +and processed with `samtools fixmate' to make it self-consistent as a +standalone alignment. + +To try samtools, you may run the following commands: + + samtools faidx ex1.fa # index the reference FASTA + samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM + samtools index ex1.bam # index BAM + samtools tview ex1.bam ex1.fa # view alignment + samtools pileup -cf ex1.fa ex1.bam # pileup and consensus + samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz + diff --git a/examples/Makefile b/examples/Makefile new file mode 100644 index 0000000..3fe3e5a --- /dev/null +++ b/examples/Makefile @@ -0,0 +1,27 @@ +all:../libbam.a ../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz calDepth + @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo; + +ex1.fa.fai:ex1.fa + ../samtools faidx ex1.fa +ex1.bam:ex1.sam.gz ex1.fa.fai + ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam +ex1.bam.bai:ex1.bam + ../samtools index ex1.bam +ex1.pileup.gz:ex1.bam ex1.fa + ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz +ex1.glf:ex1.bam ex1.fa + ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf +ex1.glfview.gz:ex1.glf + ../samtools glfview ex1.glf | gzip > ex1.glfview.gz + +../samtools: + (cd ..; make samtools) + +../libbam.a: + (cd ..; make libbam.a) + +calDepth:../libbam.a calDepth.c + gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam + +clean: + rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM \ No newline at end of file diff --git a/examples/calDepth.c b/examples/calDepth.c new file mode 100644 index 0000000..7a3239c --- /dev/null +++ b/examples/calDepth.c @@ -0,0 +1,62 @@ +#include +#include "sam.h" + +typedef struct { + int beg, end; + samfile_t *in; +} tmpstruct_t; + +// callback for bam_fetch() +static int fetch_func(const bam1_t *b, void *data) +{ + bam_plbuf_t *buf = (bam_plbuf_t*)data; + bam_plbuf_push(b, buf); + return 0; +} +// callback for bam_plbuf_init() +static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + tmpstruct_t *tmp = (tmpstruct_t*)data; + if ((int)pos >= tmp->beg && (int)pos < tmp->end) + printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n); + return 0; +} + +int main(int argc, char *argv[]) +{ + tmpstruct_t tmp; + if (argc == 1) { + fprintf(stderr, "Usage: calDepth [region]\n"); + return 1; + } + tmp.beg = 0; tmp.end = 0x7fffffff; + tmp.in = samopen(argv[1], "rb", 0); + if (tmp.in == 0) { + fprintf(stderr, "Fail to open BAM file %s\n", argv[1]); + return 1; + } + if (argc == 2) { // if a region is not specified + sampileup(tmp.in, -1, pileup_func, &tmp); + } else { + int ref; + bam_index_t *idx; + bam_plbuf_t *buf; + idx = bam_index_load(argv[1]); // load BAM index + if (idx == 0) { + fprintf(stderr, "BAM indexing file is not available.\n"); + return 1; + } + bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region + if (ref < 0) { + fprintf(stderr, "Invalid region %s\n", argv[2]); + return 1; + } + buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup + bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func); + bam_plbuf_push(0, buf); // finalize pileup + bam_index_destroy(idx); + bam_plbuf_destroy(buf); + } + samclose(tmp.in); + return 0; +} diff --git a/examples/ex1.fa b/examples/ex1.fa new file mode 100644 index 0000000..ef611b4 --- /dev/null +++ b/examples/ex1.fa @@ -0,0 +1,56 @@ +>seq1 +CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT +GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC +GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG +TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC +AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA +CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC +AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT +CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA +ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC +AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC +AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC +ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC +CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT +TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT +TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT +GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT +ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA +ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG +TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA +CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG +TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC +TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC +TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG +TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG +AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA +TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC +TCCCTCGTCTTCTTA +>seq2 +TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG +CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT +TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT +CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA +AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT +AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC +ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG +GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT +CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT +TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA +AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA +ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT +TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA +AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC +TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA +GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT +AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA +AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT +AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT +AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT +ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT +GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG +CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA +GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA +AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA +TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC +CAGAAAAAAATATTTACAGTAACT diff --git a/examples/ex1.sam.gz b/examples/ex1.sam.gz new file mode 100644 index 0000000000000000000000000000000000000000..44c07ee131fa9a231881ae3fdd3a549aa1367c87 GIT binary patch literal 114565 zcmV(nK=QvIiwFR^vD`@j1FT%hj^j#pUDMyusZ1rMEXF)hub~#)*d}fMo6W#5`Md*?fB*c{P2z2R~ z)TjStUc&dcvcw`vAENOthQ^KnZDkc(E9R;DivBkJgFkj2}^ksTemzh|d-@F}XpkSrN_K zOcCYgGTR?3qQ(CawRMoE7;R;yh%4xBBw~Q9Mrdv8%af1E{<7X4(uc6PrU+Fi2t`3? zCNdO+Pze1aNe?CLS?LBg9Y!g3Angv2*A8SS$5bT9QS5L!aEmfZX`*dB#5sB$>1Wom z3J|V)68^FoA4q<-#ZuR@u0P&P_?TG>{4if?{ahIpOhv)8m9QhkFo(D*LQ;b%M0+W# zOa!8Cp-1mn>X{U6*cYMU7Y*W{g<5(!atJZE=na71R}L|?4;Q2v+E*=FQ*hByXoY_d zNJL)S0?<~$XMh*DAA=taeY~#V!>1r)#k=6r{^GTU->EO=a``mwBx`dxnR;f`$`q7x zF-i{+MoD5i2s2(qO##OCl5~~R9Z-ptm(o>Ba=bB|_iQ)5lOpKOS+q66;UPt1Ofu(B z=YAP61L`&-;Sbm>Vl+6T0Gm)!jWG2PtN~MLz_itKK>{G&-ilrxq@cBGp!u!$wS9fbGlw)LvkTbEC`&n8 zauB0!jE`J*7ANZ%&h5;p<7+qk5f=03?>E&511=tM6Iupns%C~{0+T8-M{-W9NXhc& zSz_#E9FLK7_AJb?p4q$g6UPMhuqaj&HCz*DuW7`v^tPExHPgzc7y$8sLg zyLq|fJ_Wc26qOvki(E^ssJtI69@^ktwTkk38fDr_T@hCeLyT zQ;JE>2oqk)8FNO4Km%8gs2SWSPHyrVCJ(K7ZP{}QVUd(suaruy|L`Yq`paKt?$d5^ zxQ+`W1xIifO$_kmqdX_k$Y!fwMLD=8#ylLmN8NsWv~Q`Me3z|Sw*y1zPIOje%1+8b^5uGppmb4MBN?LT!2=y znB*pggw!aQ>iL4xC)b%9^_^^+57L4YNmjo(E9cJM?5kb*=n&kAkTfH!1O3#TEC~?% zImBxwC3($n#hQw_3;LUY2RPr~>vVp92Y<&yI@GCW0+B#^sfUBAb#_hn9}yF5cZDQ|!(Md@{vuzbPw zbAl8f5*>L&`rRSDcSti$9*?Z8{{2$d%l@;Ir`3;0dqbj-sbNXhjm&N; zQJcBQbBSjAA|;sTziAY(C(`88d6xgm?QrD%k0Y4ENAFzJo|%z{8dY-5?qvsSIGt;q9aSuy1c_ic@sSk$tY8C4U1{#7vJ=-KjBugLPZm;lwj? z^+)01cJBgFE*vctsw1dztJ)Y<`<~)^`i+vz9HLMo#s=x-01#;Ixo(7H^Lo8zQ>RK7 zUJBWpIHPPs^rG_JIcH2=_u(?gzD`i8YE7>6BEhwN$*O6(Z!b?&J-uc@kSwmn45omp zE}9G>#^)Ne?y?Fn;5Kc2{aAOzw_>rn1hy%K@U_*UL~QCo`p1&l_)?sbzqdpxgLI$_ zin(LasHywNQDXz+b)9@i3Vg`CU+s7Z0hUO*B(#XWfoYl%r{K{I5eWa7Y7HP+!~J@F zf-{!f+t)6g-K%Vt=LpDS!El76^iVgW7r{piFJhb&y4pnhXrY|~m`ut-PHks>8n7d?H2&sIaSpbUCyBn#g^T1jbZ zbykuwY)h%smdoYnX6Y^R>HKUjG0I*{`>dp^&P%WdnJG&E4IgC`1{sByWc1){;lxq{ zUt;0RMj0)BJCji8D|J1BRoPH%TFB$l#i&LN_~;CEpaedJKn#CMYguCsQiB}D09 zL>*P=fHEUejVuMwEtreFqPYhAo_aRKZ%9c6glZWHR6PYPPdk0LTf|oMQ@Nxj@ofYycedp1|qH&IU99+d(zM~*Hb#t~nZ=d+*J2KEj$DXIl(W#A_T z6Q|=aJz&~u9U*-%f!Qy6DgS$cHzGMd&y_ZZa!RZtmqX=B3VBSNhC63aAJCv`6(KcF z@At8UF~Ur?lWqdyx3_jViac+Umx>Tg*+Dajj0u+08O?BlK1u|V4VW~#F&QCZPpNpu zB;xCRAM)WEL=hSigMk}YyjU%$<=xpruDJ|48Z5o%wSU386iPR_ow5n@ zX?Y~nB_0JtNAs#{Dgvd!RU)?(zqxPXT?&O*MK=Ew<+Y&veu&<~WS&jOHijf^Qek=A zt?&ET<#vjp2u~GEQ1YP0wxn>}lBlQ|bk(E0F3FOLQTKEkKJkbE z*vJ;nnJvYk{};f}MjZK2MWzpSM@(r1>cuq9Q^jQ+W1efSy4w2VrEP1q zPw9P5n>OhkG}H+iW0r{aT2eja8BpVjME3w)yNH?i*JdBbKo3z;-382^NZb>owzSXp zAWi&&LrM-W_!Atxc)DILVwI`pGBjgJv@9@rHZ+sHleg72%%={2(Aj*wT+hg46doUz zM!6NuYx#GrfxzLkpI5a zA?oHasr!H+W1DC+W9q>$M`LV`F*m}ohVeS(LFo}~d7Kd;w}%Kh8YI&?>Hw~;%>*f& zO)pQEY0wU&)ntc9C}Z_>m8cz4tL7HGfpbvLTn~hPdY)1QRyk5gCuj&}eZ}dM>q64Y z6ZPDnen~yM;Xz45!qEchB;zG=#@5|sb=?H{y`KuTRgaLD8{_j{BqmNf@LzuAUc`$U%I7QSuJhpAGoZdB>MsCY za`ngZiuC$*W8JWQcpK}Ns|Z8NOIwqahx4*rpd2VQ zcl~QXvrZ0slX@M<^Z2i>{K_Dcu{OZ=<-gFuQu{&-w= zxm`TW1W!kaOj26BDlu|p0y<7aW4@V!G`* zVp-f{eEnd6+@Z30utjMP`t%g2QMr!!0?@<^cW^dTSdZ>ymQhSNSJ}J^q*PyVkZ%y*0Xi1qlc(8CnFra?M zE~z)U1#>~3{p3UJrTHMw^*YPawz<)B^MKt{XPO$>#(1r0!@W;^hz^Wg4#9DG+uVk+ zLz2jMeW02ZCzg^Y6uvYmwqZ_Z>f7$dUehJo*At3z_JW|^Y1DkANlL+Tv=Qb8%8epm z5%IiSxm~L6%9UjjTp4&MHi;l5t5<44yJ$qU1L+#nvh?=VtfEs;k^AP31`+}nshYVZ zFrb|6loBTNdudHss}sqZ3XPp~9 zj=BBiimZH6k$tjtasaAWya)R`Q!RO0o+a;1b+P%hoT-cwcy^)#?0#poy>WY&2;I9HN3U{1pQ z#NTJvdce!q%iCWDiPAUIBIe9IsDXKb|l$ni0$8wIX}B)h5Ur!aYr&A$7Oi z`A{j%Ql)vav_m-;P8=8=y<$mn=1-#Vr*)j)m1q5yPOkIb)^g~4ZIk>cP?mjKRS48J z%-qtVQ#(@VPMPSkdX9B@ll-O?*;w4C(25t1IZ*1xmh#9BpbwX4ciJU2cT3xONTE-N zNiZhssAWXEJ%*wjphC+Ffuc9f9-w)UyrR*)Kzl+znN!Mqgx5>ltL3>Th&@Bx73lKQ zZGLr=aF__+y3}^8 zxa)j$R4yv6*h<;j=Q{7>=tQ`IvmZe|>ekh&ew!C*rJlBe?NdjjA$uC$0s{%WbfdW^ zQr*6}EvUEKf+s4M$#><{Z+N2abP1}~QX1b_CQ)ulvdrtQqRd^Gd&MG89!{XYeMtJO z)92x9Hm7auZ-;guaO&u<0Yxp-g#d?=sw$L8|@2q2A-tt*UJxuB;4Ee_K zbkHP8cX05b$K#huF3=|5p1}17AN94+&gF}RQ>QzqKFWOD744MgRR(^Dm_|gUAi5^d zeDda#Z7;KH=AO0(%NGtW8JC}LccT5txpK3Kla7l_({yMX>5k(|7LI8eoD@e|6gBJu ziriPy5f@*S1s`}H@kt|bPs;gt&I%JL%Z=&Om2-E}$~sP%l>OBvug8>lzCr6jq&wVH z$&6HH`9jfY%A7viZ2Tpa>(z7{==C-nv{E|oefIX%!_4WkRyCAuUJ>0U(o2WQjatBH zpQVbZ>W+?F2VAx8Q@OsnR2cPm6-9n^-`m))^YB{jyGI!Ac^0DWx#=3ay653Ow8vkI?)Ncki~1rMH8izI;aa`+u{|H@<@ zu)>;P%Z}pd!s>39Ec-#tX_7o+R=>uG0g%siAYIsxSRevy@XmG}09$_~hF}>AXt#bl zntz)@!@-^Y>Z3=}Tu<)@kL}5k$kFDk7QwI{F(j$p&~1AN5a)pY+MQl8D_kiwhuu9F zeBt!wM~>oykT3~s&w>#fl>m4J-iS+}2=k}8&)xhsDeKdEBUS7RII~Vc%8Q3gJ(f&rZLNp;W$^0E@xfe5W3U=o!N+KetD-9Xx7h4H@`=%8I(zexc9aP z!RT#_OGrd6?lzTRI@@}`ADXH;a{~;rSm7DLKD2E$B}nmB`S64fIMO49(s`Pa^G zRf5;Z4A1$3v9+ZRqXNz^e(Tj@n%vjkUwZd>z20K&qbud!GPmmr5jHD1MnEyO(CaFo zeD${5WF{JforRdk%PJc=#;omGKRHMU2Oj0Q7m!u4UB>kPSQ{El6fLOv$a2 zpI|7XT<+3o0^oXK91}p0ff>+AECsKRS``~7Ow*NNrgxrwh@b&~X|L|OPpCkzZPf1d z6rDK4IV^l97jQ-&-xD{2iY`5!l?p&8pCFKSp?}{V0mYnts~-c30nae5^g4Fa^Da6q zkN)g=!b}b(IK$p*H(qNK1gQ=WYMw=;w4m^pEOGi-#p`B}5LlgNcl%+L{!_&H+x+u- zf4Og;<>!}|XZQ2V%l+x)C5cDlZo&+o8HQ&i`JiAukUY4vyz|Yd#us=B7=MUEWE-F% z#WbW)1UYu*of@Mqw-seLMz?T@0|nzKwUpK%rQV18^t9}<-tYYbrCnqriA1C`;6Gs6 zMJ-SDV*UnG=p~e^1vQ@N!1*GpzWJ8eK3wMD+)9wCJgL)%?I zx-(46{RdyigOrmoH|Dze;KO9Me(-Y+Wuz_w9mk7V>8fXm`pVp;^3*}w@YTS{zsb38 zk(4gMJT5~9YT-wCS-6MSVa&S|?n}>R2!@vVBof^xgJ{ZNiv*Q+-gR|UjkstEnEDo6 zf&7?8F3Cc3R06qHr0{v?@YL*F9z7eAk_1-BzlA3hq}#o~of2#{BbqRucPN_u*>+%_Sg-v&~>%&A=P-T0^SDl49aNz% z4~J&xAsJHs(mQ`TBt}a>;gC{q*+vw_&ubQG)m$)=;=Qtz9@8DWOnk15k`8FlJ%Pyy z2W7dQ?S~@M7Bo0C1 zPo5k{zaY>Y-~Wc2?tcB9)$mu{|2i+{<Wy_Y{YPvE8jQ*iKUzTSn&&VL_=fjcD8^c9U_l#qUXN1^i8 zKu%vm!{Ia!*Yyrac(_NGnKUU(UU4RVe%Q?ne+FQY=deY}h z+U;XdPh^Ry+qQL%GoKD7q>Dn7gtVgj_xMk7J-?cN-p@Yw_otLqyAUd`M3z`e^hJ$A zGv}U<3Q6>BQW>m-a=Y@{x0rdASEc4ozI<5w;{J_OnaWB%xR_{lyGwk!rSo%lQ2H(4 zq=#vo%#|!&IZ2WuUECp6p8T!{^@yrL>T8Q*xa_J5YdI;tsyny+2>ve9vd!nQQa5<{ zp(4yYxg0WNotcdo-L*<7hbku8*0wnJezX=8IgH6NMoQ*scnm3hf>ULnO_UBZR+0GZ zNWfP|55DOt!y`klVG*Z$LzRAqVEs7KsGU1o(&e=+W$zo{%OH155Swp5O~U=jx}Q=u{feApD1$&FT{*u*seXr4_%qtmD09Vf9$oc2 z^>4rLFMUl@`Hr!A+jh;ZQ|q14iUn}HV_Fn~>AzEd|i326$WPKlW z38?)u_s{)%Kl)OApRob?(T4-t|11ymx}JRI1$1siUQj%p@bO&eW(`~<6&$AdLj>A| z?0S}YpmUt+d7#v+hc4&}porb4F=QTm>+|_EL2bMwsH<$iRPew_eB~E1&{1#q7nK8MWEk}ors!~1~dF}jx4m#G(c9+0f$?!l`n)z!&kJ6 zUnSJOYjB3-9ik;8LcnamD)=mVi@!FPspXdRy)%#Q)}z@;9RcmU#CsWN&WbLig0nxh z=9WLC$GpY9BI5je902#?3&9+c2q&E8&Qvbw{$p0S?x(xFd_*&(>f5d79pz@}$1A~H zCc<%xXh?Y0@t|<#F4bw~BqA?)oRWsEWfe|z2~BItE;GS=y-j;bJE4~xmD(<{O_6`6 zY1F9YTTSWAc3vh&Gd4!+^xL6~eUvBYWcSu8H>xUIKM z=TE#vq+~$;!(@3|&*1eBc6wi4$Y?hbt$iJ9gNi5+}q zE%c6|q$)~MBiv87p6*RjeA7)5B5zr-6O@4mU^ZC|~l^spbci#KEiL`TE+ZZ&b=+Z{@^p_DZ!Ppl`( zOVDw1A@bFX{nm&@*}G8=(fQ;Hidiz~QB*D{d(hd6Lefc3b9?zY$`N^*98NISydg7I z1=PZq|A?2fpO3@NigKbKYEkHBM9aO2S5$0MSQ6BJ?q8!~?}7Cfe!q&sBSx({|cU0WU6URKmI{sLjlx;wqpu+D@rj zzy_1$oDk0YHn)lc(7f3N=Ze?oAK(MLWr3FIjx$=fvS z?v#(}FevJLJgLUW=yUH0(p09eS6O!Z_MoK$}YPbY5_@XF!5^=L;VwiaKH;_UT zCrIs$Kf6P(4K50@c(3U*cu-b%pF`?6)nrB#(%>D~9FcKg`Xz(v`M&onaq~IMV@ppy zz8U7dXlHF@!P!j#)@G!}*e&7{*VH^rrTG%h6ewDkwIBZi!5esdb^?CI1x5PK=NUmj z+Kp24U?~EMVMP)LxC)dVO*G*7VEK`5hk>|p1rd_5xdrf-sI}}V1Z{i?<@aKhCYD`7 zhHSjw<$dg)rf&K_^HbuA#^V)7axg8%>YPWSr8}XPC-0O5S;-R18=|i`0Sc<3AJUj4 z@eki}D?)xmn^u|e>CZg87}dR=@=kWcJ_yN4t!%c5t4OzM@kGvcoyaqr&{n_fd#;t} zk><9YFkTj^e>VHR3aQpsyQtk_{xg)j#$}#znliFQ6~cHKt(DEY(>r?~)K*S$x^w#J ztbB&E;ABlJTXG`(uXRN=Plel%3NOQS_vu4VAIsx(I<{WzF)eKbLXE<>x8ATS9dL^E zGPPa@HHP_>g_-}T0DCkkYfX*(7d5)zfZ9=Du+H$l49#CA)zheAc|BJ848h@sq9o!UeIkg0J&!G#&_Zsx~*^R+~4Qt z50gx(mX%g~?$(`6?7qPR#e5R3fGdSr`q&FFk9(Oz^xVzq(^Ms}lq%FQ+oU&%I*L3I z&UyNW1m*XJl6VRJyvUX&R4JXe%JZAQtdP^yyNph`?7hs>nx0ySoF^BZQWJZmXvHf7Jr?Pnss8oX zEwss}9Eud2$4lyb{xt*OM&N za8O1*09x688wk^=Qc%c%{xTIT1-xy;MprfQ(0z4arpnx@jtSPqS6K zd7z3iLuk#wF?FHx5WVzVrCDNVZ5U3FelJL~wrQfv^lUqFik_mi8V=dE1Z3awp1& z&tv~`y;E>6O}AMkBlv1|fegE}mfzp*K&wwRZRruuF&^=YUrvboSZM&kgL@Z861Lt> zr{aF7p_j7=^ZS?#JN#_4xkf>oRS()uC(qgTT^O<`qO4>2yH)~Qu2C$g*CTUg*0sEuvBA~wqOZpoer%mCCzfZ)OMmOKZ z(W&=Tikh;TwCq;#x4xBr9ISi>!bJ71!{zJ@RJCX91kMhiy8qTNzf=MBfB(4pemsMy z$pOR{9*%INAp81nfbv4)e$c9_7zzzO6EyfZE5%@=XLOT^J-YEonXaF1| z1E&^fY$veI3#3jGXS6zDyPE*Ey{tlUq`ZSkZ`G z9a4NoWE!*pG6-9Zlg~VxDYNO?V zsz$_@?;9x z63UVpUk}YLX9Wc{9(<}aj6;WqpGA6tQ_{fd6GL7J zIoR4^&n;Dq3gf_&c!7SHo+=# z1XL|G4?7U;VHsz%1BcXk#N{RZ z24$C@@?3^I|F@1Q8UzPSB6kidx1+qX!E}`Utid@SGL0BG9m4%{ISyJ%UP2&GgX#xs zwvCYzSzB9$bm0=3gR~8LY-fu`!O{Jw-3y-(92b;oLn@i1A;G+@B)jx|n>LT=Hw+>e3{0f;^lQf~dU-zi3n)E+##j$+4E^6;WMf`_NE7qrLTqTe>Y%cLHV4BSSR<&a_tIc{q(3{Pd&{(NPAr!y zX|-KgCzcFyW)ff_5gjCk05I{{blq}Dhhd4>=%*jjfTpkwSHeDW$-q5VO`Q?FrCA$efPH`-1?#dYcxl0MKGNa|RX{cIJ8I z(EUUl4<(SNlI6>?U2ba1iR!I(SG3GEH8RlP6>SepC4?se+x}ynLl6vzTahcBm*`+~cCH&0pR5Q?ZHxQVQ{Qa? z;+MB`-45K~laKC(cCYi8j(o=AZ(T+Qe2wqGhvlwNrxXkAW2DN5cjrjIy`$$dkK~bh zhnc1nG3x7l53O_}Q&%BllGD~Uxg~|pHaX9ANa-^kc8grs)?SgC-uYVapjk6(vZNiw z_7{LgAC7s~ep7Kqb2`w;k_Jypsp=*5d@1T~mGj)@n|tZB9aE;P^@(|e;2}|e-27P1 zj|GWE(k-t*+}x~~#m&4!3D_Omc66crNJLKaOBU#ZLeLB<1d^c!5Enx$=~kI73D zKFu_@sQ4Mkf~`(NA`M0QzqaM}ijz92PLV`V71Gm>HdZE+V09Cqv8M8RUL}cEhZQ9T z)DF@s#iL68BV0!jvL;ze@W|Y^vs7a2aZ(KxBKVR<5ws3ZK>1n~T3X#Bn_7WNg*_MJ zLm~+76tn-574nn_Q?+#~x|~PjZ4>$CMEWpkJGb_JImaVBUnkf|(2ijYccUVpvZc|b z8leA1pJ}FV^GrXxX!~QDx~uD%Q3Bk~C@(i!Q z3=eEALxyKBLAh@RVv_gK<%{e5*kl{7BZOBz(WtIv!e;S_iJ$UIK zIiOT=a$mt!h0WdGP8|gnEe&>+asD(*#$3-!&yWv)je-P-^O4nczSxYMCtK2fcS!p> zr0WSa*?HbwKYDpdv%-!HDhnO1p3>kR@n(vvSVi3M>^3Kr>qi2<7ohB1Od{(of=(n53_&oT;Loks?Ui_T1A% zbp;{F9y~T82d~~xA8X&T^J&tponE{e){JKJ3T!$>@b|*a+FdlG7n%3)UAW4h`u*2uOPNY@CY& zrIbUfs&MXC(W4wk(tmoy8S&Yso|v|Kt5N3Ux4lo^P2*XG&T)P(G}hpX41EVcr}f)3 z$$?O(?`BQnI1$EOO(#0-mq;EWxr;>RIY?*X#h?4#nCXocZ@wFEkj!|v?|sZ2m(g~E z=>++zCVfoF@(rB$sJGUi&hIcBcZiYd#oJlD`(17mUo<~uy0qJ=Y@p;-o$DD_(uWeHf^|ITlD2!kZ13Mv+?Rr+6^pEmvenFfwY5EV$1SOoJU5P-V<{b zK>qHDp{I{+DJNf=7mv`!#mc?wuG@C=lc0_3NsNmqm*NWr9i1mQTDN;?QX|F2O7M4nZH-D&p*5Xbq>ZV23@V7|LE<=jA_pGoa0U&> zd6)h?wDaY#lbW}P9M~e^&~3smf-?vyBNpnox}E2jDL$~6LCN+f!Ek@xT{@4DI zOqiEGcR`YFU7%$6jaQ9Xvmtcj+Zv#&-b-xdClqeRX|T>XiT3glT?9(p_FD)*u&CJT zBzo_@kyNNhpKRKoeSXgTga)0!0B4R_XX2BlKC zi)oUaItD7PUQeL_l1eo0ifb>2yN%1LnG0ZFZg!{fDNSrV&Sj7M&=_cMg0fQ$P%+1; zoAA=xBcPRvX*_ud70iQm-K%t@2T6Jii)TThRmh7D@$I3cxI zh8jW;w)eA--SJf$X{ISH%~mKC*#G%|zK53D)Ib;33HPPvBj^!alyTg9FDmh_CTX)x zvHic=T$@oMV?fz^4_&f1(ju!k0!Gm|!ss(j;YP(l$pe81aae=N)M>?eLQ-|lXyr>_x|CpTnK^miKS4iF?t%&RQ`r-^I4?=|r`Nk4VVwwS; zkVXFxR5V`f>}M(F4pQU8eOFAZD$~(Ot8<^wGbXrk;xryajAE)#Cd)^e2>+uD<%R=2 zjWZ+arN{-*U;Api&@ zdz!%C@1GNx#O*>(zWdjIr~KWnt~i|pq#~Al5^Z+4o~rXbR9(?|%_ROLr|_Xbj~JW^ zD~s#@e%e8hGgOtXo`w^pRXc`!xX0P^y0`H|K1Gl`z&F0njn)&sN>9&Js_j@Evf{yt3R-B2oG@}y0hmN)86 zm+fgNI6$gmKv$OJereC^%GSruHA}i6T5pt`N0@1@ZWF?kQD6#}|~z%quiS7*T^1 z<2B3W-vTKA%@T99^8)_Nq)eB#x>e*%ZT~+5$tvklAvO`*P3gtXqMAd@Uu<=qMZyAS z+cEPNpnR=DHd0Z)hO&Y>9k+uZH)53m^pZ5~TSWRxWcwJGo3y@T#(P=g=@9BrhPrg? zbe*3_l$>>2MxiKcaK`kqiW)!{I{ww|hbX(u}IZ(H#!6u!E>0L+a7ZqBc zJq=LGXiXVZ4I(#c$}HZk2g1{0l6MWGTbf2TlE3{5;sA?P71WJg1(QFg?~r#Tu1R-p z1v!8H5tB}-TgL?`gL46(qbWOfop_?hUnhF|3mgB_BuSpw-^)Izl{*i^sto!^`%$yJ znP;2}W)+jF77uFKr2g!(XZ+gZ8}hwkvUVp7QkO~TdGCn~6IdGBkDbw| z>vE)#V-*?;%Rv1Rn8c~-=k){YpDNrf=UR26Rjep6ut!yQnqt-RQT=IxTmdNnzR-`a zpWWJupvQ4c0)Vcd4=wsmG%C9;MzyRM2@cBDUsZAufh=Qf zY2-$P*{1jT?kuaA#Ic()RNK{93y@tYW?2W%@zK>_h0Mu!qdv?yzSxhK;pH!egh%@_ zFB}E@e4H+@Jz^(-7w9e+#I!kIqIA~O>g94d>qYGdWxxrsS=?PNdUKymdpQFbZkLO# zM@a%43C{Iy5iyZz&To}p*yi7VKmYyr{^`C@1Jdm5!M`9#{czXnL%LY=t-sv=9yH;4 zNT|D5(6CWqx5CU?^=;6EE91KNFx4(@$v z91{%*f{rThi5B6w#);K#Zg(`6yO3eJ->k)oPNE;?Uinpib)jnj ziUbJV`-H1B)_!LE5TvZDP)3t%K_xd(zE7@b^baI3BI*t26spVFKeX&S<0MIyURxfc ziYQXCo>BGZKP8-CDD@tB#p(^fga)%Wok>8>0$pdW&+pORhWxTjEnssf5KX~Ba}G0 zqu>Xjt2MnJ#LfWqN#y@c*eAv5&AQm^`S{6!u;_DG`0 z5y2ivHrB1@98_AM8(uZV_2eZKoh;~UU?m)rxI~o?C{2o^lg}+C@)KJ12S9t{rw6Y9 z-3}<5d>f}e1zM*mG9Y;dFAwTGmc8`S{oJjVY499*3(TM21DN06AgR;0`|b8|`}oj< z_yDP`bM$cp25OxdlAT}1vlgOEo4iVEV~_Jki5a9PKabrep6COM$$(nV(Q^J0PWK$F zlTXw1iXQHMg5KWqv*_nre|!4q+-r~oP(y^&9y<7_@hfMa+$(B1P|d?58)Di8hi{r$os35(`!0ETA_H93fHk~&kQMK?A~aE@Hen|9pe0M{{Gc}Kc>Ik zpH`3}*b8NMUOe%R%3OOHyLp+Rx1fdKyyw=vOVHLdtisEY4w@P~bW>i@ZxE4b_s`VM zY;jKwRE?XbB#}JqE(Hopw>kiKX{e~-@L9BO8qQ&H9RkaBV@2(0>k^CC>_y%{s(FDL6+#jZuM2-4?* z0Ox@YMDUNHvI?Mg-Rhy8+@lYYNdgw~r#27tF>&%dP`fmnSFNZlwRR*cTCK0Bbnu`o zqu7d&YLH|0wLlL)`6;NXu4t*@lY$=vV>=ydl2klg?tQp?3KOw3T+&rEmp&4|=2vOw z&+c=+O}Cv)k4p5JvW^J}9BPV~Wba}T%L-8(=|F>|5K=C#=uacYx3^pC_KB_)yY^et z+|@(s3wr{SdR$) zdA^jyIeL2A;g+NlVs%e_sz|=AC=cpwP?9R+1UagiNHB6co#~2R_wn~yp;@5N4W)EG z0d@QTbprG@==b3^4S!#}Pj7i&pUnUr5;FXZO3q4#uQ3RvN3?C_m((cl=G82zAYp3l zr!xgTP0n`$+ozA8cH|pnE~u4U4f4c2n!0!}!in`Xk*iU;;T-14Aq_`idSJ>RB=I{1 zF}@*beVWf{ypzSBZbW^3qA5`ui1w7oRj~P*d(Ipx#dn>o zLK;>u_DJvZyW9Hr_t`y7Y4WKsK_2?fdf#l)yWmdq=uW3EeK#~^lu=|KIHt!5$%o0| zGEl9)r^*sl1n7HZOSbs~zxZYm6e&Udf)2&dfy)4t*pq+=AASfQ`YFL$cJKpzZjUiY zF^ZP=@Nl*|Eyb#VaSrn7h)4K>@-A-Ehex!C z^QU=I_j$T25av36XSNyIrOTTiSPl%-yNFhCv!k3+Z?HkCz;^KXGosT&;|8Pz$}klo(uZ}ar|(cM39gG@-NcjGWUn6QJQo& z#zmnW>w+}V)2WEjy||wF_J=6%B*$w(NSM@6xbO;{T86j42Yxl55=Ro!s-}qXH!rum zz#QkG!+O<(W~6M{m1i#!gl*SgsQ|T+w;N_10=j1}^A}ENK%EakPD){GlLjiF@J5Yh z)mp;?WvP7(QnRMh!WKZ^1q8z&A47c)l!;6!m&&gFA}Hzt5ZBG`Z_|8FS&b_A>*!jc`N|$0YB*{EGm}=2X%!5!)35dA9GpeaMraAi0d1P{I#yFS$fw@N4Pzh(3OM`JO8HRa(zRzb@RjN0v5*Od zUylZaF$2qsbNV_>ukE{fDUme0?Nu7px3v( zl06xs6QE!bWI?xw6hPJLJ4?oyy&(Nskp4zUU99;eKAlPb&?ACSuV?x}-=-;pw2#M! zyIg0%GVqG_OsTN8@@P>l*z)_wcR!K~ydF0#-HynXfG$SzyA(NleFOD@GE6Eku2<*H z{lWRmA4Iz~Ne~*%!<1;n$%?)LY(3;=pcL0jK0IPs3!xy6Ab8J3n275EF|941HxyNn zTu_IWguIzsPWE_$8BVO1-s^-0lwYSP*AJze+%t5y&qzLv_KuDy+uHo&dDc3dk?mf? z5gR$BI&g*%y7Tq?n5aD~UeU?u{%ARuaOBu2^~T;%O~m#hRu_|;#7-;kla}{A6OP22 zF89)_Me0INl;63>6W+dzY!k1D^sTo|Wt2ZniOT5e@nj@Hu%a5(n<#MNNHtOIsY;-< zY|A z)}{>ek$KUYrWa``r2~;EMMSl1>uuuNFFD)-P%gUa4iJ)| zxqT1)L*qQ>s!$z*fCXv|R0aSoF_aI1j;*=(g*`cy_u9W$}DIqIF5vNRa-39W+0lO!S@7BGi;q z!1$Ge$tV?PpUO#7jFkkngCDS01AELmnAg&xftJ{1dfKg$3>aV*0r-u39DBr=j-XWeX^W*f1!6MEOijsc5A{^9gmQSwuHpDOE|yX+PkpIFPzs7B#bXavK6Ae2d1hq zTi*?GlmZP|EnCmXsn=eVvw-pI*Pqg9%Yt&&ipgL7bbzUvioP3MROd`7+U&RU*+=)@ zH@KB?K4R^wgg<+e{iDOpdm*p2(@ueg&3+( zt4Cjix*;0l@8c2F3Ykl&qF^;ui)t-j(S8TBYsx6U8&X@R#_-Sgr%(GSBZiy?@GS72@?IZZcS0GU=wmHY{)h&~sG6lO|0r<91Dx5KJ-r+`}8ON-m6H)Gs? z)nq2)4un!DDjRDDL`zCf{l0OqPTWJ`b#gmHoNCI=zS-F7ozZ07DD*Rtr&__eonB=)VdWOGziZt2@%KYMn7>tc;YSVEhy@ zOau0GeHErVsP8^83O!cxp><~4t8*moSh`ye2Wr2ZVu|ihS}Nl#oK9=i(As6-gUH?{1Ev=krQ%8!6C}twUv6dhPm@Hyi?DXyr; zt9zA-Bvq;=plD~D!>24O*eT9TCWx7)r3uRt-lo^W{`~KEf zvC^a>vg`9Xu<#B9fPMnJd`=OD_R1%k0vEAOne-mvq}%Jn=b>t~)IiHnjj`?lnkTC# z8y-x|GA%XHh@fzdMi)W=mD?wtsh9exL8RfA}d4_bF-65OIS> zOqq71g~*|93yLXVDEXAv^Vj)!#<_V)k)-5Ej&nhBs!B5f5|~tCQ11G%1Sx0DG6Dn@ zqA8-Ue0Cq+}Tz+^XH!b$5Lgn{Q73@IcF9W#C=ok+C$*k zK6P&AYmz1YS)%qb!;?_lVK{?TpGO1T{gUw+Qv1 z7t?-zj_=bM@} zJlo*GM~6>XMcZ%I#*o;7fNFiO`SsxWSsI-KXHUgC&2TACdHJt%%uqv^L4Uw#yWV+E zhF^}<8**M?CD26df`5?scu3Axpwt1H^Oi_)n72q(K(3@Dnz-@uqu zgLDe>{CZcGIl5Sti8|-GJ3HW43W@xAo=6CnsedL0z@XA#Cn-S;|mEpk>{C$3T{_B1{KR@4J-4FM_ zFRuv=Vi(IZwMQ&Iv1};A%A!n&h*EB;2sEDrOP_%KZj#*0mx&iou;&5OHr8 zOGl|t5(}?G+MC{NgfrtvU#IBvNDsS^UYcjqs?UO`l-SktOYMS(H=r-8gE`MLjG4dwli4MHX7R}cLrc%pQg zWF)Sf3Yg2$+^*E_fqjI>4gNm@70?Pt>&bCi?ft2GoD&0E8eFN^`5v8kNZo)M&)p@x zE~jXQg;y4TXqyz`?+)TxOcay$oxws+H~aqHP3k6QE#=fr+30wYW5{wJrx?3|fehg> z{q##}5x=BxM**=zibeu_`@t`Fe?GMf7(Q+Psr&cOyL=kOr)_Qj^z`1}o^F3y{pWoP zkJpx3gen-}MZ~%7q@-~^m{9=prON;dT+uZ5%f1aN3h_la){hT;Q z@jISK*0LsL++mDO@tmP%Y#4^{+$$)F# z3CMwB{M`4@S0Md(H^TJRPPYjL{L`JnH5Z$mHm*@s*W_wK4FEi1+r}IIa0^RaYh{cl zEGVO&liKG*Drfecd_N+czk*-Z(R#A!=nvd4R=Oyomdx;I|02~4`|zXMYTOf z0ADfwM$Gk;Sip1hKHt3G{xnVM{eU8N2ysxCiGzYlac5kRf!SzI#iz`$+jyCU8ZV~{ z+TW)aw|F{3!IwXx6`jwvTaAqOfaClx5jphA5_BW{EJEQHcxh^R;{wj>G##M zbT?{{UEBf&inexnmQaU<0Lx`|Jt|;Q=-bqJ?K=?Mmw^@sqr}-wkRIaF6qIjV8y0{1u!3^A77h%S? zXkL5jFP9W(fFJ4#=&r)J4{HbFj8{hbQJ+JYU+eBS0_}*Sg5 zdCDzmVR~jwC82d?fcn#Rk!*C~m7kd_04yrJ=jiJz5=nuj$33IasEV4^I85mYugr?` z=LF}IrtxiRVCeEHpLssUg-%H2vNRMe6K{h=g8q0xji2qSSkS`;pwG2A=<+Cv_wpc5 zMs({C&st8W<}4WD9@(Y{GwD`Sc^Q~XLx`+`Uib*6e~)Y( z&QrsbJ)X6^H?c@AgUU-=VhH2W9OlmsCKPm=_8o1;Vk>DL`o%77yE;I`fDgJzvs)(VkFFYBrxJWW$mteJMf8t%QNhp7VU z!Tv|BS(o=eBayvrf7kQv4_#w5Yuf%9=5Bh|?KBK(n1*iJjzIRAFGR~5cSq}|CscIt zJq2&bKv+fNJ!O<%KMT3c#41IBbs3N9`#g@ay?XUmc`BovkLS*-9ynCq@Qra!$bg*S z^s?E9<6YF|;%vUYt5mWgZo}wvJkeg1&IKVFjo&6(Qr0kF+ty106qYmL3!TjNV`u}O zrr{~=QLUZil`f_;lpa^n8D&7j#W&P|RYdyqu0Q1;bkL6a^Yi}ssgrK%gUR!-qWUN~ zj%TzTrG($SrOBY^89k%EQS1yQ`bfy?2i5_1Sz$lX|E&()Es&?VSIgw{nOhR+{rPB!7~Qzx1lq@U_@AcFn0 zL-6s6_FfvD;pC5#kiJWx43n3Bz>;peFGlMQDcISzz3+Ptv<;W3jPtwkoHu7J zZP(@V7CuI_Utr`zvJTNu7+oGw5g4}*fqnz%_mXtp`u*3#swQ(AO*`q7d0E7=z9*hZ z@4S4AB*`cQN6N~!w(XZZ$?19?i5P?n=4X`jZcXdPc~j-_ZTq<`B{rOnRA`Gxl{4%B z74!&m2H>&>ozux?SBgi?G?zjI4ke^t4Tg>BmWT0LjF{~sHbjg>PQQtm^m#fL5e zbx5_D%p^3`9ePa8uKPdJtmk8bl*BaymLXWHDA40bkvk=^WCrx>04QI(x024>N089_{`orD`S0fb=Qe)apM?4p`lh=NYKHLsRBzm+ z+`UhWkrHuVNp1ucLO?xyQ{Et|Ip2hBH3`obkLdT_FWm-Zk&x;pYf~DuS}OB_$Q;mg zE7So{@&+K5`~c7?Xj1xXl~FG#JEj>m((T$0yp36Df9;DfzkeI8+^al+Ykl$|^oICR z1GPvi;d#PXZ)gV&KzVuxF@Uj68D=(3`tJsH%;2mfhvy0#>s1nr_Yi6uZ+P!Lz^^<& zbc<2P#{G8x_}E0hC}1|r54iAG8z+v`_L`UvPgimQnzx+W_8YC= zE_tP#ae%3jrfcetsBVK)av5r4=07vV^jkS3?y!Dfb|oX^zFj5}U8c?pydJ>(U02i{ zQl(TUK5ov8C^fY1PdBC3ug3kORNmqrYY_XC2}(E)sU9%edxX@Fo~t9I-yG@u_Vjcg z?^AE={PyM+_6;+WzWp+PN-Nt}6*TII5J|QCm`eWG4mUJ=~S~-!vsID zyPPrEE5ES?<=IlA8gVa54N&FHr*rkpyMy<<-u>-!wE=3URJL6u=Ho*P>oU;R*- z)=2WrS|M^8a{L`+pU?w?^XV>B$CFSHE?Nqrwr>}aIzqZ%(ywn1G(1gD^217t&9`Lk z`}rpA?dH7n(WIKWOFcCBd7Vy>X0?6S6_t@4?S1aKwDG@7o3;A;{toWD8*RQ#4(Zi) z*H!ilj-HG0I?Ko8W2nT~)L(#l{<%&k2?`IlrYMQFJ>{OaHRVNni5i}Brpq`(O4p_d z{1c$VV*=E9q`F&34?WVK6QnNsbg`X5+F^FT%(&LZjr%D>7%s|h69}2XxbNb7ckO$Y ze&plI{w@toX*t&>y=5VnWrYRsc5D3ee9-rJEbY5UXc{yVDiVb_fTKx#ywIxH@z3+JcNv)AkV9oldGgQ}@1*3d#~ zwC{hZ!0P_W0e(Yv(1>mpQ+S1{P*+V4z$qoU?4Fx^UTOqoQ>U5=u(32XqbMkmZKO^KW$y-!;IapA~qB+Y4jRB{;de%{F&n$`F{!0OL{ z+*bAL7^|6;dJUpqYqw4e_HyUOrobS_0fh(2OqEYM3vW}E5ADk*wYfbF!(Y`-Z-4K9 zX>R0Ycyn9c8Diz!&*8{o-qEM?Xe7CFNL(z+<51P@Y(H_v7w=Q*(U-4Mt?~jE5|ekB zap9r8;y%=1so+!wy19D1$rbL%2Tgeo`(gh=3L-?z13jiomPS+$}rUiCE60m zub+cN8SjE?r^`v_oIK%rMpbn2!)5v4}F1}d5&NQ^aMhxhXIQ5v2dN1N-SooeOK_-1DS^%3XS@3#jEe7Iml+v$M5i`l-5`}_Ux4V<&{eHLgp z=-U_NUi~LK|4ArQ7WP;jayo6Ae#&XQvL)>{kymMh@CL2u&&i77E+Kdigjs6wpL*~8 zhe4uzSRY(dk!j1$Nf@n{DKI-R$0Y(0l0Bq6En z5=aNrz3l=l9;V$MrXY{Ln5+jQ`&gH9Th9oo_3m>G(uvc9Z|~8x{qeE>+*o6YEh!7_ zp}0iU)@7WexDa?55ADAS!_)b+z?t#P-HCG&I?JXxP|F2Y1d**Hn!jkICg|I22L zoCe&6Qg8kU^x)pmk6#A)yRGN4>A7XSJU9^I`@SX3M;)za7w@cb;Uc1T5$f_dIf-7FAL`RNk zr9gA936V7D5i7PmI)@(WoB#fXAKpfla`;q=CC94H+qUygB1_JQkCe}Gg5q}icsa^G zTL8g-%U&%MxlQyLT%T!i0XIHT3+R{K-0?D$? zo}%P)6x-hn6Wym|L2KM5KPY58&6w?3nvAm+fWT9Vri{{EBR^cXXco1r^Q>C;^(C^^8st0s!y{8RcTCzo|hs{wBf(Ycem}U=(LEA>9jwjRrI1e<*)6Oa-=;C zwDna;oqqrsO?OB~C@#e$)YR;$>GN6nF69Rz26>V@l$ zCao84l?-ufP6>>GwyE`p7FcO*(UPsDp@e-Hx(pmu|BY1*5nd1KK4UbLH`saQKwIj7 z5h>8098e&U^E8HPUPSzq1uEh^exhL0ZRsbjoPDsOtzL5D5y`rpr^eaHP!WzGxWH z=q8yoxbJlp%!PWi9_B+zJ9(h8^H-?`DBT`&`Z!(fcEmgX|9lTEYV|}6AbRBU#c%+( zBah!!!L$Ew9Ml3IWdq-TQJiVP{y<2V3MBX{P}4p^Qi6T z=liGpEAtUl9O;#^ChL7NEi?H@PQ1Br52${Q*YXE5qHojv{;cK~bDwVWZD}kDN_&d_ zblReq>n_SEv@AOxfChhGQj>ez6m}g*Nfx0Q@8dYikIAC^aC1@VTfX1ma*i*=QXmMx zbTjMIH0{7&eX~i*oa4>E`s1Av=*GOLNu^@XwGN-arIOH-u>BEG)A^G4BS6U*aGIFQ z>gh`&5*!9*G_7DLc~ZiJ8R6pQuZc+enKbLk&&hi2wOjt?A2%3y?h;LnMZ&<@W4$FY zQ0s+xDo!b7_|ykqOfs)-J)Lu5W~1ieYEq)_+y9|$x8M}-I(f97&htaT=>X<8hk0P^ z`}F|K3C%`2%6c>%PK0b0GlC-s)7XNtRU^yO!KO^ zc@(G*@G!t`sqL{Jw%VBKxM8wO`%?*KLQ>|xh0*X3?eF*~E7(2vN|kKuxtA|n?>(1+ zel-@YgSyUMRn!|bwG9;(}qcq4G069r)|S-uzdP(zgE-bOWjJnH=Z? zq_p$xe~EPR+5H{TGC_dx0~+pxLYQ7{;#LOzx{~?%G>%mS`5WnmrXWe;EG5(Is;Xr~ zV1dY~cgEB*erLmpi@d|viF%~$0=T(6exgc#S?l#{F|(l_-JFNXlp zs);u8u_ZGPQDQHJvL7RQth+3dDc&9@&Im$T9_lPC;{0yV&>j?tr+DUi{pk@8+R&V+ zMrwc9*mKRGBU?WZ#mXtbuh7MjU`pD_%`je%L^B%d+s5<9UKr0>Q^rXb-UW4%pdJhk zUGLf={ats3x>(WB4Dr)xJ)e6HtpL%d=W`A)-SyGIzmTZOdE;0sP1h8KLwI!!^^qd&w2Qv zCr)cp>w`ic+y5xD2MTasb3tV6)phGeqLunS1B6vOK_rHh+b0v+Hp@VNnx`jg%>72ZDyEMp5T?CRiuWt;#9E*rToKG z0C|wnnF0-v^Q4KK{1M*2M74f>mpD#E6q-=2ufRzQcW|TfC$zeh1M_4c;lqofYDfb- zv_Br;{918(mo%|RRCS=KaZPQ|rwye!3aCJ)*W;xZ*nNGB)Vq4+72P?{B(1JyGvbh1 zGbv?Kn_uOVV*C~p;seMO1N922?B~aJ-g8xN8JC#aOq1e9 zlt!}9QC`A`vZyo3=+ca+%KkTwD5;g>Dh-;MCKdUjAX#a370&%K&f(F2a>r9o<4(_^ zyzi7O8HmAPa4SpgBip{4$Sd(WeMOvcpQ>Y_$0PMW{7atlzxd~0{QmSm zxzFV~L#l?gttWfqwP#wOnk}&0yT9F#N^`236qzRO@j;Nv6ngS^Si{~+ zA1Y_Dey>=+e!M^8pCcSjnsiVf?)x~)yK_$7`uU?j9dGJwcD#xD^Lp@CC>qH!IiN~g zpChSQ)aL!~=er$3dSqfrl4pwWs>OwitNKr3Fmk7^k*IgqBv)ex;GbiXr+Si$Q_=^E>K=%gHPv16p(g^ z>;RabP#O6ntuKEU*E93bhwzT#PPf3;K9tOHDBI4(opS8C+oYq|6QXj1Y&M8LIfy@= z`pJ-*jK*XCJWpa)nm}strl;4*ydG$8(0=XR7l-Km6CgSIvT*Q=K`x^0e#K}VoEflYeu;GkyIjZyB zBt@7$;TU*zMt?}@qb#T)+^hOzk~OU$D6`%9 zckpAE?(MYT$(Jlpj)6k$)}7J+9_WZT|9ZR>rUc2&Q=@GUSCn+(^*PY*L!iN^j$?Ow z>c;V0NixStNRk+!eCtS|Gm_;^msI(3NX3$_=I9wF2~3^Lvo5eo9|={M~cy4W1AvQbQR1Y zF?yUGePphU#Be~1@3tRW$J=d+k~!f1>vR>+C@C6&)x=5c!jT7h$f(9ZD#surZMmCg zY?5|DVYv*nrvKrgd`_xzy`EA&ZVi=5D%%ucx<_;vU+350-!BX%Nh{gq$*Yx}4r-#9 z72-2Z4ylrxg=%R@Ymm}5qvoZ}s;s@jZLT&zX@lrL| z>1X#F(Jl_98j*-n!NPjx|I4&cl#XyuDsB}-Wm#Bi)A9%@WgO!CS}%X*%_qfHF+G#1 z-Ip5U7%B~+R!Jn3?{M)MCkOczYAhMW8}JwZ4P_nV z#rlD7eNbtC@!aa;gy-3FXmaNP5#0^LR_KjaRf#h+Pn}PtRsH^YBzh_bq#2Ft^#pW0 zdID8p5E3ombPrRs?Ncs-zEr|vq*_d#bHmB3C?qf`NnS+h{H6~VAHC1wedYTbcNz0X zqCBWu2YquJHN9Cr9qxkvo~pOUK)(af-)6F`ZMa+V&iHiG=RADw*ppK-y&O+|hKzbM zR2?@|?jXH^j=H#{zbJ^GWlVCXw5rctc)Rb;sfOHC#}+kCnK{c~L>1DB2H7wquzo_X z)-?5ZEkJU7y?~q0C$kmh29?BFb&Kot`*Vi!_p5pPxXZWq+ijHh(e|Gzsy*Za5fuxg zERyk=Yy>zT6;UooeoUkMj`5^Lh1LAcO7SvtG z#gv9Q)*7J0efooDcISlNr@<#qC#n6+nEw~JIvMQklOG$f?v&brxqE$D$D`8yf3fr1 zBELUCP3c2Wi~c<4KdVEWA-KDY(}vTz)y1dz^?^`?THmms29k|R9sz}uUwK4}cYYR^ z7Bu+Js5E2dl2Rn>#z&<_(CFK)DWg1sNjZm4di_&+_qsU@$e~!0YNVmd4d=l6iaHux zM?YBs6C3g{?0|pTvjx>7eK?LDF{lc1lk*-`Ib)Nzo@kA+|dF~MPzqJRA z8>RjdcSt3sE_DP`MQ)y^u)d6Pdq4NRq}cU7!$w|wmOTH5i|;Tq;HU%B<_z8sG@1d1dT?dp;(rzkK%i)JZo~*=4U>79%#Z16dChJzWeRG zXNTN8=Z+GoyX4m??kiIbC{)Wh!-{IMOp1OH9l-O=t)+Ty@ALH3CP2?uG~6ZKUd$MO)Ui_#-(5ThFuDc}u#yMwyer`FF3m7#``ICbW# z)o)VT{b_pY-9kUvG_<==g6v4R=(p)cx!mbv zx*dce2O~{6>890)4m?c)QC13;N}K-m)1YhEVVUQl!@jRoX1(70%Zs^a^tpwCDl?H? zY|H7hBqA)Q^}64;3?iBoZq{}%;4M^Qhk)hJL-1L z2wQItCuh=uBC{9l`u;iD@h?bVHsknrdNa=hlMxWCjt^5Rr4pchmZx`&<fO<8I$@n=DeK*ud zG)?XzIpe8yVw7aLCpV|Tycs6miPl2B>`%#Sn6#nl2$?bjD8J2et(G%ZU z-;-Nh(p~TH$i9Vn`_vaHPu8zI?mdaJhWcy(%A@42_vF|mls`X9=Qs_r$T)2~QJ1Ft zybLs_Yt1&F_hu^Ndvi)K5!G4Zd>F?u77sn{&8f?g#G2;owGR{}KB!Y-Y0f*x2+8wi zx!;Eo-5zLV=3WuuQ|GRe;TJz5Dd{@jomU2$KB3z}51DDncbtLO z*ZFl8L9}gh-!0hrBA|=3wFm|(Rkq&ZWM~T7(F(L^wug%(>)L_ z5xFCZA}vi0MB5+L7yIBj7l9tQdgE$I5#n97Q5ju9lfwy(K^hLN!%0kpnnu{(Lli1D zOR#fED}@^riHw*Lf;VhAb+qw=ryVa~{8?v1Dg69cE|OnLqNkajcVl-IV@*4KiOu3m z=IcSkfkL}7&aY;iq`VQqOBaYJcct5_>*)z{owG;JJwiLnLS-4b&+hQjY)b{w%<&wV zSLKdE%2ZKRw@HM$*phOm)OL&fVF~9iONtMB%(*44C96RB8z}n`3`E;0Rc+#!P)BwV z=x#~lotJc8uct<#fi%*@KszeVHf>e(h(b)XMWfSZqyJDq@EOw-oZ09XeZ+<8a<%SSox>E zg!7c$kqErUUB__M_?5yLb*_`UR0A)ADjAtkuYps5Nx}Uvn?;F}8XbpuFrare?d<+} ze$##0@|H}8W_n51k%ktUQX_nhBcv`Ar54xXg6_}gazZ7MYmj4l**f{=EShKly9NlG z*Ow_`*65U<3_iyQIz<=XnGnnw7U?0flf+r@KJ}?mS51$GqH;*Xr`*E6Isf@O{cLzc4d~t6qpw(kR7Yl1Niz6lnCDcf@A#E+de-wj^({#A zFe<>Wy^aj2^!hSopo_s@s}1gh?jdKOZZlLy)R^*PSB%~*O}RpN-b9ux(UEyl+CQO1 zn1xiX3-|YN=p~K5NLpkw`(YMCuNJX1P)wrSgUF z6~-dYG^rc?XrXt$16p{Wva$`^fJ>UPa zwsqjhfl~AQkzdfIJ!=T5e36fud*c6rbBgwHKGkA4u}-%TEu5|9n;G-v#O+ zS35{^vUkzC#_cU6L2 z)TlMMCly}5W@HQ2#K|4-d*YEg0g9oLgGXweYMn^QDUbAoqbyu0iMqW}W>h`9P*F!2 z!Y69YEUoCl+`zcpttX9vANxS*$e`GY&8c*)Nl3%8dr<5m zLGI#`A|GL9;#D}QKO{H5GHI5mN*y5W7pq@J>h{_$4^!RfWfj|_Z*>+h09^LW+Fc0M zdV;z$kUJWv^<%?VJ@-2|x^8-%{AdN`Lp=(?E>w(EhMZXy(p*;WKF^&dVI*r)8mJmg z6g?1as--3q#2#!-;~=VSLLTzSgpveym}}JAFTeY*)AV}(JKsO<*8Um)<}Z{u8E)IF z$Zl`G-M#rp?Uq(F?EbOpW#*KYb0@y|0FqDNOggm8+qZEPWqcQL3FdlC`K{-Da3k!( zBhHDTaWS{~9ok>{DUHp3OXDqNoZoxug8BJ#7IXWF+>h=i(cMn7JXxh!ytJ zs}a-m5in)hV0PEOz*e>jc6Hv*7f}({f=q9_^dcn9ILhy(M{h%kT!bFTC(-X<%)P(7 zPXKQ~kiRdtSy-Xxo6zF+VqX@m%SEtj37vu=9WoW?oY*n5q^$O2h+zZisM3NuPrpRu zzhBa_fm&E#&=ez(L*uR4)+Vo$X9L811%{f`(NC`iewBc>3r=(=5^Xj zh;FBpFKHio%xaeOIhgoyq8j5GHytl&SweIt$-2NK5!O|b75c1iXouRt{7qqg!3d)+ zc5_zKxoB@v{h5rq5muk-Nz>(tBx`0pGa6_NC{LL{RgPL>O3O)alirk#_YYMj!mrKi zN22izY2=rn7Y?HFMWF7Cc5|mNFF^5pl;8JO#ZR#IgWvuttg!2*GaUmO zr=(Ic>Z*}@g!s^DG+m7^s)#YKJsb}`P>kRkWWQi4GCqj!_tc!$Fsag{3nEgqMs(4N8lE(> zpi*s{BFf+2w3MF@PpuWi!84dpW9yi|P% z70Er~_pgHLRoDHiG)13r(k?hl-_ja+IhCrP@)MB0rk+Z<02J4MoT1^Q$N5m(;}KpM zB+bwY>z*_N{vvS*wzI42R|Gqn`N@E*(3c#`U@ICp{S1tB|1T()fKu7=BcPJEAI7ST z^z`iy)bD92qKZPF6y!USZR|~CtD+80jFVg26k#4Ed6`|^j}2G3)2YBDv;XYzH^jA8 zhZIfbVbgLLtO}R`*piUUWl=7%8vK728O)A`GMy1Zy9D3+ z$VV}j=5cF~etH6{36n3@-2M0YmdbVNn7rzMBP5X(Wj3EP#6V?~!oA{ZVrE>qM|qr%KI1OMTL_^y zrC!u^U`C?@+RG?3QC>*=9OLhCuPOHa=7UwUmlmxgTN1<#BztE``z`}EI4HenKx0|~ zHjHY;-Np3LlkYI!-KL6>GVQx!EsD>w<7ZSwew6gXgpV%i>{HV>zpxL?m87Dw{FE}J z&~Eh|(G^G+vpGfI@G+kuuXcSwJ947k`{cq?EBURD$Rf}cg+RZ6kvQ#`Dnb0yv=w+# zZJ$cA2>7NL&o$1Nc6+Q#WGcL8QpOsasPGZ}NG?Eds~=%mOEu5g_H)JvNTAcOdwEeo zxuhpC2Wg5`p@Q-g2eC=ni3%l1_efGcdQf~#qsW9v4n$MN$@P-Z7Cmj})cvIOAJL^# zuN7H7D(z`XNLq|P_S}mahSZDdZ57gT&Ez48?nD6j0H_+gc4r;XWw1H1rAI(T#dHac z=UQTl;VsdmcML^}<$W@N1Tq2np9CbYm9CY?e=xH*-9f5s=6nBYQfu1BW!+pWPe!TP zaIEQ)>OIzULEW2gM)X5XBltHaF^%_CF6(ITEByRKY?8rb%=r*WeDiHwC6vF**3U9L zNn>vJFLRYDC6wS~Fg=7CF{Kk8(B1WaunwiDcUxd?W8il` zB4t3sQD}-ih3YqddD|xlPp+1Lc($Dfec$Q%C1u@%hIoVN0)^K7$mWa4*B8F!sJx=T zY{1}f#^7^NaLo9%{|ar{`(tEK#bPj=NEU7xsGLNvb25 zzk8TbO`kQNu@<)+n=T7mI| zH|I;+%zr=_1bT~kp;oWEU**OwaGneun0Cz(QWYfTD-JEun#A+4I;OT=^7!y50t30f5uNQZY#8I#$Qgbl^KWQ9Dk&-9|74;@p^ z5D3X_mWjgDmL9BH8<2j411@xMvOZjFb(MP&B{ZT3wYX6#YMYSJ)7oo4xm(1!<|W6# zyy_g+p8AozqXiC>v;(nBW|@r zZ#QA4;Wpe}oP`xIvTyU-lAv08=9cJBh5H{r1qb@W94Zf}O&iKmrA2i?6`IBHoVqCd zKFasZad?E5ZKvYC-Tw%wUPAv0q{`hRXHDjp8!p!C>kAZex9rNeYwWDZ_J^4hp{ZBa zm?#1SQnfh(s7ekg((e`N_lA`0=zZ+=Nq*{}5^n~4Cv}!WbhD%4c zM(wnPdTO6yWge>`UhtY#&FptnmfkVS)I@-tr!SXUx)?n2tfk_Hsy9U0P{4S~&@_W% zA518>*``_FawgsL91ieSwL1eQhc&DzrjCZTi1p`}WmGb+m7b^W@6J?J36f&_K_(@I zdjce{NybP!Ly|f*NRJlP(6t&LYw_C&5wpwBqiRolq?zN5IUTn7oCF2x zI&TovDFMawrq)VzK5!cGD-_er7|Lhv>(wL`_&5b+{9TlyJcD2i)tKnnM|mO-^_xSb z3HWKvn@>b5BIgf%bWT$qlw7}bM%kMkRh3?S;@U5)sq;{^%{QPNU;RpI88h0^I8QZx zp_GnJ>frkji!cu!DC+5-$XfMNI?PkacBi};L`!|$he*NNK!x)X(!}Q<)9>z-jt5#G zr!Oh-NjVFEfit6r=$P_Iir5YZKg0)$3Fki_Ea|tqe(zS_tg!a>*1cHs(pmR01HQi7 zo7HKr^wQP8@HPyGDV10?c?Z_#R3I2Yq4e3R{(7*!!FO!OQFO!TZvQy8^j?nxrL3Kc z;wL=XZl4J{=QUpw6+|>Lh!$~mbQElKXw8yRIX|-BzJV^dDJEEgjn>=_cWs}N%T=RD zi=-;rCq|2YUm2yVIn-bWY$en&J+tR_pn{_@qesu6a!Mp zvCQI7LwgE$1*OyucPvT+MvO>37&<$$S%)|=mIQv*70zpDNa~4nLVvcOpSREZ@cEXy z0ZF8y51i^AoLvSf0CR#=fl^YRA`Lfdvsu24*1rjOzB(jXjInohIj2G-c#7gwBXPC( zd!@THqV82SQh@ZS)rvA@Ri0`m1;)=Qg9;(5H;92%K{bX+KGs03(5{kxYC;e@D=(?( zKE3&VK)QhPr*TV(3G*&Ef4SR#Jw_$b4b{4HLc1Rw(kPfFI;={d+0W|=`gKRrS)P(T z&{NGXE5ucO!`DT$2Nv~_N+lgs_rPMj%c!W0D&iR2Sbt7dHDJ{*D_m{h{b^v9Qq50#c*^Ig0#=(m z{%ePlkhG>lVo_ZP(5@Mk!MFJzEGd2j$%Bun`FaMZq{pCo-QS$ka-To`@DU=v6o%Hr6(EAs!>T#vW{94!QTu;z{1@J($B#z{l^pi zV?#F`zP^3lKi^VWo8t8k0}1Y0VYJ&tEF@mY9xC3aVx@PjYp~7{1zL@%DyJ(bvX)f) zae=$@SK@0GEvxhlqm@u+Mk`03zw_tpZo<2ZukD<2xm09U_M%(tq$++=QBN~I3OvUd zTz_Brty|L?(XB+dlEKs*Ce>2>c;7inP~c8MASV0ycfa+OVRa-}$`7g8U47L5OF*61 zKUpmXy-JiaBXw{{%dM%N46*G-(>v=QEhvwpG|l$Nri}8pX{YvYyO3dpGcXOoRW%uy z6?K%rfa?zGD2UIa<)fV@s`*LJi&RFsPpP1IHj2qT`O`F-2`WODYkoda(JR#YozEY& zewa9p>gX*k<2)k6n~AE~q5>>Z5!l+<9z{uC&^8*mM;cyzV#BjFNV5gskx66Qs(sIQ${#`x@($?D|25D8XxEEMF|wceW+Hh;|K+mzfJf4 zM@pmh-Q0~at^2L-xLxSZ2X*6ZN)}WOm0?K-T3Mme>V%1N{?=D|OUs!)rQy+E|Ky&+ zX-=yXRWtqAdQ^Oz0Wgt)ig1Ga++C*2IPvLcLrQ#RLInjAf7hG*ap|Q2(JLM^_A;5> zzMG~D^0$#UOmN+;ob+Em&i%eDtPLG0m!~@@bpqo$dWP* zQ?{s4T=M7Z38W9uqtq#L;K9cTT93!Hjk0_Zza;e8D05y!ntPt)JiX3u#6>bG z2nsw>jIz|`0Y-6e270De&ap-*DebRS z8R}P~yFrg*W(M`JgZ?4Cat<}hNthrWRYdyL+-^g!?)^Y4v-gqmC=Tn%WK{WLASR92ZWORCXR`Kn%vQa-Wragub z^PS=R{cTV(4^t`JsGdxDm62wocliO}PrzWpv_}P0G2K7z+68{*e#691Pt0>>?vFyg zviPny0ce_s&e_<<6BIUjSM}hj+AENro7Cs2zy7YSgg$6YLbq=#4z}NVZoE3AlbYM! z&(GO4F=ZK%^G=Pgs)dR=r>Qf=`KyX^BU;OmN+0n5W`KT0K);*YWVPGrn?v`yjQ!lK z2aOcfaFm)_p4Hssq>rSL;KgW?8R^RN6A<-IxXX_sN_kno8_(FjvbM7iC{r~)50DSh zq+d5E(hMDX?gVsg%db@%MrVsizngbCzAB~WXLpq_^raXFkuOeN*NyZ?KzXSacMf{X z2B#ya&a5h5mZ*Swc%|HNc=0Fq`hY*_Ux462X%ewt&uSMKCl$L@vMHj(v%1hrjvK(8 z%X+=q>wQ4ojRx%Pf(L(H8`Y()Z%D7edSA9Zxu9C2&x>2ssM4o=GOcb&Z^)~-d;dPk`LBcduGPF4 zfDF(=VJg(m1S%-j{h*}aM>KN$z&}LOJtdU8CLhbcp-N%Qb;7w&i|$(xWVurswRixk zHk&auzQyAZ>Te2F1Urx4`u%-$*7Rl6?|Q!PRIlQ$iEJwhD$x?;sOX@LO;qGb8AD_# zXDj+mFtfLQYW#Ek2= zf2f8Ue}ZX6n>XcDFN;g81P_>(?2t$2&S`?QrSjQhNZA5YE;U)yzIGag%)UFI&YOxa zRg4uYu5A_8JdK+TzaI)ud?QMSfvH|%zW?x~5&;pK`?5fPl11&sD~-yJu%F`~A=B~d zm)2Ru-APTg5W0LD8uAk)bvfYek9@v}aG3lPVs|MlX-2W7Uw)v6%4S~uijD+y%u)9_ z=^r6JVU~(RrNl4wcGp1V{t1_8Jv0DS1Jmw#k~A>gftMuW9*nVeVwv+OV|9_v=lIUY z(FzUo1-EY*^FYt+;4YUt0n&=&69Cq9=r=rY0y3Y=9x8SDU|<&HnA-A7wR5IOKKcwH zk}AJ6gSrv-SHP?i>H?t`Mvx^Kvj)vgs(Vno&7BgBe^3iBqxJjJW;KX*<$&gMu`SZfFJ;2@%5 zwg{5FQXs;_A$XyaKqO?L7d>e~>??f~Ln=5fFd0!SVJpf>N393~vei(hlU03`d=8a< zC^bP6flbWDNELPWsVb!OZl2$$%UvD|xsQAGQ9&2H87UwQ+&;Y>8m}Q+MEVQiZnm(? zkSG3t%IfM3PNV&kG1brCi>XqP^=YYxrfy+RK1h~aFH$S;5W)j&X>3OF6U(iY z453O1Ch2s}D`WEE)-1#4PcFHqQnx=^IXM3wo=FlFT?}u^| zByT~OIVM_}}SuqabLQtly< zA?TuLPU7AToF{7_I_It_qdXWun%u4<1r&(hh6k8EbkIcTH4RJho1d1@slUGrwUNeb z(-JGQ+xUU3$~aLH@N#|<#uS%Hk3w6--`wikXzS58OO!qjE$KL_g^A8xG<7-c6T*I| z44~x)l_!%$YSkiw_#o2%#z=*j&v~g}a#jibXeXj~6vR_oMfwor5AvuV?jV}y^#aY) zyRj~s()0TSiMe4&8>q^4KB65&l^0XJW)y}ydQ;2&9e_(zw}VS!O*$zX*A7Sd^*nJX|)RNXho#I zjehn+rYiqL4ST4S^6@>clMDEf?6;8@C3x$cG=B2D4AljFU_+VRJ@vxD6&Jrs@tWEZ z1~1b)=z$*q?nfCCYcx$6=vTA4P4+O7zKqo8&|yXGXa#5zc_b}#>O}XL)Ya%a|>7%9)BcrLfjFY4Fv zivIroG*8oTpKtD-FKkn>!~fyTSiT?atjV(v`>%|P`J9^~jpDg&JU!Z(*pr+`HyU>sR3DuK z!TLQ+5K5?7+6EP~X`7YyX#jn85-NyNmv&$GWLftjblkj3mAWiFmD5URpmC1#q1nnG z*&2OeLsH0?^0F=jl)QyHf=W>pW)o_l`pS;liae7iB$%ZCd3rz#JCrEV?Q5Zj+(&mS zLfUtY*B~e(Jzvju-jUDNj!(H0ih1eYed(hf>W*_|TC0N1D%*euqLp%*g4(^&mF)a$ zgBf3bQhB6GQWfM*p#QE;mOidF_ zuBM8b!16{TGznJgZ+{I|FnWy?roKmK^_bbSg4RO@whVz2Yk4YNrJfaWI}~Ap&j0u- z<#)f1G66gU(41-^loUmQ5menSwe&L7jS}%b(P;(uu!d$*zgUG!Pyb4h4=LO}p?1K# z%HF6!`QDTg$a0oy-$zqZou(c-_m(P@d^+AlrI;_}-cm)H17(=I#_8-QZTKqY)3Bg^ zcGAVvA_`O$isk{%LFeIKE9)k`qLeJV0F*T9YQePO8x= zdP_Z#e}MXZ0QIMNQ@v2rWHq_eRxgcd?$365zD z%7Jas=%1`=lvJVnKQv{i8UJ*Oej4|r^Ho?ukMCYo&=0$?IdY{rR7;bSt*wH3*pza( ztd)vGnv};hN2s_kJJc@E{gH!Mb2c7g~ zYGsM&=NM{hgU?a{6$@(CdtU>!!U;sZ0=YDcNzbT}T5~;@0n~0CNqU~ZbbremKjypg zSO;&h6v4l}d`vHGa#&-P)ZEZV&`5LUp9Fn+PC?j3#C}!K6h27;cGqKS`DMQO^2+pqN;I=(&a z6epPS-{sWLqjB4qXD;S9RvfA{l z20B8wu1Hzsfnu2A?_d2g-!SWmbQNp-ywmkEyF|be3I=_TE|&pbhzSfdXnP3z@nyB? zVMVu6VNGf7gGVXS@4lYz4rqUSaeLU`4|-IKNg!cJ(nDz8S!3ID`@x}6gt?EdGK~vY zumAE`b=Z00PK;bY8$SUmNP1LYeYSJkKS$Vz=aZ&@ZO*DzWvrCXRigEur++P@dT=H{SqbbURRV`>kU-VlYN$$3xoHAn);s+$e$Mt2+wXksl#;MgDIIcIn%K#PxVa)?CToW_ z*#9C_I4kZvK2zJ*JKd?fOs~~LO_i9VreTUC5on_TplkhEEdzBg)$QHw^Dqx;bxm+` z$sHJSNNX4r$W=v21y3q>Q!MxY+de4zm0|w=G*pmq4#oHujOzThs|H@~0SxsY_X za&b~gZx$)*1C%bNvg)SXW2(o0{+VCL3Bw9%JEnTpQ<||}cnL+eq8d*rF$wYr?bfXV zX>YIRyshc?iuN}~3+;sF+1#FOtDoN{dwX@49#?E`se2{l-h4-Rp7>!=`N}m}F4=49 z|2I%g}XP;08X3@Qw-NZ9x$#XtnQle7nzM_eSDQ5Td_RC*w4(8RIlJSZ=#q=9fTI@6IMT z>-gF2qAUgmLeV2$0XB`Y(Yy9JF5NaA^mHE?W1${K(?ab9JW`}IetbNR_bgy+f z8dRtdtr}Vb=<3{~U}YaI>A=tG5QKEgK;3g7pPX5B|5dZ;<`?t!=B58EOzm_J%C!mB z>X}dhc`~$!_N?V*0V_9`3y)a^*hamPGd0$eLKG8jOO+Vq0Z?oY(VgONtY~{*$f{V> z9ju9E2Q3&Z(;@4s7DljFpg@4&P)9)Hhs#p7rHpfb`RBFE&P*bgY^#?3=+VpoCE0Em z1iw=eRA;7*I`K^kdE9IpLOF-ec zy8ZMTT4U|pO%1so>8YUg#@PWsX%&F>Lv&6DPgi&f@fY;9C%IPt+1dUI-kL=~k6b}e z7IU|ykGxjfH-74>1T>QuTJ!9_rA60h*0u`MC>FMiAaDOTquG6TyJZtY$!_d#QZgG|33#+4B z6-X<~Fljb~g6jSl=_My{O}B!&2s9g#$AUedwWiw-|FsO$_l9E6$2rq=G3Aj~`c$zn zC9_q1WUJ76g;GHU+D7&t=FMx9)vap9&B`b~GORY`sZAJPO z9_EngyeTHKLJ`BgsRoqgh9@o}LJ-lxr|z0E%+ni%Ljfo2k)9jUriA|2;B4zK93ds| zzx}U}!gNVODpIC}syh{_y>55PVcx`E{VBDw*~9!vVA7qs2ZO5TSNiGxp+jH$(%+oV zvLzXosGF50n9OKL(~?wC1eG09rF{!Sc}3C7D5nv`RpT^fS8T=T3@xquH-%1+n&0Z5hC`Xq~6RXRZ2}Xv2B|r~*uCW1kIt6dq{V;wGG!CX_605!HQ~5Q8!aG%34QGJwhsg3P z>%*xs(BDQmt7klOjU3S&#?L@@E})!XqgRSjmc**wMUPZOwb9a~rPrt{ApL2ep(Q9Q zp+8w80E<~JPueZh!N{+*&R5|xVZCUXKp|zSh^xK`l%~TKz4{2G*G{(gu&M?fLf0u) zDZMU;xlp$*28*tg;H3b8feYOJ3aE$oNz&>6R8-V?QO= z7fO*TwB9W_RVc5q{f{D5nqX|f30Ydx?`C@S;mLFhcx7q-Yt)JqXbVYLrv-uKAyV?G zh!e5`DUI0Jvw7RcXtoUee)sRi)4qXjMTM=mK~t#&@>HyDngiqd_2$JuTMObkOJWsW zbWY{&oK}Ga)~!?3GZzL%0HR=+U!O0l)sgk@!LVjKLyVPd@1wAv$lyA(gqPOzxA|;8 zZ?|Dsd?|e1lLzYoHi%TDlp&406z3NBv|kA)rTxdyU7@_qA9qbxm)+@%;4^!rWYh*= z!c1@(st=|4C|L1ERdobm%zEeyk(UjIud0mReoCi6r=|$>yOAT!)@gmn3786gmrZ|8 z@SQJf?5KpX;2N;^>E5DsN893bFf@YJDdS%KryVpIjrB!o7Sz1 z!XW^56a6&yO9P1e^5K6ton*|muLdm;m@BOzTgI(`?9Hu z^A*13V)BEcQ}{P{Xk~ZeYJ6#Vsv25rlmRR+tP8JbIDp3P>W)EeAP{NnvSrmwVz>3A z3z!cn0fo!*4i-o&eEaIee8TWaqN~1^(Ja;iWmuz3fDQGkGSqK`EPuyY>xjhczgX4K z69g;Z;F1_r^cF-fte+G2Q=;cMv8XR%ia;6;c(1Qjt62x;J@nWZx?9}AQc-vfYS`9W ztaGS8D}^yW_j!IB{ES$SHYOjy-t+HPLJzZ_WpnZ9sA&M3@d9I75L5xdZAIrWiI(PI z{%$aHgDkY3qaYuqohQ@Y(}!T;FAq!sLxw4UoI!M4Q+c)P*eF}C;*eGNL{Mo-{Ybmb z4K?HMmh{zQjCLKV)^-*x3q!oBZ2N`p0V!^N+ylTal=XcpA9xK8nirSt?+!ey9{ivN zwY4a>9`SB?2iu*XGSKfv4}<*^UMC+>?N&>QgMB-5cCCIP)5{EG1aYA`nu>O6i}d31t04 zz7kx)5wbki9 zhzN%Py>;UY`P4(ZI4|aCSyQ^0+|^nnVWA0b-Gk8cBx}UMAqEtKA!BmB{XWK859Mn7 zLpV6jf99!6-GDllKpMY?=8AH5OGk}sI&!X*B=OoZ(*G+YfjPyk?Epk{A&}pC^NL&N zyT3s7G2nY0)Ee9BOF~csA4Y@0#;AB1F>nagHE3rlF6w?+{oLI=Saex|pmx&t(n#y{^XDD4q1_d)wMefOjkFh*G>rt-l{Eej&J39Gf#Hx?6Pm54Vv^8Z z8;0)|kYA_^=Rp2KaBmiR_&fge;QKA!6xLN$9{62%dQN z__w3xLIEJ+u^rGFD`GuxilH}!54qjd= zk$I^WX((tHD>DkKBAaTWh@~g(^1{XktlKBe1K3=LO}TiH(=lZ}V8jwZNvn{$49%Rs>;d;&tis#oglrD#KQZSrUNuyRsxwkwDG>m_#gKjACnG_b( zlkjCf(bA?@s%G>$6*1$B+A$R~@CVaN^+RC4{hE69trTo?dwzcI-)^FB9jw1^Z&0;( z^SxH1o|W7$YInY!QK}%q~pxF{`N)45*G+V%62nT*X*D zsf}q$2Y|3XSvsZDmi^Pww+AMLsU@hVDRHz~cWrkBwI#+l_W;yD{qyHPC)zEOG zKF^D7<*MMC491`p=ndoeI${bTX9d!m#S@VC0oS2%@v$$q)*JzHR-DJ$x+lQ-Rbn{7 z3U9>)u4uC>Vm+6$%GfI#y;mZAuU4`c={s1!%4vBNV4?;#Ob7-s+uZ^9;z+Y`;IPtj z=GwwicO^{*=498|Sv8Rfuznn@bWpLTr#g1IHLatHP9?&Rpwe@Nv}F}kcmc027(P@A zeXt&+DaF=f(xT-(p%GT$LhagB_1o+VLDTZxSA)j4Nn2&pIhzXQJ&Y~%kO>C8RnMC( z0-;TKS&7ibHl^!yc>Von3$3sF)uGeMc>o3hkKhT5h$*IPLswvNNgv|0uWGb~UX;Uu zK07oruw}K*jH<|}wq7DK1^V;DfqplutG9k}t`-`D;1fK%5cWNZ02lGyr4-O*43%$R z2C*?Jg22p4jrxd!1w2CNvoEb_>alLkOVi1+mOa*ts4a6?ELS&y_Kh!K7tSg?+FvAt z^y|y5H?!)%8~3g$s#0?|J5L_!@p8T)UwBLHRvQP|naaS%96w4lh_}%oV?-Pyj3Q;z z^7^_3W@XkTdogx5c-rAXOEN;`+HgjOo3F26I^0mY4#EKu)Sn3IkB52}t=2{etF>)~ z{d<2gS{N%Ve*ZDniiGTV5#kP!*3j<{?lK=RARTC&QRcI35yy>Ce(18|9Q?1#@vUx) z7vnI_Y`OnWzWp+F1$y}~QZCDMwVf>p2MTmtSoGb=XfVjeZ2`vH*NU6fyt+GtqdM7V zyLO3)$QC@*&D4*U)MAy4?^EOkf+!*=1($HyBtQZ6)uAcIieLQgySc8@a z))Vnn8LJ)6HvNlKhpC!67=1w?cQusPfR$i*hQ7!o!wL!Bl$P7~1sRFWK5L+#sc>>wf{^&wQ0uaGdVfjWW| z#J!Xmz|bV?Ouqk51YJUc)U<|7YPITSGP17(YCA0-zz9@=gP zGh~C$K<*FeZ3_UTTi^RU-qVSz%C=icWL8oP#KgEmD%SKmGOT5!h4t(oQMLBoYEMk; zP`84G+Khid8r;>z_@5*F#hQL?4Gf2$KkgXMb{uzH7I7O>hID07yTIb2ejS~Hs2-#Z zF>;p=K|z(}gF0^>T-`+KaC2A&A8{|yfE;`uV-c(O6n+~ux@UWw#GlEIv$oGH zCbizXdlu}z{j##7i6prwmnnHi)6k}VE7b?ekQ+a2hv7T7qTAr5`+4~Me6`B``7x## z4TI1Pm~e%}AwR#d=j!>{Yq5s$z&TD|PM@Eo&~*u$swh*V1l$y*cV#jx^*VDhqwKlb_vF* zfxg{GSiV0%Ih+uX2i&7I6=)yQS)}!AD%Ui@LzS#VqStKal)HkRCkqozg%voLXvt+; z%Vx{Ft%}q{1bP~WA-YqMGw7UHGzV1JtD{vN4`&++gN-XSE2i%tdfRmo&L}fz`h`Ne zWLA&1Kk6Gy8(GTx|36e6mLw+S&MI@ThKPGj80}RgiZ~Ys3#&)Tm3LhEw?$9dd)vvZ zVAVMlC}L5;6~z2%IKkTb=AbI9(~=TgTN&Kr^iG`e*1lVvTK_6#4w4W7r-6>tPKJom z4CK;bUaSBUwtqA(3S27Krl#SdD%+}we#IFo7eI}|a)&UYG?Q&dL}3+JGbi=N5^a=v zzg)ry6Jux)M*gyG6QFwaT#Xm!i-CT7{*4IedQd4$HFr;OzTEoz)9Xi?TZ@(Cx=g=a zPLy{usL_gco_njWTGHurvhSbLz4M+HV%pEQUoxnnF~qs5nyQyw#BVx)BTj4q>!Kb_ z&VNYipoE3Q-~ z6l5dqZ8)tDGsSg~L2PK7BFe=D=)q`g0>4E_Tq!?? zGtDoLoB~J)IX?wFtPnm(p!Q;WuaIR|0jgJH_E3Lmq=lewdeRTTB|g zEx!U(w4BoY3fT)82Jv&1tp6b`Suio;$#NbogW9*zE6@@RUvW);znfivLZ93EH^cZ4 z3*&&qfN+LDQ-6p$!8#DtaShh^P^N8S0=ewVu2s-HcXr;rZ^PL0Y5WAKGYRWitw+jw zt+|lELu{dgr;Na~g?nS$BcyPxh}`ND(mvZJ(hk>Cmp*j?!E2Je>xMY6?xVyIVI}9f z0QHwhE&^cwUM*C-(?WbRn){BOy|RqeQR2c1i()Z<5Be^33W)5WjJlE5__?v^aDw=9 zXM5-r7(mU?yQfz5`vs2d)=&jC3+@zZg{qXL*%lWJ3g>YM)+#piee}js!IdmQ=^!){ z@tds-r@5J|%RMb?@g1BjAn9*yw5UBhwUP*V+ZwF!z}U(%CV}lN;NN5VW5%OXdz{Yn zi6Mm~mK2!Diq<#wVR}E7S8>hU14j6UskhE*$#N7^P;BBt4QOw1#tPwrshYaRWLo+rFFv^>`qBnG&5M%x{)C}^;4 z-HM)=DN-4@N_Cl4ZRc+QSL}f{x|irEKU&UU*;u15_qog>C%hhdhr*k3rZx7mc1hvR z7P0jegVh>;+P;>Snp?vx^>-y^lqUEbHsWVZPm?< z?bN$Xeqo`H{f6^eZ971h<8KYudPnLB(1DEGph@BO5=lErZD0q$7uKkUoY5XhuuL8Qs2=`4u zxLLX1>5ZzQk=4FoT(@p*l{rS*c?R)^Pbz(U7`xy7sw_6xjWC1(xmI4i9^wLqSgw8n zCg1uv{=07iXqTU;&>hmcw+S|izxxr+ZeiuF0V-R%<2E~fp>k)m7AP_--7tc^;%GrT z8E9j!Ea<^UJmjMy7fj(l0>Ai(#-6`>5!~-0PCBCN?Z%pliQV{j{P{XBU$CVb#Y@pD zw6h*UmA&s{u7jF<=yWdZ?BHZJk3>^jIlwq4MPslb&Il%ey-)oR;2r#HObpx8Z+e;$ zh^X#{A$F$+KiXKv`PDrvt0L8F6!VJ z!=N5!xGN>vD46c4&R&7@%>I+pH$wyMa&vfSUI4ifnptCSN~C2gU$!{Mng&uawaLdk z_tW&}&w$E={L+WlQ@Iz78DNT`jUpL&|_DbiZ@aOa>3 z^oL_sOf;G!g$Qzxxp04K>rN?M8KuA>tmr18QZ@cv(v)hU3{=A7tWlKS!F1Q1;cS~1 z4)WV;$dXX=@7f3W_4D-~e73h~1bOtU`a45~1uR2Zh(x}Crnxge?w#xg?jpy8@YaxUUGYAj z-e?FJioNNn=Qb9)cm>RF+^K^-TB<&p&@?MDcRfe4#3WcBQkGZDiA9XU>LX zZVBhX0q|U_lNZ>0Pz}sEov4i%MHwf)_mSr@bbhG11eEzm_pL(a>CD|@OW*tJ^wk6v zWYD7nYZUj7MU)!ryRi2MPtFSSmy7IdAN%?FGNJKn`h3*~!iC}T+oQ`5%3?u1X{gyU z7de9JT@TS6V*O!Q$5Eo^*!Z=a*a28SkFb9T;(x<2P|?TNvugEF1>i5Xy1C6JZ>7w)D7u{Ik=8zj$)c3Sy{+2VR4 ze`go6@3C3~uSMs^;R2;*tXE8UP=DE)PxmEJI6bdZUTbA*xO7s6jdWbUhCX z_gGI%AHYQ2cXLMxJ$a6wdXcKYIHG9v$SE{qsGSb79n#bxd(QZtJ*E^r2~sEp(UU@Z>}E$d ztjk+}zU>jrwA=9J@ZJWun92CLh}qs#=V?nWItLE|g`jX@u9J#dplln2G)n84%`TPh zQl03B!f<47omU8M4|xpMEIsEHU_X62VHGF*_w!S`-}c{6?SB3jzY?%DvmCVfwj;Zb zv2G5P2P8<9yn&g#jP$peUdKs}vpV5XznTy64I4<>jYgsuj=!W4C$&V^v;kF031@nv zIC&n<>U_J+xBPs}v6$L29NQsmM%xA#(j8PBIZ5BPSpph%KTOAtpR+p$eK-dF=tGl_ z^ewHcFDc^8(w!dr7^q32(RJ@namHN*Q@XgfH|Inpyt!q0!qB#faMbyFM4Y!nx-@O} zk|bZxEh+u*_4Dh1dof+tjcJ2nGAeWMRt?~y5lBr5-^Cj}15t$159^^l zU*PN{`ZzB&r~C9K#;40c^fD^rR#$Y;s~$LW&JgivqY>tfvQtH@M4O_GcCYI=xecV( zRMBt$uJ%@?b3K*Zl$ex){D+OjUxt`S+YYPtM*%6cf6jFu_ExJ({ae%Usgat*)wXjR zF;jatfS9NXFi(bgMm584vAddhO@k3>6m{^fg!5P;J*60GM^aT#E3UvA4{&7Fe;#Ux zK|eIc4sZ~RKEWCq&TFs^#+}T;n75g_HB6}8z*#j}tmQW2MM^;vCmrDO)MP~Yjouwk zq+0^ISpY`sLt(0Wawu|9-R1XnZQbsA8k8duP`<1pat3-&oKwd7+qjFbz?;*7ga(x3tzqBPZHM z)=GPol5?0}#6a=4gO=FeSH7Jc78?Co~BxLqrDyrvWL* zF;-d!{E6)U_m3Zb`1$(DfOJRo@9Xo+t)Kso#sU31zeP^g2TAN^OHP6Exj$gdOJEuf zbE!Caj1Hm0<10=iQsJOdspsd%56^x#|NNuW-`oA;_Sf9|zkj>?U)6PgZwh~Pf4|+` z&r!u`^+%xXP24TiLnM_Q)k4-dnKGME)i}4g}&QFG5Zi`4UHcRyC#xs36@+L9RSa_}(AvZgJ->+&NL82m>5$I(g zMZvFCMB|2(xfdh+ImPMzqX1SwslT7bet0>zDfH)NIKEHw`&kp7H?V7@(64+PSJ<(idK$hXid!!0Qe*WO#YD!M>b&agRu*fJEk&ty@vGVcT2S%d;fDqYNzR{ea$gw)>(TB4agp zwS)!e_aRVnoM`IhUaOuf@(J{qDm!XIz%$!p5X!`uqa+m(I=Uq;EiE!Ch?C~2-Dqbp z_;k9uHO}5nOMN>CT@*wM8MIzcQ95S@3P14k2RL`EopAe%JE&DOE~fLN<`(=&vOh*j z{hECh(#c%{GmYaoeY)>k`FtX+hs;wTYiCNF8(=_@WO!I2y1l`o4){ZazrhP^#3b)( zJFa%x`vi{+l{l0D>O2P(;pAjfe($@sDZ|8)9$u+Kq*csFvCb>Q>K1SldWAqKY0;68 zLy9yc(>&5&7nI!YTD-LL-;a0a8wZ(B{r9i>zzKe*Dn&v%(?xFUFl7Awy?o|&RHI2f zC{$9C`gBF9rG|p&z&9HRv1kxxp`gu7>Xn_rtky5Xsw1qezgA_aU!tJo)-T1SF57+V<+}{gj6Z7pH_o32Q8sSx$L@T zL`nGph^zp0RJ5WgBK;wvkttr*F?F>s->Zy--nOhFiU_pv!n6gLa1@#`y8Kfvi}6Tv zS}IS@mNjahC1)Un+UH1r$#3xkkkj18q3e+NZNOCz;{$PPK_l;i6G_!Gnx`vEbikU1O}i!=|mmI?H(= zV3OVgDWUfJ5&7DAckrUb2VdCeDWLjKOWmpktiO!{`Q)7E#d^>VSnTV?MQ2i`aRKfMeq?XdP0{y-}I$dHCski{jWRuCEjIfY3q zi8+_$d|fNZ=u>SP4s}RvD@{mekp3n}y`3_PlXb_>7*=2|_vxIX%%m`OVn0Vnq5bpb zIp=ED)J&6?AUSXe)4rvuUZ9;g*=AZb=cD$oD-k@Gp#JQe?EW`M-m_{tDcaCB;LkuM z+8^WyS54J7ONaI2p4Wc3o+wP!bogm7(YfuzR8SeOFBzRD3a=WjybdTw8=Kj|R|~2Z z*LPUrPH}^KlFxWhNr9n_)awVT979W?2==Qn%7ro{Q>VRN-TE<;%)Yg?x}`DU?{;=Q z*5&N>i*H!QiryT|Fpgs&O|WNY;t?+~B~(6KVfUX=rK4trJ)B|XC*RfP?~i!j=^Q&- zUmnz8sQnFd>=nz~5I-V45C$t75?w^1ysY$9vX88)(h*kQNYqtf)efseCFjwo=yDy& zY}TcWA_W+Rle-6ygLzvOUL3TUw!7_U{x#TG4^V} zY*pnhlmQ)S1=2Z_c>izj)LyKHFRCK7n&p8Fv{YrJ-+pjR_mSBL@cWU|U6C-{D{Qkx zjS4YVc$V)Pq|Wvkg?u3qsXJ46cR!+TuoSx0!D!1hj@M*W*|pMMtE>jA-Y1AY$E|TB zlP0QBSx#iNI8wcHu-un2(BG!h-P23kzP8hJoAjsFpYW@mrfysawbM42nxmSs=&}8e zkPa}a?yqW)exSmwD3vuFRAnCs2x8@GhBGc}sK7>Fgz1dGety1%_eswJZ6lg1s_mNc z>KZdsBj^??!mlsbPyYImMw=0&23_dl^V}5F(4!nouzcIKPh8mStYvw)1+vxQoNG^pf`bj8=wui!W=G z#1Mm@ZY&)GTGq*P{Y~6ijj1^26RZGNHaIY$w< zYMY2x+T5qejMSisx|<#noEo+6XGA1|{V z&|nz~E|xk;RW!3Hqv;naaQ6cQK+b_;x=>r&*|*eN*$(gn>zWaO+Ez72>W7~ykuH7! z72oH61A3sfUO<}VtLAGO<=Uy|84U0xoJ1xxuXq3ObDi-l5wB%y+DA_p1unJ*X@gcF zWP?w}A0FcmjA_+(P(?5RHh5V@sEzhcpQbmq<^iZEvm_L93}s~eG4%opxtg6{v~%h= zz8tjfNkfm23L8=aRY;4Uhls>dJvD|kN?*{`r}(-A6@3_HvZxBPw9wj4e2@&N93+V~ zcKP)eI|qO+RS~TxyS!8ss7qi)x+)y0+&KUpWm=2NN?5peQk>SU$pxfrmxoZjhQ07m ze{DDeK&JP=}aVzw0thLm86WW3X))eyv&8-nrFh zHsF!^Wy%-Mkw&hH5aFpR;Y@pmc796+X~hGumw#e5!0ZwNy)U$%1NBms{K97%i63jo zjR^YKqDI=);Olz>HB%9&dsR`mI%slUq=7F7ggeS`jx_GHFn;^XK)+#E%@ip>6()+y z?c*7}if#M*==mp~%6W$sKxY^KFi^mIKf_rqtdx-5F{rT*VEy(NkHB+6bv1)@>`1F6dyjSnA@iL#(QsEh~1yR(4k) zLe{O~95f&me+zo>249{%NQV4!S$j|0%9l1O5cyN9nq#D2%Za^i43fHAb{Kvycyfr< zlVaEttn2M+Fs}{L+wt@8bG?|gmps`{zz}-{dWBjr1DGyGZ>x3Awmq@~kFd4wBw^XQ zMyjAbF$hsuVEsPB>Wr42CVhLGd}UiL8ipKq9_%q6a*hJ?5UU=RZlTh;;=Gn-;UD3d zNaY$h)M59|((F?SsqMd=&-em8D|HdN!1VLh4GtjS~HtcuYHG>x=Da?L$nOSUD*iD0A{9u_Sr zP3@wqfoArgXod!&{rGd#G(tb7-2qj+h%l%fh;RgiHv0slynr#5JmJTa0z!OASj9!n z9Rig+A_0SW5m1NA6`@iW5#9$qf6njdSb3kcM#iEI_!fBqkI)$s&L3KMKYOzA7R{Ez zt}EClD|lg(>+efVTLoM^_)bzAqPJcE${1h?-H)eYvCA&=Tm3p)&>T-U+h6Vf$Av}> z4P$pg_b!57BqRDpB?yZEi%YnlRvF5B946R6z+~T9LueUYM z?tkOMBdpd<^m%BE(IEPCL5LB8A$(*F(ThYkf{db8{j_pfsY!L_QtD~t2ovZ?w$2~c z+aBx07#86bLrn~RTvzg5I2L-$(_s*_-wd`+u5uY;0b&n#_tiSXl3GAtlti6TssD=*Y54onqZg`mDAr(V7NlBi4HGH4|N113li8UYSsvX| z4kKa*w8caHMJo=`i2qH9IljjVi-?$k`k<8aY(Q2F=XSx^CB8!mVXB-Fe~w~>svdYr zH3WsC{Aeumz|zOLh>$}}a&Y$0Bb(!MTafNz^XvT_Glhy8sW(u!rKE z%Thkj_r6dMDS>9UQEcjs4q|$viwMJM+%+~WEw6AQ9&m9G5)cKsTgBG-I4#3yc3w^a z;<*oDh0PqsI(Z!5;byHIZO4|e7xAS&1@}YnV~8OFF!T@oc@mSjda3<=G4B5o^)^e> z@2fBCd_@yOEFk=9jK{RZDP~^a(YPq|@e+Zt{>*l-bcj7QiIb;8UBWBG)X^y-{E4Kt z$6*dBAP6H3zPHh&R842)prz+lVW$T%4j@75w?!P<#4etr`~E$W<8_~bmNoZASGCs( zZx}&@3)Ulc7>6OXjOPf^X1R`oXFmp?3x}~|>2#k$1?w47{~O4a++a041&sL1pCm*L zf3R^qa2!Eub-A|g2QGW#exq#>Wk9v;J4;aaBw25o3b=8gkfJ7=`F`09`96@2x~c{w z|NS%&$EiF{sELUhoIlw^x~BR<2h(H2mVy*ybnlKUsG8RmwR}Y_A+6v398ynr;U4k} z-deXBb~z$+(7oqCUr-8GrtV48oEq%+Aip?Bo;&wBNzx3u`JW|7@y9VAWFM2T_*Wd= z^`mi~uaND!>s-?w6@O}_EUn~;WeOj;+jCLDcpQx_wJr_=We{<5u8SJMb!NX)kHB3u z&cgHLEOyGyTSXn$6D=PqC&oEQpb%VvvIH+UFf;(nz<$oeN!FV2ib=>S>Y33H5qUY<(Fgf81rT^H*_tpAN(CBOcG5@3`VS zuE`vgL4GHW$mYx~R%sWhm0hNVo(4oJ7^d!8D~^Cd9=yK*^z?HHXef(dA`JxxXXd@+ zrU$7T=PCvdq?60LUck6rpq^qg-s3~na%hp4pI-{iab zD^Z?*uI~SOPF<`yU`mcXp3)@lQcKzurL8|t7W)bM2cCFIBL|?>#N*kq0N#Dq8%|3& zi>I@uBXUfZGwy%sLm=1h19*(axow9-(rs?C8dv}iyJahI*0pY?1}bkx`#9q%kr9Cu zFvblz&d5A<^mslFlj&^L<(U$cI(VRvDmH>y`v@jtP;0Nj-1of*cRr_OA${Ja2IH{H zE4lCzy$}+Lt8Bw%^1Qe963l7d{WSR^|9BbZB2z^p=I9_}kETk34rdYoCD=&g){L(&kiStISiDd{jXu> zB)=+PQgQ+O^T>BnNA`k}E0L2+*P{A+ijZdBr9vJZ-;;Nw9-<7%s*FQohK%dTxP@WAqOI`xq#);tCIb-3-8 z1_?w6nGzqVI6|~cJGo$3U_`T(2QXFNc;U1?%|TzMUV%t<%(HN<0SqhQD)UB7LFtpTv7jbf3f*- zobwbCIq;3kI5C88r@(aOb02|g+&`Dr>Rp&w$8>_@3BzO*lw3GWZB`EnF`L@M{Lj~a z{_`IP^FJHR-8%j=|MSm3^nLjRn2R`$4p&`9INRO3j`#xAYZq(SEf>$ZE_-(f@Lm=d zvs{VUU-dfW4kADAKyK3aq(ZzMP;QglA@YOm1)SSpJEm5;K*_9}(S#o5ef;>C^>bP@ zu#rtNM;$cqh=Fqqr0jc?@ay3v6<4#coVg*v3b6+2ejMB-k^?dz>=`oLbfR&^>ZXYC zD}6AzRT={q{xtVjO{eEFQ8c!K279DDm*uqFcna-nFh!XBA@ZHWYEC}tAvOaVSwWM9 zaWFgfU`9A`i+X(#=fTELX&$Lu!zQ9Yq`n-+mgJ=vPHyFGNAG44WuYaKdhD-vge;1v zq8)AT-cHdUz|=?}*TIZw7KUloQ0oYp#Bm<_#|LVNX$J(nh6U?3dW%{rxt3ub<`kyj zf1)w_awlFaJTc@z9g|9J;V|b~?TrsO9>f{XD(zra^_~)8frZRjOMF!;%rv=x9AXjW@p3YI z^&nrJ1b3;6Gux3y=GL9>I1L;m1i=Aw3X*tC{G}I|Q}4o@?|h$WO6`MYf8BrdoSf0} z15fU!TT$F7L!mJTDw8M4>)8}==KH_a4@t3H9!(4WJdqTb4#QO)2dYw_ew^SO=eZe$ z7ixsMY@U35OMw5wUHq1A{c=bH&`{peie9A(pwu&xhMlRL0llRYW^XR@m5~G!QnU)n zx00`{rh2hj(hczP{!@Xs`R88##6Y{)xzF%|E>ef?b?Ovaf?CU#HnLNis_+#nyRGU{ zJzUEN=scRoy<>7|=)XffPJnAu$mniyd!<9eK*hSAo>+ef9==-w;$bc(=@bXbdn>J? ze0{}v^+4tPl4B_GpN~t1b=BwXpcFMwGvK!SoC6FqJRarwg$g~jb|cuv2f3yW)nXII z5FTjlN2?XOzI%;&d>$ndB%VT}3gz*~yQY^PDH0Kv&M_IKGgU^g7&Kl@cN&mjeiY}l z(dx{*UaMNYNMAXm4h*)Mk+0|QgY~Bf@&F}%OB~oq@A_%0rGNzTBNlp*X7fxs&{K8m z4YJfzc@FXGw5&~rUla91Qr12zD99*(hoMFxpt2%0L)tY)Mo38_umJR{nI^WwvmL$s z>J+|b5_sKN4qR`x-ivDq2pmhryl3>>t$j?OfQA4s1NxN*nyx9C`Yu`C{F#$6w2h#W$oLDmN1jNrCZ;-$lh2=+=C^LZM$f+;!bJXA0cI{`ICv6C-#%x zd(YYB6dvcGw&q_n*J>y#S$^duN{P$EDSX3s5oFFd5rfvolaZ#!vM6n^O)~OBpiqQn zP_~%DQ043Y*Z+TeqND?XDtKo&CeqaLP!aYa$M^kviHK~?+h={IcK3WjM-fuCx=o-; z4s~cL#H)k4?-7X`Lw8WKFKD}uNRZJ%eCJH5;j=THds1DDpdH?!BfQjhXPP3~Z~D*T za4#Vx3}Y4aR*fn0Y+%BXFaAD;vz_JAo}-)t-@FuoegZ!$S%6$T4|ewk*vJydjKq>Z z4~7N-(A>n&F7Lh^&}vPU<pM{iY#^)QDoN2{Q)v>5``ZV9P@n(vV3 zIl<>S0C{`Umdc-&)9U|lU3!_yIGQ+g2k<`gdT6IQ?w zrc#c3N)?TjvjrWN8ClqXK5Tmdm%`Hr+!=0%Y#V^kwA}@OQAdqQ6A`E zL(|wX(BC7V<48Ug5wB1~b5g!jGJD(iL7s31Y@i9xUa;^LBz-DaBNdWL=X_j}@g z-|Q(Vr|CYW=d;-G;uE`QUiVY`io2!mu&#a-d02j2_8eugd3oBO&$0-CA?*|lCZQh6 z^JF?TNrooU&LPG&ekVl`{^Akh8bPIcXFpL8<)>5g)2V#Hsetk8Y`RnXnGmaX?O0W9 zV}cSonCgYJN9;dy_4h$`bUsTse<*Bv5243EUsY7vQNu25QdKo4UmNQ^Ifov2ii|6L zgJTZKry@=eO_o2Gc$K!~lXd@6hB#-}Yh-%9oN)1cz5Wy;uDgxL(tP-2Jw3Gc+Y9VD$g(?rM0pnx6dg$4y95&kRNZNelWgbq9endH##ggDVF}PGaVI?OZr{x) z2klms8Bi`pjK1NJ!Y$zZVs6N4>LBRaVuBGs8It6d_;HdRpJi5i41)g@?-^+ z&NgKar1L!8m)QrSGVgyDd+{*tC2dc2yNsWfnsqrEW2%KAPJ|m#oQikjipkox$nMv&WUh258(WxO@ z%I9rOL}>gUBCCXvpB=(qj*w!w`>5@^GDi3BqrHu`|9J1Mb-u!fynm>V;p3KgOI>)~ zgG{c-kRq%paifW=1v5V@`;t}oc^Q*djX#?Qi-?#1=QrD&BbJRHM>R`1zF*|_%Yk$c z=Z3OvURH=dJ|#;1);~maMB*La{>A@=mQ(XC@A|p5^YioV?dJX~tsLgJ_mA7{_WblT z-Q8a-*2wm`L2|SHZ!cpXRa~l%Y8Y{IxyRhqg5(`r@uB zuhq3!$BiN<$lZ1w z9nJyml?zN?+bE+vNoXR~;e5-fPi|3H#bq-f+BIf0$Qwz+_I=pM4hAcH@vVfDQXFx^ z%Jvlc@A{Tn`aVZwlwt2w4kadv4PVVD)V1rl=IL4Z&3U;CXN0W^jrL=5WEw}eAIhhrU zT5M6DA5*E=ac!-u$)!t4P{o&COPk-&y~*4r_E(p|tVTaclUk@U$%%(4yV15{9Bju?QV(!INv&0&1gpD8nXSfM>SvWB zUaD)Z#AJe{YY+dP59p&^g*~9YgQ0-*G*v8pJ@ajMK$mxZ5i#!%O%w^icHj@XzVCv@*_o_=mr^u=&2Qc9DXc^xUR-xw>E^;fyjI z=i)|rDZsu8=eP)H_^3`mxuR-4&%IL0$VyUz>|!&;x~l~$!(wPE&S|usl!YC=r6Oyk z<+K~6DZBZe3z<362Ql{GNd`38-L{wdOLkMz?L8hu5t?pu|Y(WR##86z% zbVPm%EUOxnmgD5PTCQHGEXk_L0#(@gz7D8&=;U};L%QYtV>ds?Q@r71C3n&!C!XRG zOd9ImQ_-KD&~oV}!w;B+Dam2(s$edRmE&=V=@QH=CFF*Zgq4wgawQdS)7&Y1^k zG5=R7AwA;}Si`zeD|br!$lCKb$@*F88lcDBD&3MVc`WOM*{;!ABy$9FMik3GbejI6 zj52bU|OaRKkYRaCaVx$kb&~0ExBiCbK9K(sz8B%-Y11(ka*I5$+fvvoMwL zlGrsc9gyd}W&f|XJkP6}Aob}c8D1jlJUvHBf7858({PKnp8oLRfBrbYtj~)#2PCT- zV!M&7s=*uk^3pnAbehzOa`QfiTXRlR=F-g<<=V|S0-krExT;MH?a}kkBhLMf_&Y|r zl)h!_Z+;C@^*m@DdMWYg_WT%^kY<_V7)(Z5>ka^kw^tm}@8)$BiD>h*d+#q3AZ2(W zcToM_lRr`gwIK)cRmxQ`MJZ)6`Mut8P95#G+R+zYa;3qZo?ew zyUF47N7Po~#9C(KcpaA0J|=013Zn#hOGl3A>XHmRBMP>E49%5L{%)P0bLx`JccHz9 z`j}rIQG~Js${u@Xj_H7wVWyeJP+hWmVt;e_CS~~Pw|!?gz4W=M!g&&R=8>AZzON?=#XpAFvw#y63F8#(j@L(zjGg6=~hSVVk${WW>#mTF>oWczBuyU<; zy6=Pk$xg325XN2&mG=4x>%MbEEZKWEuME=pPt&T^v~c_c^r;6$hZFsgHg>y>{ z<6(?oiqv6xQ+q%CovQQDQ^DWw%I!vnD65F(q0IoXMt$@`p0@wAm{m!n_Pf(IVsX2wkLm zGav+V6h zqSH21rf(9$>g^&iCq##WeBI8Yk5+{;smOa8A?GNgaY&$Zq~{MjPdD|7 zx^v14y4fbDiIjI|W!-jL%j=yu4^%@=^@6KrVX>mt z-62bx9L=;01`zO_|4ZM!$k&n<&l{ZoKNGyGYErliayOel(c+ zQmd{g}E$SpJV=KmPu~h|Ax}F4*<=-PKgCAJ53?e0B&e^PG!_{Cw1a2 zceqLglqcMF+xB06sr=vegvv} zHbE6gW95?1g2YSO2YQBwmUOXFfu`Xy5&Sy*HF$#cE*k#gY(YZ!X}a=)ezV~(o0K~& z(eEjp2J#i{H!0KZy78*th<3qc8+t8ED1X13rLBI}jA=8eeOw*?$|w*;NTay3Z)<>h zby0vo*;bO4vjCD;N#K=($4}K>zvq&3wT3{m=3NoBu!i&+9+08+S;=lG!^~ z&G5>qN<8$Xm8xnN(u;wlg{_L#H5yPT7=UCQi+`FYfM`ZmZZU z&;{i#5U616II#4zj;1j>8qcVKNg>NC7|q|k&!s#4I=MrGfAFPC>1rwx0e%+4`8Alu z=(+2qMd~8b&}9MTx$X8f>C3Vf$3iyS8D()HL~VRvSVa4ON2N5Uy|lgEnu#w#iHWRz z87W9yzkZwX_|4NgDx=$!G?0F(#v0=Fm_dWj(-oX`wVWoaDZVW81ibIP?)x9^cm z5Id=cdzoRdpyHf1L{q>O1B&Ofie+*y`JBf}6_vgYsxjJ_>2(_4#(oL3$5rqo6L(in zUME5@~zreB(s<@;4;4BBn{a=>@0%JufXPm&xC#+}n-T9aZaGsg}DZCMu&! zBCV@1QTa|O|J~XD?k?WUWp|HoqEj2Q4>8|D1d|AbhwCu+*WLw6^3rq4Ybt2g2e+)? zldVDKU2&il7XRRh!c%HSL!uUY($>m8=vZoVyG)#<+3F>IS%Y+%=JRf;%lDp9Y&o9< zgQJ{k$0PNRsb5r8gB0J%RXI+zhU>MLx2hlupuQ=0`K@!TkbJYzIfzOEiiqSZ5T6S& z%*N~*=46c|@J{xfodO2bs8AHB09tC7+n<73;rB>d+vFD1QnFoF;#|99*DuM+UiSO< z8p~`wWfDjAh);?KwedOj{~PO>Xv1zRs<<~z2?-P{i&tu)R;2Qp@>xwWZSyaCsP)vJ zTy2xBspd$vpBb;Z`+u(<(wbLFA?R~&l%Zi~e(8C_U3yC0GRjRZEUcl{x9QE!R?k|^ z#KSVA3tAQH3#1h-EYpbjOtJtU=SZCJ9k*Q^ zG*RPz5oX@Zubl~VM$e}1J)-1i=PYMysTaxQa8kN#7#_xXet~L)wRbYmS)7oBe)H_P zH9(8+(RK5=SKQ-3_2%D;twvq9qAUTmkMOUK3}@u34;EA{UfzYBKbX_mFK893H-ft! z*%IeF*@5wW+9(r+yz|hMK<-+6d?7XJHy_h!w~Db*@8J(LL#Dv(-tz>6e!h%8Ne5!ZSka=oB%Nsz&V`QBdzQCKO5ZA30rgNoBABRslWZ+{O^-L89TG@9ckkLWIq7vH00+^q-wSMQ$9sK>{u#L}Y{SF!0##|B@g z@S_ZC+E25*x|BH$ZIi!=k786i@xfe;p zJ`K#Pv#WNQzeQz)+hP{=FzUU!AZKITcH4fGyffDFy1Q;YH9##&K%_w4v*`8qKF`l? zv*oPcdvP`3$cbpPEw`JBHYBy*f{b<%snOG5{?9yoek7wM_Mp6YDx)3eL?apP2UP4@j+Y=F4Huk(Jhz@%A!W9hVk1w4tg@#iEW-S5#&MihhQBfT zmd9B~UeS`3ckjA;l6Zgwpox`lt8i|HDZAx+-Lk&y*8PtkZ3a1j^FWyv{m}Bq9Z6Cm zn=fEQJ9OIgptyWCvLn~kXcyXXjXoCV)EUi}lLIBSmc9z;<>Ogc&YWyIJDqydP4~f3 zXPYug&`@@o9Hj9Uwtr$7oGg8REm;-l630PU5_jHSO<;AKg4}Y*g;c=AR8%v5jPc0R zK>!-RpIv1~Scg;?$~Ar4b?l&Wr-{TSh8o1z_rE;fypZK0d!tF9mY-9e2_tnGn5Xjo zAEKK=Pv?$yx=vlDr?Au6d<{lQ!xl6GYN&PpL7N*;tr z#WBj-k8>uVy}UwOmeo87}|~MpV&aEPQ)|+%*ixb9sYG2w~n|=*A!w8;a~H zS_8zJJ0X!{4#3&nhu86_S9j_12uMPr|y_}H%s0F=DiJL!DXmk$a^w6a8k&1yevAc zS&5UXMrBKqTiTN(f-P@-*um1xANC5c?lJn8Evf%}#%J_uG_(@)qf{7l4kC=*MT`sA zt<=-ovU=+qG%+!T1!G07)A!b3#tWmtF2ZCOozEHuM7Q>z!k#z!^V9jh{U@AXihuq! z&Jq6XKmT<cMr`K0&!|G#K#14tS;0aR3ZMCeG$s>~gR$%bv) z(!j?CoJ%crDzSQ2Uq!3Os_Ur!{ABTH1Z)A)56B(NFcqaWlYC$NA*EhxuleqVob2g; z-icj;oI9}-u;&@#|4=8kH8eHJ&;&=SnOg>6k%^V5JM)U6JghPw4M_eQaw@}GAGmF5 ztE6cP(U~+(#+T5smyk!dXwndIsp~-hAi53O2F3JtC$!6}^8@ZPQ8!I|3^AU@FZ?~F z3=bMjmawklT#UH>OflotgPd+0OKrId^V~#%l7(Vm8OkP$~JWmyf)P6=N2mUX!S=zM#_pQPUQ2At<>PI1~rd!{HHml9$z67J!wjtJDsIj`Pj zmdqK@ZvvXz8%9LCYIJX@xbH^m#>ZRtOFxfm0&HPVeZqf*+FgvsFqmL*FZeJ8VM({# z-|7bax;-et;wgEZtr#C}k}r>!z)tF=&MKZ};Cf5?hIlIG9m?kv2oj0)X`BMmL|Fdp5M(RbXT)x0^*nLsK)G9 zL?a6`_IBT^Z0H>kUM0Cm!=_#Xr(ecTnOkaUKEV04m~&ST1OvYA@sHu-!)(S+xd#82 zy6?1z4{Yc|$Ndn^XAsHmhDk~lV}75}Ino@u9qL}4t2^9=#BDJLBEXL9f7i0Y9~nnGmIZQuLoI@!Q-t;GNUlSY8lVV-A&llac%q;O(Cd$0_J zG^7Q@R-Qlb9e9RB!k`E@DqX#ko^ zEXBd!`@{z|TIh1`?aY8jMO78mHO=18lsi)P)&Tc*f^zy??I0>8O(>%cU6UeXtOLIS zlC*6L+`f!>(ZNh)$uQ+({o1Mx;{ebws4V5Rjo~FBQw&d131xi?qz$0wE#OJMezNpy zJXC9mK3nA-LU~yd!NV(v194+=t#!Nz7){l%QExN1JR8&VJ?owJ;TkD{>jaZ72)I(b zf2=?XjeYV=TA&8!gB3B<3}V2Pv|Pg+l0rm;zV@AMlh!5jNWF@;sllj{!ih4HJ0~W! zy0x5s$N~@36}6xHx}J@Ns4>JDV~B@?G++Ei@f@d1LKanw>#nSI#ZA*M)DMGtNu^6o z7Gqt+#F{e8A_54Hb*H9SnrO{c;R-RLlyien$>o`-^i+ug{=(HcM z{32l!v1%XhGM?@6Y)?(hmc=T z45|?+Vs1ftvhrh+wEvla6_A>ah<&ECsOTn(2AKxpu5f;TQBYUn6tBNtrM%yt?n1oW z-|u(f?1*e~+~ebmec4!4?sJ2A$s@k)O5xsty zS_*?C4FFdkSAVCDa4Ji#{#BgoLuBz%tsTDBwmfK7q0$;nYr+bsE4HX=1NuydsKb}X z_!_a^N6^Fj4&TeSeoZx2b^j^zPBAi1brdc+6J+$VUfT2t5 zcnGh+5jofkkTO!@0QvXPUMR|(#Pjxqd!NXEGN+85eozz z9#c*y5VSo)|D|o3^Z95ii`QC&D?4UPo|90X+H>0u)Y+#WHYJ?4B?l|jCE*n>^%2Z? z_(=q`4s%$fiPZlNVRp+E>-%TB_O$No9yY6L@-*EBI zwD?{6`GU!>q{?B>l&2N66GJLw< zhyvAtl9V#|{HhPGqe18kPA>IrEMC!YtHV4HSz4^=x4}rAd_iR~*;D7Z?TumbUj4i$I&Ot+fgMs9g5iLpTyNYpNk#dp&BK=HzKe(03Mc?Fd zFk~gVbV!Qx6m9{NY-#H-4>Kgw7I-)(E_-xSNeq)*0|b68Pm6M(uw{~kX+i4eQA*So zRSInR?dEKlt2FhvJGA8**p>C_Iab0!!+53x5}XsD@!ta#yvJZ4JhtnWuG}Ij-QTHs zJd8ZN^QVd9x|l<&xYDyh7ct3_R!mz<>x4a^4THSXnZg`er-=2mn3g*wB0D8@n4h=d zX&m2Y1HSLI68|LWth$M{Dth?EyFgPaSK2n9_8_i5nxxlqM^Q5|nCWOiaoi_~+Z zZd?-St!bN{t$TXy*CAa?qX)^=#d|75uvUo@=&TBbajLii8h~yo21OvUqY%^p6XCHS zoN%5TAMQ#xJ-hZN->HUkv(EE0#i55M>IQf937uRgtKu6$WXI*a7cem$|lgNoIR4HL`xLg>h>HS!|&ng?d?OoiH|oISKY^j zA2r(ntHk*3y(AM*$#QvusMQ#hVv9_Zmn&yf$R?zl)047GV>f8>M>piW`ty_?Qkp~S zE#)2=akRi`Q+D*99r%9y=`)%J&q%1c2XrQAzVvE+gDMhasNno_vs1n?JVyIErVy{6 zk#2!DR2p=qxSBqo2TdK|%zlY$!uc9K5o;sXSC=^ize~HmHpx0mPJSW*dgvU`7@r;~ z5w=71Dhv_e09in$zv@(QZqg2#f{-4n7nn_mQR$Cg`zt?nzaM*0q(Co}h{sBI+}lgC z-r0vr2jAbSHvOcE{^za7l!$n7J}k*xAd{!ZrD!|fv@ODg{0f+QX&_qYFw})9rYo+R zjXm&+tVsjl8*kY?_^dcvyZQVK)GpzZrNb4yRA)<=YTmCZiAA-*1e1N>?r06B*gaPq z=B)gR8cDnXLdU3O06M!S4jJ*T1?R3$x*4EwLFmM=%p>UIUGFuzz6$TeFGe*s=`XcF z-6cSpKL4pd9_q0|dUmPKNQnqT#Mk)zGYxv+^m1N&ne?7pfO-`Y`z6N6ei+oVrCNxe zi7=bR-_4b&d|9bNdM(&I&qr%tU|lw<$U2~{>+e|gwmO#h6ss>9&b8<@s4zV5?8DNl zgzD348dO@nr{cj6KDzre<(X~QeW1JBw6N*)^ke8NCjW}U*=Jlm%-5yDXammOvtE`+ zFW@Xv5+FjxhsqYh4WXNwdh~mYC4DXT4~d{)xjlx$wiwEJzYpW4i+S4MnonXLUEf-Nv7D-c@pRp zSUi-#0#ZquwVl1yfkLL(h8K6NQQRg|oRPYvwTTjEHab2?>@b|sKb>)cN)GyC73Jsh zHibE2@*ikFXP?#Ew3~_0#g#va*?I7u(JjceWuQrK3fDy1TOH}=Eyv2kJbO3m*AC0^ zYJr+l@0%jV@TsqNiMlb%7(%(Di=GMh^a3@iCI?DR zLfW6`cc8zHK%#UT#{Y^?{%gYICe4Yk^w977&N=^R3!I|hzmdo9Kr>OfM38z`X;XP^F(XSteKz9u~x3W?K5~WdueY_YRc%BqOoAE!a!~EXklMKcF3|DZ9 zrD>izU4#T>$9Z1N;o{$rw0ou0UKMG)N#T!VpQIvR31~JaTCJmhZ_^yq1!X+&b0-!_zROz`ophtMw$FIHDA_|dhLTBPCxt1tQz}8Q8`KB%&F3l57EKrYENV8zJ!yfsdUd(M+j&)FTG4l--vU3mT9z@ev5k&Io-EWcTeNf zU|!$dx(Tm-g=RSd>R^tKJ)eN5xdbrUM;B-G1SM%uD6dXOCu|JbptPFG;W1hYXu9SZ z-2y1p#5&oAv-;_;-u0wDf-63dx`GjkZBTyE%N%~zVMd(8%duwd5A8{Bl1RQ?2Opma zvlU2r||Yx7HyCN zAQAn}#p?mS2z%l+*Qu7;QXJxuG4%_zm`)*|xcEmje>2%#ub3r{0xJalYHgi~?Hi6gb0BXV97(2*jJ;v+Ly^K1sQUU_xqhl(w+9nyS~ zx-SpV8q6g?`L+@5=2M)f{=$_nSV*ryaYo(A@?J3%NSOCz)8LdEPU1oI@?!xN1G3ev z|8leeR+7sdui%oOCotK^LHnI+Y-p|nj-wA{teQPP&qgR@qi)#Pm2hdwuzDsxh%qc?a9UAZA7SJ51W3ipIlTdvP=O0Eg$PE|8M%@8= zS`g#Rama1^-p8#mBGY z>^_yOcrP82dEVXXFu#^UQ$Mp{hL4YvWL|xKeDa;6n;)-I3+Gv|CaeN!x=*_%pr^;_ zXPXqMJZft*5o^3nxo09ZFWO>{YIUokoVlb!3Yhhr(TJWOr5%f2*4N1GH{idixJxxZNs6}(1fLR5XDHzaBQM#6RwYqI!)BrKTh4ARbw|72e`~F2}xn&JEJ?c>UR=?xmyaBN|R2&{r>fI zKzLsgq%H={Ho)2EIMoJc(*8PK{Af~W;?wABU5S-VZdO3?h4hs2O}6TOj3+J>^T1FmnYco&5ej zs;600_$^^R6ecuHNa-%^qGQs@xCW9hJwqBdbE4`s0Nv#i;p0A1J7&Lro9aSkxy)n@ zbYO(x-cw;vLo7YnXi(aWb%Ev!OHVvv36PekPsJO`JBIKWkeqN0=AJ#r>FFOoMwoMD z&$fAPk`BGrF>pcEzF2v?>2;*YmzE_kvT%L>hV+(o8)g%%rY#pt5>q6=>Rrn;B^XbM`N*>P$T& zV30y9S&3rtJ-_j7PCwRwE|LBA1x>&5Kj?0u8$U*%5d4dq)Gf_IB~08-7_8F(bMq46 zlM_2oR_Qic1$9+lI`xi*&q*|n8O1OBB9Wg6&ymUHx;R;0;eSbJc0bT{?(RJDU-lcNz4*mKU4ni;ZOmmAtdia;mBbti#l4`81MAX#Yy6xqX zr=t|TsfA|bwlEuBsa$17?BtQ&QN_vUNTUyd>iv(9^3QV*9maaf+C|19SNzA%Hf_!cr=V$ys9#aF( zWAa8{^=ivrmM2RyHKFz9KrrAn<^pFBspimvlaiy0b6@8dw9WS=i()29uv8ga7M|B} zWUAvd;mn^fe_8P6Iq5%0KslPaW+9#zUwa-r88*d32Ae8l4IA{oAmsa&`0 zq?k*3N~qW~kUS1WyFIe_bz^?t;@vTrczWW0Z;^gNLWgK&8-esX=h0J(5OuZqz#TkO zltgTv_j-u&Pr}G$D=2D?fQ=YN%9xPH_hMExrj?S`o`Xkz6DFc`m@3p;CZ|*4R4&~~ zRa@3BPtv8kteuzLCdd>8%$YCFI*&r#6ovj%pu1IWi!RJk$G$Hs_smLTI2?o z2>Szr)>-)TtfmOk=ol``BcQ>|B7Y4)hv6|P^-nvqjO|Z}d1ecAViKW)O=28Q;OTpg z6fRTZy4Hc7GuufA>}SuBf#Q?yeEfglnJ44cXMXNlK7rYAls3;%J*Fd_3MjsrUV&j? zcm{%t;^#f8Z*`zC`g~HOk2~~fLWU6mXNSH%H9IMmV3&HtpFC2EFy~HS3zgJp&lZ$p zdd2(nRarVf3Ikf+?5KR->z{}b2AXh)zUopmv9u;7$YMm-f^&!=+f(tbi?!_drOf>j zdV+{e61foQUrxMd-2z@T1O0Ow;(Li+k%j&c*s|}cywy`rOaMkAlhZlB#+v0{(ufJ$ zgcY+TZ=rvVL3csd@oQe@4(58qG_hT~_?71N@N6012v()4SmS=JeXdIAP4!TW;<_h8K25>X!x$o!Y?Z1)Ny$E$CkP23h`u?)yeL~Z z_#(|Xt?z)_+|ICQ4in^byu5zVYnFRms>-Jc)#z+N=)?n1t}$;hYN89bhJE^jg>RYK zf?f<%nc}h+Onl26DAlP<8?PRw?_6l=FK{(lZhecw2p-dD(88E{^@lBz% zQoxk{cF~+W+F7KLj!~zVoxDjRS)tnwvV@1`loS2F8Rz_&#!7{RzTc3BTRAUiw2ax%c0Vv)~A^VvDeWs2@=u#AgH4|$>I>MG_OQvKh%JJq7&RYS1kSZ@Sj$cci;I` zu))m~|8t~9`6ey04}#imsxZG;V_0Ig&OI$3kNA()LT@;EWoA_F4Q-&7U`c&9!V030 zvHOW%q`^%&$HVQJfWmk3YMh#@4?aI>23IuS6zi{L&y%AZsOogqPuc_e$t)Uy4j#3+ z_^#U&G0#&Ompfq96zB`}q<9%_n=ewu1_e7P&w!sjhtlOYA#Ae#pXt z$Up1~{xo?|mSlY;I9kyo<;8EVldKdHq+Z_64-;_4+w?D_p2*R|z>Zfp%xM$)ZsY2$ zxG%d$e;y~KsB~jJyH7b%i)Qdl)2Ra8zeo82yB3RNxr%fj<57BEG-ppzBMyNwNu8(h7^qeuEi}f{dO=pUlNU5Y|QOKln z>d5*7ci{c})9N*5_zZ8fD>bnwDtd+i4 ze$LIpB)*PHcc+Vg=D|}#^4cexCdy+x{g8QBj#H6@@L2?+W>C7J zm@o-WmZ@w~xf|SP2}--LhrxG>o@tsSwp^q<7&WQ=qGn7U;T?Y556)7ONd;|O#ZBsZ zNDs2L_ZyR%Cai-WhC8kLU>(2ekIHv4oMB~(iJ09GL#OCX2fjp3|CzX$@VN$Af)?F~+J)u^!Q&TQLh2l(;#82_Cw>LzBMyFmN8FmR`ta|%6Q zJ-bmd%@k_^>YUT!ceH5;dN9Kf4J=6APfSD5Rwms)`dJ7_3nnK~xF>l{Tm^598F$m+T&*IqD2$~mO*`JH_ z4|Q6Vu_#GYiD4_EwvRIJP$rH|>S;tG6h)m0K$qBArz@&vO(6AB4N}E(Zd|_ZE1d(o zVV<4z+YMJF6uRr3iEe|G%%o<~@wU+Mre>jR!SYTKP1dc*a`O}zBAPK}I< zMH$W%k`Obm4)kS}ckU+*wWCXaALyiyJYV|IEydwcUIDD*jAK!jAhxp3RYN@Bo9`CMOaELeO8*?L~ z)^?UmKJzAT?_-5&lu^&Y`+R zOIA@>s`ZANEdyfag0>OZECi)OvS@YLagVMtc{?U;u&B!xm5d7^s0$-N8GZ3eklUZB z5K6oBfL0ft%H+|m;;D~YiiqOD9|AtA<*n}i9sY);a!Dsw?n!|fuJqD#-2ggh7I2O+=F1rGf+K;aGE=V(H2uhbpP$*Ea5HJmoFw5vzAqo@UVGbhm0!fr_4$wyaCs zKr?%BR9qg4Q$Jp}x99#253l>zEIFS)-Tt{!p;&pj)8;li)9|M}IM9+uQvf?p@Uc)g z3DnqJ#fpwAYGxdkuZURsc%Xdm@VPAhmsP7IrgFzvI@n^9a)tyc`sY0F`RPAie2nzL z>9=b+pr-O3s!{P(_0B|h#T?-b9ch6zT&@2jOpa7I@!;A zhBiSovzWua57e$Ae!Fv%Qa2X32Pl(MhwJ1`mj)%3XZKTsuH}}l8wPkg7c4Y37uj~&#BtcO#z9gZqQR!)c(lE zCJLlQql>`&mKgAbP%J&q^!YTetqOAsG%b9Xf7lKmXw%q1;uNhz0Xmf?FoPDkGnaIW zsxTSOum^b0ppEnx5Aj?fHHP5X1Dx=m3}f6hc+6U%IH;Xq~O=t;^=V}iyr8r=3u zbDWy5dV1q4=EP8+9fJ-mv-|RL^`_6!bM!+crpN|#kuXP8FuesuW2K~}1oSz~=rOyo zHp$_rA8Yk^?y;gG6iP-Z9k@-|>*?Y{f&SpSNh=IkOBM3VuP0rh=)VkI{Kq(Ul=zDT z|J0*ES_8S=p~IX+i6E)Ef31T(m&OI;>6DFIKokqlL;(~AP9%|)8cI-2(3^2>;hV~% zxakGxP~OsuH2Wg)zh)NohC~phBi2a&>>z!-%Xjcc@8}tNA0@nyg>8Z?wA%MYjwM0n zkzxuVGQ^03E|Ja-v++~5v~E*!lrEX{{H|xT2+WOzP#HHUt2(8enbfyBO7w;MGu47! zpv{uWG9dgw^^s?FB&Tc!F$MHhq&u>NiE?Ih_S9b&expP=O+RxeId%(uqkyS2RUSkW z&dL<#9fQ_e9f)Iuk=)Uk(QOK*TN!lDq$F;MGFDc=%}Z-aA1*xb9GC_e)EaWIQ9iJ9j|3W0p^jRQ>_ zsMzz6Y%st6c3iWHn`B@nX(a)h3T-q!QGO3C`~a$xT!U-V#6A%Odc_vCLb30Y*2L$j z|1P>GhF$rR^R`F@rt`Nt$h=Fv_lsKDFe_*7u6|qJeV~EiXnTNJ8BR`!QCT~+g7Ne4 zG@V{hO+v&jwTyacR}MZG$)|uGen}PPHkUJJN8={w?H z8I{yaP=~p}$t1`Jn+8u^?jD-DoCK7Lz}06RjYbs@s7o{YsxY@W@nz9~>9?(y4Gp*{ z=6L7faLZ*`db0%csd zS^E+3(mF-bBv6I+VE9fDsO(s|QCV)!F8NM%nBRE>+EB1x3f{)1!>!48LTU_?cwM|q zgDZ9@?|2APD_#xfd37lV`75AxMUXjt8$CP-P%B0+DS?8}K{I2vwS&vf?y{|@E*L!>0`Yub=bpKz$3 zcy;Bc*At9)gSKm<*~0TzNlvd)4Q6G4L#o~V(4q$Adgszev0a=9>##vvaHft%`hstt z`rc=`^rsk?Sml+x>^0@#SLks;7YSE_F=vXi zL&P`|-F5s}nRwV^Kvf>5oF;)5gaOsPw~oC_8VVbeMZHMl%qNtehbXCWD4O$i8173i zR>h*b)S|M>g+%G1a~CkroL^2{&s#a0XR=nUiEPemKXk7Pb)bf_MW_l~ceD@0L$9Eo z=L&-^9KB>v+XQs%91;&(MfVDsupTO*1O|(0-Zx}4Pu?p-qXerczqsQx@Z>$=><7p_ z`l}9JCOt);1f5tO{wO05C7_w9MJEa6&pJ}Dp(&I505u;^=wkssx3^8-{7`5IQ^g`| zP~O=Kl?0O$WY?v5q!IAZ+c`I<*#%%8gz)!>wpX?a%4z7vsd2~&Ph%?*I`E2 z)E2g+(D`)kM4Lg0PYlA7#8@2G*KACYAf-W~6gO9Zrdza9y1h&(nV8QcKpx1EVoT4E zI<4qEZ2@|Yk@WU{Qf;CXlto)}+y#!iQbw&j6q<5f1sY%ez;Y#04nrr?@(+Q&j-$Hh zq09_zwB=aPOBnejiKtGGsG*J=4)bdt=GQ8{LtV{y=a+pN$afXKU$>5~sBdBAqRvGU zmpa{uQ_c%{Tus$Mdqt(8Gv|h_=1zuDt|d6?`O+ye)o64=zTEK(l>=YBm;Mx{g`a zuS0z$ZgsfxCDrrF?Wtw!V-V-7Un8*}FIF$OWx4bU=~7E|s{=)_-yg#>)x0GX>#K%x>du9!Kg_w;ZM#(B(5| z+NYXR;^OzKj`sSfv57v8rhAJCPBt8w)XPaBd7DjO>SNVV=)4eP>Nw86#4kS%cnT(IG@# zcp@yFPh$p2xTFg7`}07xiY`(CK@bdW!~bT#(bc>})k|~Rv?gIFyFotfSCO@po3_IC zVoFQ{uu6MmnlE0aiBF7Lnd-I1)Z+pu=fq&ZB;u#Kg?F#hX%h?7`hKdWjC>gddwlw> zv0NmAd}o1dX>}jm$Y&b&yym&H@kgaHvSV+dVxtOs@&KQdRZSDlJ$ufX33C5dSzXym ziP6-N<28uyqOfRC^Lw8a!CH{IRBv96RJsB<@D(BKainh@Bmf@ce4;0JoKty}OsXI#P zrWqu)`xIL+h8KXsX^!@sVXeAN=@P9v$$Rhk^5YlZvLXqP*+@0)p`>$k8qyi z(;H8HVCx>=xEMW8_EZ!5u>?uV$_yl^P)`uPRK!cqUB%Nq_Y0>lWYb01vN-!j(6lhpL#K> zNk(k}Y97NwAshM_-u>vG-$(au!Wdk%Lfp4uqXSqUdzwl)Xc3LEeW&|kK&vU_shv+# zPirtSzHQDr^wZbcaGYvFFAecR;1r{`DQ>BA<*Hr%A}0Qco<_>(fs4Pq^uTH6m)>X3 zsb+>BlJv%z15_Bl8`f411KjEt+N&0h?OR=$#0a`x8lUrBqTjP3x26pEH z#&oh}FK&CA26ES7;{9}ruFO!v{f8S=VtO0gWr6%?VNlK~#lYU6n}F_-A6Klsl|ft8 zG!0Zh0{Vj;>&S3S3XEJ)(S&m~ot2-WG)=)@radnSQVg1-Ph#t!KU%Q z#&|#MeCA(XO_R+nts;D>>lJv87e$zgEol+0eNm}2eSNG|ZfRfNsCZ#IAPr1&! z!#p48>Qz^NeL>MM5#Df~Qb(I~u}CZE`^$(lA@_gj?TTa-IW&(ldVNdow^tMOQu7L_|1Ii)41 zX#0X((xv;p#hk0peDFowxhOiWh0{$_V_QH*FC_0N$ikj%cXqcFhOA-!*}&YTJQd96 zwLMo_O8!Lg3*%1%G^xF142bwfdBbD+F5Kz{{c}rIoA`ldmP4i44WyEsJy#mW5$L(h4!x1A);$Kuj7AT8w&Bri^exX-Yqk1N}Om{-DSrY$d@!Yrwz7{Hz z#M;pIbDQ_s&RZYldPc#cz6C{+j*mVu-ts6Yq%>k3Dc-~YJ|kV*ZeH{gs7;TzrPn6D zIUtJIqN%bF><>Oq2(AMCym@N}*}qd&Ggr6QKJjIwl%6Aj4^YIkFPZb|J}EHBqyPOF-7Em~w4fQo>28CS8YZdw%bo+x_@!$*J~dtZleHfN zbJO$5a6ZSgHIygQ3a35j)-mE)4R98w7LVwu{ z+rhXvVT9{>MhI!Uh%s)hc#5&j0IJ-CYsY))LZHsWu}EL!9XBPVkexv96F-21 z4|9%kPhIbyE_`+u;_<<#*%;I~w&>jJhPEE2BUT}PVQ9uIN1D|ZCOggp{VuZ#)9%Eq z?v8Y}l#mBVmCyH515!CmY^`B=g1Vo!IYx%MEG=#4;9!n1XEeBCL84Q)W*CQygv z(d()}G-H+)o`$6;bpVuNZP^GErRDp<=!rd`s!oKW4XUBFoQ9iWQ4 zp9QtPvlA7z2K<%}(O#*Hyy305Ns|^NEQ|uV?qvM`V7IC$9{ri9Up4R#E~`dCqj>GR z-U_F|D?(aPqvZ#Y{mi+|ZS<;RMAY0OYU-_sO|18)k9(swO^5rFvo)XzB;~hTY?Jk2idLhywIWWkzw6s+)Bz!vUwRhVxIwqTS0c{rmB zSvl3Ep`wxxZ?Ig=-N*fzUY-rkuX5#gFLYgaB!fOp0E)dfYr4?|C=f7S+a1Lptj~>$%G7I-gl}ULY?b z;CpoJFtJG}4tk`Rc)W%AS;G8W)xj}V(xRCFC2&M}P~O>(z>^#jJ(6_8hd{0iEM{1@fDB z$pu+qo+2g;KsH;<>rHUXDoa(0!TN1CPLmYaM25g^#cx!mmhZfzbW!4U|rVd<<>sy7>I_>9@eD z6_4q?Nml*$IaYd){>upz-X?qI?Dcz5I##T7mhKghTx_(Q^-?o+RZr zbOSPr3Z8KgX+p|5($9pn$6Xpi-}g^Le3SGnoQ{+9@Ec-6i#zq7B>ow1Qe}{G;1=a{ zy`YnDP1T@#V&voU%{aV%Y$Qrs-+O2RAeYOXQ$BqF1yAs%>LCBDU;F9P$EW^zxZ^rq zc_j@UxLEeQG9$Yg2g_NHiRVx=_!zu}LNKQU z3N;cWZrXqjlL>d%bpy0e7*ezm z@|SSgs!2LUlD3b)d(Jd+t>kn~t)>QsI?_@mWtVb)m9Cjx&G?F2&6p&de)jGG7$}3^CGV*c!%jUxVBu@uLoUtUHp6K03ys*rJ@u zV%*qASFI`)UDuqK5(6k#AbX`no&EVE}()e`hJ)=}=<qG8GRnIMQ@KfmM0|>WomYZl|9Lj)L?aztUhP7O$(qZ=cIWHJo3!HO}gh zZ(s{d(Ipyz9GD^uC-;V(ORYe4nDB<3a-*bPC6=lQ$2gmG2dQSPLdw->KIP1+B40~_ z6ya9G38>v){hJfR&m#_eurkcNXp6?8~#Y zuNPl+nkIuR9kgBwoIIywbs8L2hxvK)I>rKngDCeA>F%m3&7?Do8F!tJE3r-w_51k` zS;m$-SE(g3d0c$Ml?hgh?^9x-#i+luCW*XV5x*ZAl@bdGggw0H!p)1${@k0MF4DO8 zb(mkbKK!+C7h5jUc0FWoAnJwpX?a7_IGJ4e5bBn-T`DMv^Sz{%N~%!V#yG_5<}%+h zH9knpxr=uch?j=NPI{)WKF7GJyEVYA3t~@ZMST(|Wz+CDD$gKCj~msniC31-7V%x6 zR9%BrC!lV@RepxGIU7bFOk(I@G(o^y9^n)@UDJlMGE?GuIysdR1iVtGP4SB5Of+}(iASJl)LjMX7c^XH;>ig=>@tslR7nJt z=0Lc$me*%=5k4*I6g8mdOYgTp+eU6Ha!xQRd{=yzyGVN| zTwd=3!1Rmwj>fy*K>(lgN8(wWHJfz+BQVHY3s5&a#IRiS>A;sS>JN11NC-VfcV0sa zd)c0RpajMCC@$7f#@K%LM|SC;_>V3-`P6rWE&xcSDaYbF!xPD*Jc<9>D$38>eF={K zLi5i@_aUi1hn}CV+XhXps_iokiA9ks9-^SRIrYYz#0jpz{JcF+=LPM-FXPZ{U(?Ia zd{#wtf{1N~jlHmZ=z+)B3s3p63d*xGg4u`JM!dK@68O8w{finXXGovIeB7AWLEo=_I|4qb1lWDDC-8@Uwvu@x%-0 zq{cU@$m{VD2-;<)eFNg#YZ@2YXVxl9@~C~|o-*wPr?{b49QNh?n*?;Cw|J7m_Dwhw zlvAGDN+_?TWd*c+fqBM|O7pDR#YZ$~Z}J55`UW6v@!;c%-^ZiO$q%azm>(D)(JubU zUs20nQB07cnnS}rJvjZ3sch13e7$;GdlHrh^S=+#gLYgi4M^wKT`!|wowU7bNY8!J zVQ=ZrUi+ee2URyf*<`F;Cq?EJfj+x$KqDHQZZHK)#2@ttbqHDO*T6y0?6qc~I88gxcaEAP|F zS5SVn1bAzpqM_K7Reqv_FTAH_Aol1-q<+fn7zlZS(B1;oIqRGsOjaFp{Ukj$rbfk$ zvV5!JhNilMb`~Q!xlP$~>PfSi3ON;=CpI0j2fb1BYpOIFQXq=~9?AW1@0p7s8_2Ds z0!a<%=j|6JoQ8Rf;kYyM?s8g>OzLPpxHB?tRaqjP3->yhR3T|Wx+AZ${#;G$ArO`5 zP{r`gfC2IR&UfpzH%&NehiSY%(0WclA*HKAABxk&1Lh=$j5K)#Cb@h%%q>cO$F>_{ zQ-k}IT0ha8q{YhPgVd~^SH4T-@ES@K@t*1WJn5(0NaEH{6LG~XwfuV3OH;x$^HCCM zHhc@rK};--yiu2-!C@a}JynunDyW}B5z2Q3(D}mGLFN>B15i)3dITWDj%40XPSP|z zNN5aPH9gd1ms`|vu9)p^ImWBN-1AAMCm(MprtMztp3EuIc-$cah4DcJHyGNzp1+|) zgQ?&=&yF7~eN$zO`j>=@yENWXnmeRn*4d~J5x+ZQ~`?%fIls9?7s9R@pV;M)1itcvR~nd~akdFHy0 zzH5A*n6jEp0ckEZ8k|l_J&EMcHJUN(T?mCn3O`N)Hi(6{fX;A!9oh5xaXYj?CFgE1y8qbu{QWn+{N$~N8zpqyr_jb@D&we> zEKjIILr}QYVQ!z$_l-r@^rt3?98__F?7f4O8;kBa%ycc;to!;kC#+&i3Y`2-lwMDI?V4I&g2^7q)+U3 zSH6kxC_V9W9gSqtieF8fQD)Nsga)8>qSjA(z9(lCt@M~3*YQ4_!ls?gbchCr@8o1QSe7AQyd5d1`t1li`$ zMP5>RNQ2A5cNer?g5fr1!K zo>^a8dsL$~p2=Rp>GJW&R!L6vaDI2r5)HmBA8fG7j~pd zg5u|bRBjGGx%oA-B2)GfK89gQmYI~;6+f=uP6`?>o_y5Dq-T}p71BH~vIS{zJDG7i9vH}B=GT+NWObv?Y4IUt$XNsxe~ho}o4UUExza3DTzKRn zcza`k)QE1HVBd14`hMzkIG}A^?;4!O$Aw;^3Za)P)pa~i3IakP)=mZH*KMM_rTG)= zun%vD_(O<~-^D9+?#cnB*uuD$ThuUiAqin4qgw@LE@@l;w!}Ur+OJiAS}~JYiM5!RrTMO9kd2=1)>g~_)F*v|ajx&~ z+Q1*umB1#&Og z*5ycppvIymq*u3Y!zDXy1Dc?OE`zN}^Uig73SQeLXesnzn{0XtTT%fQWhvD;dKWay z1vsvfY&2ZNqN&)B9{_Blu}3c$x+8vQ}?q)-d{mR z*Y(~o-b4Y^V@A6V%{Yr~$^O_cqO{dc1BCKDdfp-G&kV{hSCSkj-PamU{6&}R5Ymvh zN#^8j6#YDz@F>!5E%#!p_6qaag;RXH^4QSm>0nhcN9)RGNMcKtw_KGef2h#`SGx~T zs+7exV1b5;|U-I|Y6V^nX@{|m67d8CvX!;Z zk*n)b15l7IWs;0^@fd_qwKlQOzxDZE(b5uPzOP+W(LG59%m)2>0kv6N06ok zD*g|S&APyb^4{C!7WkRh9i3RGJo-1edHE|~^CaY!YXCxZ!y{`Q?TfYifZXd56zCA| zSEx&{N1(u|yGcQ+jx)xp?+d+aEqn4K0?kwlK=Bt{+rj7~i>m2eZ9t0F;voALPE}$)DuKb#iYZ>q1 z{qcBy|7*pbJa?-yDN)mm42^Lfg0DS)PVM}h)?j|#9>bu9m)?<>%w)*2A7S77ngP{b<2>n5P{ z5JCE8_|PB2kbh%8gmZ733#x%NOXYMKZ+soYs;)TnoJK)fLTQXk1!fXB^wDm1#4!*Cz2NlsY3>V^>P5_EctJ2H_QHh0^n!rKG}!aKl;BO_wn&p{9E2hPgRb8_4NL` z9nzf4>#h?_B?<}+dzj*xNAITZ%4}7k9T$?4a*I9p*eJ{AubqdfMeB-iMh_a*Dr=dV zh*O2T=Ti~r1m^em*%g{E($wLx|A0=a^zu-aN24vyxULUud`|(?*h%qu2kD=}pyZO+ z-j4U4RsP>f!24zY>1tA2_SDq6>A{rm!0ujFK5mtHbV`6Q4@uP444S1v-H>XP&bh(; zj$Z~5-_&0*>KR_#@BM^N#<@4(Aw5ctq(VI;4*66HmpB0VRU*B}Xwk|B{rs$r?#o`^ z&j@~+`u7y(<}mUcssYT;qe2x0`{y|0HF_WKFQb3U**=f)^J~1T>&eyy#HMkw5p4WN zc@C6kC7jPebj6+s^PCGGM$t^4;iG5%=-*o1t~Y7oO?e1c*~(wqz9} zs6J;MhFy{zxZ1dhXq386Q1gp-6w-Tsg_Om4@#1KJdc#PU%0Qay9j1XeN!sSw)x1Qd z{L&c(U8z^peh+Yda+CfVf9pJ~t1p_ikATXC5#PxrG2ltZ?|Uow7Ue#kqgGhb<*6?I z<3q|1yLpoW`+j9oxl2Rc$RBuwA<<#>H2IcoQ=T#rCb{=joOQA9O&fGucdwbNxUo<}YW6Wms(Bf1us@*fF-MjI(z7uATbR?f1 z1aq2o@z$r7qq;fkqlABA&s0SPHNMw|hQTWaRC1s)i!xPUqQi7Rg{P+&IbMC#u+yUt z62(+yUHo6UTt=OMN+Qt+G%YQs9z$|m;g9B;NePr>F3B-LyBv@{t9Sbp-lk?KGYn6F zIQ))ZAHN6p_B7Sd<13$DdsjN7a_f*zLjQtDf>W5p{&KwY5pn&l3NzH8o+Pz@$7DQ zkvqaz8AFhxc|~<-Wg#fRxhMrrAEukBj(+ zF?FDB)bXziiac#nws?~?hN5Bi43Cq|`mQJas0zwYByvn)#`-f~7h6Ue4~0A-dy7Hh z9ZNeHQ9hlx>gyU z^BBF;ap5>L^rF|O3bR&r9{Sf!yj+(FowPTlK~~)No(Zi_IQ3i@lPa-0&w<{ik!Z(_ z2djhMG8WAX5@s=!9K~T;VDb)FFS5i{)tt$meOi4|Hk*sg{Q=O%Mt#y_6JT}4*D1&X z>5_UP8Ii7QMw*X)KWw`sc1&7tEBAo}_I)1=I5G^Sxp^_8)Kj}j4P;8*4=Kfb5aT^c zl;JE2QPazWfiJxWtJ`+`8Z|dJM@yE5$Gc{jiex!LZZ343g%&r$3Z_sEIBDJxl&ylr zd_jKPS>is3kVa}$VK&iB)<~xdKW83{I(nMu?d&ZiPZYmHi#L>{BF(J{>M(U}4a9?3 zK5ieWEJfCh-f=}!=E8&c^`dRUYS~sC4@?E#Ay}vxG!4B3A;kR+p1{wW(7Y6+lhF}L1)|Sn? zRb|WWWSx`dE;Qp}kcZkG%a#~VcFm!^O>R_+S%awbHV zkCzxRwYd&+cMkU7AAh4;Gj(h(O`9{$UH@c@)6yVpjrLu~lDYBLnq~Hb3Tb>_6=zQ4 ztJ7Tm8tATLS^%Bo%Zba0wJ&%^fI8=$rOiq*b*OBOcjN>{M%T2^{Dc zJ{OlN(&OMk{a^k6368h0z<}_!zJy`n@-}<_u0TwcbQP4URg@+2jo&ye$aPn<fpNsBd6b@#(1zg&@pi`&$YE`&#He;u)Y6f_0l~cTJfvhnqp7>SJBp!9F?12zZHy0jU@6m)U8pSy7F zhjWWq=e=b!m@-iCd#;yHn$lp-+qflo<0k-voW#rvSppgUiFw&o;n(+W{jBW{d7quI zc(rI%7+8>e27-nf-mlyE)^%@hYIYe^ymGq~gipU(1Fr9fss1m}{|h!{>ET4g)WkF+ z4kY6%n*bk(I!z>{!4KD8iX<}bNhK4hY$ZiuUkZBI2DX!mdxL$qJ7E1@T=&YR5vP(k zs(oo(Nyb%x>(lL!3NLlAwY1?dh94L=X?vzV1*O4KW_hD{xd5#M+K`Mpja4VK^!Xy9 z)dub7jn>?~!Jcn#Ycj~bam&iviW9=4NQ~@>y8+SwG{9pqumLYbbP=)?WMJc{J*;M3 zFfLFe3S2&K(DCPx^qqt9{_fVV#rbxa^Byu_v|K!MoLB2egcuIG)Qe9xiPN1SW%XBg zrb&DKDW3E`F$)ngK!z2!oQG{4!>jU9Yk`{+3MF50kGt4tTP6-cJz*H{z&TeDtqTlo zaFQO13%1?Dq)7>3+_}bf3kmCYSmR#kRPUqLfLRIbA@)$rU(gy7r55@++>f#6W_{kZ zt!@g~Q~N@6kA2dET5O9x<|zt}TVxvJwZZt6ewjN+Z|2>N3S$QZ2%^D^Z}&U?9`8@) zNjwQR{#pTDcL z&>s6R{bb=aw@t1x_UkgG2!L>WUY7vQ)0Qi zti()jO9Q4s94wlcJ+FG=T%p^l!KRN#u|0d_?(w}zDW=B;XJcsrb2H-qk0(;A<6`Or zbt)|dsDnw-3YC_4$nA^pw?%3eu)U4$)}Z%!cxO=mL)%n|2vQLHngiuL7wd0nCT{gm z-5z_ZU22u@>n}neN_p427*?V#!zD&6D*;(h3x>io&AN8vq#0=9GKV502Mw;3N!>c9(GAgJi$aO9an(Ow|rua9bZVq*0}lmQ!1}K`1GL@ zi`vp}mV0fgQ&u%P&Mxj!p?Uc@sD(CWNv0%PoE!9W7wKo*+!3K0U(peEkLMaqik3u{?Nlh|3D6S~ckz}D}^D+vcbjg!CMT5lWkk>b~1 zH}%^0uWlS3eMo7C=!L%Y(ILZu%iYMPgg(7mKRWQtl&VMZiBCcRHkhpQ{Up~Xq(2L=+9rT zxc>LsYEL@Xbeb}Yc`1<8#z=7nlqaAjtH(p zuD00i%#~*roWL6DJr`58T}TYvI$YqL7ksVNdgT`E7YgI|R82tpZDc1~UT8O@%G0dWYTU4eTZUfX?u7!1BOp2bvl*Bz6LDcYwW@-dx`xp%&mEzR% z{1c=-gKYx(KXI5KqyPa(KjKsIr=mZi^lR+-y8Pkyvkuf3->O;<;* zGbT93=gEhCpuzijB1Id{QuiT^cJc*1_e zI#N>5gRZx?bUFEaeAW|wh8@femsF-F(}+BQE+)5c+jyqTbf$h_+;Q!uuFh2R`(&{G zu2-5r_{An5{c-hEob$(}`~Pa@Bn_K1G_pG$eQ6~-QhIagyIaE!xaAOJ>^-kDoNU$> z2RaoP1Dfg~mG1bZ&yfb-r9P`BpnIPgFc=3vyQC;r)(2dufqPJbha{k$VxC+5NHx4p zKUHxS@JXG98dc!O-)`1E+#1gQ4NdBl$C;%@?fhkT?dwhT8L30QG(JB)G#i&g&$Eya z^~>7GHJs-b-NAXf^m4lP6>d{Bfht#=c(-16^7LvKX*Cr6v22pWRPo-%*e)2yfrgu( zn=5@P-_`H?&)fgQKI0g_{4c`yzkX)Ul2inu9(*W1Y!X0&oiBTVY>8a%ftD=lo({CtZj)z>b4NGhH3Y7_^owlcpZfoy7 zRC~EGW%+V~q?w8%XAr+P`*Ypuw^JOEXQ>CtbNLQi6En{}Go}LXIp0c_A|~I2bAWh; zC17d3fZH_It~3gE&ymWNJ6{~6PSO}JVLNV8Qzl7Lw`T=sO~(=FFFi@J&_OnaRIqiY zm`M!_LbXbASq(tHks4>ab3aphLFmOy(Qo~t#(Z>D>x4zjRk;nQVJ zPx|SC%8%k(6hwEIq`H@Nx5ZHs*~71!bHUl~ecNxWS;v{v%@$shv2gT@9m*sW@}bX7 zs1=hA2cHoCvV+5Cb(nUM=DSInh3#&rciKxGqX!n-#Vf$6`PiFu?7ya3{UOjl@z{e$ zGL7eGa70Df&uI#qI_8XkLE2jhtu)5V2hy48YMn_<({+UZJ%shZrhTk=r-mh9N*bOHQ@j-S@0Ez$7Bm(59-R2z?XI}Fxp_6KiT4RDIHz*q}Op~#KyVXHfM~tNu)K#w; zI5oA98Dw{yU^tV$Q@->AN%S|sJaS9QUtYkec(a=Y%M}w5c`Y>hp7Fn7_C^hbA=i;^ zQN~-f1bTk#mxOb4O1!b?IA```k7)cqFf{h04szjaWDN?=ChLcMvtAy3HZ?dQN9L)* z%w1Ad#C0J*_*JNcRL-luP>09+D-ojns4f6C>~<1p!Rmb{Xqr5`Nttr9O@gCH=Twav zpC>nXH+(F)M~v+p=d|Na?y+vwG@e5fXD7ZuJ%$G@ST)a*dYxxXR&f47IP$VsGJP{& zt_OoentBIlHj@m{APfj$=7*Ha&03O8BFTvw&hMK^45EIGj(x7G5Ax!zAg&&o2n5)9 z8LXlQDuR8!bvN`j)nHBsU!=D_(4RVlF<2{XyywD<6KU+)!Jnzarl(5rOQ!+yQ33!(UERw>ye_Zn0viMAm_G=IQo*9Bz^{ft()xSB{_ey#1t)gV?TIRr%y`pEr0a$wI z%xiX@xSnrbd72+sJN~!q+#mCauj`LangL+HDIc$Z0x0J}1)FFFl`HCtlp{XCgZiMy ze)xEgKTWDQALH(`-%x9w#)_bTYY?08=@Nj7xFmai$|jKuLUjuD`}X|&e1Ca)dHjBU zd7eIx=cni2@z40r@88c)zguGHqOElrB^!Nd@ZNRDOAm%U0X7M{q4HR>>NhnS+M0N|%N0`%5dALw* zf7Tg$ODAshrUNX>AN~wA1t~!#k@ThmwI4m7f^Fz>6GSUk(FKgYwB{ZlME&)ARYA!w zMlcOP_Y@;$oEF2y(D=Bzsu~9Nq``bB=8B$==^~&@1xkxldgMUAc7T!qOKQOEAA-aJ zeBww9Zy49(&F|s&YDT*%+uDWPxl$($4ozYqP`a}7QIhfeJV%u2yFcFm2A?{D&93#I zudhFZ`wQQeRfu2FBSF`KbK`IK(yQP7UG~4b3^B-q|M}?7E&uzouttW^vYK>61d?Hy zLz1||G%C>22~QBLZV8a4E+z|QFxx?;f<;J{M@V7PJg;Fy7?-`Zd4S>5xc*!vizZ65 zCf*nr1o1WJyUrBern@Bxs2b4y!cxg{x#n#yFO5V@EF9LotXlqeYXL0Xau&nhnVwNXPx6Nfoeh6BK>Qm~~VFzDmlP z)GRZ!jmVJRk#)1i@(CaSmKRSTivR~%Z7mI%>BARDy3K|Uvtv?=QxS|rK7BA`)<)2!?E(|-ACtdHi zC7U{$W+!o^1drGq_oISO^`!=^x0eXzC9YHSmTkUF1?OQ8n~u=7k?QNg+6*dxL?t$= z&!q8+RgvnDAGyPdW=5pH(ijQ)=5o)~lw_hYY|)wGsk;O;IVg0ZQjW0-^ouT3xTd|o z8)L?ovlRYMVt6PM#)}{GzzwwyESuIB_o@+D@~w*aL`IKBGmh`W`~Cw^O*cpyALXN+ zAy&L>1TU8J<&wUNxHZ|ToRfOIWA9 z;8MP^1uK=v;sx_~dLA_krB=Cz-M!H2UZc$^-Um>v?Ra}e5(dgiQo0l{+qMmJjIgYI z*l5!Y)95^}Q_Ys8t_Rm+#7Gx|LZcb`z)QY;Ap;rThL$aH2yf~$aO=0BX%2I|Z{ftm zi%heq)hnJT^C@a=u`B*x7ksmSpcc@>58MJ`7JHIf0-d9ful8K7Fj~|RZMOyL;f1hl z;2&v5h(t1#nAxQbW9hGu3gR06bEG?)w2dJ_HU$$JWK$&>(D>NO5-5fV*6)G6gM>{W zM``is4#TwwI}^4YHIjhE1-)-}{);re%hNgcKz_4>Kk2Nmp_UU{_Hj-{=hm6_^N9*9 z^^E!UQA7i$px>WPIv)pGJJ8SkS8TD1!#5J4T&9scfLbStE*+q7##`T|9pLI>W3@x_GjGR9XrqFoI@vn}IUJ23TuDJIMDC|gj5(A;@R|=0&DUL@!#Eyby@Ruv*UK3 z);;ZsCu1#1s%y`Pk%74^(Wvs4Qma*<-=r9Huv2;WM)Z}|J^7W_e`cIYjJ~DzBEAbW zkGCRx8hzS;_Qs*5rowR^ypf7}8=zI^HJz;Jy*fdi`DG@M>smQE>y=)xYsf_I>Mp38 zG^JUB$Jii<+i;r}nLrO&Wg4zpJM2mGiXc9J@X@Cb#$J61Y$Y|K#S!LsAQc|KevC?KV_M83PeF5ku-CS-I9STt0m_X?E`gV9X2!DWU2gt zK67+=e)Q=)!AD4EU-^}BfZ z%pFInI8vP*(YkGK@d z_gOLHaz{i>tr~UH-ah=t@cdyuK89fu14|{;MKk6ndP^8^4V~JEGX%p=R>1JA38+Xw z|4=zPM)b#9;3h`>%Zu^tE!|RyV%)vae)yd<8TP~biCeTci6WS!YY?Ij8h8(BC}oU7 zB5O75)OrSLOA?UPV_&35J;ygB_LSco`39iV)sK=iyd^X8S_O zUw!sTXbTBF83O=MLrdo)&lOi((lH|ag>#f&r1Xw)rTl(6cj+%EE*aHpyir0)Zc@YC zzj&JB>gyJ$|Hj3zzgLHG@&EkzFQJdzKXIzOqd|N|T#tjr3Vx+^jmSFsE!OQB!u_Fy zKM^)b&R?Laujd(v5HB`6cW8=u&~UVBI9c&*@~U7>yZox}?qheFFfCuxX{-~j00oi{ zE(6w^8{&V9U^p#hO7>xXeSLp_kvTu9*_p~Y1N{+3lOBNhVm}^Zc>bk_XH()78K+J- z^}+07Ns?>G@1d`VSJ8o$Wp#Yg)~+r6XUghytBEOPuXel@?4|XeXJ=ydHR=ymuuFg` zEjlxed#K~grz#XXpt4aM8D%4*|P+xn+O&W?9Kum0;28*74hK(mBPuubrMM$)i zg!rcV`1FevQeQm%LR>;Rtnf{rZj&lM^Z90^2yUSWhsGz%AO8X;rh|L)k1Yywy7jb& zx`3V~IBFUr@)jYvYzNE;HKl$>X03D`%&Q{;H(N5?o6O+5ikr0OJygaS{c`0Gm94DMj_4osKY#w{?D#W#{-~d4)krvf(#8nK{#@Tsw05*x&wK2 znCf@_qzFkADUgpCTb4|^AnlSOL#s(ohvWUAh?8s@k5h+{?ie~)K=sRXWV9l^rYe2~ zC|&xDd)n0H-o!UR$r%H)oQ7@rI~o>oKxasNt-H10R1uy@DED)dS6|VY77-s%%=`#E z&#Q`J6YAp1-Kr=dYZS&oVfEc%j_*eeGPv^Nk+%-IX1YW=D}_0iCDmJyLft^+(pZAJ zvFnf6oDcv<2RY=T;$^uoje-)zoiFo1sv6R`_}f6l(X^NJd-XFZo-{8_QaZZlhbdxa zhiR(sRAGMJ=C^Vvw0QVNxO|w%gF>eX)YPL3%@{Eu#>sP=VaoEMZePVY-}#Ia!+4QC z8aMpfz4K2-{NynT;BIM4b3EC?>vfW^^S3I@KRYhzP?&QVE@mS$zd#W0+{QD-H9-Rx zV2aL@N#0WpWL;Tz*iOZ4JM&?Fk-OQcH< zvcv43pa|*LjP%dfpFO0s!YGLMqmCyE%-7VtM~0TeT5qu{lw#CD3X*z^4T2{RG8`+I zXtBPe;(uBNkztvn zRAPkNuREceMvy6tr@(z`(Ywq2r-*cZLn$0VAlia7+A@)dqd)q#5BDb=eO#joE`ASv z4Yos>=3WZz!IY_?{8O2skdjcF@pDbXrKt}#Mk8?ZmD@s{o_m{wm+DYc&$gHz_>kez z_i$Bo*R5qDnqpCE-$DX1L=UQ{Kb1(8Om9;I($t`py0-EkbA`PBuzkOw>Nd>J@YJjg z-ie20r9QT)B4yAfo343pec|Vu%{FZ+=u*?Ji4I?Uqj-wEPzDoH-qK9uQ;?sznZVJE zZX`$1yGwt?%ih63dH*%Muc=YhS&mE}#t`SgE#sXgps7CM=lV$PFcDHPOCIAHiQ{tM zG(V#eDJr{c$V)KgE1J}R+mNclS7S$I>Ia$KJb(WerAVIyC7?mq2~YhP}BjuV5QUMHc|D$X8r+fHxO%WqSz;7U!D zcO@Aw${=+=hoNEW*kmx{tG|W$=Vtmx-wn^t_s7_EcM`HRqoj_U(W@Pt(};qJ5woXX zH>s7|_&}kSQzXcZDz+$^i&s#0ExLa0iv!hDpPMs3OqUoqdPtP1;({uX>SM)Euty%ps)HY2(lKz^7Of1+o9a!vCv~-m)AF~z zbVy;)&;+w)&+Wpusw!fF3I-4&W)^{6QH~KdG57%~s1~UAcw7=gi)8;Ha6`c})yUH< zfJ>(-C-tEGkGx~t(#68_1{ZIW&RKnfu3Ir%EZWlBoXGlJA0Sk2i#N1zozdQ?Wfkb> znq4xachl0%+S}LuAs(Bqg1V^ktji<2c<>;~WZ5PykZMIKh!&Hcm@-l0*V6y>O*)PI z&`9c88{JZWXwzAw^dBKTSp3#!w3vuIA12G(5})*Lx?2S#zQ0YC8~myFx9s&K>3YKS z0#kFCZYnd@P)==a36FShf!UPM!)d2haSw@dHiT@2RLo9(6)92OJxJ~`5|bY<(tmvT z_>c5SwKl%`m(pZ&WKS2qP&P#8q$_~x zs-XPZ-Jc=;EijJ&qtVwQ^bHl;%ypGI&-Y;Euo9;Vq+-i)ejebQ_GElC^1R6`#F%@Y z)OxnlO2~4R%Hu?52tjz#u5p2{JR^^%%###NCQaFUW0sRsS@}~+Ht!y?7~v`mmBqq# z`KR_Fm8M`fV2v-;pFPfNSWZ_`GwN|NE^h>S?l-lm7&|2Ki3`3Ve$9V--rF$%8^i zb_w)*y8~`(A8+U0I=}esj{i(*aPcnfh`6CNd-OR_A0MYzytEBl^#>+xDc{nN7t&ru zK7zgZaf7?rpw6d&q~S#2PL21`+$y|Gb~$|W>vGGq^blAg=A-Huo6wb+e49#*^@HDF z&sEK#UdBmBp2|2{m_G3NBf%A+s7a~ErVDDAHPtp7bl6?^Cf!w{H zI5f;VWS}_C&mEkfw-Vg@gfGjNS^eHW&R?mu!rI(|9rqqhnl_$wxH!9dO=%`}&77ZQ z$m|opz(f3p9@FPB`jpSBm2XRyKpQw7B#`q1i<-DLEM-g5t6zh;6)vxP&rOBK_2}+> zpfbDpEkFZ#O&?4Xp!kd%psh{V8dLK6xIvBLInoH!+9gNosI9#P=(6tEl|DC$Oz!^s z==4etwt-lZ$IhV%M?r46aP(afkC-Wu-;Q+OjdtUP0M#f0hZocaFdr4vYuAgPp{Mu;BAB>q`nW3F@OZ5V@^H?>V;1IO-&}< zPvsu>IW&muPkeU_9J4Vf;WRy8a9)j}%e3KoTGLAwcZ#?N>izy$CG%T~xA`G;Ho6C@ zM;&yrsEqIDY|({URJ}&wHTUbcb1av!5RFgO%jh7dgqhtF}aNm64 zl0q;j{#Xb46IZW^1e5qWG`u_XdA{h3l&OMv2<}+2DM$JTNI$=(UwnVjZBNvauJ8T$ zYWuOXQolZr_t&6aU;N|ueJ1BlQ&DXxw_!^!pB3(@cxQXVyTc+Ri->X~qfPvZ&vtqu|$LyzLD8?ejz)zyDV4i(zpEY&Me7=bV9cWj!GW^J_=q@e@~;zN4ykC3)HqUdWf9rVCB;WdJu z=SzB}^Jt3t(*KEPiO)+Ic7?EWR1W6g2f=?K?I4 z%yEl)Mf!?TF{e2cWM|`2A*Tj&>y8fP_w@Hq63&y-f;Ff$x+9=Y-O{Y~%Rtw-xBBi4 z6Fw3nA0<3Pah)_=86ZN9KTrFVI2&#c>Yo)<>ScC$FikhcFwQ<`n#yvSZbqnxGMT$0 z@jp%Wh>xhyP_B_IRTj$RV@T~7Ril31_V(yD12rx9x$kR! zZfmvKqPtB{dz-dABngFU3btM*m^9dR{`AE+sB;ZYuY=}hk$)b**MpSiE`j_8PK<(6%<1yKV zx-bub{^3Bw7%&6f=FuMb;MNb}0I2X~wQ;vZ;-W#Jr!1n&=Xqs3gw4vMmAz)qvz*>E z0dh^ztxtE>sU9A$QEFwkhV|ibpnje|&;+!{?o7S6P4frpLo&$WQxubU0vis1svJPI z0JTGyrX)|FSARao-Fu#gw?b6RHwOvTde;+SpH=g zfwlk2-y~l;xP6bWl&4;^OC>0A-ec@g;P(`#nn=o>L1C-5NxyC=*mWlhKOVIVS05>n z*b$J{b!C` z?#V!T9hyjOhZN?+*MHw!RHA2T9F;7BG^M(3NL>RX1(#D-@g2<{0Oj&;^|kQo*2ITj3D8)&1KR$ zR2rdq>mO(oPT*cnfd4E)g+LS(z)jEep#*eN7F}g3@(6UhRuLaqcku-_JGQR3M*Jv9 zRO6rGScRLJnw)XqXgE=WRc{L-SNw5k&5EwY(vln9Q%a;6S3Ibq^i)JPXMOQ2n{@|i zQ^Az9EkKZoZ)!0gcMa7i5svQ$sE-Y@A~n5LQ(x0z*JD)X$Pe*;j8vk1JASO<8Jp#w zy)rp3Re_mrfCG831EaR*rO^t}iQe$1KM#Dc%USAt3(jIX;*55xr`YuOniqNAMLKss z35+N(AexGsFpEhoshCBm&o`Pnmq31&APF9-y8!ulGv@U*{&yMMDP#W2Lmx{&{Y50fGU9Y7mwrWu`r&$X6cFC@-8=z5A9X%0{ zVnAf2xLXyESg`Y}3W*D4Zd;h&1=#wIRkFOR{4A_} zXwT1Cfob_PN$fVN+lU@&b_zU%V*R5FX=n9Ppp-HZ;?IEssrSPek0F9694ISo7=j+} zx)ZO~%DenwMkOwMbg7K(Z5jR*0`T+zbZt<%XSzzP3?Z1Uud3U1!VEY_l&4hex>`Y1 z)N0gb)vKqfo=VN-bA@RN@gHy{xK;H!0nnihvWE05>zg`wHeSl910WaaJ+PxaEq{?z zk?aYe+#lu9t2n==+2eVD?;HdUCpW<9Ue#0lXGsJel?kH?S0w^t^d7h|UbR52Xfq63 zZH9fIDgjkXSA?ci6aG=b1?(QrPj=g^jAt)QgvK?iSP;7hfx+ zes#H1nbt|(sq$9Z^v^EV5B-6^Km5l>2p{(k{|Fx0l+@k>=I zP0Du46b_(_p~^y$hpLb#VY}0P`gJ&dcG?0%czs=|u!&Q7{dxX}#kq>UigJ*LReT5V zJhcv!2Uv$XkVsr#9;F{2D#2nvLjth@X~5#GQhfZl$kP?7K+{yPgP1kA2{dSR(MMUU z+?`G>k&F0=M7xb9bhJ$lYu}zO{s9zJTgUj1f(o-mk$feoJR4AZP)*ldZ~XVxGjrK-4P1 z?mk-+SkD!$G0H$e;x_PyOYwA&^=Yts!|`?G^)G}c;o_CFg*u9ya>wAb>d%=-4Cd! z(=;GvQzp-b*?{!KHkkA?UdPKxYU}1c^#*z0Ig}f0ysHc{Ld9lvOXoS5A=EAh4erm+ zq{4l)0p6eEMfmIO_Yl1Tg>o=~_jVoxdSul}gnOrxH=Qd%wJY?%Nyg4>cGZ21s1k0g}nyX*gMslw%tJ{l|0QYDIcst03e0JEp=qUjjpJ=uE#|!E6%j zfw+wIeZZ~SX%@9apsrhDK}~mR3((qe8a9QC*6Hsy2IH8m7_R|c6s>_(FIW{dPqbjA z0q;j2?_5|1gAH-()Y=DGa})$EKT%p@fKkp|(}{wERRxL+EHc;l6vouMCcc<#OLg}6 z5MYpz4wzhDJg-`ge!2mIb7MDd#AwgzPSDdjp3mASTqviH*Vr73bCoOov!_a zG>x$`x;K~ysKg5C6u9~@%>`?*sa4sh>cOgT1JjXD{k~Ursa&!_u2*Q9i|(H+Z`H6M z6`vZ~rjh?FXR&Aq|`sx#*-llYrpe7Z5@{6}$rZKQBNcp3*{WQl_9BE1h z8>$8c4BRqyg>VoaeRis%N4HB5X>u9JMs0h7)q zWzln>r#-2UA9l~CW*VfZVxWjD5NlCdBNd3IWD`<$d#+b~idru7D(FUaXh`;Ot3rAD zIc)`V>Z!U`!+9L11cv^y?@v@96fHW+{g|>4XgH@{qC-t#_*M<+PxPkhgQCdovq^JE zOCI^zuCQq8_^X1f%VhlI>4w;}`1=k!q1T=~|JNkyM~z>LC?}!|?ipJId4i zr3kmfbV$qYeT-go#cLX$lCLxu@&xCfn;B*QUnr<5Kkv#WljbK%9aL6GFF~V2DU?iA zG<=(hTOoCPOoh4CExPJcm^u^C+I`O%ikES?a-H%%Dp1wpxsS%xrXbhZ;yu-g2iSo2 z_B#HuYC<@Un{GPy<9Y?g9K`Dv0{wcu1*UpM=Ipsq)_>0mQWt+s~Lw4(bOv1(iJtz4P9Fl5vQ zu!mTg>f=xJY7A9@Fg6}uD`$tt_s%WMm8?bk53HJj_S@xeQcl$v-l9qVDbKlFa59TR zJ`vB=KVSW=UQNr;%K!qKb3Ms_=^|@YsC!DA$HiCgs)n@1rZK3lJf2I|@qVYlm(q8~ zP$5(s)Q_RE9qPwHTy<0PJ!L>(7a>rRl`nrlC3rqisk%}}x?cW`zSeg68_p93-1Z<@ zqT`llfxTRBQbB4prw~8)j{Oj3L0Xrh?aR*eB>jb9r;z^%+*OH4=LW0frY51sEz&<5 zq+37wUi`H0rXCcj7ruh*U5Ix+`s{USQ@hlup;Z!0lTC#~Z1+fosi4qpN&E#t>`Uid zeCsY=P%|AojTj1LWhGkI*%B#Us2b1%n-<&jCrxV}vjipSdUJCY&pqinqo7*Dcz!)) zuFX%H7SIS3pQY$hLyWp$e|h%eLsyfAHr0{-$tBe}Q!(rk0e{kBtB|llzq%&P+-?gt z9jGo3!fF6}BX4Pe*H=~l8Y0vL)%6POKd2QIPkw~T_Nnoit5k&yRrQ;(sGe+~3c{FteGCEypv8QWcs`-ll{DB)w!77k4fw*2fV z4U_0&9cc{v-R}|g-cb(Hjl0viFDn&Mh$^9`WerkP1y$Mmm1~-6RSif_5C0C*$_22n zY2t}8Xa_HtAD)xz0~GMg-#iyyeIigX=yRyQF1x`j+e z%XErqm^gI*CgSp{xwb){G6g2kfKeJRmF8&)E>9R_o_<+JN*nd}=aO=?6IdanC|%+xG$P%6&L7jJDbn<8s>Pk>c?gZ2XNR&mFpB4B3 z)J-i^L8;W2M4?_vNgM)wjQ2;h-%FBq&)~#Mz5?Xo!cUzZrHZ1OGySNYMt9%D9jZ7^ z!3`R6Q&QEQq(wdsOWv7{M;i7s(GAt8JPVDqA3SO5usgqi8t;6U`~K@t%Y}aM_Njq> zVeQI;uKRn$p-O_(Qn_ApPpUMkH0gqBLORF@NuNdgmwKsL&Qx`Ai$=fC+rp;nd=&bp z>OgDUq@CpMg`gCSL}-)Bu3%3TU<1^%ahZO9epkT`kAImJWdagv1A7DZ&6-&WqC{(# zc6tyK1?p%r6lUeZZ_=jvS>38f$>`M(nw4SC&($oahGF-C6bfh8WWt%3KO}*Q2AuyB zLojF6hqzBYG1E0aQVb~+!ARl11t~I*szl~i-(J1==bS{`qrrjc8_Gi!9lnJ;1L<^+ z8L0KeXA0ko^xOQq9o@H6tY*nbO1h-HLFbU*>u+v;Nt!!B`I9cKjAt%^SN71QUe!tYDkXY{W~ z{6|TCrr1Dih#0LiGLlhH&oR4dw?E5=3;9a@^$5YU1g`2ANwoAY37T7Y}yNlfu6jQ zCWKh0R56&Sz6tg_KiC-O;nEm7%{)OG-X7lq$HObZvVZghBsKl|UR@>-&MGj~ZCaNq zr?iTuDPJBn>Oxv|0-G+ZBvIBUnZMpv|*Xj1XYMez#(z=G60F3Kp7iy9F4W^h)^>d^<8?|MwXK<7VK>JVp zj*n4_NaG8o=Og8p>JM;!O*p?-IN<}LZmP`?5u03bP53PhSv+*8~IG&Gzl-0DEF z{6jAFx;ClGh6cs#qj*S_LP^7~tXc#C&Lpl^6V}Q{{)2}qA{J0pE36uLT-D`ylnLkO zA5rf5y;&2R$KHf1cS@{2pwK>kznuu2wL zz)7gKiWEm`icJSDt~=!WV30cxRnxSBxKSmPAJOUoH5L8PL%q!`e5^ER`deO+{*=P~zl!k5-BzuTo&uB6oQ;VD(-(@G15tRpig5W0mE;93(Vo zDq+z*B_yX&X)3LF(kVRyiJaP??o|Hk)374H^Qi8#yiW$IByj??co@uo;S=;qqymj< zL{F-i6G-=U2~HgUdYW?9{sg-p1SL^)DQB!x3ndzCi056!ie~#A2VfIc1qf}e$h59H zL)nCs2t*PPWcuP6;P0xDsZ+TD@U6dZpsw?6KkA?f#88Bk2J!%b-vg>+IV&6HR^nGOUP%RYwr=!W;(Gw4{bK7Dx~ilPLEP^u-V+AkFsiX1!r(7B z*nDmUvh2|rG`7fCaG=?135`IPBGCH|q4o#6AoZrI6t8+uWW^(>D&)B}O;Gz~v|7uQ zTH7=W0Mu`y{s+_bMT%7w9+u0mYkjlcKJG~h8vSU)$A=wPO={cbwCdzvlr2CA=t)$! z<0!cgkc3}BSE}DuVLJ3+DN>BJp??3UmC}%BO>>k(Q4CgUm8(SQ@<;dTe4Owv@QT>_ z>dGhOpEL%>k$277K7}e;y+UgBs$WqR>i@7~)5e~(xb!jlgPN-5BLIYQ#oMZrq)HX% z)}+Jsho8_ENKBcSe`u$+RFzXe(P81yiYojSDM+BI z4)bL3m$>Cps%oYwq8Zg_46P~x6|PJhT%J=}#~By@eH@?65TMfGZNUGEDzayI_7v%uz+NJ`tN*)h9Qhk0pV#XIP zomJ}8?VI#OJ|Rg`Kt3TUR3T}?mcPL2Okyj3RZI%34+TP@Eu;U}40wFP59e6rVOSMk zC?7AWBBIu6vrWSde`kN-bLzVy)b>WYI`dmw?+Y{OJ zQJVu@`ct6wWLk9NqEiYLeOs)3QT7Qb@^69KQzON6eTKRyeFId=`&%0K7cb0s1@P*$ z*6WCx)RSsJzxHS<9A?-eZEWvH_Z-Flx=5yIo=izPzbH@EaDLy09$CtLCqI4G9&{s6 z$bCg~DjPjM?nvoEh9Kl&It@rqAnq@ITeBw-)KwR+3CNNbCyLwDx>-B9juaAK^G!b~ zsrAQ58`3EWsR5N<#o$C08R`P5nI@FBAVn5!@=TVt$&&-sluA=HkHW=A3VkVjn{s}d z%k8rX)EUmt+b{Dfhr2Bf(rFXTiccGkg$NLgLXuV#A(SpEe&_*ppeffZTKJ}u-2bqE z-N{b2wavrl^Aba};=h_awy8?xi58^4wD#q+&-?lnxT-=i4bRc=PNPS~`_f92Dv>69 zTr}y9p)lF67r!p@OD5Hc=oMdnYl}O| zub)z>Z*`zwnViV2xJgg1kTxjgh-uWf=t1Y;n>4Uup9>R_z6LYKdNTFN&gH2`HF4II z;X@`ez}1g$l;jzSLWvyLEXt=1NDtP2p?`)kT)p)tojmYoDa|AOj#0C}uM$JO1!@34 z9n7=wWBLlh59g=-ByFY>g*L=nipx1=t>^jRo22b^4iopnZG0<@h|lvgZd8d}+WlGH z_YeMl7X@>oQRCC`>bHF;lKdQM@HW#Y8n)>!vFr3a)$gdRnT?sG(dqMD@j!7Q^0-em zB~R)=s~W$<=8kIMq!ZFK*}-TUqjwAyUUYo*+n}z45+3wHOTk}GoC=9LuOs4|!gQx1 z$IGE&bW95O8lY+(Gru;-{I--!h&%!I463J{D{QJ+rJr(V@T4jzsmc3j+0)m=Prm$| z(xF259N(v|RCb-9ve#6E7O2&-Wa!tiNu!*nB^`PPx@QqZan1DL$(3?R+@ogG-=mD) z)6=+WpnW@9{i(-)cMbt3Bxq>+#D_`-p6Mxpid=I zQ3Z&+C|#T>PDl(gRn3~98rUrm(PM4W-L2YEHOpgOee`HB<+Zq`3daw_r&2=t9OhJt z3R`sl;7p|l)zeTX>m9}r7B(#$KO}mCZGt+8@vpzMAN12g^0&hYRCPg)K+u_Tjk?=?dr8|RcHX%A%r1>3Thk3Vey2<62K zpaUP_0BBtQX}CiJQr88}<{5U%C_fXTLRmjXeR{qA#e3Q%8d`n%KX1SM`0+@UviH0P z)>puO9%XmLJM*VxpXNg*oou^{RA8byveI>nuI~OKEk$T;{{!VwlFCDZLO|QesFg{r zErLF2riQ6gpbr6B{~aLTq9^^HZh<$B_)IxP?zsJvCV>W9I*_Dekw`0+pRRu?n~aK~ zli5k>JI3}tUjDk~Fm)q?H6dvBND#%-?|Bbu{C7n!dL3x$L`g0`!zew;Irgc&Yo+s2 zz5J&{8m4&%*-K1uqzg zP2*5q;ZSLt82&Yz(xKum>U{VIn(s;z)5A{^6nUDnvn#zc^m?!a1tz5ld8?s}e-{ZV zKLcIl^YGPPx>7ULRFQN#UacixY6aCyvmBbBE+>e@k>B`*wO8$F%A{IS_X5MiDDI|2 zq;nUl=%Dk1>S+Vi8wrxcw?Ek8;2*PQ_Os}+V0Z*54+}%ONxdn%`~GP%jVg8zzs(W^cJ`<{ z8bFCQbPP)=tQwYaUKFFIj_Eqqno&1Xvc#EH#e7pakWd|s9A^TwxRu-zOJkA`^62Ln zKmQz7CosSphboLiSP>ET=VMPc6@@=7BxyUkqSy;@x@DtUn+ho{P|r&~+c~*%_ewRn zx7QRIDuv4mDJo7zi{ELxT>mu5snx7!&2oEN^=$dWTuISW#aFKRonAAucAd(zv8qtX zuJJkfcp+B*;#r+^BQ+M$AwA!deY>J%zR;hit4jJdr1eirAEGW)D5x|IK|(7-68Bt= zKNQ_<5mLotRR>DzpGkA{rN~yspzz5(J+(al8L2jtQ~m#bsZJP?#B&42qZ_OWl>J^Me3jCB%nx9gL!nX67ok33FD)45CPP}f3)=| zq9vAHJ>Ic$O4s?+i(W_ieH$JueG;a4;3NWU9hRUW^y#{|2h0lT;#PO0oCIc49qFH2 z*FTW)4>P)?r#ED>x4OLi%gbtsm5NNYH{A;EqVPhKGL>zzYj~J(KzJ=Y?6)TmMjFh0Tc6~*F*;_8>vWZ%!~F1?Qr zBvQahj`4q(m^H_}=-QnJRZEc?swok$>!6=p-2Y+Y0wh_|8raR%&&Zzn7eDCjn;PZ> zX8RPX89{8#s&7@C5h>YEbC~~K^&cCTzYoU7&20~%rjZG%4JzcFz)K(cxcq&&QdLn_31zKPPTkTqq~Evc7)?!R!F=nm8Vx&ty{3V zNk5;k{yD{Jw0jOu_q%&ww40mufjYvvDy;1cB!I+=r&$y(0`*;?og0eG8m2i?6GhFY z-|M>C5(|K5Lp+dcB>=8Rg!wKK#p?12u!XO>i04+IerER9e3W*2J~kYHdgF9z`LO8j zkAZdXRY6n>MZGfqyQg_sb*P)I=_coDj!#EGnXdI7HBoLCgY zbY}&6Sl8=cKU9;<3x<3I>;fMCC!|^ArJMBg8=&Hs-_(wdSJaSX!bae;a=PkuOHe_S z^F8(?1MQ1TOXgV$b8?K-{j{)Rv}9=WSC7Y~+EUCmm8Gz2OvKJWjzZj-X6$Ys;~cn3o1nFO6jj*_5*K zQ!N-AP@z^(na`RhyS6}eePUMWwQSL0Ui8me~4A9#I9??>WE_EZVIVsDEw_vc&yc z8kaih5vN;V$nMMvYlG8DMHU-F05nxf7mCY@XRS3RR4C*Ec7XojKv$mN&}aRG{E3mk z!>wzqrf^y1o-8TS7U-iz^Pev3fwGwQpn^)*YR#hG2kPm|$2qs|0cq7wMm9)b)?e24 znUEHlAg_}bxR3L5#(6AC^E(?gI#}znRcX^5lO`|F^-7UR9iLAFSerAI$(1Jt{UXXL zZOm7y5UX8JUF6Mm@*u__INkpXq>v`Khh1}cbil&!l-Pd|*(j`Vg7)qK7H4FY+MZeUFGQT2IXI;&z&+@rj0}SQZq6<&J zv{^~^0nPNm$E($e!l<$^YM>dvb)=_7);`0&}1?$GF z^;VEXT|^qN*A~8DL0u}Lc1c73l3n{XK>E177@$c+3)T!!`t7APg;7=U1Q=?puH$SG z{#b>r7ZTSR)c|mUb#b+-o72nHDyBxjI@S<ZN1qH5{r z%j`@IA4@IsW9_ZTvjV$d306>Sy7*W@37{}5z}m%IRYbIE!pf?v6F@V4|4x5g;jXb! zsxC>tK%}VP*-5VXE}#d8f^ZdS?yjy#0bIBMrh$5vhJHo2$_w-^bKZiK1;<8*DPMIC zbg5lVRiItemnuRtB@pB_jc)M52bA{aY-!UZPJa%w{s8Dr-J&HE3?g8>)1u1)n+7&M z3rKlf1xlc~fHcQ?!yNVzad8~Q;NI8L%QiP+Ubv0d;GAkPWl?F1} zg|dd^Lf>)q;r+QCq<=O@*CPLVeC@}FJw|#UO;}WdLJ}*nm`B!ozDpGwpb>F;nn+f) z=O+B0KXbEPTzdNxHVA3{2^*+y#g=*1Lf{*$8K8*ux|^ncCETEhO%3X5{DM-v38-@>%}#AV z`gwb$^7%N=qp?g(Z~x$%vxl@C@&g4Ji013ksLN%lf}8r)T8xSipF)xp>*L$-f&$=h ze;(A!+v>dwU|D*oe(M}M{c*rrWZEnBPhKNoDulGG0RsXkt-7&GAafp( z>SP>As{Vpg1I0)5f{7wJReXeUXVuiLTmy@sDkAQCk}N3qLA@QUp&iu!2PexB(H<&* zLabmf(`maj>%+IezKu_G^v~2lu9jA&_+KKbLT$}=eA^a{>(1vHigg3uAXenL9(4_B z!?-U`6JL_xGp;OdQ(gh+Y;Ig{*a$bKQE+6}9|!;3mmjEn?WIWLC2pN+Aoi zf8A-4f@!!1Zg{FZ_G*_5Nr~(ChaYWps>UID0xP7ZQLs*u3>Y}JqG$yGRJA14j84JI zpFhN1&Yu6D1?!&$>zn%3KgRKSc*dp`qj-LKCbNQWf9E>@<72s5vnGHlb3i4@Cd9wB zCgK`<5BMY{96E%rP50~PsT$zCXZxY=*Gl#EZVRegAR5G|viMIhg@pr~`_WA!6&gU_ z{`$-Qvc#~MpWCO?+n}|J1sXPpwE`E0kvd+sbKbJEG^UKI>{a95f_c!5Wv1r!;jh`S zE?;F=kJv1@;w<@v!dgZR1==!nMM<#brQKcrxksFOnlIe~3EfBG~ zrGDvh{qkpEH=iF0!|Bim)USO(Ueaaks#AYp!w%RUCO;e0``2+$>e3VYf0mi5mMQ`9 zOEI3RgqmMjdl74+Q5!s$nH;Q84d_FWyVjEVv7J{b;A*(EZjb-Y^%txjgyHn@k%d4| z;k!hrc1d1&ISEsoyBD3rQ+p9&qrEJ=(jSoQ? zv<~T4w%15%*xp`>x97yQBVu?&ib@Jw{(>9srovSfDCX3MVWL?L8^Alu-@px@lB`<6 zr~$p#<{0cmWN!eQ4m`dNgCQ9k_HpzuTy11bW|bIMVvU86Ek{~Z1H8}m;3`lW{*;8F zMspN+U$_9~QPortj`5*$m{#r%p{5xyrU`2O{9k)mXR%$3{zBn`YZyvxsT`Q|tnt|b zU$~BPrUdaQU}_GbK}!^WUp)Sa4uH{wP_Yz77e02pvX(bht0cMAj&%Q%xB%|*lS*U& zCD3f!MSq18R|=7JVl5!+UWEz8l_XWa;#i+FF<;Py#Arf?Ta`~#C#-WYAV|f~cTHHG z6VAoUO7?W!|HgGc&iB9Dp8d*`ni&=zk{jTI7lI42P}%Oh&J8R;`F(;iO;&O4U;lEs z6`=h%$vvFDuqj3r`RGt_hNUS|ZGMslI=ALm{I|HNSNH7HFreKuL~FY7^>^Y{_e1ET zS=YS9#=*}WB9(afp<@vwQ`(hYoht`-_IveSec_Z(jaFll?1770=JaUG?MByL=z} z_ubuE2H0Be1lSPWh%f_0_e5W^0Zo}<=~9LD+@=KdDR z7lcNYNmT(#m3n=1(I1=1lVdeKf8t%bd{%1x#xc{5R zX`<7vM}xRUOkL(GB0L_bWF3n-)~{RK{(T=$(mQ*qDDuCL!R#dfT5}cP*{T>VI&cE{ zX0@DW3sM_cH&vkD$D?b~-6f&LyGgU1-K`5H9CNfG3=#cR?gCh~PFyFb6`bGErT!Q& z>$kJpy*hm^XIU{+Grp@1Bfv_*6my(gTFBM0{1!UpZFltlkgKAQ?vub z({GUtH1ZfKvTC>jTA)hjeqj*c(`w-lk5xkpV}`2Y4PYh40`S%6NJY|HZZj$!rGBki z{tqtz%CUnBh_)cS`TR%~9stHyKy&H%F!B()7pTqb`?qSfGqZ~cc%grdW`ynBqK_k73p@}8fj>Jj2(1g}J zzx8j13w`wLob+j~Y`2b*x{#+f!;E)M?jyO#SkO&xDz3Zh78KmeU8+uQRA$w|6R( zKFPt_;T=(O4EE#Y5#LUQ>`4ggcW7TsW1BJey{Ddpew8wFMR7+fdWPyoT^#16PKqzD zl#XyCW`SH+eo9#rxFSMnXcd`bsPw#dO;CrO4}4RfzH{-jAKtf&(})`uY>$GR=*lXD zB<=Sq&PpCKoBL`dIt_P6bfzBdDJzP;so@p3HN6mq0@pU(Bdf z|C*cs^M!kLt4`HD%zV=r^z02UnH8GKAZQTTr)pISo2@%-PQTunnEZP~SrdtYuO*Ax z6}c$Wk=tBHy2(Z5Du0mar44ovmsM)g4G_bnjzm>0pZl~1n#umxfiBm;A={^3-6dk0 zf}WbN4y+03PmHQaRR022ruqiEl0?x#DB5ULK~lBGlLkEBs;&s|M50sHkwR5F-c4PK z^7ck&Z=l)+@kbh`6oCd5ctyEU3tvzrU$Bm|*6oXnbg&i-md$YW=F#8Ollh zU4fE@T7(rvDBugP$|)+C*MXia|1@D?Lr`yMOU=VcA@bI4Y{axFXlLDz8 z)Tw~)c=p80RYC$9B@KvAFM?qgXtk3%N6;F{hz<%R+_tK?S~F9p-J#{fGet20$Hb(GAju*cCB0bP9Va_m&4MaCeool)NDqum<$$$(X07 zhrCB$HHOqLoi%Q{0`}ciRTLzJJz{xUq%Lp=z?o*QbtwVt^JEPITEQwG&(eKRo_S7`qVokvwwk;Yg>4Epy2 z8GSvjJ8FMmQ#EybqKOV6fGOgXLZVhAJ!O!4YbfKZ|L*%)g6JQ^lq?mDb01w3G|`vK zUxxpBI{G!J@savHu#|NJ)>B;?xlj>oQvHobza*j;8l^hEP)Sd{f-`Q^bC7UKa2!Sc;KpT!q(=AW~>HiIBd|jVQLrekm2T-o=vPqvJV8kZc%|V8>Nkg)jH%{3a(p5O<(?(q_4~RurOT*fhtL6M zp9DSvJ(r`SeFM%DQh6{+G^F)DiJd7j_TIdo6p&3OO)ScRmni1eg7%=e($7wszJ zei8gC;(gw!-lw};%XX!tpOixg!&0JLbW@8vYK3|=_GM3#G}V4AjVJOJ^j`KTsX{)L zVrf#PY&v7QhswhpY8f-=vkSCAlQNZl(9>n&!|y^^W3`LxAJT-9AKnM&m%V6GB{wwp z6MT8oiQ;&1zKjkbbs#ltgchXhJY`)!Gi$wj#CE(GOil z)QO}?DQ?w!;By*MeNVoUODRe-M_L)-&h?+T)V5oKI zbf*dB{pj^Lu02$<-2oK|<$SJ?l%^4NhQ(7#aCI6@l7wkO@u@wJ0~D62=r@O|u1T6F z6rt)HA=`pmql8*#ebcpCGwJ6o^+}F%LULzH*)^Lo0bPOKF31xWBZ`zL5U!L4RMed;Brz-j^Zow(2N?Ebzs#*F-LeE{k{IJKA zKIuj9=lXdE9-u|6PiECLSozSbNk3H`WBpvP#;A0?_QUGipAaf2qvi~i)iK_k-&6%w zP1(P$2`gskJtkcHiIhtoz}zk?!BdKUwd+`4uaaDsP>^~`G-`GE5xDpmnxZkvO$Te| zc>VANNQpEWs8+mIvdE$+pR}GD(8m8a8xXdHVWBLz{|2m(`-N~u+_6gW{=O2c4v7Fz zUH_U7h}U7mRcM-}Wf2cTMpK{yun^IelvBFz`PRk1>JVp_di|nKv|vADj2Gi21;u#q z(S6|P6(}+R5hlC;g-X&n4g0jT;HK;oHcQ{tFRFO@X(&}EXgV~5hhX9Di-~$&n4JaQ5wY^ z9MV-P#7)p??gJKzulZOZaal*29{=M3dWToQNyow@p@tcH#^TE2Q1!;I!hdyT@h4KH zVtaG4Wb^v(^&+QkAc#=}ysVOIyC^8=E!m2sL#1x#7R8S#1#oEtoa)lA#uOt8@SxVo z6;@xq0uWKG7!_EZav_G-3U;+=`lIimT08XgDhz81;98s7?}q{4fTB|ECH40Tvtad8 zhkx<`vN69VkY98iaFCz3-;Y7ue~0H^-T3_b`940Im-ol`a!1dk<*PsW03x3$n^0P{ zz5f&O2kG_b{dFrH0#&FIib?A4v%2^gQ(pb-Q`F8oRaBpJopju>So}k*I!}MEZ`99Q zxA^`2GUKN|!P*c3fcO;xag?;=0e~(pDsOrI?+|0=)Y2rfu)y0q1m_o(+z(X}OU3CJ zwD4N5$`V95RBH1pb02<~J%~~WK#L0c1-M$6wGM0v3 zsx(cz2DU-u@v;ipRPK+@6x7}o@9kH*a8_N%M^cEyYl=_18^8o8@{9BKI z$ItNh-}k>?{|61Az-rhE8@TKE%3t>b^Lz@GRQ`0l zJ>7cG&hP3^Ag(^~$@!^jN*YW<9ze%b%9ZJg6ma~NhZ0%+TpcX=$+2ok0h`oDAqOHEG{dt;>)iS&1Bg(fqP9WvvQz6_sz*Q=|m-0oCM_sOkp| zz!oV0sAog)+XTM&Wk{vw3FQqyPuG5P^+2*C%kQmP}fbo(i|amwpENdFy{e^x+*DD&qOSCUks6#c6Z$ z-t$RENrC}neAk1IR3c@`35rcz6Vi{TG{|>ifB6rEKh&|DQyEA3oeMwBqk*c1jF*1L zx~Ej3I|Y3_s!miuCf(V>!*Aix*GoP`qQ+Fe_&UU`$_3q2A&`S=FO__&IAKyqwsnb0 z=iC@=I;^$bE{7&GiypK;8Bde%T^X8!{}IZ=FLEGt3N?v2|4OHyOlnuso*S#m1~rYH z`M%7qJUWr7#ypm7pGd`V^WN#oy!DSjx++@u;?&V9&0r zW2F!4X^?uzD%q;!eMG=)ze2jequsNL_kTdqKkZjdBt?ZWuRQHjiW892uiUMU^Yex- zG@n}0j@X&xF`4(uLJ^=M$a#k^cfWHDO ztkjXzt`-Q(% zBkfi$wHQdupeoiziFY+C(6P@R(0FJ47nc52B#_T{xGgWazM@q6vhu9Uo?p3ku9m=G z5(5|o6;(LbUKd_Vm@nAF#ZOrGCJ+>#{?M-HL}iD8RFKQ|1@*@lmn(T!QqGU-AXh2nM5j*(|Q7v&(NH3n(?pE zzK^pC<&LmkW5miw{;|~$d{zpY=#zg{1)98;O#WE$)8-$q_oNX+0dsPUUS|P{wqHr& z6g1`eGY!u=1o}yhK+hDL*lV)B2R;8e@rR2a!-vrB$IHG&GcEdRh@w!lC*4!SeSMD3 zbxB%;UpCV|xlJ7#_bHP&1x-sh0!k_2Wg>A(`zbLxB3L^-0P-=Uci^;RX$r1Nk~)~b z;O1x&)GLtsTQG~q>atNQ7A1u$8ikS%RL}l7HmhjzuQ$`O7Pz|TphU+KPXrxfou(24 zp3hG}lGt?1`M*3bYiB6+yjxdDCDD{z8&b`36@y;2eeR^JbZePPleoKtd&LK4=uX|t za#_hPGb(4K-}^`-R-GWJQQhMg5wFv@X(!oaiT~(jpHOkqV7Hw~$dDGk3UNO>ajTj1 z52mLZ=o9=`JWROug>!ee=$GofJD?#BAQfth8ALkX1Fk??=7MYz%&!AYdf5&)fS@mQ z8zetKO_sgnRVAAu14L6OjIgM!7pJPouSlCl<|T=mGpL{O4tRQUqlcm>ky1hYbK^9Y za(q^s5L;JMf$RC{94K$0himXUDXN~r1kxB(OgX^v?q@&!MDxk@3bGGitFTM}`WD&y zPbS5ynovee-wD~DKLtv>|M&2S?%)01-A6mVbtml4X&~j~tLl8F$|rzjSX@$kg7gO` zROzCp3QlzdYi>1@CTGR(rd1E0fQCu0dsEce9w@;onddt;fvtY7N|o--(mt}QQc1J8 z4Qf8=c8$>OL#@ox&TTdET?I|0i$_ZqHq|$4S2j(Wx~DMXu`-yD`7+*LrtrGPRc=$) zUXRRSHZATdW7A~6D$ta&@_6-A3uOfAN;Qp*P)S8ocPU$lcmb03PhnHHU3}=N)^Mf_ z>j_7BXYuR&@6EiN#iFt+Eb1nuk8_kXFF`CG-vQq1suVlZlNr_`SW!7xsR2bp#hZiZ z(-}+o`c1Vrh>Gn1SvV^u&mgR0{k-Yd=mEf@x?iSK!%0z759?(YU~eSx25aX3B*zM! z6?LrNlwpfcOTX5E-`f|L@7KMt0Bk!HRg6Tiwzp}-^BLfA==xt{AI7|cQ=iq)CAf$U zR>Lz_+4j9;`;Ndivv5HMhu${zSkx?=XzSrnSL>&C-Hy*)tXgovF`$-`!d09I2Vy*Y zvie)3_3^oT(^DDL;x*7}f?5IkhZJ?rU~Zu{T(Wz!Uic0u5|~!!u~c=ie{ShiO=>Gc zKg_x-<58}qDJWcP*B@CnO?&N{EUVF+GUle0p*O@?A94>%`g-Y|-B*F-b}e4oH0(D* zZ7{1{c*||Yp{9XvBnByLAfBz|xz-g_3Si>-YT5zIPHlu*Cj{<{`#FMYO-f+2KwZJg z8FzC6wc#Du418ipeSvfuw^9W$f;>`6fuv4cL`Ob3F>?o16D_b&g|x2haV$gt|- zsp`_KzdF?K?^|}K4)|7kJD^?}J|4Ei_d;y{ZSoj zEf~ka2Ig-=KkgNhadoU1SoisLLnBX5C*S*aLpQu$J0n7tT~ScPrJGcD>zHpk{FPAp z;V87dzJs^Y`wVK~ZF@Rw_SRtB+YnZht^T=HcZY;9tZHLck>z*J_W@nDry=c6hlCW9 zSJ<{Sb_>UV8gXW{J$ak8CSu*K4uif66{6yAXHq+MIk@KzgPUySpd)x>4uI8Lq;r-;=+ zHm#t<>Iaxk?(NUR)89C+yc7xNXp5>Qk#WIotYXFe|1`f;zD3fgo`2nuUtIQT!D>o- z6)#l{XGCfRFAGdshDGJ@hpCoQC4`%`I9M{I++ZCm^p|`{B7A8A&(7z%PRK^lXfbP$ z1=8$SRZu3g4t5C6S}JA72b%(yv%9KI(P*t|#;giv)1qK40?P}OP(MqkU$60?VN%81s%=}WAnSpjp zmpR?BGEA+QN5)N?;qBw??eV|{aB&aZ*(L~KZ!=6T*2KtNi(Bk!+%lxRZ9Lru@CK@e z#nTn2IMAoGkZv(+!+WrnI9#?ZfTK2}{;&AoQnI}p|LYIUI-w;gX-WqXk z2wCu)V*E!`7irbfleAjL zbh2alx%K8R3RtIe^bQWmUEkODKtujD`gsJ~n}gPIj#@154kq7QQq4?C=N zPRgBN&Lr;CvCyICI2WBml^9v}c6K{WXy1HaQZi~K*j=NB*`{;!q9Ab?d>`Vg&n>F4 z`j(yk&9ZM6URxX`r{g(wkGt!FyZ^fDc_3HcZ!Ee$hglXYk7R(5r#>f5s`gp=<_WR6b$iqzjLsT#+v8ZtH zcNWx~Ea=z9Oi7+FrFcnV(fPr#ftBZ1r7H7f%5e?YB&>#fX{aLnlNR!GyO7J~M;Y=r zH88&}NeEv-DD}Qd8A3V;sRgN1f?v57Gm_hxryf4-OZ<@VrcY`O#EANTwCk{an=Ue4^e$Daka2Xzq|?G^9r_)MSf$FD8K z!i-{gjax9w5#C1=Pb@<2i)K_^c}+Wte?aka80l4*(Ht%{y?HygG@XiJ^n-tYk5|mg z@5keDkKepMy3Tvw4a57{nGt#?2Hxs=vNz8#KO%8Z{r39V0ZRT&NN)%AqK2+aino=V zi(7h9c#+ZZmFS!5_<|Esbzsb`53!u=`#8wq{yM&lkE#AlrP5PFmhqTdQs0(l<60DN z1=^wue9x0A76J!)I#B%1VZR2am6{maE?=wH##v#aBkgk1!||+#p;qrgIOa%qkjx2qi|*S@QX z3mGH5ctw+72fqQ&j>WW`^9TP1(_=>;TqjSj1A1PbLVWH5Z+bn53-$z$_f!PAhs{rh zHi<{tax(Oob%Zr4uQWpNLBE-MA(~ExDEyZlO|}y2q9m1L7_n>EHQ01fPv-<*+z%ob zxp7_Nx(900?gBr<_Q-GjleXwK!XlPykD@+K?qBg>Fo08Kskc5UG!|*~+#-`V7Z(mAZ21`4s*kn$)_(&B8 z62*2LWk(Y-Imn&$PM7kZubHav<7V_p2O;gsK{`1&b-zEIt@z4)<{3>iZbsi8BBRBw z)TIvk_pUQIpQ%Z(d6w+S{E>Od-`<{{qU&`Qb81((uF@6FFVeX_y2GS|zBc3*<>&6f zNhGY3Pw#)fbiaO|qLe!h_P)jxJ3gLKi6J +#include +#include +#include +#include "faidx.h" +#include "khash.h" + +typedef struct { + uint64_t len:32, line_len:16, line_blen:16; + uint64_t offset; +} faidx1_t; +KHASH_MAP_INIT_STR(s, faidx1_t) + +#ifndef _NO_RAZF +#include "razf.h" +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#define RAZF FILE +#define razf_read(fp, buf, size) fread(buf, 1, size, fp) +#define razf_open(fn, mode) fopen(fn, mode) +#define razf_close(fp) fclose(fp) +#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) +#define razf_tell(fp) ftello(fp) +#endif + +struct __faidx_t { + RAZF *rz; + int n, m; + char **name; + khash_t(s) *hash; +}; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) +{ + khint_t k; + int ret; + faidx1_t t; + if (idx->n == idx->m) { + idx->m = idx->m? idx->m<<1 : 16; + idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); + } + idx->name[idx->n] = strdup(name); + k = kh_put(s, idx->hash, idx->name[idx->n], &ret); + t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; + kh_value(idx->hash, k) = t; + ++idx->n; +} + +faidx_t *fai_build_core(RAZF *rz) +{ + char c, *name; + int l_name, m_name, ret; + int len, line_len, line_blen, state; + int l1, l2; + faidx_t *idx; + uint64_t offset; + + idx = (faidx_t*)calloc(1, sizeof(faidx_t)); + idx->hash = kh_init(s); + name = 0; l_name = m_name = 0; + len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; + while (razf_read(rz, &c, 1)) { + if (c == '\n') { // an empty line + if (state == 1) { + offset = razf_tell(rz); + continue; + } else if ((state == 0 && len < 0) || state == 2) continue; + } + if (c == '>') { // fasta header + if (len >= 0) + fai_insert_index(idx, name, len, line_len, line_blen, offset); + l_name = 0; + while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { + if (m_name < l_name + 2) { + m_name = l_name + 2; + kroundup32(m_name); + name = (char*)realloc(name, m_name); + } + name[l_name++] = c; + } + name[l_name] = '\0'; + if (ret == 0) { + fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); + free(name); fai_destroy(idx); + return 0; + } + if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); + state = 1; len = 0; + offset = razf_tell(rz); + } else { + if (state == 3) { + fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 2) state = 3; + l1 = l2 = 0; + do { + ++l1; + if (isgraph(c)) ++l2; + } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); + if (state == 3 && l2) { + fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + ++l1; len += l2; + if (l2 >= 0x10000) { + fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 1) line_len = l1, line_blen = l2, state = 0; + else if (state == 0) { + if (l1 != line_len || l2 != line_blen) state = 2; + } + } + } + fai_insert_index(idx, name, len, line_len, line_blen, offset); + free(name); + return idx; +} + +void fai_save(const faidx_t *fai, FILE *fp) +{ + khint_t k; + int i; + for (i = 0; i < fai->n; ++i) { + faidx1_t x; + k = kh_get(s, fai->hash, fai->name[i]); + x = kh_value(fai->hash, k); + fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); + } +} + +faidx_t *fai_read(FILE *fp) +{ + faidx_t *fai; + char *buf, *p; + int len, line_len, line_blen; + long long offset; + fai = (faidx_t*)calloc(1, sizeof(faidx_t)); + fai->hash = kh_init(s); + buf = (char*)calloc(0x10000, 1); + while (!feof(fp) && fgets(buf, 0x10000, fp)) { + for (p = buf; *p && isgraph(*p); ++p); + *p = 0; ++p; + sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); + fai_insert_index(fai, buf, len, line_len, line_blen, offset); + } + free(buf); + return fai; +} + +void fai_destroy(faidx_t *fai) +{ + int i; + for (i = 0; i < fai->n; ++i) free(fai->name[i]); + free(fai->name); + kh_destroy(s, fai->hash); + if (fai->rz) razf_close(fai->rz); + free(fai); +} + +int fai_build(const char *fn) +{ + char *str; + RAZF *rz; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + rz = razf_open(fn, "r"); + if (rz == 0) { + fprintf(stderr, "[fai_build] fail to open the FASTA file.\n"); + free(str); + return -1; + } + fai = fai_build_core(rz); + razf_close(rz); + fp = fopen(str, "w"); + if (fp == 0) { + fprintf(stderr, "[fai_build] fail to write FASTA index.\n"); + fai_destroy(fai); free(str); + return -1; + } + fai_save(fai, fp); + fclose(fp); + free(str); + fai_destroy(fai); + return 0; +} + +faidx_t *fai_load(const char *fn) +{ + char *str; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + fp = fopen(str, "r"); + if (fp == 0) { + fprintf(stderr, "[fai_load] build FASTA index.\n"); + fai_build(fn); + fp = fopen(str, "r"); + if (fp == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); + free(str); + return 0; + } + } + fai = fai_read(fp); + fclose(fp); + fai->rz = razf_open(fn, "r"); + free(str); + if (fai->rz == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); + return 0; + } + return fai; +} + +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + char *s, *p, c; + int i, l, k; + khiter_t iter; + faidx1_t val; + khash_t(s) *h; + int beg, end; + + beg = end = -1; + h = fai->hash; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the ref_id */ + if (iter == kh_end(h)) { + *len = 0; + free(s); return 0; + } + val = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + beg = 0; end = val.len; + } else { + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + beg = atoi(p); + if (i < k) { + p = s + i + 1; + end = atoi(p); + } else end = val.len; + } + if (beg > 0) --beg; + if (beg >= val.len) beg = val.len; + if (end >= val.len) end = val.len; + if (beg > end) beg = end; + free(s); + + // now retrieve the sequence + l = 0; + s = (char*)malloc(end - beg + 2); + razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); + while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg) + if (isgraph(c)) s[l++] = c; + s[l] = '\0'; + *len = l; + return s; +} + +int faidx_main(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "Usage: faidx [ [...]]\n"); + return 1; + } else { + if (argc == 2) fai_build(argv[1]); + else { + int i, j, k, l; + char *s; + faidx_t *fai; + fai = fai_load(argv[1]); + if (fai == 0) return 1; + for (i = 2; i != argc; ++i) { + printf(">%s\n", argv[i]); + s = fai_fetch(fai, argv[i], &l); + for (j = 0; j < l; j += 60) { + for (k = 0; k < 60 && k < l - j; ++k) + putchar(s[j + k]); + putchar('\n'); + } + free(s); + } + fai_destroy(fai); + } + } + return 0; +} + +#ifdef FAIDX_MAIN +int main(int argc, char *argv[]) { return faidx_main(argc, argv); } +#endif diff --git a/faidx.h b/faidx.h new file mode 100644 index 0000000..1a52fb7 --- /dev/null +++ b/faidx.h @@ -0,0 +1,82 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef FAIDX_H +#define FAIDX_H + +/*! + @header + + Index FASTA files and extract subsequence. + + @copyright The Wellcome Trust Sanger Institute. + */ + +struct __faidx_t; +typedef struct __faidx_t faidx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Build index for a FASTA or razip compressed FASTA file. + @param fn FASTA file name + @return 0 on success; or -1 on failure + @discussion File "fn.fai" will be generated. + */ + int fai_build(const char *fn); + + /*! + @abstract Distroy a faidx_t struct. + @param fai Pointer to the struct to be destroyed + */ + void fai_destroy(faidx_t *fai); + + /*! + @abstract Load index from "fn.fai". + @param fn File name of the FASTA file + */ + faidx_t *fai_load(const char *fn); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *fai_fetch(const faidx_t *fai, const char *reg, int *len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/glf.c b/glf.c new file mode 100644 index 0000000..8d5346a --- /dev/null +++ b/glf.c @@ -0,0 +1,236 @@ +#include +#include +#include "glf.h" + +#ifdef _NO_BGZF +// then alias bgzf_*() functions +#endif + +static int glf3_is_BE = 0; + +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} + +glf3_header_t *glf3_header_init() +{ + glf3_is_BE = bam_is_big_endian(); + return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); +} + +glf3_header_t *glf3_header_read(glfFile fp) +{ + glf3_header_t *h; + char magic[4]; + h = glf3_header_init(); + bgzf_read(fp, magic, 4); + if (strncmp(magic, "GLF\3", 4)) { + fprintf(stderr, "[glf3_header_read] invalid magic.\n"); + glf3_header_destroy(h); + return 0; + } + bgzf_read(fp, &h->l_text, 4); + if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); + if (h->l_text) { + h->text = (uint8_t*)calloc(h->l_text + 1, 1); + bgzf_read(fp, h->text, h->l_text); + } + return h; +} + +void glf3_header_write(glfFile fp, const glf3_header_t *h) +{ + int32_t x; + bgzf_write(fp, "GLF\3", 4); + x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; + bgzf_write(fp, &x, 4); + if (h->l_text) bgzf_write(fp, h->text, h->l_text); +} + +void glf3_header_destroy(glf3_header_t *h) +{ + free(h->text); + free(h); +} + +char *glf3_ref_read(glfFile fp, int *len) +{ + int32_t n, x; + char *str; + *len = 0; + if (bgzf_read(fp, &n, 4) != 4) return 0; + if (glf3_is_BE) n = bam_swap_endian_4(n); + if (n < 0) { + fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); + return 0; + } + str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact + x = bgzf_read(fp, str, n); + x += bgzf_read(fp, len, 4); + if (x != n + 4) { + free(str); *len = -1; return 0; // truncated + } + if (glf3_is_BE) *len = bam_swap_endian_4(*len); + return str; +} + +void glf3_ref_write(glfFile fp, const char *str, int len) +{ + int32_t m, n = strlen(str) + 1; + m = glf3_is_BE? bam_swap_endian_4(n) : n; + bgzf_write(fp, &m, 4); + bgzf_write(fp, str, n); + if (glf3_is_BE) len = bam_swap_endian_4(len); + bgzf_write(fp, &len, 4); +} + +void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) +{ + int j; + if (g3->rtype == GLF3_RTYPE_END) return; + printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, + g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], + g3->depth, g3->rms_mapQ, g3->min_lk); + if (g3->rtype == GLF3_RTYPE_SUB) + for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); + else { + printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], + g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); + } + printf("\n"); +} + +int glf3_write1(glfFile fp, const glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + c = g3->rtype<<4 | g3->ref_base; + r = bgzf_write(fp, &c, 1); + if (g3->rtype == GLF3_RTYPE_END) return r; + y[0] = g3->offset; + y[1] = g3->min_lk<<24 | g3->depth; + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + r += bgzf_write(fp, y, 8); + r += bgzf_write(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); + else { + int16_t x[2]; + r += bgzf_write(fp, g3->lk, 3); + x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; + x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; + r += bgzf_write(fp, x, 4); + if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); + if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); + } + return r; +} + +#ifndef kv_roundup32 +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +int glf3_read1(glfFile fp, glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + r = bgzf_read(fp, &c, 1); + if (r == 0) return 0; + g3->ref_base = c & 0xf; + g3->rtype = c>>4; + if (g3->rtype == GLF3_RTYPE_END) return r; + r += bgzf_read(fp, y, 8); + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + g3->offset = y[0]; + g3->min_lk = y[1]>>24; + g3->depth = y[1]<<8>>8; + r += bgzf_read(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); + else { + int16_t x[2], max; + r += bgzf_read(fp, g3->lk, 3); + r += bgzf_read(fp, x, 4); + if (glf3_is_BE) { + x[0] = bam_swap_endian_2(x[0]); + x[1] = bam_swap_endian_2(x[1]); + } + g3->indel_len[0] = x[0]; + g3->indel_len[1] = x[1]; + x[0] = abs(x[0]); x[1] = abs(x[1]); + max = (x[0] > x[1]? x[0] : x[1]) + 1; + if (g3->max_len < max) { + g3->max_len = max; + kv_roundup32(g3->max_len); + g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); + g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); + } + r += bgzf_read(fp, g3->indel_seq[0], x[0]); + r += bgzf_read(fp, g3->indel_seq[1], x[1]); + g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; + } + return r; +} + +void glf3_view(glfFile fp) +{ + glf3_header_t *h; + char *name; + glf3_t *g3; + int len; + h = glf3_header_read(fp); + g3 = glf3_init1(); + while ((name = glf3_ref_read(fp, &len)) != 0) { + int pos = 0; + while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { + pos += g3->offset; + glf3_view1(name, g3, pos); + } + free(name); + } + glf3_header_destroy(h); + glf3_destroy1(g3); +} + +int glf3_view_main(int argc, char *argv[]) +{ + glfFile fp; + if (argc == 1) { + fprintf(stderr, "Usage: glfview \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "Fail to open file '%s'\n", argv[1]); + return 1; + } + glf3_view(fp); + bgzf_close(fp); + return 0; +} + +#ifdef GLFVIEW_MAIN +int main(int argc, char *argv[]) +{ + return glf3_view_main(argc, argv); +} +#endif diff --git a/glf.h b/glf.h new file mode 100644 index 0000000..12e5400 --- /dev/null +++ b/glf.h @@ -0,0 +1,56 @@ +#ifndef GLF_H_ +#define GLF_H_ + +typedef struct { + unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + unsigned char max_mapQ; /** maximum mapping quality */ + unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ + unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ +} glf1_t; + +#include +#include "bgzf.h" +typedef BGZF *glfFile; + +#define GLF3_RTYPE_END 0 +#define GLF3_RTYPE_SUB 1 +#define GLF3_RTYPE_INDEL 2 + +typedef struct { + uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + uint8_t rms_mapQ; /** RMS mapping quality */ + uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ + uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ + int32_t offset; /** the first base in a chromosome has offset zero. */ + // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) + int16_t indel_len[2]; + int32_t max_len; // maximum indel len; will be modified by glf3_read1() + char *indel_seq[2]; +} glf3_t; + +typedef struct { + int32_t l_text; + uint8_t *text; +} glf3_header_t; + +#ifdef __cplusplus +extern "C" { +#endif + +#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) +#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) + + glf3_header_t *glf3_header_init(); + glf3_header_t *glf3_header_read(glfFile fp); + void glf3_header_write(glfFile fp, const glf3_header_t *h); + void glf3_header_destroy(glf3_header_t *h); + char *glf3_ref_read(glfFile fp, int *len); + void glf3_ref_write(glfFile fp, const char *name, int len); + int glf3_write1(glfFile fp, const glf3_t *g3); + int glf3_read1(glfFile fp, glf3_t *g3); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..1d583ef --- /dev/null +++ b/khash.h @@ -0,0 +1,486 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.2" + +#include +#include +#include + +typedef uint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + uint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + uint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [uint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (uint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [uint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/knetfile.c b/knetfile.c new file mode 100644 index 0000000..cef197d --- /dev/null +++ b/knetfile.c @@ -0,0 +1,300 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "knetfile.h" + +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); + if (ret == -1) perror("select"); + return ret; +} + +static int kftp_get_response(knetFile *ftp) +{ + unsigned char c; + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (read(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + write(ftp->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + +static int kftp_pasv_connect(knetFile *ftp) +{ +#define __err_pasv_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + struct addrinfo hints, *res; + struct linger lng = { 0, 0 }; + int on = 1; + char host[80], port[10]; + + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + if (getaddrinfo(host, port, &hints, &res) != 0) { perror("getaddrinfo"); return -1; } + if ((ftp->fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_pasv_connect("socket"); + if (setsockopt(ftp->fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_pasv_connect("setsockopt"); + if (setsockopt(ftp->fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_pasv_connect("setsockopt"); + if (connect(ftp->fd, res->ai_addr, res->ai_addrlen) != 0) __err_pasv_connect("connect"); + freeaddrinfo(res); + return 0; +} + +int kftp_connect(knetFile *ftp) +{ +#define __err_connect(func) do { perror(func); return -1; } while (0) + + int on = 1; + { // open socket + struct addrinfo hints, *res; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if (getaddrinfo(ftp->host, "21", &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((ftp->ctrl_fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + if (setsockopt(ftp->ctrl_fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (connect(ftp->ctrl_fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + kftp_get_response(ftp); + } + { // login + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + } + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd >= 0) { + close(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + close(ftp->fd); + return kftp_connect(ftp); +} + +// initialize ->type, ->host and ->retr +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + fp->host = calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->seek_offset = -1; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + if (fp->fd >= 0) { + close(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + if (fp->offset) { + char tmp[32]; + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + close(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + if (fp->fd < 0) { + knet_close(fp); + return 0; + } + } else { + int fd = open(fn, O_RDONLY); + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +off_t knet_read(knetFile *fp, void *buf, off_t len) +{ + off_t l = 0; + if (fp->fd < 0) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + off_t rest = len, curr; + while (rest) { + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; + l += curr; rest -= curr; + } + fp->offset += l; + } else { + off_t rest = len, curr; + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + fp->is_ready = 1; + } + while (rest) { + if (socket_wait(fp->fd, 1) <= 0) break; // socket is not ready for reading + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; // FIXME: end of file or bad network? I do not know... + l += curr; rest -= curr; + } + fp->offset += l; + } + return l; +} + +int knet_seek(knetFile *fp, off_t off, int whence) +{ + if (fp->type == KNF_TYPE_LOCAL) { + if (lseek(fp->fd, off, whence) == -1) { + perror("lseek"); + return -1; + } + fp->offset = off; + return 0; + } + if (fp->type == KNF_TYPE_FTP) { + if (whence != SEEK_SET) { // FIXME: we can surely allow SEEK_CUR and SEEK_END in future + fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP. Offset is unchanged.\n"); + return -1; + } + fp->offset = off; + fp->is_ready = 0; + return 0; + } + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd >= 0) close(fp->ctrl_fd); + if (fp->fd >= 0) close(fp->fd); + free(fp->response); free(fp->retr); free(fp->host); + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char buf[256]; + knetFile *fp; +// fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); knet_seek(fp, 2500000000ll, SEEK_SET); + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); knet_seek(fp, 2000, SEEK_SET); +// fp = knet_open("knetfile.c", "r"); knet_seek(fp, 2000, SEEK_SET); + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + knet_close(fp); + return 0; +} +#endif diff --git a/knetfile.h b/knetfile.h new file mode 100644 index 0000000..bf45f3d --- /dev/null +++ b/knetfile.h @@ -0,0 +1,55 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include +#include + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr; + int64_t seek_offset; // for lazy seek +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + int knet_seek(knetFile *fp, off_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/kseq.h b/kseq.h new file mode 100644 index 0000000..bbe0125 --- /dev/null +++ b/kseq.h @@ -0,0 +1,223 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Last Modified: 12APR2009 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_MAX 1 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->l == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff --git a/ksort.h b/ksort.h new file mode 100644 index 0000000..16a03fd --- /dev/null +++ b/ksort.h @@ -0,0 +1,271 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/kstring.c b/kstring.c new file mode 100644 index 0000000..dc20fae --- /dev/null +++ b/kstring.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + return 0; +} +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..221ade2 --- /dev/null +++ b/kstring.h @@ -0,0 +1,59 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/misc/Makefile b/misc/Makefile new file mode 100644 index 0000000..4404ccc --- /dev/null +++ b/misc/Makefile @@ -0,0 +1,54 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 -m64 #-arch ppc +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_FILE_OFFSET_BITS=64 +OBJS= +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim +INCLUDES= -I.. +SUBDIRS= . + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +lib-recur all-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" $$target || exit 1; \ + cd $$wdir; \ + done; + +lib: + +wgsim:wgsim.o + $(CC) $(CFLAGS) -o $@ wgsim.o -lm + +md5fa:md5.o md5fa.o md5.h ../kseq.h + $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz + +md5sum-lite:md5sum-lite.o + $(CC) $(CFLAGS) -o $@ md5sum-lite.o + +md5sum-lite.o:md5.c md5.h + $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c + +maq2sam-short:maq2sam.c + $(CC) $(CFLAGS) -o $@ maq2sam.c -lz + +maq2sam-long:maq2sam.c + $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz + +md5fa.o:md5.h md5fa.c + $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/misc/blast2sam.pl b/misc/blast2sam.pl new file mode 100755 index 0000000..084f018 --- /dev/null +++ b/misc/blast2sam.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; +use Getopt::Std; + +&blast2sam; + +sub blast2sam { + my %opts = (); + getopts('s', \%opts); + die("Usage: blast2sam.pl \n") if (-t STDIN && @ARGV == 0); + my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq); + $show_seq = defined($opts{s}); + @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*'); + while (<>) { + if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print + &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); + @cigar = (); + } + if (/^Query= (\S+)/) { + $sam[0] = $1; + } elsif (/\((\S+)\s+letters\)/) { + $qlen = $1; $qlen =~ s/,//g; + } elsif (/^>(\S+)/) { + $sam[2] = $1; + } elsif (/Length = (\d+)/) { + $slen = $1; + } elsif (/Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/) { # the start of an alignment block + my ($as, $ev) = (int($1 + .499), $3); + $ev = "1$ev" if ($ev =~ /^e/); + @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev"); + @cigar = (); $qbeg = 0; + @cmaux = (0, 0, 0, ''); + } elsif (/Strand = (\S+) \/ (\S+)/) { + $sam[1] |= 0x10 if ($2 eq 'Minus'); + } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) { + $q = $2; + unless ($qbeg) { + $qbeg = $1; + push(@cigar, ($1-1) . "H") if ($1 > 1); + } + $qend = $3; + if ($show_seq) { + my $x = $q; + $x =~ s/-//g; $sam[9] .= $x; + } + } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) { + $s = $2; + if ($sam[1] & 0x10) { + $sam[3] = $3; + } else { + $sam[3] = $1 unless ($sam[3]); + } + &aln2cm(\@cigar, \$q, \$s, \@cmaux); + } + } + &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); +} + +sub blast_print_sam { + my ($sam, $cigar, $cmaux, $qrest) = @_; + push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); + push(@$cigar, $qrest . 'H') if ($qrest); + if ($sam->[1] & 0x10) { + @$cigar = reverse(@$cigar); + $sam->[9] = reverse($sam->[9]); + $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/; + } + $sam->[9] = '*' if (!$sam->[9]); + $sam->[5] = join('', @$cigar); + print join("\t", @$sam), "\n"; +} + +sub aln2cm { + my ($cigar, $q, $s, $cmaux) = @_; + my $l = length($$q); + for (my $i = 0; $i < $l; ++$i) { + my $op; + # set $op + if (substr($$q, $i, 1) eq '-') { $op = 2; } + elsif (substr($$s, $i, 1) eq '-') { $op = 1; } + else { $op = 0; } + # for CIGAR + if ($cmaux->[0] == $op) { + ++$cmaux->[1]; + } else { + push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); + $cmaux->[0] = $op; $cmaux->[1] = 1; + } + } +} diff --git a/misc/bowtie2sam.pl b/misc/bowtie2sam.pl new file mode 100755 index 0000000..5dff88d --- /dev/null +++ b/misc/bowtie2sam.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&bowtie2sam; +exit; + +sub bowtie2sam { + my %opts = (); + die("Usage: bowtie2sam.pl \n") if (@ARGV == 0 && -t STDIN); + # core loop + my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k); + $last = ''; + while (<>) { + my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches + if ($name eq $last) { + # I do not know whether the multiple hits are ordered on the + # number of mismatches. I assume they are not and so I have to + # keep all these multiple hits in memory. + @{$staging[$k]} = @s; + if ($best_s > $nm) { + $subbest_s = $best_s; + $best_s = $nm; + $best_k = $k; + } elsif ($subbest_s > $nm) { + $subbest_s = $nm; + } + ++$k; + } else { + if ($last) { + if ($best_s == $subbest_s) { + $staging[$best_k][4] = 0; + } elsif ($subbest_s - $best_s == 1) { + $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15); + } + print join("\t", @{$staging[$best_k]}), "\n"; + } + $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0; + @{$staging[0]} = @s; + $last = $name; + } + } + print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0); +} + +sub bowtie2sam_aux { + my ($line, $s) = @_; + chomp($line); + my @t = split("\t", $line); + my $ret; + @$s = (); + # read name + $s->[0] = $ret = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + # read & quality + $s->[9] = $t[4]; $s->[10] = $t[5]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[2]; $s->[3] = $t[3] + 1; + $s->[1] |= 0x10 if ($t[1] eq '-'); + # mapQ + $s->[4] = $t[6] == 0? 25 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + my $nm = @t - 7; + push(@$s, "NM:i:" . (@t-7)); + push(@$s, "X$nm:i:" . ($t[6]+1)); + my $md = ''; + if ($t[7]) { + $_ = $t[7]; + my $a = 0; + while (/(\d+):[ACGTN]>([ACGTN])/gi) { + my ($y, $z) = ($1, $2); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($s->[9]) - $a; + } else { + $md = length($s->[9]); + } + push(@$s, "MD:Z:$md"); + return ($ret, $nm); +} diff --git a/misc/export2sam.pl b/misc/export2sam.pl new file mode 100755 index 0000000..8e3e280 --- /dev/null +++ b/misc/export2sam.pl @@ -0,0 +1,107 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.2 (03JAN2009) + +use strict; +use warnings; +use Getopt::Std; + +&export2sam; +exit; + +sub export2sam { + my ($fh1, $fh2, $is_paired); + $is_paired = (@ARGV >= 2); + die("export2sam.pl []\n") if (@ARGV == 0); + open($fh1, $ARGV[0]) || die; + if ($is_paired) { + open($fh2, $ARGV[1]) || die; + } + # conversion table + my @conv_table; + for (-64..64) { + $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499)); + } + # core loop + while (<$fh1>) { + my (@s1, @s2); + &export2sam_aux($_, \@s1, \@conv_table, $is_paired); + if ($is_paired) { + $_ = <$fh2>; + &export2sam_aux($_, \@s2, \@conv_table, $is_paired); + if (@s1 && @s2) { # then set mate coordinate + my $isize = 0; + if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize + my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3]; + my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2[2] ne '*') { + @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize); + $s1[1] |= 0x20 if ($s2[1] & 0x10); + } else { + $s1[1] |= 0x8; + } + if ($s1[2] ne '*') { + @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize); + $s2[1] |= 0x20 if ($s1[1] & 0x10); + } else { + $s2[1] |= 0x8; + } + } + } + print join("\t", @s1), "\n" if (@s1); + print join("\t", @s2), "\n" if (@s2 && $is_paired); + } + close($fh1); + close($fh2) if ($is_paired); +} + +sub export2sam_aux { + my ($line, $s, $ct, $is_paired) = @_; + chomp($line); + my @t = split("\t", $line); + @$s = (); + return if ($t[21] ne 'Y'); + # read name + $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired); + # read & quality + $s->[9] = $t[8]; $s->[10] = $t[9]; + if ($t[13] eq 'R') { # then reverse the sequence and quality + $s->[9] = reverse($t[8]); + $s->[9] =~ tr/ACGTacgt/TGCAtgca/; + $s->[10] = reverse($t[9]); + } + $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + my $has_coor = 0; + $s->[2] = "*"; + if ($t[10] eq 'NM' || $t[10] eq 'QC') { + $s->[1] |= 0x4; # unmapped + } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) { + $s->[1] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? + push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + } else { + $s->[2] = $t[10]; + $has_coor = 1; + } + $s->[3] = $has_coor? $t[12] : 0; + $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R'); + # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?) + $s->[4] = 0; + $s->[4] = $t[15] if ($t[15] ne ''); + $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]); + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "BC:Z:$t[6]") if ($t[6]); + push(@$s, "MD:Z:$t[14]") if ($has_coor); + push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor); +} diff --git a/misc/interpolate_sam.pl b/misc/interpolate_sam.pl new file mode 100755 index 0000000..6cd6831 --- /dev/null +++ b/misc/interpolate_sam.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl +use strict; + +###Builds interpolated pileup from SAM file +##@description counts bases between paired ends and piles up single end reads. +##@output, uses a #header for the RNAME and then the number of reads per base +##@author sm8@sanger.ac.uk, Stephen B. Montgomery + +##@caveats +##Requires RNAME to have format as per example +## chromosome:NCBI36:18:1:76117153:1 +## supercontig::NT_113883:1:137703:1 +## clone::AC138827.3:1:149397:1 +##Expects simple CIGAR characters, M, I and D +##Expects SAM file to be sorted. +##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77) + +##Verify and read in SAM file +my $sam_file = $ARGV[0]; +if(!defined($sam_file)) { die("No sam file defined on arg 1"); } +unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); } +open(SAM, $sam_file) || die("Cannot open sam file"); + +##Globals +my $current_location = ""; ##Current RNAME being processed +my $current_size = 0; ##Size of sequence region being processed +my $current_position = 1; ##Current base being processed +my $open = 0; ##Number of open reads (PE reads that have not been closed) +my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the + ##contained value from those open and deletes the indexed position from the hash + +while (my $line = ) { + my @tokens = split /\t/, $line; + + if ($current_location ne $tokens[2]) { ##Start a new sequence region + for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; + } + if ($current_location ne "") { + print "\n"; + } + + ##Initiate a new sequence region + my @location_tokens = split /:/, $tokens[2]; + $current_position = 1; + $current_location = $tokens[2]; + $current_size = $location_tokens[4]; + $open = 0; + %close = (); + print "#" . $tokens[2] . "\n"; + + ##Print pileup to just before the first read (will be 0) + for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) { + print $open . "\n"; + } + $current_position = $tokens[3]; + + } else { ##Sequence region already open + if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position + ##cycle through to catch up to the current position + for (my $i = $current_position; $i < $tokens[3]; $i++) { + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; + } + $current_position = $tokens[3]; + } + } + $open++; ##Increment the number of open reads + + if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition + $open--; + my $parsed_cig = &parseCigar($tokens[5]); + my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; + if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } + $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; + } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition + my $parsed_cig = &parseCigar($tokens[5]); + my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; + if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } + $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; + } else { + #do nothing + } +} +for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; +} +print "\n"; +close(SAM); +exit(0); + +##reads and tokenizes simple cigarline +sub parseCigar() { + my $cigar_line = shift; + $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g; + my @cigar_tokens = split /\t/, $cigar_line; + my %parsed = ('M' => 0, + 'I' => 0, + 'D' => 0); + my @events = (); + for(my $i = 0; $i < scalar(@cigar_tokens); $i++) { + if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) { + if (!defined($parsed{$2})) { $parsed{$2} = 0; } + my $nt = $2; + if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; } + $parsed{$nt} += $1; + my %event_el = ("t" => $nt, + "n" => $1); + push @events, \%event_el; + } + } + $parsed{'events'} = \@events; + return \%parsed; +} diff --git a/misc/maq2sam.c b/misc/maq2sam.c new file mode 100644 index 0000000..758a698 --- /dev/null +++ b/misc/maq2sam.c @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include + +#define PACKAGE_VERSION "0.1.2 (20090521)" + +//#define MAQ_LONGREADS + +#ifdef MAQ_LONGREADS +# define MAX_READLEN 128 +#else +# define MAX_READLEN 64 +#endif + +#define MAX_NAMELEN 36 +#define MAQMAP_FORMAT_OLD 0 +#define MAQMAP_FORMAT_NEW -1 + +#define PAIRFLAG_FF 0x01 +#define PAIRFLAG_FR 0x02 +#define PAIRFLAG_RF 0x04 +#define PAIRFLAG_RR 0x08 +#define PAIRFLAG_PAIRED 0x10 +#define PAIRFLAG_DIFFCHR 0x20 +#define PAIRFLAG_NOMATCH 0x40 +#define PAIRFLAG_SW 0x80 + +typedef struct +{ + uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */ + uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual; + uint32_t seqid, pos; + int dist; + char name[MAX_NAMELEN]; +} maqmap1_t; + +typedef struct +{ + int format, n_ref; + char **ref_name; + uint64_t n_mapped_reads; + maqmap1_t *mapped_reads; +} maqmap_t; + +maqmap_t *maq_new_maqmap() +{ + maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t)); + mm->format = MAQMAP_FORMAT_NEW; + return mm; +} +void maq_delete_maqmap(maqmap_t *mm) +{ + int i; + if (mm == 0) return; + for (i = 0; i < mm->n_ref; ++i) + free(mm->ref_name[i]); + free(mm->ref_name); + free(mm->mapped_reads); + free(mm); +} +maqmap_t *maqmap_read_header(gzFile fp) +{ + maqmap_t *mm; + int k, len; + mm = maq_new_maqmap(); + gzread(fp, &mm->format, sizeof(int)); + if (mm->format != MAQMAP_FORMAT_NEW) { + if (mm->format > 0) { + fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n"); + exit(3); + } + assert(mm->format == MAQMAP_FORMAT_NEW); + } + gzread(fp, &mm->n_ref, sizeof(int)); + mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*)); + for (k = 0; k != mm->n_ref; ++k) { + gzread(fp, &len, sizeof(int)); + mm->ref_name[k] = (char*)malloc(len * sizeof(char)); + gzread(fp, mm->ref_name[k], len); + } + /* read number of mapped reads */ + gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t)); + return mm; +} + +void maq2tam_core(gzFile fp, const char *rg) +{ + maqmap_t *mm; + maqmap1_t mm1, *m1; + int ret; + m1 = &mm1; + mm = maqmap_read_header(fp); + while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) { + int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1]; + if (m1->flag) flag |= 1; + if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2; + if (m1->flag == 192) flag |= 4; + if (m1->flag == 64) flag |= 8; + if (m1->pos&1) flag |= 0x10; + if ((flag&1) && m1->dist != 0) { + int c; + if (m1->dist > 0) { + if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0; + else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1; + else c = m1->pos&1; + } else { + if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0; + else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1; + else c = m1->pos&1; + } + flag |= c; + } + if (flag) { + int l = strlen(m1->name); + if (m1->name[l-2] == '/') { + flag |= (m1->name[l-1] == '1')? 0x40 : 0x80; + m1->name[l-2] = '\0'; + } + } + printf("%s\t%d\t", m1->name, flag); + printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1); + if (m1->flag == 130) { + int c = (int8_t)m1->seq[MAX_READLEN-1]; + printf("%d\t", m1->alt_qual); + if (c == 0) printf("%dM\t", m1->size); + else { + if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c); + else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual); + } + se_mapq = 0; // zero SE mapQ for reads aligned by SW + } else { + if (flag&4) printf("0\t*\t"); + else printf("%d\t%dM\t", m1->map_qual, m1->size); + } + printf("*\t0\t%d\t", m1->dist); + for (j = 0; j != m1->size; ++j) { + if (m1->seq[j] == 0) putchar('N'); + else putchar("ACGT"[m1->seq[j]>>6&3]); + } + putchar('\t'); + for (j = 0; j != m1->size; ++j) + putchar((m1->seq[j]&0x3f) + 33); + putchar('\t'); + if (rg) printf("RG:Z:%s\t", rg); + if (flag&4) { // unmapped + printf("MF:i:%d\n", m1->flag); + } else { + printf("MF:i:%d\t", m1->flag); + if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq); + printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]); + } + } + if (ret > 0) + fprintf(stderr, "Truncated! Continue anyway.\n"); + maq_delete_maqmap(mm); +} + +int main(int argc, char *argv[]) +{ + gzFile fp; + if (argc == 1) { + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: maq2sam []\n"); + return 1; + } + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + maq2tam_core(fp, argc > 2? argv[2] : 0); + gzclose(fp); + return 0; +} diff --git a/misc/md5.c b/misc/md5.c new file mode 100644 index 0000000..ccead0e --- /dev/null +++ b/misc/md5.c @@ -0,0 +1,307 @@ +/* + ********************************************************************** + ** md5.c ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +#include "md5.h" + +/* forward declaration */ +static void Transform (); + +static unsigned char PADDING[64] = { + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +/* F, G and H are basic MD5 functions: selection, majority, parity */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */ +/* Rotation is separate from addition to prevent recomputation */ +#define FF(a, b, c, d, x, s, ac) \ + {(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) \ + {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) \ + {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) \ + {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +void MD5Init (mdContext) +MD5_CTX *mdContext; +{ + mdContext->i[0] = mdContext->i[1] = (UINT4)0; + + /* Load magic initialization constants. + */ + mdContext->buf[0] = (UINT4)0x67452301; + mdContext->buf[1] = (UINT4)0xefcdab89; + mdContext->buf[2] = (UINT4)0x98badcfe; + mdContext->buf[3] = (UINT4)0x10325476; +} + +void MD5Update (mdContext, inBuf, inLen) +MD5_CTX *mdContext; +unsigned char *inBuf; +unsigned int inLen; +{ + UINT4 in[16]; + int mdi; + unsigned int i, ii; + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* update number of bits */ + if ((mdContext->i[0] + ((UINT4)inLen << 3)) < mdContext->i[0]) + mdContext->i[1]++; + mdContext->i[0] += ((UINT4)inLen << 3); + mdContext->i[1] += ((UINT4)inLen >> 29); + + while (inLen--) { + /* add new character to buffer, increment mdi */ + mdContext->in[mdi++] = *inBuf++; + + /* transform if necessary */ + if (mdi == 0x40) { + for (i = 0, ii = 0; i < 16; i++, ii += 4) + in[i] = (((UINT4)mdContext->in[ii+3]) << 24) | + (((UINT4)mdContext->in[ii+2]) << 16) | + (((UINT4)mdContext->in[ii+1]) << 8) | + ((UINT4)mdContext->in[ii]); + Transform (mdContext->buf, in); + mdi = 0; + } + } +} + +void MD5Final (mdContext) +MD5_CTX *mdContext; +{ + UINT4 in[16]; + int mdi; + unsigned int i, ii; + unsigned int padLen; + + /* save number of bits */ + in[14] = mdContext->i[0]; + in[15] = mdContext->i[1]; + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* pad out to 56 mod 64 */ + padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi); + MD5Update (mdContext, PADDING, padLen); + + /* append length in bits and transform */ + for (i = 0, ii = 0; i < 14; i++, ii += 4) + in[i] = (((UINT4)mdContext->in[ii+3]) << 24) | + (((UINT4)mdContext->in[ii+2]) << 16) | + (((UINT4)mdContext->in[ii+1]) << 8) | + ((UINT4)mdContext->in[ii]); + Transform (mdContext->buf, in); + + /* store buffer in digest */ + for (i = 0, ii = 0; i < 4; i++, ii += 4) { + mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF); + mdContext->digest[ii+1] = + (unsigned char)((mdContext->buf[i] >> 8) & 0xFF); + mdContext->digest[ii+2] = + (unsigned char)((mdContext->buf[i] >> 16) & 0xFF); + mdContext->digest[ii+3] = + (unsigned char)((mdContext->buf[i] >> 24) & 0xFF); + } +} + +/* Basic MD5 step. Transform buf based on in. + */ +static void Transform (buf, in) +UINT4 *buf; +UINT4 *in; +{ + UINT4 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF ( a, b, c, d, in[ 0], S11, 3614090360u); /* 1 */ + FF ( d, a, b, c, in[ 1], S12, 3905402710u); /* 2 */ + FF ( c, d, a, b, in[ 2], S13, 606105819u); /* 3 */ + FF ( b, c, d, a, in[ 3], S14, 3250441966u); /* 4 */ + FF ( a, b, c, d, in[ 4], S11, 4118548399u); /* 5 */ + FF ( d, a, b, c, in[ 5], S12, 1200080426u); /* 6 */ + FF ( c, d, a, b, in[ 6], S13, 2821735955u); /* 7 */ + FF ( b, c, d, a, in[ 7], S14, 4249261313u); /* 8 */ + FF ( a, b, c, d, in[ 8], S11, 1770035416u); /* 9 */ + FF ( d, a, b, c, in[ 9], S12, 2336552879u); /* 10 */ + FF ( c, d, a, b, in[10], S13, 4294925233u); /* 11 */ + FF ( b, c, d, a, in[11], S14, 2304563134u); /* 12 */ + FF ( a, b, c, d, in[12], S11, 1804603682u); /* 13 */ + FF ( d, a, b, c, in[13], S12, 4254626195u); /* 14 */ + FF ( c, d, a, b, in[14], S13, 2792965006u); /* 15 */ + FF ( b, c, d, a, in[15], S14, 1236535329u); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG ( a, b, c, d, in[ 1], S21, 4129170786u); /* 17 */ + GG ( d, a, b, c, in[ 6], S22, 3225465664u); /* 18 */ + GG ( c, d, a, b, in[11], S23, 643717713u); /* 19 */ + GG ( b, c, d, a, in[ 0], S24, 3921069994u); /* 20 */ + GG ( a, b, c, d, in[ 5], S21, 3593408605u); /* 21 */ + GG ( d, a, b, c, in[10], S22, 38016083u); /* 22 */ + GG ( c, d, a, b, in[15], S23, 3634488961u); /* 23 */ + GG ( b, c, d, a, in[ 4], S24, 3889429448u); /* 24 */ + GG ( a, b, c, d, in[ 9], S21, 568446438u); /* 25 */ + GG ( d, a, b, c, in[14], S22, 3275163606u); /* 26 */ + GG ( c, d, a, b, in[ 3], S23, 4107603335u); /* 27 */ + GG ( b, c, d, a, in[ 8], S24, 1163531501u); /* 28 */ + GG ( a, b, c, d, in[13], S21, 2850285829u); /* 29 */ + GG ( d, a, b, c, in[ 2], S22, 4243563512u); /* 30 */ + GG ( c, d, a, b, in[ 7], S23, 1735328473u); /* 31 */ + GG ( b, c, d, a, in[12], S24, 2368359562u); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH ( a, b, c, d, in[ 5], S31, 4294588738u); /* 33 */ + HH ( d, a, b, c, in[ 8], S32, 2272392833u); /* 34 */ + HH ( c, d, a, b, in[11], S33, 1839030562u); /* 35 */ + HH ( b, c, d, a, in[14], S34, 4259657740u); /* 36 */ + HH ( a, b, c, d, in[ 1], S31, 2763975236u); /* 37 */ + HH ( d, a, b, c, in[ 4], S32, 1272893353u); /* 38 */ + HH ( c, d, a, b, in[ 7], S33, 4139469664u); /* 39 */ + HH ( b, c, d, a, in[10], S34, 3200236656u); /* 40 */ + HH ( a, b, c, d, in[13], S31, 681279174u); /* 41 */ + HH ( d, a, b, c, in[ 0], S32, 3936430074u); /* 42 */ + HH ( c, d, a, b, in[ 3], S33, 3572445317u); /* 43 */ + HH ( b, c, d, a, in[ 6], S34, 76029189u); /* 44 */ + HH ( a, b, c, d, in[ 9], S31, 3654602809u); /* 45 */ + HH ( d, a, b, c, in[12], S32, 3873151461u); /* 46 */ + HH ( c, d, a, b, in[15], S33, 530742520u); /* 47 */ + HH ( b, c, d, a, in[ 2], S34, 3299628645u); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II ( a, b, c, d, in[ 0], S41, 4096336452u); /* 49 */ + II ( d, a, b, c, in[ 7], S42, 1126891415u); /* 50 */ + II ( c, d, a, b, in[14], S43, 2878612391u); /* 51 */ + II ( b, c, d, a, in[ 5], S44, 4237533241u); /* 52 */ + II ( a, b, c, d, in[12], S41, 1700485571u); /* 53 */ + II ( d, a, b, c, in[ 3], S42, 2399980690u); /* 54 */ + II ( c, d, a, b, in[10], S43, 4293915773u); /* 55 */ + II ( b, c, d, a, in[ 1], S44, 2240044497u); /* 56 */ + II ( a, b, c, d, in[ 8], S41, 1873313359u); /* 57 */ + II ( d, a, b, c, in[15], S42, 4264355552u); /* 58 */ + II ( c, d, a, b, in[ 6], S43, 2734768916u); /* 59 */ + II ( b, c, d, a, in[13], S44, 1309151649u); /* 60 */ + II ( a, b, c, d, in[ 4], S41, 4149444226u); /* 61 */ + II ( d, a, b, c, in[11], S42, 3174756917u); /* 62 */ + II ( c, d, a, b, in[ 2], S43, 718787259u); /* 63 */ + II ( b, c, d, a, in[ 9], S44, 3951481745u); /* 64 */ + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +/* lh3: the following code is added by me */ + +#ifdef MD5SUM_MAIN +#include +#include +#include +#define HEX_STR "0123456789abcdef" + +static void md5_one(const char *fn) +{ + unsigned char buf[4096]; + MD5_CTX md5; + int l; + FILE *fp; + + fp = strcmp(fn, "-")? fopen(fn, "r") : stdin; + if (fp == 0) { + fprintf(stderr, "md5sum: %s: No such file or directory\n", fn); + exit(1); + } + MD5Init(&md5); + while ((l = fread(buf, 1, 4096, fp)) > 0) + MD5Update(&md5, buf, l); + MD5Final(&md5); + if (fp != stdin) fclose(fp); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[md5.digest[l]>>4&0xf], HEX_STR[md5.digest[l]&0xf]); + printf(" %s\n", fn); +} +int main(int argc, char *argv[]) +{ + int i; + if (argc == 1) md5_one("-"); + else for (i = 1; i < argc; ++i) md5_one(argv[i]); + return 0; +} +#endif diff --git a/misc/md5.h b/misc/md5.h new file mode 100644 index 0000000..678ac27 --- /dev/null +++ b/misc/md5.h @@ -0,0 +1,68 @@ +/* + ********************************************************************** + ** md5.h -- Header file for implementation of MD5 ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version ** + ** Revised (for MD5): RLR 4/27/91 ** + ** -- G modified to have y&~z instead of y&z ** + ** -- FF, GG, HH modified to add in last register done ** + ** -- Access pattern: round 2 works mod 5, round 3 works mod 3 ** + ** -- distinct additive constant for each step ** + ** -- round 4 added, working mod 7 ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +#ifndef MD5_H +#define MD5_H + +#include + +/* typedef a 32 bit type */ +typedef uint32_t UINT4; + +/* Data structure for MD5 (Message Digest) computation */ +typedef struct { + UINT4 i[2]; /* number of _bits_ handled mod 2^64 */ + UINT4 buf[4]; /* scratch buffer */ + unsigned char in[64]; /* input buffer */ + unsigned char digest[16]; /* actual digest after MD5Final call */ +} MD5_CTX; + +#ifdef __cplusplus +extern "C" { +#endif + + void MD5Init(MD5_CTX *mdContext); + void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, unsigned intinLen); + void MD5Final(MD5_CTX *mdContext); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/misc/md5fa.c b/misc/md5fa.c new file mode 100644 index 0000000..c41db2d --- /dev/null +++ b/misc/md5fa.c @@ -0,0 +1,58 @@ +#include +#include +#include "md5.h" +#include "kseq.h" + +#define HEX_STR "0123456789abcdef" + +KSEQ_INIT(gzFile, gzread) + +static void md5_one(const char *fn) +{ + MD5_CTX md5_one, md5_all; + int l, i, k; + gzFile fp; + kseq_t *seq; + unsigned char unordered[16]; + + for (l = 0; l < 16; ++l) unordered[l] = 0; + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "md5fa: %s: No such file or directory\n", fn); + exit(1); + } + + MD5Init(&md5_all); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = k = 0; i < seq->seq.l; ++i) { + if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); + else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; + } + MD5Init(&md5_one); + MD5Update(&md5_one, (unsigned char*)seq->seq.s, k); + MD5Final(&md5_one); + for (l = 0; l < 16; ++l) { + printf("%c%c", HEX_STR[md5_one.digest[l]>>4&0xf], HEX_STR[md5_one.digest[l]&0xf]); + unordered[l] ^= md5_one.digest[l]; + } + printf(" %s %s\n", fn, seq->name.s); + MD5Update(&md5_all, (unsigned char*)seq->seq.s, k); + } + MD5Final(&md5_all); + kseq_destroy(seq); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[md5_all.digest[l]>>4&0xf], HEX_STR[md5_all.digest[l]&0xf]); + printf(" %s >ordered\n", fn); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]); + printf(" %s >unordered\n", fn); +} + +int main(int argc, char *argv[]) +{ + int i; + if (argc == 1) md5_one("-"); + else for (i = 1; i < argc; ++i) md5_one(argv[i]); + return 0; +} diff --git a/misc/novo2sam.pl b/misc/novo2sam.pl new file mode 100755 index 0000000..3d3436c --- /dev/null +++ b/misc/novo2sam.pl @@ -0,0 +1,281 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.3 + +#Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com) + +#use strict; +#use warnings; +use Data::Dumper; +use Getopt::Std; + +&novo2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub novo2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: novo2sam.pl [-p] \n") if (@ARGV == 0); + my $is_paired = defined($opts{p}); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + next if (/^#/); + next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/); + &novo2sam_aux($_, $s_curr, $is_paired); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub novo2sam_aux { + my ($line, $s, $is_paired) = @_; + + chomp($line); + my @t = split(/\s+/, $line); + my @variations = @t[13 .. $#t]; + @$s = (); + return if ($t[4] ne 'U'); + my $len = length($t[2]); + # read name + $s->[0] = substr($t[0], 1); + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7); + $s->[1] |= 2 if ($t[10] eq '.'); + # read & quality + if ($t[9] eq 'R') { + $s->[9] = reverse($t[2]); + $s->[10] = reverse($t[3]); + $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; + } else { + $s->[9] = $t[2]; $s->[10] = $t[3]; + } + # cigar + my $cigarstring =""; + if (scalar @variations ==0 ) { + $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment + } else { + #convert to correct CIGAR + my $tmpstr = join" ",@variations ; + if ( $tmpstr=~ /\+|\-/ ) { + $cigarstring = cigar_method($line,\@variations,$len); + $s->[5]=$cigarstring; + } else { + $s->[5]=$len. "M"; + } +} + +# coor + $s->[2] = substr($t[7], 1); $s->[3] = $t[8]; + $s->[1] |= 0x10 if ($t[9] eq 'R'); + # mapQ + $s->[4] = $t[5] > $t[6]? $t[5] : $t[6]; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:".(@t-13)); + my $md = ''; + $md = mdtag($md,$line,\@variations,$len); + push(@$s, "MD:Z:$md"); + +} + +sub mdtag { + my $oldmd = shift; + my $line = shift; + my $ref =shift; + my $rdlen = shift; + my @variations = @$ref; + my $string=""; + my $mdtag=""; + my $t=1; + my $q=1; + my $deleteflag=0; + my $len =0; + foreach $string (@variations) { + my ($indeltype,$insert) = indeltype($string); + if ($indeltype eq "+") { + $len = length ($insert); + $q+=$len; + next; + } + my $pos = $1 if $string =~ /^(\d+)/; + $len = $pos - $t; + if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) { + $mdtag.=$len; + } + $t+=$len; + $q+=$len; + if ($indeltype eq ">") { + $mdtag.=$insert; + $deleteflag=0; + $t+=1; + $q+=1; + } + if ($indeltype eq "-") { + my $deletedbase = $2 if $string =~ /(\d+)\-([A-Z]+)/; + if ($deleteflag == 0 ) { + $mdtag.="^"; + } + $mdtag.=$deletedbase; + $deleteflag=1; + $t+=1; + } + } + $len = $rdlen - $q + 1; + if ($len > 0) { + $mdtag.="$len"; + } +# print "In:$line\n"; +# print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n"; + + return $mdtag; +} + +sub indeltype { + my $string = shift; + my $insert=""; + my $indeltype; + if ($string =~ /([A-Z]+)\>/) { + $indeltype=">"; + $insert=$1; + } elsif ($string =~ /\-/) { + $indeltype="-"; + } elsif ($string =~ /\+([A-Z]+)/) { + $indeltype="+"; + $insert=$1; + } + return ($indeltype,$insert); + +} + + +sub cigar_method { + my $line = shift; + my $ref =shift; + my $rdlen = shift; + my @variations = @$ref; + my $string=""; + my $type=""; + my $t =1; + my $q=1; + my $indeltype=""; + my $cigar= ""; + my $insert = ""; + my $len=0; + my @cig=(); + foreach $string (@variations) { + next if $string =~ />/; + my $pos = $1 if $string =~ /^(\d+)/; + + if ($string =~ /\+([A-Z]+)/) { + $indeltype="+"; + $insert = $1; + }elsif ($string =~ /\-([A-Z]+)/) { + $indeltype="-"; + $insert = $1; + } +#print "$pos $indeltype $insert $t $q\n"; + $len = $pos - $t; + if ( $len > 0) { + $cigar.=$len."M"; + push(@cig,$len."M"); + } + $t+=$len; + $q+=$len; + + if ($indeltype eq "-") { + $cigar.="D"; + push(@cig,"D"); + $t++; + } + if ($indeltype eq "+") { + $len = length ($insert); + if ($len == 1) { + $cigar.="I"; + push(@cig,"I"); + } + if ($len > 1) { + $cigar.=$len."I"; + push(@cig,$len."I") + } + $q+=$len; + } + $insert=""; + } + $len= $rdlen - $q + 1; + if ($len > 0) { + $cigar.=$len."M"; + push(@cig,$len."M"); + } + + $cigar = newcigar($cigar,'D'); + $cigar = newcigar($cigar,'I'); + + #print "$line\n"; + #print "c CIGAR:\t$cigar\n\n"; + return $cigar; + +} + + + +sub newcigar { + my $cigar = shift; + my $char = shift; + my $new = ""; + my $copy = $cigar; +#print "$cigar\n"; + $copy =~ s/^($char+)/$1;/g; +#print "$copy\n"; + $copy =~ s/([^0-9$char])($char+)/$1;$2;/g; +#print "$copy\n"; + my @parts = split(/;/,$copy); + my $el=""; + foreach $el (@parts) { +#print "$el\n"; + if ($el =~ /^$char+$/) { + $new.=length($el).$char; + }else { + $new.=$el; + } + + } + return $new; +} diff --git a/misc/samtools.pl b/misc/samtools.pl new file mode 100755 index 0000000..c014c52 --- /dev/null +++ b/misc/samtools.pl @@ -0,0 +1,255 @@ +#!/usr/bin/perl -w + +# Author: lh3 + +use strict; +use warnings; +use Getopt::Std; + +my $version = '0.3.2 (r321)'; +&usage if (@ARGV < 1); + +my $command = shift(@ARGV); +my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter); + +die("Unknown command \"$command\".\n") if (!defined($func{$command})); +&{$func{$command}}; +exit(0); + +# +# showALEN +# + +sub showALEN { + die(qq/Usage: samtools.pl showALEN \n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + my @t = split; + my $l = 0; + $_ = $t[5]; + s/(\d+)[SMI]/$l+=$1/eg; + print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n"; + } +} + +# +# varFilter +# + +sub varFilter { + my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef); + getopts('pd:D:l:Q:w:W:N:G:', \%opts); + die(qq/ +Usage: samtools.pl varFilter [options] + +Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] + -q INT minimum RMS mapping quality for gaps [$opts{q}] + -d INT minimum read depth [$opts{d}] + -D INT maximum read depth [$opts{D}] + + -G INT min indel score for nearby SNP filtering [$opts{G}] + -w INT SNP within INT bp around a gap to be filtered [$opts{w}] + + -W INT window size for filtering dense SNPs [$opts{W}] + -N INT max number of SNPs in a window [$opts{N}] + + -l INT window size for filtering adjacent gaps [$opts{l}] + + -p print filtered variants +\n/) if (@ARGV == 0 && -t STDIN); + + # calculate the window size + my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W}); + my $max_dist = $ol > $ow? $ol : $ow; + $max_dist = $oW if ($max_dist < $oW); + # the core loop + my @staging; # (indel_filtering_score, flt_tag) + while (<>) { + my @t = split; + next if ($t[2] eq $t[3] || $t[3] eq '*/*'); # skip non-var sites + # clear the out-of-range elements + while (@staging) { + last if ($staging[0][2] eq $t[0] && $staging[0][3] + $max_dist >= $t[1]); + varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much + } + my ($flt, $score) = (0, -1); + # first a simple filter + if ($t[7] < $opts{d}) { + $flt = 2; + } elsif ($t[7] > $opts{D}) { + $flt = 3; + } + # site dependent filters + if ($flt == 0) { + if ($t[2] eq '*') { # an indel + $flt = 1 if ($t[6] < $opts{q}); + # filtering SNPs + if ($t[5] >= $opts{G}) { + for my $x (@staging) { + next if ($x->[0] >= 0 || $x->[3] + $ow < $t[1]); + $x->[1] = 5 if ($x->[1] == 0); + } + } + # calculate the filtering score (different from indel quality) + $score = $t[5]; + $score += $opts{s} * $t[10] if ($t[8] ne '*'); + $score += $opts{s} * $t[11] if ($t[9] ne '*'); + # check the staging list for indel filtering + for my $x (@staging) { + next if ($x->[0] < 0 || $x->[3] + $ol < $t[1]); + if ($x->[0] < $score) { + $x->[1] = 6; + } else { + $flt = 6; last; + } + } + } else { # a SNP + $flt = 1 if ($t[6] < $opts{Q}); + # check adjacent SNPs + my $k = 1; + for my $x (@staging) { + ++$k if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5)); + } + # filtering is necessary + if ($k > $opts{N}) { + $flt = 4; + for my $x (@staging) { + $x->[1] = 4 if ($x->[0] < 0 && $x->[3] + $oW >= $t[1] && $x->[1] == 0); + } + } else { # then check gap filter + for my $x (@staging) { + next if ($x->[0] < 0 || $x->[3] + $ow < $t[1]); + if ($x->[0] >= $opts{G}) { + $flt = 5; last; + } + } + } + } + } + push(@staging, [$score, $flt, @t]); + } + # output the last few elements in the staging list + while (@staging) { + varFilter_aux(shift @staging, $opts{p}); + } +} + +sub varFilter_aux { + my ($first, $is_print) = @_; + if ($first->[1] == 0) { + print join("\t", @$first[2 .. @$first-1]), "\n"; + } elsif ($is_print) { + print STDERR join("\t", substr("UQdDWGgX", $first->[1], 1), @$first[2 .. @$first-1]), "\n"; + } +} + +# +# pileup2fq +# + +sub pileup2fq { + my %opts = (d=>3, D=>255, Q=>25, G=>25, l=>10); + getopts('d:D:Q:G:l:', \%opts); + die(qq/ +Usage: samtools.pl pileup2fq [options] + +Options: -d INT minimum depth [$opts{d}] + -D INT maximum depth [$opts{D}] + -Q INT min RMS mapQ [$opts{Q}] + -G INT minimum indel score [$opts{G}] + -l INT indel filter winsize [$opts{l}]\n +/) if (@ARGV == 0 && -t STDIN); + + my ($last_chr, $seq, $qual, @gaps, $last_pos); + my $_Q = $opts{Q}; + my $_d = $opts{d}; + my $_D = $opts{D}; + + $last_chr = ''; + while (<>) { + my @t = split; + if ($last_chr ne $t[0]) { + &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); + $last_chr = $t[0]; + $last_pos = 0; + $seq = ''; $qual = ''; + @gaps = (); + } + if ($t[1] - $last_pos != 1) { + $seq .= 'n' x ($t[1] - $last_pos - 1); + $qual .= '!' x ($t[1] - $last_pos - 1); + } + if ($t[2] eq '*') { + push(@gaps, $t[1]) if ($t[5] >= $opts{G}); + } else { + $seq .= ($t[6] >= $_Q && $t[7] >= $_d && $t[7] <= $_D)? uc($t[3]) : lc($t[3]); + my $q = $t[4] + 33; + $q = 126 if ($q > 126); + $qual .= chr($q); + } + $last_pos = $t[1]; + } + &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); +} + +sub p2q_post_process { + my ($chr, $seq, $qual, $gaps, $l) = @_; + &p2q_filter_gaps($seq, $gaps, $l); + print "\@$chr\n"; &p2q_print_str($seq); + print "+\n"; &p2q_print_str($qual); +} + +sub p2q_filter_gaps { + my ($seq, $gaps, $l) = @_; + for my $g (@$gaps) { + my $x = $g > $l? $g - $l : 0; + substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l)); + } +} + +sub p2q_print_str { + my ($s) = @_; + my $l = length($$s); + for (my $i = 0; $i < $l; $i += 60) { + print substr($$s, $i, 60), "\n"; + } +} + +# +# varStats +# + +sub varStats { + my %opts = (d=>'', c=>5); + getopts('d:c:', \%opts); + die("Usage: samtools.pl varStats [-d dbSNP.snp] [-c $opts{c}] \n") if (@ARGV == 0 && -t STDIN); + my (@cnt, %hash); + my $col = $opts{c} - 1; + while (<>) { + my @t = split; + if ($t[2] eq '*') { + } else { + my $q = $t[$col]; + $q = 99 if ($q > 99); + $q = int($q/10); + my $is_het = ($t[3] =~ /^[ACGT]$/)? 0 : 1; + ++$cnt[$q][$is_het]; + $hash{$t[0],$t[1]} = $q; + } + } +} + +# +# Usage +# + +sub usage { + die(qq/ +Program: samtools.pl (helper script for SAMtools) +Version: $version +Contact: Heng Li \n +Usage: samtools.pl []\n +Command: varFilter filtering SNPs and short indels + pileup2fq generate fastq from `pileup -c' + showALEN print alignment length (ALEN) following CIGAR +\n/); +} diff --git a/misc/soap2sam.pl b/misc/soap2sam.pl new file mode 100755 index 0000000..b37135e --- /dev/null +++ b/misc/soap2sam.pl @@ -0,0 +1,109 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&soap2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub soap2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: soap2sam.pl [-p] \n") if (@ARGV == 0 && -t STDIN); + my $is_paired = defined($opts{p}); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + s/[\177-\377]|[\000-\010]|[\012-\040]//g; + next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub soap2sam_aux { + my ($line, $s, $is_paired) = @_; + chomp($line); + my @t = split(/\s+/, $line); + return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]); + @$s = (); + # fix SOAP-2.1.x bugs + @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/); + # read name + $s->[0] = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7); + $s->[1] |= 2 if ($is_paired); + # read & quality + $s->[9] = $t[1]; + $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[7]; $s->[3] = $t[8]; + $s->[1] |= 0x10 if ($t[6] eq '-'); + # mapQ + $s->[4] = $t[3] == 1? 30 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:$t[9]"); + my $md = ''; + if ($t[9]) { + my @x; + for (10 .. $#t) { + push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i); + } + @x = sort(@x); + my $a = 0; + for (@x) { + my ($y, $z) = split(","); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($t[1]) - $a; + } else { + $md = length($t[1]); + } + push(@$s, "MD:Z:$md"); + return 0; +} diff --git a/misc/wgsim.c b/misc/wgsim.c new file mode 100644 index 0000000..1522eee --- /dev/null +++ b/misc/wgsim.c @@ -0,0 +1,502 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* This program is separated from maq's read simulator with Colin + * Hercus' modification to allow longer indels. Colin is the chief + * developer of novoalign. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PACKAGE_VERSION "0.2.3" + +const uint8_t nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; + +/* Simple normal random number generator, copied from genran.c */ + +double ran_normal() +{ + static int iset = 0; + static double gset; + double fac, rsq, v1, v2; + if (iset == 0) { + do { + v1 = 2.0 * drand48() - 1.0; + v2 = 2.0 * drand48() - 1.0; + rsq = v1 * v1 + v2 * v2; + } while (rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0 * log(rsq) / rsq); + gset = v1 * fac; + iset = 1; + return v2 * fac; + } else { + iset = 0; + return gset; + } +} + +/* FASTA parser, copied from seq.c */ + +typedef struct { + int l, m; /* length and maximum buffer size */ + unsigned char *s; /* sequence */ +} seq_t; + +#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 + +static int SEQ_BLOCK_SIZE = 512; + +void seq_set_block_size(int size) +{ + SEQ_BLOCK_SIZE = size; +} + +int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) +{ + int c, l, max; + char *p; + + c = 0; + while (!feof(fp) && fgetc(fp) != '>'); + if (feof(fp)) return -1; + p = locus; + while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') + if (c != '\r') *p++ = c; + *p = '\0'; + if (comment) { + p = comment; + if (c != '\n') { + while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); + if (c != '\n') { + *p++ = c; + while (!feof(fp) && (c = fgetc(fp)) != '\n') + if (c != '\r') *p++ = c; + } + } + *p = '\0'; + } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); + l = 0; max = seq->m; + while (!feof(fp) && (c = fgetc(fp)) != '>') { + if (isalpha(c) || c == '-' || c == '.') { + if (l + 1 >= max) { + max += SEQ_BLOCK_SIZE; + seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); + } + seq->s[l++] = (unsigned char)c; + } + } + if (c == '>') ungetc(c,fp); + seq->s[l] = 0; + seq->m = max; seq->l = l; + return l; +} + +/* Error-checking open, copied from utils.c */ + +#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) + +FILE *err_xopen_core(const char *func, const char *fn, const char *mode) +{ + FILE *fp = 0; + if (strcmp(fn, "-") == 0) + return (strstr(mode, "r"))? stdin : stdout; + if ((fp = fopen(fn, mode)) == 0) { + fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); + abort(); + } + return fp; +} + +/* wgsim */ + +enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; +typedef unsigned short mut_t; +static mut_t mutmsk = (mut_t)0xf000; + +typedef struct { + int l, m; /* length and maximum buffer size */ + mut_t *s; /* sequence */ +} mutseq_t; + +static double ERR_RATE = 0.02; +static double MUT_RATE = 0.001; +static double INDEL_FRAC = 0.1; +static double INDEL_EXTEND = 0.3; +static int IS_SOLID = 0; +static int SHOW_MM_INFO = 1; + +void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +{ + int i, deleting = 0; + mutseq_t *ret[2]; + + ret[0] = hap1; ret[1] = hap2; + ret[0]->l = seq->l; ret[1]->l = seq->l; + ret[0]->m = seq->m; ret[1]->m = seq->m; + ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); + for (i = 0; i != seq->l; ++i) { + int c; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; + if (deleting) { + if (drand48() < INDEL_EXTEND) { + if (deleting & 1) ret[0]->s[i] |= DELETE; + if (deleting & 2) ret[1]->s[i] |= DELETE; + continue; + } else deleting = 0; + } + if (c < 4 && drand48() < MUT_RATE) { // mutation + if (drand48() >= INDEL_FRAC) { // substitution + double r = drand48(); + c = (c + (int)(r * 3.0 + 1)) & 3; + if (is_hap || drand48() < 0.333333) { // hom + ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c; + } else { // het + ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c; + } + } else { // indel + if (drand48() < 0.5) { // deletion + if (is_hap || drand48() < 0.333333) { // hom-del + ret[0]->s[i] = ret[1]->s[i] = DELETE; + deleting = 3; + } else { // het-del + deleting = drand48()<0.5?1:2; + ret[deleting-1]->s[i] = DELETE; + } + } else { // insertion + int num_ins = 0, ins = 0; + do { + num_ins++; + ins = (ins << 2) | (int)(drand48() * 4.0); + } while (num_ins < 4 && drand48() < INDEL_EXTEND); + + if (is_hap || drand48() < 0.333333) { // hom-ins + ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } else { // het-ins + ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } + } + } + } + } +} +void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) +{ + int i; + for (i = 0; i != seq->l; ++i) { + int c[3]; + c[0] = nst_nt4_table[(int)seq->s[i]]; + c[1] = hap1->s[i]; c[2] = hap2->s[i]; + if (c[0] >= 4) continue; + if ((c[1] & mutmsk) != NOCHANGE || (c[1] & mutmsk) != NOCHANGE) { + printf("%s\t%d\t", name, i+1); + if (c[1] == c[2]) { // hom + if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); + } else if ((c[1]&mutmsk) == DELETE) { // del + printf("%c\t-\t-\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while(n > 0) { + putchar("ACGTN"[ins & 0x3]); + n--; + } + printf("\t-\n"); + } else assert(0); + } else { // het + if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]); + } else if ((c[1]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if ((c[2]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1 + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + n--; + } + printf("\t+\n"); + } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2 + printf("-\t"); + int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t+\n"); + } else assert(0); + } + } + } +} + +void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +{ + seq_t seq; + mutseq_t rseq[2]; + uint64_t tot_len, ii; + int i, l, n_ref; + char name[256], *qstr; + int size[2], Q; + uint8_t *tmp_seq[2]; + mut_t *target; + + INIT_SEQ(seq); + srand48(time(0)); + seq_set_block_size(0x1000000); + l = size_l > size_r? size_l : size_r; + qstr = (char*)calloc(l+1, 1); + tmp_seq[0] = (uint8_t*)calloc(l+2, 1); + tmp_seq[1] = (uint8_t*)calloc(l+2, 1); + size[0] = size_l; size[1] = size_r; + + Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + + tot_len = n_ref = 0; + while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + tot_len += l; + ++n_ref; + } + fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); + rewind(fp_fa); + + while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); + if (l < dist + 3 * std_dev) { + fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); + continue; + } + + // generate mutations and print them out + maq_mut_diref(&seq, is_hap, rseq, rseq+1); + maq_print_mutref(name, &seq, rseq, rseq+1); + + for (ii = 0; ii != n_pairs; ++ii) { // the core loop + double ran; + int d, pos, s[2], is_flip = 0; + int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k; + FILE *fpo[2]; + + do { // avoid boundary failure + ran = ran_normal(); + ran = ran * std_dev + dist; + d = (int)(ran + 0.5); + pos = (int)((l - d + 1) * drand48()); + } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); + + // flip or not + if (drand48() < 0.5) { + fpo[0] = fpout1; fpo[1] = fpout2; + s[0] = size[0]; s[1] = size[1]; + } else { + fpo[1] = fpout1; fpo[0] = fpout2; + s[1] = size[0]; s[0] = size[1]; + is_flip = 1; + } + + // generate the read sequences + target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated + n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; + +#define __gen_read(x, start, iter) do { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ + int c = target[i], mut_type = c & mutmsk; \ + if (ext_coor[x] < 0) { \ + if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ + ext_coor[x] = i; \ + } \ + if (mut_type == DELETE) ++n_indel[x]; \ + else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \ + tmp_seq[x][k++] = c & 0xf; \ + if (mut_type == SUBSTITUTE) ++n_sub[x]; \ + } else { \ + int n, ins; \ + ++n_indel[x]; \ + tmp_seq[x][k++] = c & 0xf; \ + for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \ + tmp_seq[x][k++] = ins & 0x3; \ + } \ + } \ + if (k != s[x]) ext_coor[x] = -10; \ + } while (0) + + if (!IS_SOLID) { + __gen_read(0, pos, ++i); + __gen_read(1, pos + d - 1, --i); + for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement + } else { + int c1, c2, c; + ++s[0]; ++s[1]; // temporarily increase read length by 1 + if (is_flip) { // RR pair + __gen_read(0, pos + s[0], --i); + __gen_read(1, pos + d - 1, --i); + } else { // FF pair + __gen_read(0, pos, ++i); + __gen_read(1, pos + d - 1 - s[1], ++i); + ++ext_coor[0]; ++ext_coor[1]; + } + // change to color sequence: (0,1,2,3) -> (A,C,G,T) + for (j = 0; j < 2; ++j) { + c1 = tmp_seq[j][0]; + for (i = 1; i < s[j]; ++i) { + c2 = tmp_seq[j][i]; + c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<= 4) c = 4; // actually c should be never larger than 4 if everything is correct + else if (drand48() < ERR_RATE) { + c = (c + (int)(drand48() * 3.0 + 1)) & 3; + ++n_err[j]; + } + tmp_seq[j][i] = c; + } + } + + // print + for (j = 0; j < 2; ++j) { + for (i = 0; i < s[j]; ++i) qstr[i] = Q; + qstr[i] = 0; + if (SHOW_MM_INFO) { + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); + } else { + fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, + (long long)ii, j==0? is_flip+1 : 2-is_flip, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); + } + for (i = 0; i < s[j]; ++i) + fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); + fprintf(fpo[j], "\n+\n%s\n", qstr); + } + } + free(rseq[0].s); free(rseq[1].s); + } + free(seq.s); free(qstr); + free(tmp_seq[0]); free(tmp_seq[1]); +} + +static int simu_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: wgsim (short read simulator)\n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Contact: Heng Li \n\n"); + fprintf(stderr, "Usage: wgsim [options] \n\n"); + fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE); + fprintf(stderr, " -d INT outer distance between the two ends [500]\n"); + fprintf(stderr, " -s INT standard deviation [50]\n"); + fprintf(stderr, " -N INT number of read pairs [1000000]\n"); + fprintf(stderr, " -1 INT length of the first read [70]\n"); + fprintf(stderr, " -2 INT length of the second read [70]\n"); + fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); + fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); + fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); + fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); + fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); + fprintf(stderr, " -h haplotype mode\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + int64_t N; + int dist, std_dev, c, size_l, size_r, is_hap = 0; + FILE *fpout1, *fpout2, *fp_fa; + + N = 1000000; dist = 500; std_dev = 50; + size_l = size_r = 70; + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { + switch (c) { + case 'd': dist = atoi(optarg); break; + case 's': std_dev = atoi(optarg); break; + case 'N': N = atoi(optarg); break; + case '1': size_l = atoi(optarg); break; + case '2': size_r = atoi(optarg); break; + case 'e': ERR_RATE = atof(optarg); break; + case 'r': MUT_RATE = atof(optarg); break; + case 'R': INDEL_FRAC = atof(optarg); break; + case 'X': INDEL_EXTEND = atof(optarg); break; + case 'c': IS_SOLID = 1; break; + case 'C': SHOW_MM_INFO = 0; break; + case 'h': is_hap = 1; break; + } + } + if (argc - optind < 3) return simu_usage(); + fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); + fpout1 = xopen(argv[optind+1], "w"); + fpout2 = xopen(argv[optind+2], "w"); + wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); + + fclose(fpout1); fclose(fpout2); fclose(fp_fa); + return 0; +} diff --git a/misc/wgsim_eval.pl b/misc/wgsim_eval.pl new file mode 100755 index 0000000..99e2ac9 --- /dev/null +++ b/misc/wgsim_eval.pl @@ -0,0 +1,74 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.3 + +use strict; +use warnings; +use Getopt::Std; + +&wgsim_eval; +exit; + +sub wgsim_eval { + my %opts; + getopts('pc', \%opts); + die("Usage: wgsim_eval.pl [-pc] \n") if (@ARGV == 0 && -t STDIN); + my (@c0, @c1); + my ($max_q, $flag) = (0, 0); + my $gap = 5; + $flag |= 1 if (defined $opts{p}); + $flag |= 2 if (defined $opts{c}); + while (<>) { + my @t = split; + my $line = $_; + my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); + $max_q = $q if ($q > $max_q); + # right coordinate + $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; + --$rght; + # correct for soft clipping + $left -= $1 if (/^(\d+)S/); + $rght += $1 if (/(\d+)S$/); + # skip unmapped reads + next if (($t[1]&0x4) || $chr eq '*'); + # parse read name and check + if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) { + if ($1 ne $chr) { # different chr + $is_correct = 0; + } else { + if ($flag & 2) { + if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward + $is_correct = 0 if (abs($2 - $left) > $gap); + } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse + $is_correct = 0 if (abs($3 - $rght) > $gap); + } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward + $is_correct = 0 if (abs($3 - $left) > $gap); + } else { # R3, reverse + $is_correct = 0 if (abs($2 - $rght) > $gap); + } + } else { + if ($t[1] & 0x10) { # reverse + $is_correct = 0 if (abs($3 - $rght) > $gap); # in case of indels that are close to the end of a reads + } else { + $is_correct = 0 if (abs($2 - $left) > $gap); + } + } + } + } else { + warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); + next; + } + ++$c0[$q]; + ++$c1[$q] unless ($is_correct); + print STDERR $line if (($flag&1) && !$is_correct && $q > 0); + } + # print + my ($cc0, $cc1) = (0, 0); + for (my $i = $max_q; $i >= 0; --$i) { + $c0[$i] = 0 unless (defined $c0[$i]); + $c1[$i] = 0 unless (defined $c1[$i]); + $cc0 += $c0[$i]; $cc1 += $c1[$i]; + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0); + } +} diff --git a/misc/zoom2sam.pl b/misc/zoom2sam.pl new file mode 100755 index 0000000..5306bfa --- /dev/null +++ b/misc/zoom2sam.pl @@ -0,0 +1,97 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.0 + +use strict; +use warnings; +use Getopt::Std; + +&zoom2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub zoom2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: zoom2sam.pl [-p] +Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2); + my $is_paired = defined($opts{p}); + my $len = shift(@ARGV); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + &zoom2sam_aux($_, $s_curr, $is_paired, $len); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub zoom2sam_aux { + my ($line, $s, $is_paired, $len) = @_; + chomp($line); + my @t = split("\t", $line); + @$s = (); + # read name + $s->[0] = $t[0]; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/); + $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/); + $s->[1] |= 2 if ($is_paired); + # read & quality + $s->[9] = "*"; $s->[10] = "*"; + # cigar + $s->[5] = $len . "M"; + # coor + my @s = split(/\s+/, $t[1]); + $s->[2] = $s[0]; + $t[1] =~ /:(\d+)$/; + $s->[3] = $1 + 1; + if ($s->[0] =~ /_[FR]$/) { + my $u = ($s->[0] =~ /_F$/)? 1 : 0; + my $w = ($t[2] eq '+')? 1 : 0; + $s->[1] |= 0x10 if ($u ^ $w); + $s->[0] =~ s/_[FR]$//; + } else { + $s->[1] |= 0x10 if ($t[2] eq '-'); + } + # mapQ + $s->[4] = 30; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:$t[3]"); +} diff --git a/razf.c b/razf.c new file mode 100644 index 0000000..b56065b --- /dev/null +++ b/razf.c @@ -0,0 +1,684 @@ +/* + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan , Heng Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NO_RAZF + +#include +#include +#include +#include +#include +#include "razf.h" + +#if ZLIB_VERNUM < 0x1221 +struct _gz_header_s { + int text; + uLong time; + int xflags; + int os; + Bytef *extra; + uInt extra_len; + uInt extra_max; + Bytef *name; + uInt name_max; + Bytef *comment; + uInt comm_max; + int hcrc; + int done; +}; +#warning "zlib < 1.2.2.1; RAZF writing is disabled." +#endif + +#define DEF_MEM_LEVEL 8 + +static inline uint32_t byte_swap_4(uint32_t v){ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint64_t byte_swap_8(uint64_t v){ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} + +static inline int is_big_endian(){ + int x = 0x01; + char *c = (char*)&x; + return (c[0] != 0x01); +} + +#ifndef _RZ_READONLY +static void add_zindex(RAZF *rz, int64_t in, int64_t out){ + if(rz->index->size == rz->index->cap){ + rz->index->cap = rz->index->cap * 1.5 + 2; + rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap); + rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1)); + } + if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out; + rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE]; + rz->index->size ++; +} + +static void save_zindex(RAZF *rz, int fd){ + int32_t i, v32; + int is_be; + is_be = is_big_endian(); + if(is_be) write(fd, &rz->index->size, sizeof(int)); + else { + v32 = byte_swap_4((uint32_t)rz->index->size); + write(fd, &v32, sizeof(uint32_t)); + } + v32 = rz->index->size / RZ_BIN_SIZE + 1; + if(!is_be){ + for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } + write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); + write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size); +} +#endif + +static void load_zindex(RAZF *rz, int fd){ + int32_t i, v32; + int is_be; + if(!rz->load_index) return; + if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex)); + is_be = is_big_endian(); + read(fd, &rz->index->size, sizeof(int)); + if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size); + rz->index->cap = rz->index->size; + v32 = rz->index->size / RZ_BIN_SIZE + 1; + rz->index->bin_offsets = malloc(sizeof(int64_t) * v32); + read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); + rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size); + read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size); + if(!is_be){ + for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } +} + +#ifdef _RZ_READONLY +static RAZF* razf_open_w(int fd) +{ + fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n"); + return 0; +} +#else +static RAZF* razf_open_w(int fd){ + RAZF *rz; + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'w'; + rz->filedes = fd; + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->index = calloc(sizeof(ZBlockIndex), 1); + deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->header = calloc(sizeof(gz_header), 1); + rz->header->os = 0x03; //Unix + rz->header->text = 0; + rz->header->time = 0; + rz->header->extra = malloc(7); + strncpy((char*)rz->header->extra, "RAZF", 4); + rz->header->extra[4] = 1; // obsolete field + // block size = RZ_BLOCK_SIZE, Big-Endian + rz->header->extra[5] = RZ_BLOCK_SIZE >> 8; + rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF; + rz->header->extra_len = 7; + rz->header->name = rz->header->comment = 0; + rz->header->hcrc = 0; + deflateSetHeader(rz->stream, rz->header); + rz->block_pos = rz->block_off = 0; + return rz; +} + +static void _razf_write(RAZF* rz, const void *data, int size){ + int tout; + rz->stream->avail_in = size; + rz->stream->next_in = (void*)data; + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_NO_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out) break; + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + if(rz->stream->avail_in == 0) break; + }; + rz->in += size - rz->stream->avail_in; + rz->block_off += size - rz->stream->avail_in; +} + +static void razf_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + if(rz->stream->avail_out){ + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FULL_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out == 0){ + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } + rz->block_pos = rz->out; + rz->block_off = 0; +} + +static void razf_end_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FINISH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out < RZ_BUFFER_SIZE){ + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } +} + +static void _razf_buffered_write(RAZF *rz, const void *data, int size){ + int i, n; + while(1){ + if(rz->buf_len == RZ_BUFFER_SIZE){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_len = 0; + } + if(size + rz->buf_len < RZ_BUFFER_SIZE){ + for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; + rz->buf_len += size; + return; + } else { + n = RZ_BUFFER_SIZE - rz->buf_len; + for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; + size -= n; + data += n; + rz->buf_len += n; + } + } +} + +int razf_write(RAZF* rz, const void *data, int size){ + int ori_size, n; + int64_t next_block; + ori_size = size; + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + while(rz->in + rz->buf_len + size >= next_block){ + n = next_block - rz->in - rz->buf_len; + _razf_buffered_write(rz, data, n); + data += n; + size -= n; + razf_flush(rz); + add_zindex(rz, rz->in, rz->out); + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + } + _razf_buffered_write(rz, data, size); + return ori_size; +} +#endif + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */ +#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define RESERVED 0xE0 /* bits 5..7: reserved */ + +static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){ + int method, flags, n, len; + if(size < 2) return 0; + if(data[0] != 0x1f || data[1] != 0x8b) return 0; + if(size < 4) return 0; + method = data[2]; + flags = data[3]; + if(method != Z_DEFLATED || (flags & RESERVED)) return 0; + n = 4 + 6; // Skip 6 bytes + *extra_off = n + 2; + *extra_len = 0; + if(flags & EXTRA_FIELD){ + if(size < n + 2) return 0; + len = ((int)data[n + 1] << 8) | data[n]; + n += 2; + *extra_off = n; + while(len){ + if(n >= size) return 0; + n ++; + len --; + } + *extra_len = n - (*extra_off); + } + if(flags & ORIG_NAME) while(n < size && data[n++]); + if(flags & COMMENT) while(n < size && data[n++]); + if(flags & HEAD_CRC){ + if(n + 2 > size) return 0; + n += 2; + } + return n; +} + +static RAZF* razf_open_r(int fd, int _load_index){ + RAZF *rz; + int ext_off, ext_len; + int n, is_be, ret; + int64_t end; + unsigned char c[] = "RAZF"; + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'r'; + rz->filedes = fd; + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL; + n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); + ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len); + if(ret == 0){ + PLAIN_FILE: + rz->in = n; + rz->file_type = FILE_TYPE_PLAIN; + memcpy(rz->outbuf, rz->inbuf, n); + rz->buf_len = n; + free(rz->stream); + rz->stream = NULL; + return rz; + } + rz->header_size = ret; + ret = inflateInit2(rz->stream, -WINDOW_BITS); + if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;} + rz->stream->avail_in = n - rz->header_size; + rz->stream->next_in = rz->inbuf + rz->header_size; + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->file_type = FILE_TYPE_GZ; + rz->in = rz->header_size; + rz->block_pos = rz->header_size; + rz->next_block_pos = rz->header_size; + rz->block_off = 0; + if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz; + if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){ + fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__); + return rz; + } + rz->load_index = _load_index; + rz->file_type = FILE_TYPE_RZ; + if(lseek(fd, -16, SEEK_END) == -1){ + UNSEEKABLE: + rz->seekable = 0; + rz->index = NULL; + rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL; + } else { + is_be = is_big_endian(); + rz->seekable = 1; + read(fd, &end, sizeof(int64_t)); + if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end); + else rz->src_end = end; + read(fd, &end, sizeof(int64_t)); + if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end); + else rz->end = end; + if(n > rz->end){ + rz->stream->avail_in -= n - rz->end; + n = rz->end; + } + if(rz->end > rz->src_end){ + lseek(fd, rz->in, SEEK_SET); + goto UNSEEKABLE; + } + if(lseek(fd, rz->end, SEEK_SET) != rz->end){ + lseek(fd, rz->in, SEEK_SET); + goto UNSEEKABLE; + } + load_zindex(rz, fd); + lseek(fd, n, SEEK_SET); + } + return rz; +} + +RAZF* razf_dopen(int fd, const char *mode){ + if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 1); + else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd); + else return NULL; +} + +RAZF* razf_dopen2(int fd, const char *mode) +{ + if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 0); + else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd); + else return NULL; +} + +static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){ + int fd; + RAZF *rz; + if(strcasecmp(mode, "r") == 0){ + fd = open(filename, O_RDONLY); + rz = razf_open_r(fd, _load_index); + } else if(strcasecmp(mode, "w") == 0){ + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + rz = razf_open_w(fd); + } else return NULL; + return rz; +} + +RAZF* razf_open(const char *filename, const char *mode){ + return _razf_open(filename, mode, 1); +} + +RAZF* razf_open2(const char *filename, const char *mode){ + return _razf_open(filename, mode, 0); +} + +int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){ + int64_t n; + if(rz->mode != 'r' && rz->mode != 'R') return 0; + switch(rz->file_type){ + case FILE_TYPE_PLAIN: + if(rz->end == 0x7fffffffffffffffLL){ + if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0; + rz->end = lseek(rz->filedes, 0, SEEK_END); + lseek(rz->filedes, n, SEEK_SET); + } + *u_size = *c_size = rz->end; + return 1; + case FILE_TYPE_GZ: + return 0; + case FILE_TYPE_RZ: + if(rz->src_end == rz->end) return 0; + *u_size = rz->src_end; + *c_size = rz->end; + return 1; + default: + return 0; + } +} + +static int _razf_read(RAZF* rz, void *data, int size){ + int ret, tin; + if(rz->z_eof || rz->z_err) return 0; + if (rz->file_type == FILE_TYPE_PLAIN) { + ret = read(rz->filedes, data, size); + if (ret == 0) rz->z_eof = 1; + return ret; + } + rz->stream->avail_out = size; + rz->stream->next_out = data; + while(rz->stream->avail_out){ + if(rz->stream->avail_in == 0){ + if(rz->in >= rz->end){ rz->z_eof = 1; break; } + if(rz->end - rz->in < RZ_BUFFER_SIZE){ + rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in); + } else { + rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); + } + if(rz->stream->avail_in == 0){ + rz->z_eof = 1; + break; + } + rz->stream->next_in = rz->inbuf; + } + tin = rz->stream->avail_in; + ret = inflate(rz->stream, Z_BLOCK); + rz->in += tin - rz->stream->avail_in; + if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){ + fprintf(stderr, "[_razf_read] inflate error: %d (at %s:%d)\n", ret, __FILE__, __LINE__); + rz->z_err = 1; + break; + } + if(ret == Z_STREAM_END){ + rz->z_eof = 1; + break; + } + if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){ + rz->buf_flush = 1; + rz->next_block_pos = rz->in; + break; + } + } + return size - rz->stream->avail_out; +} + +int razf_read(RAZF *rz, void *data, int size){ + int ori_size, i; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + for(i=0;ioutbuf + rz->buf_off)[i]; + rz->buf_off += size; + rz->buf_len -= size; + data += size; + rz->block_off += size; + size = 0; + break; + } else { + for(i=0;ibuf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i]; + data += rz->buf_len; + size -= rz->buf_len; + rz->block_off += rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof && rz->buf_len == 0) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +int razf_skip(RAZF* rz, int size){ + int ori_size; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + rz->buf_off += size; + rz->buf_len -= size; + rz->block_off += size; + size = 0; + break; + } else { + size -= rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + rz->block_off += rz->buf_len; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){ + lseek(rz->filedes, in, SEEK_SET); + rz->in = in; + rz->out = out; + rz->block_pos = in; + rz->next_block_pos = in; + rz->block_off = 0; + rz->buf_flush = 0; + rz->z_eof = rz->z_err = 0; + inflateReset(rz->stream); + rz->stream->avail_in = 0; + rz->buf_off = rz->buf_len = 0; +} + +int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){ + int64_t pos; + rz->z_eof = 0; + if(rz->file_type == FILE_TYPE_PLAIN){ + rz->buf_off = rz->buf_len = 0; + pos = block_start + block_offset; + pos = lseek(rz->filedes, pos, SEEK_SET); + rz->out = rz->in = pos; + return pos; + } + if(block_start == rz->block_pos && block_offset >= rz->block_off) { + block_offset -= rz->block_off; + goto SKIP; // Needn't reset inflate + } + if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start + _razf_reset_read(rz, block_start, 0); + SKIP: + if(block_offset) razf_skip(rz, block_offset); + return rz->block_off; +} + +int64_t razf_seek(RAZF* rz, int64_t pos, int where){ + int64_t idx; + int64_t seek_pos, new_out; + rz->z_eof = 0; + if (where == SEEK_CUR) pos += rz->out; + else if (where == SEEK_END) pos += rz->src_end; + if(rz->file_type == FILE_TYPE_PLAIN){ + seek_pos = lseek(rz->filedes, pos, SEEK_SET); + rz->buf_off = rz->buf_len = 0; + rz->out = rz->in = seek_pos; + return seek_pos; + } else if(rz->file_type == FILE_TYPE_GZ){ + if(pos >= rz->out) goto SKIP; + return rz->out; + } + if(pos == rz->out) return pos; + if(pos > rz->src_end) return rz->out; + if(!rz->seekable || !rz->load_index){ + if(pos >= rz->out) goto SKIP; + } + idx = pos / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + new_out = (idx + 1) * RZ_BLOCK_SIZE; + if(pos > rz->out && new_out <= rz->out) goto SKIP; + _razf_reset_read(rz, seek_pos, new_out); + SKIP: + razf_skip(rz, (int)(pos - rz->out)); + return rz->out; +} + +uint64_t razf_tell2(RAZF *rz) +{ + /* + if (rz->load_index) { + int64_t idx, seek_pos; + idx = rz->out / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off) + fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n", + (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off); + } + */ + return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff); +} + +int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where) +{ + if (where != SEEK_SET) return -1; + return razf_jump(rz, voffset>>16, voffset&0xffff); +} + +void razf_close(RAZF *rz){ + if(rz->mode == 'w'){ +#ifndef _RZ_READONLY + razf_end_flush(rz); + deflateEnd(rz->stream); + save_zindex(rz, rz->filedes); + if(is_big_endian()){ + write(rz->filedes, &rz->in, sizeof(int64_t)); + write(rz->filedes, &rz->out, sizeof(int64_t)); + } else { + uint64_t v64 = byte_swap_8((uint64_t)rz->in); + write(rz->filedes, &v64, sizeof(int64_t)); + v64 = byte_swap_8((uint64_t)rz->out); + write(rz->filedes, &v64, sizeof(int64_t)); + } +#endif + } else if(rz->mode == 'r'){ + if(rz->stream) inflateEnd(rz->stream); + } + if(rz->inbuf) free(rz->inbuf); + if(rz->outbuf) free(rz->outbuf); + if(rz->header){ + free(rz->header->extra); + free(rz->header->name); + free(rz->header->comment); + free(rz->header); + } + if(rz->index){ + free(rz->index->bin_offsets); + free(rz->index->cell_offsets); + free(rz->index); + } + free(rz->stream); + close(rz->filedes); + free(rz); +} + +#endif diff --git a/razf.h b/razf.h new file mode 100644 index 0000000..f7e5097 --- /dev/null +++ b/razf.h @@ -0,0 +1,123 @@ + /*- + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan , Heng Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef __RAZF_RJ_H +#define __RAZF_RJ_H + +#include +#include +#include "zlib.h" + +#if ZLIB_VERNUM < 0x1221 +#define _RZ_READONLY +struct _gz_header_s; +typedef struct _gz_header_s _gz_header; +#define gz_header _gz_header +#endif + +#define WINDOW_BITS 15 + +#ifndef RZ_BLOCK_SIZE +#define RZ_BLOCK_SIZE (1<mode from HEAD to TYPE after call inflateReset */ + int buf_off, buf_len; + int z_err, z_eof; + int seekable; + /* Indice where the source is seekable */ + int load_index; + /* set has_index to 0 in mode 'w', then index will be discarded */ +} RAZF; + +#ifdef __cplusplus +extern "C" { +#endif + + RAZF* razf_dopen(int data_fd, const char *mode); + RAZF *razf_open(const char *fn, const char *mode); + int razf_write(RAZF* rz, const void *data, int size); + int razf_read(RAZF* rz, void *data, int size); + int64_t razf_seek(RAZF* rz, int64_t pos, int where); + void razf_close(RAZF* rz); + +#define razf_tell(rz) ((rz)->out) + + RAZF* razf_open2(const char *filename, const char *mode); + RAZF* razf_dopen2(int fd, const char *mode); + uint64_t razf_tell2(RAZF *rz); + int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/razip.c b/razip.c new file mode 100644 index 0000000..2b49883 --- /dev/null +++ b/razip.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include +#include +#include "razf.h" + +#define WINDOW_SIZE 4096 + +static int razf_main_usage() +{ + printf("\n"); + printf("Usage: razip [options] [file] ...\n\n"); + printf("Options: -c write on standard output, keep original files unchanged\n"); + printf(" -d decompress\n"); + printf(" -l list compressed file contents\n"); + printf(" -b INT decompress at INT position in the uncompressed file\n"); + printf(" -s INT decompress INT bytes in the uncompressed file\n"); + printf(" -h give this help\n"); + printf("\n"); + return 0; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) { + printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + printf("razip: not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) { + fprintf(stderr, "razip: %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + RAZF *rz; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ + switch(c){ + case 'h': return razf_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'l': compress = 2; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if(end >= 0 && end < start){ + fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); + return 1; + } + if(compress == 1){ + int f_src, f_dst = -1; + if(argc > optind){ + if((f_src = open(argv[optind], O_RDONLY)) < 0){ + fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); + return 1; + } + if(pstdout){ + f_dst = fileno(stdout); + } else { + char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + strcpy(name, argv[optind]); + strcat(name, ".rz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } else if(pstdout){ + f_src = fileno(stdin); + f_dst = fileno(stdout); + } else return razf_main_usage(); + rz = razf_dopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); + razf_close(rz); // f_dst will be closed here + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + if(argc <= optind) return razf_main_usage(); + if(compress == 2){ + rz = razf_open(argv[optind], "r"); + if(rz->file_type == FILE_TYPE_RZ) { + printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name"); + printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end, + argv[optind]); + } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]); + } else { + int f_dst; + if (argc > optind && !pstdout) { + char *name; + if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) { + printf("razip: %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } else f_dst = fileno(stdout); + rz = razf_open(argv[optind], "r"); + buffer = malloc(WINDOW_SIZE); + razf_seek(rz, start, SEEK_SET); + while(1){ + if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE); + else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if(c <= 0) break; + start += c; + write(f_dst, buffer, c); + if(end >= 0 && start >= end) break; + } + free(buffer); + if (!pstdout) unlink(argv[optind]); + } + razf_close(rz); + return 0; + } +} + diff --git a/sam.c b/sam.c new file mode 100644 index 0000000..45cb05c --- /dev/null +++ b/sam.c @@ -0,0 +1,151 @@ +#include +#include "sam.h" + +#define TYPE_BAM 1 +#define TYPE_READ 2 + +bam_header_t *bam_header_dup(const bam_header_t *h0) +{ + bam_header_t *h; + int i; + h = bam_header_init(); + *h = *h0; + h->hash = 0; + h->text = (char*)calloc(h->l_text + 1, 1); + memcpy(h->text, h0->text, h->l_text); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + } + if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib); + return h; +} +static void append_header_text(bam_header_t *header, char* text, int len) +{ + int x = header->l_text + 1; + int y = header->l_text + len + 1; // 1 byte null + if (text == 0) return; + kroundup32(x); + kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. + header->l_text += len; + header->text[header->l_text] = 0; +} + +samfile_t *samopen(const char *fn, const char *mode, const void *aux) +{ + samfile_t *fp; + fp = (samfile_t*)calloc(1, sizeof(samfile_t)); + if (mode[0] == 'r') { // read + fp->type |= TYPE_READ; + if (mode[1] == 'b') { // binary + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + if (fp->x.bam == 0) goto open_err_ret; + fp->header = bam_header_read(fp->x.bam); + } else { // text + fp->x.tamr = sam_open(fn); + if (fp->x.tamr == 0) goto open_err_ret; + fp->header = sam_header_read(fp->x.tamr); + if (fp->header->n_targets == 0) { // no @SQ fields + if (aux) { // check if aux is present + bam_header_t *textheader = fp->header; + fp->header = sam_header_read2((const char*)aux); + append_header_text(fp->header, textheader->text, textheader->l_text); + bam_header_destroy(textheader); + } + if (fp->header->n_targets == 0) + fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); + } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } + sam_header_parse_rg(fp->header); + } else if (mode[0] == 'w') { // write + fp->header = bam_header_dup((const bam_header_t*)aux); + if (mode[1] == 'b') { // binary + char bmode[3]; + bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); + if (fp->x.bam == 0) goto open_err_ret; + bam_header_write(fp->x.bam, fp->header); + } else { // text + // open file + fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; + if (fp->x.tamr == 0) goto open_err_ret; + // write header + if (strstr(mode, "h")) { + int i; + bam_header_t *alt; + // parse the header text + alt = bam_header_init(); + alt->l_text = fp->header->l_text; alt->text = fp->header->text; + sam_header_parse(alt); + alt->l_text = 0; alt->text = 0; + // check if there are @SQ lines in the header + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} + if (alt->n_targets != fp->header->n_targets) + fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + } else { // then dump ->target_{name,len} + for (i = 0; i < fp->header->n_targets; ++i) + fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); + } + bam_header_destroy(alt); + } + } + } + return fp; + +open_err_ret: + free(fp); + return 0; +} + +void samclose(samfile_t *fp) +{ + if (fp == 0) return; + if (fp->header) bam_header_destroy(fp->header); + if (fp->type & TYPE_BAM) bam_close(fp->x.bam); + else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); + else fclose(fp->x.tamw); + free(fp); +} + +int samread(samfile_t *fp, bam1_t *b) +{ + if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading + if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); + else return sam_read1(fp->x.tamr, fp->header, b); +} + +int samwrite(samfile_t *fp, const bam1_t *b) +{ + if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing + if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); + else { + char *s = bam_format1(fp->header, b); + int l = strlen(s); + fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); + free(s); + return l + 1; + } +} + +int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_plbuf_t *buf; + int ret; + bam1_t *b; + b = bam_init1(); + buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); + while ((ret = samread(fp, b)) >= 0) + bam_plbuf_push(b, buf); + bam_plbuf_push(0, buf); + bam_plbuf_destroy(buf); + bam_destroy1(b); + return 0; +} diff --git a/sam.h b/sam.h new file mode 100644 index 0000000..970cf2d --- /dev/null +++ b/sam.h @@ -0,0 +1,94 @@ +#ifndef BAM_SAM_H +#define BAM_SAM_H + +#include "bam.h" + +/*! + @header + + This file provides higher level of I/O routines and unifies the APIs + for SAM and BAM formats. These APIs are more convenient and + recommended. + + @copyright Genome Research Ltd. + */ + +/*! @typedef + @abstract SAM/BAM file handler + @field type type of the handler; bit 1 for BAM and bit 2 for reading + @field bam BAM file handler; valid if (type&1) == 1 + @field tamr SAM file handler for reading; valid if type == 2 + @field tamw SAM file handler for writing; valid if type == 0 + @field header header struct + */ +typedef struct { + int type; + union { + tamFile tamr; + bamFile bam; + FILE *tamw; + } x; + bam_header_t *header; +} samfile_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Open a SAM/BAM file + + @param fn SAM/BAM file name; "-" is recognized as stdin (for + reading) or stdout (for writing). + + @param mode open mode /[rw](b?)(u?)(h?)/: 'r' for reading, 'w' for + writing, 'b' for BAM I/O, 'u' for uncompressed BAM output and 'h' + for outputing header in SAM. If 'b' present, it must immediately + follow 'r' or 'w'. Valid modes are "r", "w", "wh", "rb", "wb" and + "wbu" exclusively. + + @param aux auxiliary data; if mode[0]=='w', aux points to + bam_header_t; if strcmp(mode, "rb")==0 and @SQ header lines in SAM + are absent, aux points the file name of the list of the reference; + aux is not used otherwise. + + @return SAM/BAM file handler + */ + samfile_t *samopen(const char *fn, const char *mode, const void *aux); + + /*! + @abstract Close a SAM/BAM handler + @param fp file handler to be closed + */ + void samclose(samfile_t *fp); + + /*! + @abstract Read one alignment + @param fp file handler + @param b alignment + @return bytes read + */ + int samread(samfile_t *fp, bam1_t *b); + + /*! + @abstract Write one alignment + @param fp file handler + @param b alignment + @return bytes written + */ + int samwrite(samfile_t *fp, const bam1_t *b); + + /*! + @abstract Get the pileup for a whole alignment file + @param fp file handler + @param mask mask transferred to bam_plbuf_set_mask() + @param func user defined function called in the pileup process + #param data user provided data for func() + */ + int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sam_view.c b/sam_view.c new file mode 100644 index 0000000..02aee3c --- /dev/null +++ b/sam_view.c @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include "sam.h" + +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; +static char *g_library, *g_rg; + +static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) +{ + if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) + return 1; + if (g_library || g_rg) { + uint8_t *s = bam_aux_get(b, "RG"); + if (s) { + if (g_rg && strcmp(g_rg, (char*)(s + 1)) == 0) return 0; + if (g_library) { + const char *p = bam_strmap_get(h->rg2lib, (char*)(s + 1)); + return (p && strcmp(p, g_library) == 0)? 0 : 1; + } return 1; + } else return 1; + } else return 0; +} + +// callback function for bam_fetch() +static int view_func(const bam1_t *b, void *data) +{ + if (!__g_skip_aln(((samfile_t*)data)->header, b)) + samwrite((samfile_t*)data, b); + return 0; +} + +static int usage(void); + +int main_samview(int argc, char *argv[]) +{ + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0; + samfile_t *in = 0, *out = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0; + + /* parse command-line options */ + strcpy(in_mode, "r"); strcpy(out_mode, "w"); + while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:")) >= 0) { + switch (c) { + case 'S': is_bamin = 0; break; + case 'b': is_bamout = 1; break; + case 't': fn_list = strdup(optarg); is_bamin = 0; break; + case 'h': is_header = 1; break; + case 'H': is_header_only = 1; break; + case 'o': fn_out = strdup(optarg); break; + case 'f': g_flag_on = strtol(optarg, 0, 0); break; + case 'F': g_flag_off = strtol(optarg, 0, 0); break; + case 'q': g_min_mapQ = atoi(optarg); break; + case 'u': is_uncompressed = 1; break; + case 'l': g_library = strdup(optarg); break; + case 'r': g_rg = strdup(optarg); break; + default: return usage(); + } + } + if (is_uncompressed) is_bamout = 1; + if (is_header_only) is_header = 1; + if (is_bamout) strcat(out_mode, "b"); + if (is_bamin) strcat(in_mode, "b"); + if (is_header) strcat(out_mode, "h"); + if (is_uncompressed) strcat(out_mode, "u"); + if (argc == optind) return usage(); + + // open file handlers + if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { + fprintf(stderr, "[main_samview] fail to open file for reading.\n"); + goto view_end; + } + if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { + fprintf(stderr, "[main_samview] fail to open file for writing.\n"); + goto view_end; + } + if (is_header_only) goto view_end; // no need to print alignments + + if (argc == optind + 1) { // convert/print the entire file + bam1_t *b = bam_init1(); + int r; + while ((r = samread(in, b)) >= 0) // read one alignment from `in' + if (!__g_skip_aln(in->header, b)) + samwrite(out, b); // write the alignment to `out' + if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n"); + bam_destroy1(b); + } else { // retrieve alignments in specified regions + int i; + bam_index_t *idx = 0; + if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index + if (idx == 0) { // index is unavailable + fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n"); + ret = 1; + goto view_end; + } + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' + if (tid < 0) { // reference name is not found + fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n"); + continue; + } + bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments + } + bam_index_destroy(idx); // destroy the BAM index + } + +view_end: + // close files, free and return + free(fn_list); free(fn_out); free(g_library); free(g_rg); + samclose(in); + samclose(out); + return ret; +} + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools view [options] | [region1 [...]]\n\n"); + fprintf(stderr, "Options: -b output BAM\n"); + fprintf(stderr, " -h print header for the SAM output\n"); + fprintf(stderr, " -H print header only (no alignments)\n"); + fprintf(stderr, " -S input is SAM\n"); + fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); + fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); + fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); + fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); + fprintf(stderr, " -q INT minimum mapping quality [0]\n"); + fprintf(stderr, " -l STR only output reads in library STR [null]\n"); + fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); + fprintf(stderr, "\n\ +Notes:\n\ +\n\ + 1. By default, this command assumes the file on the command line is in\n\ + the BAM format and it prints the alignments in SAM. If `-t' is\n\ + applied, the input file is assumed to be in the SAM format. The\n\ + file supplied with `-t' is SPACE/TAB delimited with the first two\n\ + fields of each line consisting of the reference name and the\n\ + corresponding sequence length. The `.fai' file generated by `faidx'\n\ + can be used here. This file may be empty if reads are unaligned.\n\ +\n\ + 2. SAM->BAM conversion: `samtools view -bt ref.fa.fai in.sam.gz'.\n\ +\n\ + 3. BAM->SAM conversion: `samtools view in.bam'.\n\ +\n\ + 4. A region should be presented in one of the following formats:\n\ + `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\ + specified, the input alignment file must be an indexed BAM file.\n\ +\n\ + 5. Option `-u' is preferred over `-b' when the output is piped to\n\ + another samtools command.\n\ +\n"); + return 1; +} + +int main_import(int argc, char *argv[]) +{ + int argc2, ret; + char **argv2; + if (argc != 4) { + fprintf(stderr, "Usage: bamtk import \n"); + return 1; + } + argc2 = 6; + argv2 = calloc(6, sizeof(char*)); + argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; + ret = main_samview(argc2, argv2); + free(argv2); + return ret; +} diff --git a/samtools.1 b/samtools.1 new file mode 100644 index 0000000..45e1612 --- /dev/null +++ b/samtools.1 @@ -0,0 +1,422 @@ +.TH samtools 1 "6 July 2009" "samtools-0.1.5" "Bioinformatics tools" +.SH NAME +.PP +samtools - Utilities for the Sequence Alignment/Map (SAM) format +.SH SYNOPSIS +.PP +samtools view -bt ref_list.txt -o aln.bam aln.sam.gz +.PP +samtools sort aln.bam aln.sorted +.PP +samtools index aln.sorted.bam +.PP +samtools view aln.sorted.bam chr2:20,100,000-20,200,000 +.PP +samtools merge out.bam in1.bam in2.bam in3.bam +.PP +samtools faidx ref.fasta +.PP +samtools pileup -f ref.fasta aln.sorted.bam +.PP +samtools tview aln.sorted.bam ref.fasta + +.SH DESCRIPTION +.PP +Samtools is a set of utilities that manipulate alignments in the BAM +format. It imports from and exports to the SAM (Sequence Alignment/Map) +format, does sorting, merging and indexing, and allows to retrieve reads +in any regions swiftly. + +Samtools is designed to work on a stream. It regards an input file `-' +as the standard input (stdin) and an output file `-' as the standard +output (stdout). Several commands can thus be combined with Unix +pipes. Samtools always output warning and error messages to the standard +error output (stderr). + +Samtools is also able to open a BAM (not SAM) file on a remote FTP +server if the BAM file name starts with `ftp://'. Samtools checks the +current working directory for the index file and will download the index +upon absence. Samtools achieves random FTP file access with the `REST' +ftp command. It does not retrieve the entire alignment file unless it is +asked to do so. + +.SH COMMANDS AND OPTIONS + +.TP 10 +.B import +samtools import + +Since 0.1.4, this command is an alias of: + +samtools view -bt -o + +.TP +.B sort +samtools sort [-n] [-m maxMem] + +Sort alignments by leftmost coordinates. File +.I .bam +will be created. This command may also create temporary files +.I .%d.bam +when the whole alignment cannot be fitted into memory (controlled by +option -m). + +.B OPTIONS: +.RS +.TP 8 +.B -n +Sort by read names rather than by chromosomal coordinates +.TP +.B -m INT +Approximately the maximum required memory. [500000000] +.RE + +.TP +.B merge +samtools merge [-n] [...] + +Merge multiple sorted alignments. The header of +.I +will be copied to +.I +and the headers of other files will be ignored. + +.B OPTIONS: +.RS +.TP 8 +.B -n +The input alignments are sorted by read names rather than by chromosomal +coordinates +.RE + +.TP +.B index +samtools index + +Index sorted alignment for fast random access. Index file +.I .bai +will be created. + +.TP +.B view +samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F +skipFlag] [-q minMapQ] [-l library] [-r readGroup] | [region1 [...]] + +Extract/print all or sub alignments in SAM or BAM format. If no region +is specified, all the alignments will be printed; otherwise only +alignments overlapping the specified regions will be output. An +alignment may be given multiple times if it is overlapping several +regions. A region can be presented, for example, in the following +format: `chr2', `chr2:1000000' or `chr2:1,000,000-2,000,000'. The +coordinate is 1-based. + +.B OPTIONS: +.RS +.TP 8 +.B -b +Output in the BAM format. +.TP +.B -u +Output uncompressed BAM. This option saves time spent on +compression/decomprssion and is thus preferred when the output is piped +to another samtools command. +.TP +.B -h +Include the header in the output. +.TP +.B -H +Output the header only. +.TP +.B -S +Input is in SAM. If @SQ header lines are absent, the +.B `-t' +option is required. +.TP +.B -t FILE +This file is TAB-delimited. Each line must contain the reference name +and the length of the reference, one line for each distinct reference; +additional fields are ignored. This file also defines the order of the +reference sequences in sorting. If you run `samtools faidx ', +the resultant index file +.I .fai +can be used as this +.I +file. +.TP +.B -o FILE +Output file [stdout] +.TP +.B -f INT +Only output alignments with all bits in INT present in the FLAG +field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0] +.TP +.B -F INT +Skip alignments with bits present in INT [0] +.TP +.B -q INT +Skip alignments with MAPQ smaller than INT [0] +.TP +.B -l STR +Only output reads in library STR [null] +.TP +.B -r STR +Only output reads in read group STR [null] +.RE + +.TP +.B faidx +samtools faidx [region1 [...]] + +Index reference sequence in the FASTA format or extract subsequence from +indexed reference sequence. If no region is specified, +.B faidx +will index the file and create +.I .fai +on the disk. If regions are speficified, the subsequences will be +retrieved and printed to stdout in the FASTA format. The input file can +be compressed in the +.B RAZF +format. + +.TP +.B pileup +samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] +[-iscgS2] [-T theta] [-N nHap] [-r pairDiffRate] | + +Print the alignment in the pileup format. In the pileup format, each +line represents a genomic position, consisting of chromosome name, +coordinate, reference base, read bases, read qualities and alignment +mapping qualities. Information on match, mismatch, indel, strand, +mapping quality and start and end of a read are all encoded at the read +base column. At this column, a dot stands for a match to the reference +base on the forward strand, a comma for a match on the reverse strand, +`ACGTN' for a mismatch on the forward strand and `acgtn' for a mismatch +on the reverse strand. A pattern `\\+[0-9]+[ACGTNacgtn]+' indicates +there is an insertion between this reference position and the next +reference position. The length of the insertion is given by the integer +in the pattern, followed by the inserted sequence. Similarly, a pattern +`-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. The +deleted bases will be presented as `*' in the following lines. Also at +the read base column, a symbol `^' marks the start of a read segment +which is a contiguous subsequence on the read separated by `N/S/H' CIGAR +operations. The ASCII of the character following `^' minus 33 gives the +mapping quality. A symbol `$' marks the end of a read segment. + +If option +.B -c +is applied, the consensus base, consensus quality, SNP quality and RMS +mapping quality of the reads covering the site will be inserted between +the `reference base' and the `read bases' columns. An indel occupies an +additional line. Each indel line consists of chromosome name, +coordinate, a star, the genotype, consensus quality, SNP quality, RMS +mapping quality, # covering reads, the first alllele, the second allele, +# reads supporting the first allele, # reads supporting the second +allele and # reads containing indels different from the top two alleles. + +.B OPTIONS: +.RS + +.TP 10 +.B -s +Print the mapping quality as the last column. This option makes the +output easier to parse, although this format is not space efficient. + +.TP +.B -S +The input file is in SAM. + +.TP +.B -i +Only output pileup lines containing indels. + +.TP +.B -f FILE +The reference sequence in the FASTA format. Index file +.I FILE.fai +will be created if +absent. + +.TP +.B -M INT +Cap mapping quality at INT [60] + +.TP +.B -t FILE +List of reference names ane sequence lengths, in the format described +for the +.B import +command. If this option is present, samtools assumes the input +.I +is in SAM format; otherwise it assumes in BAM format. + +.TP +.B -l FILE +List of sites at which pileup is output. This file is space +delimited. The first two columns are required to be chromosome and +1-based coordinate. Additional columns are ignored. It is +recommended to use option +.B -s +together with +.B -l +as in the default format we may not know the mapping quality. + +.TP +.B -c +Call the consensus sequence using MAQ consensus model. Options +.B -T, +.B -N, +.B -I +and +.B -r +are only effective when +.B -c +or +.B -g +is in use. + +.TP +.B -g +Generate genotype likelihood in the binary GLFv3 format. This option +suppresses -c, -i and -s. + +.TP +.B -T FLOAT +The theta parameter (error dependency coefficient) in the maq consensus +calling model [0.85] + +.TP +.B -N INT +Number of haplotypes in the sample (>=2) [2] + +.TP +.B -r FLOAT +Expected fraction of differences between a pair of haplotypes [0.001] + +.TP +.B -I INT +Phred probability of an indel in sequencing/prep. [40] + +.RE + +.TP +.B tview +samtools tview [ref.fasta] + +Text alignment viewer (based on the ncurses library). In the viewer, +press `?' for help and press `g' to check the alignment start from a +region in the format like `chr10:10,000,000'. Note that if the region +showed on the screen contains no mapped reads, a blank screen will be +seen. This is a known issue and will be improved later. + +.RE + +.TP +.B fixmate +samtools fixmate + +Fill in mate coordinates, ISIZE and mate related flags from a +name-sorted alignment. + +.TP +.B rmdup +samtools rmdup + +Remove potential PCR duplicates: if multiple read pairs have identical +external coordinates, only retain the pair with highest mapping quality. +This command +.B ONLY +works with FR orientation and requires ISIZE is correctly set. + +.RE + +.TP +.B rmdupse +samtools rmdupse + +Remove potential duplicates for single-ended reads. This command will +treat all reads as single-ended even if they are paired in fact. + +.RE + +.TP +.B fillmd +samtools fillmd [-e] + +Generate the MD tag. If the MD tag is already present, this command will +give a warning if the MD tag generated is different from the existing +tag. + +.B OPTIONS: +.RS +.TP 8 +.B -e +Convert a the read base to = if it is identical to the aligned reference +base. Indel caller does not support the = bases at the moment. + +.RE + +.SH SAM FORMAT + +SAM is TAB-delimited. Apart from the header lines, which are started +with the `@' symbol, each alignment line consists of: + +.TS +center box; +cb | cb | cb +n | l | l . +Col Field Description +_ +1 QNAME Query (pair) NAME +2 FLAG bitwise FLAG +3 RNAME Reference sequence NAME +4 POS 1-based leftmost POSition/coordinate of clipped sequence +5 MAPQ MAPping Quality (Phred-scaled) +6 CIAGR extended CIGAR string +7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) +8 MPOS 1-based Mate POSistion +9 ISIZE Inferred insert SIZE +10 SEQ query SEQuence on the same strand as the reference +11 QUAL query QUALity (ASCII-33 gives the Phred base quality) +12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE +.TE + +.PP +Each bit in the FLAG field is defined as: + +.TS +center box; +cb | cb +l | l . +Flag Description +_ +0x0001 the read is paired in sequencing +0x0002 the read is mapped in a proper pair +0x0004 the query sequence itself is unmapped +0x0008 the mate is unmapped +0x0010 strand of the query (1 for reverse) +0x0020 strand of the mate +0x0040 the read is the first read in a pair +0x0080 the read is the second read in a pair +0x0100 the alignment is not primary +0x0200 the read fails platform/vendor quality checks +0x0400 the read is either a PCR or an optical duplicate +.TE + +.SH LIMITATIONS +.PP +.IP o 2 +Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. +.IP o 2 +CIGAR operation P is not properly handled at the moment. + +.SH AUTHOR +.PP +Heng Li from the Sanger Institute wrote the C version of samtools. Bob +Handsaker from the Broad Institute implemented the BGZF library and Jue +Ruan from Beijing Genomics Institute wrote the RAZF library. Various +people in the 1000Genomes Project contributed to the SAM format +specification. + +.SH SEE ALSO +.PP +Samtools website: http://samtools.sourceforge.net diff --git a/source.dot b/source.dot new file mode 100644 index 0000000..1735774 --- /dev/null +++ b/source.dot @@ -0,0 +1,19 @@ +digraph { + faidx[label="faidx.c\n(faidx)"] + import[label="bam_import.c\n(import)"] + plcmd[label="bam_plcmd.c\n(pileup)"] + sort[label="bam_sort.c\n(sort, merge)"] + index[label="bam_index.c\n(index)"] + tview[label="bam_tview.c\n(tview)"] + glf[label="glf.c\n(glfview)"] + rmdup[label="bam_rmdup.c\n(rmdup)"] + fixmate[label="bam_mate.c\n(fixmate)"] + "bam_aux.c" -> {"bam.c", import} + glf -> {"bam_maqcns.c", plcmd} + "bgzf.c" -> {"bam.c", glf} + "bam.c" -> {index, "bam_pileup.c", sort, import, rmdup, fixmate} + "bam_pileup.c" -> {"bam_lpileup.c", plcmd} + {"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview + {import, faidx, "bam_maqcns.c"} -> plcmd + {tview, plcmd, faidx, sort, import, index, glf, rmdup, fixmate} -> "bamtk.c\n(view)" +} \ No newline at end of file -- 2.30.2