From aa8ecff068edbb09a03bd874fce716e93e22e53c Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 19 Nov 2010 11:43:01 -0800 Subject: [PATCH] Imported Upstream version 0.2 --- COPYING | 21 + INSTALL | 50 ++ KNOWN_BUGS | 0 MANIFEST.in | 26 + PKG-INFO | 15 + THANKS | 3 + pysam/Pileup.py | 60 ++ pysam/__init__.py | 101 +++ pysam/csamtools.pxd | 251 ++++++ pysam/csamtools.pyx | 1782 +++++++++++++++++++++++++++++++++++++++ pysam/namedtuple.py | 117 +++ pysam/pysam_util.c | 499 +++++++++++ pysam/pysam_util.h | 95 +++ samtools/bam.c | 303 +++++++ samtools/bam.h | 697 +++++++++++++++ samtools/bam_aux.c | 182 ++++ samtools/bam_color.c | 127 +++ samtools/bam_endian.h | 42 + samtools/bam_import.c | 439 ++++++++++ samtools/bam_index.c | 574 +++++++++++++ samtools/bam_lpileup.c | 198 +++++ samtools/bam_maqcns.c | 601 +++++++++++++ samtools/bam_maqcns.h | 56 ++ samtools/bam_mate.c | 70 ++ samtools/bam_md.c | 149 ++++ samtools/bam_pileup.c | 238 ++++++ samtools/bam_plcmd.c | 392 +++++++++ samtools/bam_rmdup.c | 206 +++++ samtools/bam_rmdupse.c | 159 ++++ samtools/bam_sort.c | 357 ++++++++ samtools/bam_stat.c | 78 ++ samtools/bam_tview.c | 415 +++++++++ samtools/bgzf.c | 683 +++++++++++++++ samtools/bgzf.h | 134 +++ samtools/faidx.c | 422 +++++++++ samtools/faidx.h | 103 +++ samtools/glf.c | 236 ++++++ samtools/glf.h | 56 ++ samtools/kaln.c | 370 ++++++++ samtools/kaln.h | 55 ++ samtools/khash.h | 486 +++++++++++ samtools/klist.h | 96 +++ samtools/knetfile.c | 632 ++++++++++++++ samtools/knetfile.h | 75 ++ samtools/kseq.h | 227 +++++ samtools/ksort.h | 271 ++++++ samtools/kstring.c | 165 ++++ samtools/kstring.h | 68 ++ samtools/razf.c | 853 +++++++++++++++++++ samtools/razf.h | 134 +++ samtools/sam.c | 174 ++++ samtools/sam.h | 98 +++ samtools/sam_header.c | 701 +++++++++++++++ samtools/sam_header.h | 24 + samtools/sam_view.c | 224 +++++ setup.cfg | 6 + setup.py | 79 ++ tests/00README.txt | 32 + tests/Makefile | 32 + tests/ex1.fa | 56 ++ tests/ex1.sam.gz | Bin 0 -> 113194 bytes tests/ex3.sam | 13 + tests/ex4.sam | 9 + tests/ex5.sam | 5 + tests/ex6.sam | 5 + tests/ex7.sam | 2 + tests/example.py | 121 +++ tests/pysam_test.py | 841 ++++++++++++++++++ tests/segfault_tests.py | 37 + 69 files changed, 15828 insertions(+) create mode 100644 COPYING create mode 100644 INSTALL create mode 100644 KNOWN_BUGS create mode 100644 MANIFEST.in create mode 100644 PKG-INFO create mode 100644 THANKS create mode 100644 pysam/Pileup.py create mode 100644 pysam/__init__.py create mode 100644 pysam/csamtools.pxd create mode 100644 pysam/csamtools.pyx create mode 100644 pysam/namedtuple.py create mode 100644 pysam/pysam_util.c create mode 100644 pysam/pysam_util.h create mode 100644 samtools/bam.c create mode 100644 samtools/bam.h create mode 100644 samtools/bam_aux.c create mode 100644 samtools/bam_color.c create mode 100644 samtools/bam_endian.h create mode 100644 samtools/bam_import.c create mode 100644 samtools/bam_index.c create mode 100644 samtools/bam_lpileup.c create mode 100644 samtools/bam_maqcns.c create mode 100644 samtools/bam_maqcns.h create mode 100644 samtools/bam_mate.c create mode 100644 samtools/bam_md.c create mode 100644 samtools/bam_pileup.c create mode 100644 samtools/bam_plcmd.c create mode 100644 samtools/bam_rmdup.c create mode 100644 samtools/bam_rmdupse.c create mode 100644 samtools/bam_sort.c create mode 100644 samtools/bam_stat.c create mode 100644 samtools/bam_tview.c create mode 100644 samtools/bgzf.c create mode 100644 samtools/bgzf.h create mode 100644 samtools/faidx.c create mode 100644 samtools/faidx.h create mode 100644 samtools/glf.c create mode 100644 samtools/glf.h create mode 100644 samtools/kaln.c create mode 100644 samtools/kaln.h create mode 100644 samtools/khash.h create mode 100644 samtools/klist.h create mode 100644 samtools/knetfile.c create mode 100644 samtools/knetfile.h create mode 100644 samtools/kseq.h create mode 100644 samtools/ksort.h create mode 100644 samtools/kstring.c create mode 100644 samtools/kstring.h create mode 100644 samtools/razf.c create mode 100644 samtools/razf.h create mode 100644 samtools/sam.c create mode 100644 samtools/sam.h create mode 100644 samtools/sam_header.c create mode 100644 samtools/sam_header.h create mode 100644 samtools/sam_view.c create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 tests/00README.txt create mode 100644 tests/Makefile create mode 100644 tests/ex1.fa create mode 100644 tests/ex1.sam.gz create mode 100644 tests/ex3.sam create mode 100644 tests/ex4.sam create mode 100644 tests/ex5.sam create mode 100644 tests/ex6.sam create mode 100644 tests/ex7.sam create mode 100644 tests/example.py create mode 100755 tests/pysam_test.py create mode 100755 tests/segfault_tests.py diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..82fa2f4 --- /dev/null +++ b/COPYING @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2008-2009 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..65330bc --- /dev/null +++ b/INSTALL @@ -0,0 +1,50 @@ +System Requirements +=================== + +SAMtools depends on the zlib library . The latest +version 1.2.3 is preferred and with the latest version you can compile +razip and use it to compress a FASTA file. SAMtools' faidx is able to +index a razip-compressed FASTA file to save diskspace. Older zlib also +works with SAMtools, but razip cannot be compiled. + +The text-based viewer (tview) requires the GNU ncurses library +, which comes with Mac OS X and +most of the modern Linux/Unix distributions. If you do not have this +library installed, you can still compile the rest of SAMtools by +manually modifying one line in Makefile. + +Pysam requires pyrex (0.9.8 or greater) and python (2.6 or greater). +It has not been tested on many other platforms. + +Compilation +=========== + +Unpack the distribution and enter the pysam directory. Type + +python setup.py build + +to compile. + +Installation +============ + +Type + + python setup.py install + +to install it within the site-packages directory of your python +distribution. Type + + python setup.py install --help + +for more options. + +Architecture specific options +============================= + +Pysam has been compiled on various linux systems and works +with python 2.6 and python 2.5. + +Python 2.7 and Python 3 have not been tested. + +Windows support does not work yet diff --git a/KNOWN_BUGS b/KNOWN_BUGS new file mode 100644 index 0000000..e69de29 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..11fb9d1 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,26 @@ +# +# Use .add_data_files and .add_data_dir methods in a appropriate +# setup.py files to include non-python files such as documentation, +# data, etc files to distribution. Avoid using MANIFEST.in for that. +# +include MANIFEST.in +include COPYING +include INSTALL +include KNOWN_BUGS +include THANKS +include pysam/csamtools.pxd +include pysam/pysam_util.h +include samtools/*.h +include tests/00README.txt +include tests/Makefile +include tests/ex1.fa +include tests/ex1.sam.gz +include tests/ex3.sam +include tests/ex4.sam +include tests/ex5.sam +include tests/ex6.sam +include tests/ex7.sam +include tests/example.py +include tests/pysam_test.py +include tests/segfault_tests.py + diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..3e3b745 --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,15 @@ +Metadata-Version: 1.0 +Name: pysam +Version: 0.2 +Summary: pysam +Home-page: http://code.google.com/p/pysam/ +Author: Andreas Heger +Author-email: andreas.heger@gmail.com +License: MIT +Description: + + pysam + ***** + + +Platform: ALL diff --git a/THANKS b/THANKS new file mode 100644 index 0000000..02fea67 --- /dev/null +++ b/THANKS @@ -0,0 +1,3 @@ +We would like to thank Heng Li and the other samtools contributors for their support +and their hard work. As a wrapper, pysam merely tries to make their code accessible +to the python community - the heavy lifting has been done by the samtools developers. diff --git a/pysam/Pileup.py b/pysam/Pileup.py new file mode 100644 index 0000000..e182d12 --- /dev/null +++ b/pysam/Pileup.py @@ -0,0 +1,60 @@ +'''Tools for working with files in the samtools pileup -c format.''' +import collections +import pysam + +PileupSubstitution = collections.namedtuple( "PileupSubstitution", + " ".join( (\ + "chromosome", + "position", + "reference_base", + "consensus_base", + "consensus_quality", + "snp_quality", + "rms_mapping_quality", + "coverage", + "read_bases", + "base_qualities" ) ) ) + +PileupIndel = collections.namedtuple( "PileupIndel", + " ".join( (\ + "chromosome", + "position", + "reference_base", + "genotype", + "consensus_quality", + "snp_quality", + "rms_mapping_quality", + "coverage", + "first_allelle", + "second_allele", + "reads_first", + "reads_second", + "reads_diff" ) ) ) + +def iterate( infile ): + '''iterate over ``samtools pileup -c`` formatted file. + + *infile* can be any iterator over a lines. + + The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution` + or :class:`pysam.Pileup.PileupIndel`. + + .. note:: + The parser converts to 0-based coordinates + ''' + + conv_subst = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str) + conv_indel = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str,int,int,int) + + for line in infile: + d = line[:-1].split() + if d[2] == "*": + try: + yield PileupIndel( *[x(y) for x,y in zip(conv_indel,d) ] ) + except TypeError: + raise pysam.SamtoolsError( "parsing error in line: `%s`" % line) + else: + try: + yield PileupSubstitution( *[x(y) for x,y in zip(conv_subst,d) ] ) + except TypeError: + raise pysam.SamtoolsError( "parsing error in line: `%s`" % line) diff --git a/pysam/__init__.py b/pysam/__init__.py new file mode 100644 index 0000000..3062753 --- /dev/null +++ b/pysam/__init__.py @@ -0,0 +1,101 @@ +from csamtools import * +import Pileup +import sys +import os + +class SamtoolsError( Exception ): + '''exception raised in case of an error incurred in the samtools library.''' + + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class SamtoolsDispatcher(object): + '''samtools dispatcher. + + Emulates the samtools command line as module calls. + + Captures stdout and stderr. + + Raises a :class:`pysam.SamtoolsError` exception in case + samtools exits with an error code other than 0. + + Some command line options are associated with parsers. + For example, the samtools command "pileup -c" creates + a tab-separated table on standard output. In order to + associate parsers with options, an optional list of + parsers can be supplied. The list will be processed + in order checking for the presence of each option. + + If no parser is given or no appropriate parser is found, + the stdout output of samtools commands will be returned. + ''' + dispatch=None + parsers=None + + def __init__(self,dispatch, parsers): + self.dispatch = dispatch + self.parsers = parsers + self.stderr = [] + + def __call__(self,*args, **kwargs): + '''execute the samtools command + ''' + retval, stderr, stdout = csamtools._samtools_dispatch( self.dispatch, args ) + if retval: raise SamtoolsError( "\n".join( stderr ) ) + self.stderr = stderr + # samtools commands do not propagate the return code correctly. + # I have thus added this patch to throw if there is output on stderr. + # Note that there is sometimes output on stderr that is not an error, + # for example: [sam_header_read2] 2 sequences loaded. + # Ignore messages like these + stderr = [ x for x in stderr if not x.startswith( "[sam_header_read2]" ) ] + if stderr: raise SamtoolsError( "\n".join( stderr ) ) + + # call parser for stdout: + if not kwargs.get("raw") and stdout and self.parsers: + for options, parser in self.parsers: + for option in options: + if option not in args: break + else: + return parser(stdout) + + return stdout + + def getMessages( self ): + return self.stderr + + def usage(self): + '''return the samtools usage information for this command''' + retval, stderr, stdout = csamtools._samtools_dispatch( self.dispatch ) + return "".join(stderr) + +# +# samtools command line options to export in python +# +# import is a python reserved word. +SAMTOOLS_DISPATCH = { + "view" : ( "view", None ), + "sort" : ( "sort", None), + "samimport": ( "import", None), + "pileup" : ( "pileup", ( (("-c",), Pileup.iterate ), ), ), + "faidx" : ("faidx", None), + "tview" : ("tview", None), + "index" : ("index", None), + "fixmate" : ("fixmate", None), + "glfview" : ("glfview", None), + "flagstat" : ("flagstat", None), + "calmd" : ("calmd", None), + "merge" : ("merge", None), + "rmdup" : ("rmdup", None) } + +# instantiate samtools commands as python functions +for key, options in SAMTOOLS_DISPATCH.iteritems(): + cmd, parser = options + globals()[key] = SamtoolsDispatcher(cmd, parser) + +# hack to export all the symbols from csamtools +__all__ = csamtools.__all__ + [ "SamtoolsError", "SamtoolsDispatcher" ] + list(SAMTOOLS_DISPATCH) +\ + ["Pileup",] + diff --git a/pysam/csamtools.pxd b/pysam/csamtools.pxd new file mode 100644 index 0000000..7dac38d --- /dev/null +++ b/pysam/csamtools.pxd @@ -0,0 +1,251 @@ + +cdef extern from "string.h": + ctypedef int size_t + void *memcpy(void *dst,void *src,size_t len) + void *memmove(void *dst,void *src,size_t len) + void *memset(void *b,int c,size_t len) + +cdef extern from "stdlib.h": + void free(void *) + void *malloc(size_t) + void *calloc(size_t,size_t) + void *realloc(void *,size_t) + int c_abs "abs" (int) + void qsort(void *base, size_t nmemb, size_t size, + int (*compar)(void *,void *)) + +cdef extern from "stdio.h": + ctypedef struct FILE: + pass + FILE *fopen(char *,char *) + FILE *freopen(char *path, char *mode, FILE *stream) + int fileno(FILE *stream) + int dup2(int oldfd, int newfd) + int fflush(FILE *stream) + + FILE * stderr + FILE * stdout + int fclose(FILE *) + int sscanf(char *str,char *fmt,...) + int printf(char *str,char *fmt,...) + int sprintf(char *str,char *fmt,...) + int fprintf(FILE *ifile,char *fmt,...) + char *fgets(char *str,int size,FILE *ifile) + +cdef extern from "ctype.h": + int toupper(int c) + int tolower(int c) + +cdef extern from "unistd.h": + char *ttyname(int fd) + int isatty(int fd) + +cdef extern from "string.h": + int strcmp(char *s1, char *s2) + int strncmp(char *s1,char *s2,size_t len) + char *strcpy(char *dest,char *src) + char *strncpy(char *dest,char *src, size_t len) + char *strdup(char *) + char *strcat(char *,char *) + size_t strlen(char *s) + int memcmp( void * s1, void *s2, size_t len ) + +cdef extern from "razf.h": + pass + +cdef extern from "stdint.h": + ctypedef int int64_t + ctypedef int int32_t + ctypedef int uint32_t + ctypedef int uint8_t + ctypedef int uint64_t + + +cdef extern from "bam.h": + + # IF _IOLIB=2, bamFile = BGZF, see bgzf.h + # samtools uses KNETFILE, check how this works + + ctypedef struct tamFile: + pass + + ctypedef struct bamFile: + pass + + ctypedef struct bam1_core_t: + int32_t tid + int32_t pos + uint32_t bin + uint32_t qual + uint32_t l_qname + uint32_t flag + uint32_t n_cigar + int32_t l_qseq + int32_t mtid + int32_t mpos + int32_t isize + + ctypedef struct bam1_t: + bam1_core_t core + int l_aux + int data_len + int m_data + uint8_t *data + + ctypedef struct bam_pileup1_t: + bam1_t *b + int32_t qpos + int indel + int level + uint32_t is_del + uint32_t is_head + uint32_t is_tail + + ctypedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *data) + + ctypedef int (*bam_fetch_f)(bam1_t *b, void *data) + + ctypedef struct bam_header_t: + int32_t n_targets + char **target_name + uint32_t *target_len + void *hash + void *rg2lib + int l_text + char *text + + ctypedef struct bam_index_t: + pass + + ctypedef struct bam_plbuf_t: + pass + + bamFile razf_dopen(int data_fd, char *mode) + + # removed - macros not found + + # int64_t bam_seek( bamFile fp, uint64_t voffset, int where) + # int64_t bam_tell( bamFile fp ) + # void bam_destroy1( bam1_t * b) + # void bam_init_header_hash(bam_header_t *header) + + bam1_t * bam_dup1( bam1_t *src ) + + bam1_t * bam_copy1(bam1_t *bdst, bam1_t *bsrc) + bam_index_t *bam_index_load(char *f ) + + void bam_index_destroy(bam_index_t *idx) + + int bam_parse_region(bam_header_t *header, char *str, int *ref_id, int *begin, int *end) + + bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) + + int bam_fetch(bamFile fp, bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) + + int bam_plbuf_push(bam1_t *b, bam_plbuf_t *buf) + + void bam_plbuf_destroy(bam_plbuf_t *buf) + + int bam_read1(bamFile fp, bam1_t *b) + + int bam_write1( bamFile fp, bam1_t *b) + + bam_header_t *bam_header_init() + + int bam_header_write( bamFile fp, bam_header_t *header) + + bam_header_t *bam_header_read( bamFile fp ) + + void bam_header_destroy(bam_header_t *header) + + bam1_t * bam_dup1( bam1_t *src ) + + bam1_t * bam_copy1(bam1_t *bdst, bam1_t *bsrc) + + uint8_t *bam_aux_get(bam1_t *b, char tag[2]) + + int bam_aux2i(uint8_t *s) + float bam_aux2f(uint8_t *s) + double bam_aux2d(uint8_t *s) + char bam_aux2A( uint8_t *s) + char *bam_aux2Z( uint8_t *s) + + int bam_reg2bin(uint32_t beg, uint32_t end) + + uint32_t bam_calend(bam1_core_t *c, uint32_t *cigar) + +cdef extern from "sam.h": + + ctypedef struct samfile_t_un: + tamFile tamr + bamFile bam + FILE *tamw + + ctypedef struct samfile_t: + int type + samfile_t_un x + bam_header_t *header + + samfile_t *samopen( char *fn, char * mode, void *aux) + + int sampileup( samfile_t *fp, int mask, bam_pileup_f func, void *data) + + void samclose(samfile_t *fp) + + int samread(samfile_t *fp, bam1_t *b) + + int samwrite(samfile_t *fp, bam1_t *b) + +cdef extern from "faidx.h": + + ctypedef struct faidx_t: + pass + + int fai_build(char *fn) + + void fai_destroy(faidx_t *fai) + + faidx_t *fai_load(char *fn) + + char *fai_fetch(faidx_t *fai, char *reg, int *len) + +cdef extern from "pysam_util.h": + + int pysam_bam_plbuf_push(bam1_t *b, bam_plbuf_t *buf, int cont) + + int pysam_get_pos( bam_plbuf_t *buf) + + int pysam_get_tid( bam_plbuf_t *buf) + + bam_pileup1_t * pysam_get_pileup( bam_plbuf_t *buf) + + int pysam_dispatch(int argc, char *argv[] ) + + # stand-in functions for samtools macros + void pysam_bam_destroy1( bam1_t * b) + + # add *nbytes* into the variable length data of *src* at *pos* + bam1_t * pysam_bam_update( bam1_t * b, + size_t nbytes_old, + size_t nbytes_new, + uint8_t * pos ) + + # translate char to unsigned char + unsigned char pysam_translate_sequence( char s ) + + # stand-ins for samtools macros + uint32_t * pysam_bam1_cigar( bam1_t * b) + char * pysam_bam1_qname( bam1_t * b) + uint8_t * pysam_bam1_seq( bam1_t * b) + uint8_t * pysam_bam1_qual( bam1_t * b) + uint8_t * pysam_bam1_aux( bam1_t * b) + + # iterator implemenation + ctypedef struct bam_fetch_iterator_t: + pass + + bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, bam_index_t *idx, int tid, int beg, int end) + + bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter) + + void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter) diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx new file mode 100644 index 0000000..0da8d9e --- /dev/null +++ b/pysam/csamtools.pyx @@ -0,0 +1,1782 @@ +# cython: embedsignature=True +# adds doc-strings for sphinx + +import tempfile, os, sys, types, itertools, struct, ctypes + +# defines imported from samtools +DEF SEEK_SET = 0 +DEF SEEK_CUR = 1 +DEF SEEK_END = 2 + +## These are bits set in the flag. +## have to put these definitions here, in csamtools.pxd they got ignored +## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ +DEF BAM_FPAIRED =1 +## @abstract the read is mapped in a proper pair */ +DEF BAM_FPROPER_PAIR =2 +## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ +DEF BAM_FUNMAP =4 +## @abstract the mate is unmapped */ +DEF BAM_FMUNMAP =8 +## @abstract the read is mapped to the reverse strand */ +DEF BAM_FREVERSE =16 +## @abstract the mate is mapped to the reverse strand */ +DEF BAM_FMREVERSE =32 +## @abstract this is read1 */ +DEF BAM_FREAD1 =64 +## @abstract this is read2 */ +DEF BAM_FREAD2 =128 +## @abstract not primary alignment */ +DEF BAM_FSECONDARY =256 +## @abstract QC failure */ +DEF BAM_FQCFAIL =512 +## @abstract optical or PCR duplicate */ +DEF BAM_FDUP =1024 + +DEF BAM_CIGAR_SHIFT=4 +DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1) + +##################################################################### +##################################################################### +##################################################################### +## private factory methods +##################################################################### +cdef class AlignedRead +cdef makeAlignedRead( bam1_t * src): + '''enter src into AlignedRead.''' + cdef AlignedRead dest + dest = AlignedRead() + # destroy dummy delegate created in constructor + # to prevent memory leak. + pysam_bam_destroy1(dest._delegate) + dest._delegate = bam_dup1(src) + return dest + +cdef class PileupProxy +cdef makePileupProxy( bam_plbuf_t * buf, int n ): + cdef PileupProxy dest + dest = PileupProxy() + dest.buf = buf + dest.n = n + return dest + +cdef class PileupRead +cdef makePileupRead( bam_pileup1_t * src ): + '''fill a PileupRead object from a bam_pileup1_t * object.''' + cdef PileupRead dest + dest = PileupRead() + dest._alignment = makeAlignedRead( src.b ) + dest._qpos = src.qpos + dest._indel = src.indel + dest._level = src.level + dest._is_del = src.is_del + dest._is_head = src.is_head + dest._is_tail = src.is_tail + return dest + +##################################################################### +##################################################################### +##################################################################### +## Generic callbacks for inserting python callbacks. +##################################################################### +cdef int fetch_callback( bam1_t *alignment, void *f): + '''callback for bam_fetch. + + calls function in *f* with a new :class:`AlignedRead` object as parameter. + ''' + a = makeAlignedRead( alignment ) + (f)(a) + +class PileupColumn(object): + '''A pileup column. A pileup column contains + all the reads that map to a certain target base. + + tid + chromosome ID as is defined in the header + pos + the target base coordinate (0-based) + n + number of reads mapping to this column + pileups + list of reads (:class:`pysam.PileupRead`) aligned to this column + ''' + def __str__(self): + return "\t".join( map(str, (self.tid, self.pos, self.n))) +\ + "\n" + "\n".join( map(str, self.pileups) ) + +cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *f): + '''callback for pileup. + + calls function in *f* with a new :class:`Pileup` object as parameter. + + tid + chromosome ID as is defined in the header + pos + start coordinate of the alignment, 0-based + n + number of elements in pl array + pl + array of alignments + data + user provided data + ''' + + p = PileupColumn() + p.tid = tid + p.pos = pos + p.n = n + pileups = [] + + for x from 0 <= x < n: + pileups.append( makePileupRead( &(pl[x]) ) ) + p.pileups = pileups + + (f)(p) + +cdef int pileup_fetch_callback( bam1_t *b, void *data): + '''callback for bam_fetch. + + Fetches reads and submits them to pileup. + ''' + cdef bam_plbuf_t * buf + buf = data + bam_plbuf_push(b, buf) + return 0 + +class StderrStore(): + ''' + stderr is captured. + ''' + def __init__(self): + self.stderr_h, self.stderr_f = tempfile.mkstemp() + self.stderr_save = Outs( sys.stderr.fileno() ) + self.stderr_save.setfd( self.stderr_h ) + + def release(self): + self.stderr_save.restore() + if os.path.exists(self.stderr_f): + os.remove( self.stderr_f ) + + def __del__(self): + self.release() + +###################################################################### +###################################################################### +###################################################################### +# valid types for sam headers +VALID_HEADER_TYPES = { "HD" : dict, + "SQ" : list, + "RG" : list, + "PG" : list, + "CO" : list } + +# order of records within sam headers +VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO" ) + +# type conversions within sam header records +VALID_HEADER_FIELDS = { "HD" : { "VN" : str, "SO" : str, "GO" : str }, + "SQ" : { "SN" : str, "LN" : int, "AS" : str, "M5" : str, "UR" : str, "SP" : str }, + "RG" : { "ID" : str, "SM" : str, "LB" : str, "DS" : str, "PU" : str, "PI" : str, "CN" : str, "DT" : str, "PL" : str, }, + "PG" : { "ID" : str, "VN" : str, "CL" : str }, } + +# output order of fields within records +VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ), + "SQ" : ( "SN", "LN", "AS", "M5" , "UR" , "SP" ), + "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ), + "PG" : ( "ID", "VN", "CL" ), } + +###################################################################### +###################################################################### +###################################################################### +## Public methods +###################################################################### +cdef class Samfile: + '''*(filename, mode='r', template = None, referencenames = None, referencelengths = None, text = NULL, header = None)* + + A *SAM* file. The file is automatically opened. + + *mode* should be ``r`` for reading or ``w`` for writing. The default is text mode so for binary + (:term:`BAM`) I/O you should append ``b`` for compressed or ``u`` for uncompressed :term:`BAM` output. + Use ``h`` to output header information in text (:term:`TAM`) mode. + + If ``b`` is present, it must immediately follow ``r`` or ``w``. + Currently valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb`` and ``wbu``. + + so to open a :term:`BAM` file for reading:: + + f=Samfile('ex1.bam','rb') + + + For writing, the header of a :term:`TAM` file/:term:`BAM` file can be constituted from several + sources: + + 1. If *template* is given, the header is copied from a another *Samfile* (*template* must be of type *Samfile*). + + 2. If *header* is given, the header is build from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level is then a list of lines, with each line being a list of tag-value pairs. + + 3. If *text* is given, new header text is copied from raw text. + + 4. The names (*referencenames*) and lengths (*referencelengths*) are supplied directly as lists. + + If an index for a BAM file exists (.bai), it will be opened automatically. Without an index random + access to reads via :meth:`fetch` and :meth:`pileup` is disabled. + ''' + + cdef char * filename + # pointer to samfile + cdef samfile_t * samfile + # pointer to index + cdef bam_index_t *index + # true if file is a bam file + cdef int isbam + + # current read within iteration + cdef bam1_t * b + + def __cinit__(self, *args, **kwargs ): + self.samfile = NULL + self.isbam = False + self._open( *args, **kwargs ) + + # allocate memory for iterator + self.b = calloc(1, sizeof(bam1_t)) + + def _isOpen( self ): + '''return true if samfile has been opened.''' + return self.samfile != NULL + + def _hasIndex( self ): + '''return true if samfile has an existing (and opened) index.''' + return self.index != NULL + + def _open( self, + char * filename, + mode ='r', + Samfile template = None, + referencenames = None, + referencelengths = None, + char * text = NULL, + header = None, + ): + '''open a sam/bam file. + + If _open is called on an existing bamfile, the current file will be + closed and a new file will be opened. + ''' + + assert mode in ( "r","w","rb","wb", "wh", "wbu" ), "invalid file opening mode `%s`" % mode + + # close a previously opened file + if self.samfile != NULL: self.close() + self.samfile = NULL + + cdef bam_header_t * header_to_write + header_to_write = NULL + + self.filename = filename + + self.isbam = len(mode) > 1 and mode[1] == 'b' + + if mode[0] == 'w': + # open file for writing + + # header structure (used for writing) + if template: + # copy header from another file + header_to_write = template.samfile.header + + elif header: + header_to_write = self._buildHeader( header ) + + else: + # build header from a target names and lengths + assert referencenames and referencelengths, "either supply options `template`, `header` or both `refernencenames` and `referencelengths` for writing" + assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences" + + # allocate and fill header + header_to_write = bam_header_init() + header_to_write.n_targets = len(referencenames) + n = 0 + for x in referencenames: n += len(x) + 1 + header_to_write.target_name = calloc(n, sizeof(char*)) + header_to_write.target_len = calloc(n, sizeof(uint32_t)) + for x from 0 <= x < header_to_write.n_targets: + header_to_write.target_len[x] = referencelengths[x] + name = referencenames[x] + header_to_write.target_name[x] = calloc(len(name)+1, sizeof(char)) + strncpy( header_to_write.target_name[x], name, len(name) ) + + if text != NULL: + # copy without \0 + header_to_write.l_text = strlen(text) + header_to_write.text = calloc( strlen(text), sizeof(char) ) + memcpy( header_to_write.text, text, strlen(text) ) + + header_to_write.hash = NULL + header_to_write.rg2lib = NULL + + # open file. Header gets written to file at the same time for bam files + # and sam files (in the latter case, the mode needs to be wh) + store = StderrStore() + self.samfile = samopen( filename, mode, header_to_write ) + store.release() + + # bam_header_destroy takes care of cleaning up of all the members + if not template and header_to_write != NULL: + bam_header_destroy( header_to_write ) + + elif mode[0] == "r": + # open file for reading + if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ): + raise IOError( "file `%s` not found" % filename) + + store = StderrStore() + self.samfile = samopen( filename, mode, NULL ) + store.release() + + if self.samfile == NULL: + raise IOError("could not open file `%s`" % filename ) + + if mode[0] == "r" and self.isbam: + if not os.path.exists(filename + ".bai"): + self.index = NULL + else: + # returns NULL if there is no index or index could not be opened + self.index = bam_index_load(filename) + if self.index == NULL: + raise IOError("error while opening index `%s` " % filename ) + + def getrname( self, tid ): + '''(tid ) + convert numerical :term:`tid` into :ref:`reference` name.''' + if not 0 <= tid < self.samfile.header.n_targets: + raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets ) + return self.samfile.header.target_name[tid] + + def _parseRegion( self, + reference = None, + start = None, + end = None, + region = None ): + '''parse region information. + + raise Value for for invalid regions. + + returns a tuple of region, tid, start and end. Region + is a valid samtools :term:`region` or None if the region + extends over the whole file. + + Note that regions are 1-based, while start,end are python coordinates. + ''' + + cdef int rtid + cdef int rstart + cdef int rend + cdef int max_pos + max_pos = 2 << 29 + + rtid = rstart = rend = 0 + + # translate to a region + if reference: + if start != None and end != None: + region = "%s:%i-%i" % (reference, start+1, end) + else: + region = reference + + if region: + store = StderrStore() + bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend) + store.release() + if rtid < 0: raise ValueError( "invalid region `%s`" % region ) + if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) ) + if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart ) + if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend ) + + return region, rtid, rstart, rend + + def fetch( self, + reference = None, + start = None, + end = None, + region = None, + callback = None, + until_eof = False ): + '''*(reference = None, start = None, end = None, region = None, callback = None, until_eof = False)* + + fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by + :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. + + Without *reference* or *region* all reads will be fetched. The reads will be returned + ordered by reference sequence, which will not necessarily be the order within the file. + If *until_eof* is given, all reads from the current file position will be returned + *as they are sorted within the file*. + + If only *reference* is set, all reads matching on *reference* will be fetched. + + The method returns an iterator of type :class:`pysam.IteratorRow` unless + a *callback is provided. If *callback* is given, the callback will be executed + for each position within the :term:`region`. Note that callbacks currently work + only, if *region* or *reference* is given. + + Note that a :term:`TAM` file does not allow random access. If *region* or *reference* are given, + an exception is raised. + ''' + cdef int rtid + cdef int rstart + cdef int rend + + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) + + if self.isbam: + if callback: + if not region: + raise ValueError( "callback functionality requires a region/reference" ) + if not self._hasIndex(): raise ValueError( "no index available for fetch" ) + return bam_fetch(self.samfile.x.bam, + self.index, rtid, rstart, rend, callback, fetch_callback ) + else: + if region: + return IteratorRow( self, rtid, rstart, rend ) + else: + if until_eof: + return IteratorRowAll( self ) + else: + # return all targets by chaining the individual targets together. + if not self._hasIndex(): raise ValueError( "no index available for fetch" ) + i = [] + rstart = 0 + rend = 1<<29 + for rtid from 0 <= rtid < self.nreferences: + i.append( IteratorRow( self, rtid, rstart, rend)) + return itertools.chain( *i ) + else: + if region != None: + raise ValueError ("fetch for a region is not available for sam files" ) + if callback: + raise NotImplementedError( "callback not implemented yet" ) + else: + return IteratorRowAll( self ) + + def pileup( self, reference = None, start = None, end = None, region = None, callback = None ): + '''run a pileup within a :term:`region` using 0-based indexing. The region is specified by + :term:`reference`, *start* and *end*. Alternatively, a samtools *region* string can be supplied. + + Without *reference* or *region* all reads will be fetched. The reads will be returned + ordered by :term:`reference` sequence, which will not necessarily be the order within the file. + + The method returns an iterator of type :class:`pysam.IteratorColumn` unless + a *callback is provided. If *callback* is given, the callback will be executed + for each position within the :term:`region`. + + Note that samfiles do not allow random access. If *region* or *reference* are given, + an exception is raised. + + .. Note:: + + *all* reads which overlap the region are returned. The first base returned will be the + first base of the first read *not* necessarily the first base of the region used in the query. + ''' + cdef int rtid + cdef int rstart + cdef int rend + cdef bam_plbuf_t *buf + + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) + + if self.isbam: + if not self._hasIndex(): raise ValueError( "no index available for pileup" ) + + if callback: + if not region: + raise ValueError( "callback functionality requires a region/reference" ) + + buf = bam_plbuf_init( pileup_callback, callback ) + bam_fetch(self.samfile.x.bam, + self.index, rtid, rstart, rend, + buf, pileup_fetch_callback ) + + # finalize pileup + bam_plbuf_push( NULL, buf) + bam_plbuf_destroy(buf) + else: + if region: + return IteratorColumn( self, rtid, rstart, rend ) + else: + # return all targets by chaining the individual targets together. + i = [] + rstart = 0 + rend = 1<<29 + for rtid from 0 <= rtid < self.nreferences: + i.append( IteratorColumn( self, rtid, rstart, rend)) + return itertools.chain( *i ) + + else: + raise NotImplementedError( "pileup of samfiles not implemented yet" ) + + def close( self ): + '''closes file.''' + if self.samfile != NULL: + samclose( self.samfile ) + bam_index_destroy(self.index); + self.samfile = NULL + + def __dealloc__( self ): + '''clean up.''' + # remember: dealloc cannot call other methods + # Note that __del__ is not called. + self.close() + pysam_bam_destroy1(self.b) + + def write( self, AlignedRead read ): + '''(AlignedRead read ) + write a single :class:`pysam.AlignedRead`.. + + return the number of bytes written. + ''' + return samwrite( self.samfile, read._delegate ) + + property nreferences: + '''number of :term:`reference` sequences in the file.''' + def __get__(self): + return self.samfile.header.n_targets + + property references: + """tuple with the names of :term:`reference` sequences.""" + def __get__(self): + t = [] + for x from 0 <= x < self.samfile.header.n_targets: + t.append( self.samfile.header.target_name[x] ) + return tuple(t) + + property lengths: + """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference` + """ + def __get__(self): + t = [] + for x from 0 <= x < self.samfile.header.n_targets: + t.append( self.samfile.header.target_len[x] ) + return tuple(t) + + property text: + '''full contents of the :term:`sam file` header as a string.''' + def __get__(self): + # create a temporary 0-terminated copy + cdef char * t + t = calloc( self.samfile.header.l_text + 1, sizeof(char) ) + memcpy( t, self.samfile.header.text, self.samfile.header.l_text ) + result = t + free(t) + return result + + property header: + '''header information within the :term:`sam file`. The records and fields are returned as + a two-level dictionary. + ''' + def __get__(self): + result = {} + + if self.samfile.header.text != NULL: + # convert to python string (note: call self.text to create 0-terminated string) + t = self.text + for line in t.split("\n"): + if not line.strip(): continue + assert line.startswith("@"), "header line without '@': '%s'" % line + fields = line[1:].split("\t") + record = fields[0] + assert record in VALID_HEADER_TYPES, "header line with invalid type '%s': '%s'" % (record, line) + + # treat comments + if record == "CO": + if record not in result: result[record] = [] + result[record].append( "\t".join( fields[1:] ) ) + continue + + # the following is clumsy as generators do not work? + x = {} + for field in fields[1:]: + key, value = field.split(":",1) + if key not in VALID_HEADER_FIELDS[record]: + raise ValueError( "unknown field code '%s' in record '%s'" % (key, record) ) + x[key] = VALID_HEADER_FIELDS[record][key](value) + + if VALID_HEADER_TYPES[record] == dict: + if record in result: + raise ValueError( "multiple '%s' lines are not permitted" % record ) + result[record] = x + elif VALID_HEADER_TYPES[record] == list: + if record not in result: result[record] = [] + result[record].append( x ) + + return result + + def _buildLine( self, fields, record ): + '''build a header line from *fields* dictionary for *record*''' + + # TODO: add checking for field and sort order + line = ["@%s" % record ] + if record == "CO": + line.append( fields ) + else: + for key in VALID_HEADER_ORDER[record]: + if key in fields: + line.append( "%s:%s" % (key, str(fields[key]))) + return "\t".join( line ) + + cdef bam_header_t * _buildHeader( self, new_header ): + '''return a new header built from a dictionary in *new_header*. + + This method inserts the text field, target_name and target_len. + ''' + + lines = [] + + # check if hash exists + + # create new header and copy old data + cdef bam_header_t * dest + + dest = bam_header_init() + + for record in VALID_HEADERS: + if record in new_header: + ttype = VALID_HEADER_TYPES[record] + data = new_header[record] + if type( data ) != type( ttype() ): + raise ValueError( "invalid type for record %s: %s, expected %s" % (record, type(data), type(ttype()) ) ) + if type( data ) == types.DictType: + lines.append( self._buildLine( data, record ) ) + else: + for fields in new_header[record]: + lines.append( self._buildLine( fields, record ) ) + + text = "\n".join(lines) + "\n" + if dest.text != NULL: free( dest.text ) + dest.text = calloc( len(text), sizeof(char)) + dest.l_text = len(text) + strncpy( dest.text, text, dest.l_text ) + + # collect targets + if "SQ" in new_header: + seqs = [] + for fields in new_header["SQ"]: + try: + seqs.append( (fields["SN"], fields["LN"] ) ) + except KeyError: + raise KeyError( "incomplete sequence information in '%s'" % str(fields)) + + dest.n_targets = len(seqs) + dest.target_name = calloc( dest.n_targets, sizeof(char*) ) + dest.target_len = calloc( dest.n_targets, sizeof(uint32_t) ) + + for x from 0 <= x < dest.n_targets: + seqname, seqlen = seqs[x] + dest.target_name[x] = calloc( len( seqname ) + 1, sizeof(char) ) + strncpy( dest.target_name[x], seqname, len(seqname) + 1 ) + dest.target_len[x] = seqlen + + return dest + + def __iter__(self): + return self + + cdef bam1_t * getCurrent( self ): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + return samread(self.samfile, self.b) + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + cdef int ret + ret = samread(self.samfile, self.b) + if (ret > 0): + return makeAlignedRead( self.b ) + else: + raise StopIteration + +cdef class Fastafile: + '''*(filename)* + + A *FASTA* file. The file is automatically opened. + + The file expects an indexed fasta file. + + TODO: + add automatic indexing. + add function to get sequence names. + ''' + + cdef char * filename + # pointer to fastafile + cdef faidx_t * fastafile + + def __cinit__(self, *args, **kwargs ): + self.fastafile = NULL + self._open( *args, **kwargs ) + + def _isOpen( self ): + '''return true if samfile has been opened.''' + return self.fastafile != NULL + + def _open( self, + char * filename ): + '''open an indexed fasta file. + + This method expects an indexed fasta file. + ''' + + # close a previously opened file + if self.fastafile != NULL: self.close() + self.filename = filename + self.fastafile = fai_load( filename ) + + if self.fastafile == NULL: + raise IOError("could not open file `%s`" % filename ) + + def close( self ): + if self.fastafile != NULL: + fai_destroy( self.fastafile ) + self.fastafile = NULL + + def fetch( self, + reference = None, + start = None, + end = None, + region = None): + + '''*(reference = None, start = None, end = None, region = None)* + + fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by + :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. + ''' + + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + + cdef int len, max_pos + cdef char * seq + max_pos = 2 << 29 + + if not region: + if reference == None: raise ValueError( 'no sequence/region supplied.' ) + if start == None and end == None: + region = "%s" % str(reference) + elif start == None or end == None: + raise ValueError( 'only start or only end of region supplied' ) + else: + if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) ) + # valid ranges are from 0 to 2^29-1 + if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start ) + if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end ) + region = "%s:%i-%i" % (reference, start+1, end ) + + # samtools adds a '\0' at the end + seq = fai_fetch( self.fastafile, region, &len ) + # copy to python + result = seq + # clean up + free(seq) + + return result + +## turning callbacks elegantly into iterators is an unsolved problem, see the following threads: +## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1 +## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html +## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code +## changes, the changes have to be manually entered. + +cdef class IteratorRow: + """iterates over mapped reads in a region. + """ + + cdef bam_fetch_iterator_t* bam_iter # iterator state object + cdef bam1_t * b + cdef error_msg + cdef int error_state + cdef Samfile samfile + def __cinit__(self, Samfile samfile, int tid, int beg, int end ): + self.bam_iter = NULL + + assert samfile._isOpen() + assert samfile._hasIndex() + + # makes sure that samfile stays alive as long as the + # iterator is alive. + self.samfile = samfile + + # parse the region + self.error_state = 0 + self.error_msg = None + + cdef bamFile fp + fp = samfile.samfile.x.bam + self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end) + + def __iter__(self): + return self + + cdef bam1_t * getCurrent( self ): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + self.b = bam_fetch_iterate(self.bam_iter) + if self.b == NULL: return 0 + return 1 + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + if self.error_state: + raise ValueError( self.error_msg) + + self.b = bam_fetch_iterate(self.bam_iter) + if self.b != NULL: + return makeAlignedRead( self.b ) + else: + raise StopIteration + + def __dealloc__(self): + '''remember: dealloc cannot call other methods!''' + if self.bam_iter: + bam_cleanup_fetch_iterator(self.bam_iter) + +cdef class IteratorRowAll: + """iterates over all mapped reads + """ + + cdef bam1_t * b + cdef samfile_t * fp + + def __cinit__(self, Samfile samfile): + + assert samfile._isOpen() + + self.fp = samfile.samfile + + # allocate memory for alignment + self.b = calloc(1, sizeof(bam1_t)) + + def __iter__(self): + return self + + cdef bam1_t * getCurrent( self ): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + return samread(self.fp, self.b) + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + cdef int ret + ret = samread(self.fp, self.b) + if (ret > 0): + return makeAlignedRead( self.b ) + else: + raise StopIteration + + def __dealloc__(self): + '''remember: dealloc cannot call other methods!''' + pysam_bam_destroy1(self.b) + +cdef class IteratorColumn: + '''iterates over columns. + + This iterator wraps the pileup functionality of samtools. + + For reasons of efficiency, the iterator returns the current + pileup buffer. As this buffer is updated at every iteration, + the contents of this iterator will change accordingly. Hence the conversion to + a list will not produce the expected result:: + + f = Samfile("file.bam", "rb") + result = list( f.pileup() ) + + Here, result will contain ``n`` objects of type :class:`PileupProxy` for ``n`` columns, + but each object will contain the same information. + + If the results of several columns are required at the same time, the results + need to be stored explicitely:: + + result = [ x.pileups() for x in f.pileup() ] + + Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`. + + ''' + cdef bam_plbuf_t *buf + + # check if first iteration + cdef int notfirst + # result of the last plbuf_push + cdef int n_pu + cdef int eof + cdef IteratorRow iter + + def __cinit__(self, Samfile samfile, int tid, int start, int end ): + + self.iter = IteratorRow( samfile, tid, start, end ) + self.buf = bam_plbuf_init(NULL, NULL ) + self.n_pu = 0 + self.eof = 0 + + def __iter__(self): + return self + + cdef int cnext(self): + '''perform next iteration. + + return 1 if there is a buffer to emit. Return 0 for end of iteration. + ''' + + cdef int retval1, retval2 + + # pysam bam_plbuf_push returns: + # 1: if buf is full and can be emitted + # 0: if b has been added + # -1: if there was an error + + # check if previous plbuf was incomplete. If so, continue within + # the loop and yield if necessary + if self.n_pu > 0: + self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1) + if self.n_pu > 0: return 1 + + if self.eof: return 0 + + # get next alignments and submit until plbuf indicates that + # an new column has finished + while self.n_pu == 0: + retval1 = self.iter.cnext() + # wrap up if no more input + if retval1 == 0: + self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0) + self.eof = 1 + return self.n_pu + + # submit to plbuf + self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0) + if self.n_pu < 0: raise ValueError( "error while iterating" ) + + # plbuf has yielded + return 1 + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + cdef int ret + ret = self.cnext() + cdef bam_pileup1_t * pl + + if ret > 0 : + return makePileupProxy( self.buf, self.n_pu ) + else: + raise StopIteration + + def __dealloc__(self): + bam_plbuf_destroy(self.buf); + +cdef class AlignedRead: + ''' + Class representing an aligned read. see SAM format specification for meaning of fields (http://samtools.sourceforge.net/). + + This class stores a handle to the samtools C-structure representing + an aligned read. Member read access is forwarded to the C-structure + and converted into python objects. This implementation should be fast, + as only the data needed is converted. + + For write access, the C-structure is updated in-place. This is + not the most efficient way to build BAM entries, as the variable + length data is concatenated and thus needs to resized if + a field is updated. Furthermore, the BAM entry might be + in an inconsistent state. The :meth:`~validate` method can + be used to check if an entry is consistent. + + One issue to look out for is that the sequence should always + be set *before* the quality scores. Setting the sequence will + also erase any quality scores that were set previously. + ''' + cdef: + bam1_t * _delegate + + def __cinit__( self ): + # see bam_init1 + self._delegate = calloc( 1, sizeof( bam1_t) ) + # allocate some memory + # If size is 0, calloc does not return a pointer that can be passed to free() + # so allocate 40 bytes for a new read + self._delegate.m_data = 40 + self._delegate.data = calloc( self._delegate.m_data, 1 ) + self._delegate.data_len = 0 + + def __dealloc__(self): + '''clear up memory.''' + pysam_bam_destroy1(self._delegate) + + def __str__(self): + """todo""" + return "\t".join(map(str, (self.qname, + self.rname, + self.pos, + self.cigar, + self.qual, + self.flag, + self.seq, + self.mapq, + self.tags))) + + + def __cmp__(self, AlignedRead other): + '''return true, if contents in this are binary equal to ``other``.''' + cdef int retval, x + cdef bam1_t *t, *o + t = self._delegate + o = other._delegate + + # uncomment for debugging purposes + # cdef unsigned char * oo, * tt + # tt = (&t.core) + # oo = (&o.core) + # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x] + # tt = (t.data) + # oo = (o.data) + # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) + + retval = memcmp( &t.core, + &o.core, + sizeof( bam1_core_t )) + + if retval: return retval + retval = cmp( t.data_len, o.data_len) + if retval: return retval + return memcmp( t.data, + o.data, + sizeof( t.data_len )) + + property qname: + """the query name (None if not present)""" + def __get__(self): + cdef bam1_t * src + src = self._delegate + if src.core.l_qname == 0: return None + return pysam_bam1_qname( src ) + + def __set__(self, qname ): + if qname == None or len(qname) == 0: return + cdef bam1_t * src + cdef int l + cdef char * p + + src = self._delegate + p = pysam_bam1_qname( src ) + + # the qname is \0 terminated + l = len(qname) + 1 + pysam_bam_update( src, + src.core.l_qname, + l, + p ) + + src.core.l_qname = l + + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam1_qname(src) + + strncpy( p, qname, l ) + + property cigar: + """the :term:`cigar` alignment (None if not present). + """ + def __get__(self): + cdef uint32_t * cigar_p + cdef bam1_t * src + cdef op, l, cigar + src = self._delegate + if src.core.n_cigar == 0: return None + + cigar = [] + cigar_p = pysam_bam1_cigar(src); + for k from 0 <= k < src.core.n_cigar: + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + cigar.append((op, l)) + return cigar + + def __set__(self, values ): + if values == None or len(values) == 0: return + cdef uint32_t * p + cdef bam1_t * src + cdef op, l + cdef int k + + k = 0 + + src = self._delegate + + # get location of cigar string + p = pysam_bam1_cigar(src) + + # create space for cigar data within src.data + pysam_bam_update( src, + src.core.n_cigar * 4, + len(values) * 4, + p ) + + # length is number of cigar operations, not bytes + src.core.n_cigar = len(values) + + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam1_cigar(src) + + # insert cigar operations + for op, l in values: + p[k] = l << BAM_CIGAR_SHIFT | op + k += 1 + + ## setting the cigar string also updates the "bin" attribute + src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p)) + + property seq: + """the query sequence (None if not present)""" + def __get__(self): + cdef bam1_t * src + cdef uint8_t * p + cdef char * s + src = self._delegate + bam_nt16_rev_table = "=ACMGRSVTWYHKDBN" + ## parse qseq (bam1_seq) + if src.core.l_qseq == 0: return None + + s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char)) + p = pysam_bam1_seq( src ) + for k from 0 <= k < src.core.l_qseq: + ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c) + s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)] + retval=s + free(s) + return retval + + def __set__(self,seq): + # samtools manages sequence and quality length memory together + # if no quality information is present, the first byte says 0xff. + + if seq == None or len(seq) == 0: return + cdef bam1_t * src + cdef uint8_t * p + cdef char * s + src = self._delegate + cdef int l, k, nbytes_new, nbytes_old + + l = len(seq) + + # as the sequence is stored in half-bytes, the total length (sequence + # plus quality scores) is (l+1)/2 + l + nbytes_new = (l+1)/2 + l + nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq + # acquire pointer to location in memory + p = pysam_bam1_seq( src ) + src.core.l_qseq = l + + pysam_bam_update( src, + nbytes_old, + nbytes_new, + p) + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam1_seq( src ) + for k from 0 <= k < nbytes_new: p[k] = 0 + # convert to C string + s = seq + for k from 0 <= k < l: + p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2) + + # erase qualities + p = pysam_bam1_qual( src ) + p[0] = 0xff + + property qual: + """the base quality (None if not present)""" + def __get__(self): + cdef bam1_t * src + cdef uint8_t * p + cdef char * q + src = self._delegate + if src.core.l_qseq == 0: return None + + p = pysam_bam1_qual( src ) + if p[0] == 0xff: return None + + q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char)) + for k from 0 <= k < src.core.l_qseq: + ## equivalent to t[i] + 33 (see bam.c) + q[k] = p[k] + 33 + # convert to python string + retval=q + # clean up + free(q) + return retval + + def __set__(self,qual): + # note that space is already allocated via the sequences + cdef bam1_t * src + cdef uint8_t * p + cdef char * q + src = self._delegate + p = pysam_bam1_qual( src ) + if qual == None or len(qual) == 0: + # if absent - set to 0xff + p[0] = 0xff + return + cdef int l + # convert to C string + q = qual + l = len(qual) + if src.core.l_qseq != l: + raise ValueError("quality and sequence mismatch: %i != %i" % (l, src.core.l_qseq)) + assert src.core.l_qseq == l + for k from 0 <= k < l: + p[k] = q[k] - 33 + + property tags: + """the tags in the AUX field.""" + def __get__(self): + cdef char * ctag + cdef bam1_t * src + cdef uint8_t * s + cdef char tpe + + src = self._delegate + if src.l_aux == 0: return None + + s = pysam_bam1_aux( src ) + result = [] + ctag = calloc( 3, sizeof(char) ) + cdef int x + while s < (src.data + src.data_len): + # get tag + ctag[0] = s[0] + ctag[1] = s[1] + pytag = ctag + + s += 2 + + # convert type - is there a better way? + ctag[0] = s[0] + ctag[1] = 0 + pytype = ctag + # get type and value + # how do I do char literal comparison in cython? + # the code below works (i.e, is C comparison) + tpe = toupper(s[0]) + if tpe == 'S'[0]: + value = bam_aux2i(s) + s += 2 + elif tpe == 'I'[0]: + value = bam_aux2i(s) + s += 4 + elif tpe == 'F'[0]: + value = bam_aux2f(s) + s += 4 + elif tpe == 'D'[0]: + value = bam_aux2d(s) + s += 8 + elif tpe == 'C'[0]: + value = bam_aux2i(s) + s += 1 + elif tpe == 'A'[0]: + # there might a more efficient way + # to convert a char into a string + value = "%c" % bam_aux2A(s) + s += 1 + elif tpe == 'Z'[0]: + value = bam_aux2Z(s) + # +1 for NULL terminated string + s += len(value) + 1 + + # skip over type + s += 1 + + # ignore pytype + result.append( (pytag, value) ) + + free( ctag ) + return result + + def __set__(self, tags): + cdef char * ctag + cdef bam1_t * src + cdef uint8_t * s + cdef uint8_t * new_data + cdef int guessed_size, control_size + src = self._delegate + cdef int max_size, size + max_size = 4000 + + # map samtools code to python.struct code and byte size + buffer = ctypes.create_string_buffer(max_size) + + offset = 0 + for pytag, value in tags: + t = type(value) + if t == types.FloatType: + fmt = "= -127: fmt, pytype = "= -32767: fmt, pytype = " 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value ) + else: fmt, pytype = " max_size: + raise NotImplementedError("tags field too large") + + struct.pack_into( fmt, + buffer, + offset, + pytag[0], + pytag[1], + pytype, + value ) + offset += size + + # delete the old data and allocate new + pysam_bam_update( src, + src.l_aux, + offset, + pysam_bam1_aux( src ) ) + + src.l_aux = offset + + if offset == 0: return + + # get location of new data + s = pysam_bam1_aux( src ) + + # check if there is direct path from buffer.raw to tmp + cdef char * temp + temp = buffer.raw + memcpy( s, temp, offset ) + + property flag: + """properties flag""" + def __get__(self): return self._delegate.core.flag + def __set__(self, flag): self._delegate.core.flag = flag + property rname: + """ + :term:`target` ID + + .. note:: + + This field contains the index of the reference sequence + in the sequence dictionary. To obtain the name + of the reference sequence, use :meth:`pysam.Samfile.getrname()` + + """ + def __get__(self): return self._delegate.core.tid + def __set__(self, tid): self._delegate.core.tid = tid + property pos: + """0-based leftmost coordinate""" + def __get__(self): return self._delegate.core.pos + def __set__(self, pos): + ## setting the cigar string also updates the "bin" attribute + cdef bam1_t * src + src = self._delegate + if src.core.n_cigar: + src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) ) + else: + src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1) + self._delegate.core.pos = pos + property bin: + """properties bin""" + def __get__(self): return self._delegate.core.bin + def __set__(self, bin): self._delegate.core.bin = bin + property rlen: + '''length of the read (read only). Returns 0 if not given.''' + def __get__(self): return self._delegate.core.l_qseq + property mapq: + """mapping quality""" + def __get__(self): return self._delegate.core.qual + def __set__(self, qual): self._delegate.core.qual = qual + property mrnm: + """the :term:`reference` id of the mate """ + def __get__(self): return self._delegate.core.mtid + def __set__(self, mtid): self._delegate.core.mtid = mtid + property mpos: + """the position of the mate""" + def __get__(self): return self._delegate.core.mpos + def __set__(self, mpos): self._delegate.core.mpos = mpos + property isize: + """the insert size""" + def __get__(self): return self._delegate.core.isize + def __set__(self, isize): self._delegate.core.isize = isize + property is_paired: + """true if read is paired in sequencing""" + def __get__(self): return (self._delegate.core.flag & BAM_FPAIRED) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FPAIRED + else: self._delegate.core.flag &= ~BAM_FPAIRED + property is_proper_pair: + """true if read is mapped in a proper pair""" + def __get__(self): return (self.flag & BAM_FPROPER_PAIR) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FPROPER_PAIR + else: self._delegate.core.flag &= ~BAM_FPROPER_PAIR + property is_unmapped: + """true if read itself is unmapped""" + def __get__(self): return (self.flag & BAM_FUNMAP) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FUNMAP + else: self._delegate.core.flag &= ~BAM_FUNMAP + property mate_is_unmapped: + """true if the mate is unmapped""" + def __get__(self): return (self.flag & BAM_FMUNMAP) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FMUNMAP + else: self._delegate.core.flag &= ~BAM_FMUNMAP + property is_reverse: + """true if read is mapped to reverse strand""" + def __get__(self):return (self.flag & BAM_FREVERSE) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FREVERSE + else: self._delegate.core.flag &= ~BAM_FREVERSE + property mate_is_reverse: + """true is read is mapped to reverse strand""" + def __get__(self): return (self.flag & BAM_FMREVERSE) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FMREVERSE + else: self._delegate.core.flag &= ~BAM_FMREVERSE + property is_read1: + """true if this is read1""" + def __get__(self): return (self.flag & BAM_FREAD1) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FREAD1 + else: self._delegate.core.flag &= ~BAM_FREAD1 + property is_read2: + """true if this is read2""" + def __get__(self): return (self.flag & BAM_FREAD2) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FREAD2 + else: self._delegate.core.flag &= ~BAM_FREAD2 + property is_secondary: + """true if not primary alignment""" + def __get__(self): return (self.flag & BAM_FSECONDARY) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FSECONDARY + else: self._delegate.core.flag &= ~BAM_FSECONDARY + property is_qcfail: + """true if QC failure""" + def __get__(self): return (self.flag & BAM_FQCFAIL) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FQCFAIL + else: self._delegate.core.flag &= ~BAM_FQCFAIL + property is_duplicate: + """ true if optical or PCR duplicate""" + def __get__(self): return (self.flag & BAM_FDUP) != 0 + def __set__(self,val): + if val: self._delegate.core.flag |= BAM_FDUP + else: self._delegate.core.flag &= ~BAM_FDUP + + def opt(self, tag): + """retrieves optional data given a two-letter *tag*""" + #see bam_aux.c: bam_aux_get() and bam_aux2i() etc + cdef uint8_t * v + v = bam_aux_get(self._delegate, tag) + if v == NULL: raise KeyError( "tag '%s' not present" % tag ) + type = chr(v[0]) + if type == 'c' or type == 'C' or type == 's' or type == 'S' or type == 'i': + return bam_aux2i(v) + elif type == 'f': + return bam_aux2f(v) + elif type == 'd': + return bam_aux2d(v) + elif type == 'A': + # there might a more efficient way + # to convert a char into a string + return '%c' % bam_aux2A(v) + elif type == 'Z': + return bam_aux2Z(v) + + def fancy_str (self): + """returns list of fieldnames/values in pretty format for debugging + """ + ret_string = [] + field_names = { + "tid": "Contig index", + "pos": "Mapped position on contig", + "mtid": "Contig index for mate pair", + "mpos": "Position of mate pair", + "isize": "Insert size", + "flag": "Binary flag", + "n_cigar": "Count of cigar entries", + "cigar": "Cigar entries", + "qual": "Mapping quality", + "bin": "Bam index bin number", + "l_qname": "Length of query name", + "qname": "Query name", + "l_qseq": "Length of query sequence", + "qseq": "Query sequence", + "bqual": "Quality scores", + "l_aux": "Length of auxilary data", + "m_data": "Maximum data length", + "data_len": "Current data length", + } + fields_names_in_order = ["tid", "pos", "mtid", "mpos", "isize", "flag", + "n_cigar", "cigar", "qual", "bin", "l_qname", "qname", + "l_qseq", "qseq", "bqual", "l_aux", "m_data", "data_len"] + + for f in fields_names_in_order: + if not f in self.__dict__: + continue + ret_string.append("%-30s %-10s= %s" % (field_names[f], "(" + f + ")", self.__getattribute__(f))) + + for f in self.__dict__: + if not f in field_names: + ret_string.append("%-30s %-10s= %s" % (f, "", self.__getattribute__(f))) + return ret_string + +cdef class PileupProxy: + '''A pileup column. A pileup column contains + all the reads that map to a certain target base. + + tid + chromosome ID as is defined in the header + pos + the target base coordinate (0-based) + n + number of reads mapping to this column + pileups + list of reads (:class:`pysam.PileupRead`) aligned to this column + + This class is a proxy for results returned by the samtools pileup engine. + If the underlying engine iterator advances, the results of this column + will change. + ''' + cdef bam_plbuf_t * buf + cdef int n_pu + + def __cinit__(self ): + pass + + def __str__(self): + return "\t".join( map(str, (self.tid, self.pos, self.n))) +\ + "\n" +\ + "\n".join( map(str, self.pileups) ) + + property tid: + '''the chromosome ID as is defined in the header''' + def __get__(self): return pysam_get_tid( self.buf ) + + property n: + '''number of reads mapping to this column.''' + def __get__(self): return self.n_pu + def __set__(self, n): self.n_pu = n + + property pos: + def __get__(self): return pysam_get_pos( self.buf ) + + property pileups: + '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' + def __get__(self): + cdef bam_pileup1_t * pl + pl = pysam_get_pileup( self.buf ) + pileups = [] + # warning: there could be problems if self.n and self.buf are + # out of sync. + for x from 0 <= x < self.n_pu: + pileups.append( makePileupRead( &pl[x]) ) + return pileups + +cdef class PileupRead: + '''A read aligned to a column. + ''' + + cdef: + AlignedRead _alignment + int32_t _qpos + int _indel + int _level + uint32_t _is_del + uint32_t _is_head + uint32_t _is_tail + + def __cinit__( self ): + pass + + def __str__(self): + return "\t".join( map(str, (self.alignment, self.qpos, self.indel, self.level, self.is_del, self.is_head, self.is_tail ) ) ) + + property alignment: + """a :class:`pysam.AlignedRead` object of the aligned read""" + def __get__(self): + return self._alignment + property qpos: + """position of the read base at the pileup site, 0-based""" + def __get__(self): + return self._qpos + property indel: + """indel length; 0 for no indel, positive for ins and negative for del""" + def __get__(self): + return self._indel + property is_del: + """1 iff the base on the padded read is a deletion""" + def __get__(self): + return self._is_del + property is_head: + def __get__(self): + return self._is_head + property is_tail: + def __get__(self): + return self._is_tail + property level: + def __get__(self): + return self._level + +class Outs: + '''http://mail.python.org/pipermail/python-list/2000-June/038406.html''' + def __init__(self, id = 1): + self.streams = [] + self.id = id + + def setdevice(self, filename): + '''open an existing file, like "/dev/null"''' + fd = os.open(filename, os.O_WRONLY) + self.setfd(fd) + + def setfile(self, filename): + '''open a new file.''' + fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660); + self.setfd(fd) + + def setfd(self, fd): + ofd = os.dup(self.id) # Save old stream on new unit. + self.streams.append(ofd) + sys.stdout.flush() # Buffered data goes to old stream. + os.dup2(fd, self.id) # Open unit 1 on new stream. + os.close(fd) # Close other unit (look out, caller.) + + def restore(self): + '''restore previous output stream''' + if self.streams: + # the following was not sufficient, hence flush both stderr and stdout + # os.fsync( self.id ) + sys.stdout.flush() + sys.stderr.flush() + os.dup2(self.streams[-1], self.id) + os.close(self.streams[-1]) + del self.streams[-1] + +def _samtools_dispatch( method, args = () ): + '''call ``method`` in samtools providing arguments in args. + + .. note:: + This method redirects stdout and stderr to capture it + from samtools. If for some reason stdout/stderr disappears + the reason might be in this method. + + .. note:: + The current implementation might only work on linux. + + .. note:: + This method captures stdout and stderr using temporary files, + which are then read into memory in their entirety. This method + is slow and might cause large memory overhead. + + See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily + on the topic of redirecting stderr/stdout. + ''' + + # note that debugging this module can be a problem + # as stdout/stderr will not appear + + # redirect stderr and stdout to file + + # open files and redirect into it + stderr_h, stderr_f = tempfile.mkstemp() + stdout_h, stdout_f = tempfile.mkstemp() + + # patch for `samtools view` + # samtools `view` closes stdout, from which I can not + # recover. Thus redirect output to file with -o option. + if method == "view": + if "-o" in args: raise ValueError("option -o is forbidden in samtools view") + args = ( "-o", stdout_f ) + args + + stdout_save = Outs( sys.stdout.fileno() ) + stdout_save.setfd( stdout_h ) + stderr_save = Outs( sys.stderr.fileno() ) + stderr_save.setfd( stderr_h ) + + # do the function call to samtools + cdef char ** cargs + cdef int i, n, retval + + n = len(args) + # allocate two more for first (dummy) argument (contains command) + cargs = calloc( n+2, sizeof( char *) ) + cargs[0] = "samtools" + cargs[1] = method + for i from 0 <= i < n: cargs[i+2] = args[i] + retval = pysam_dispatch(n+2, cargs) + free( cargs ) + + # restore stdout/stderr. This will also flush, so + # needs to be before reading back the file contents + stdout_save.restore() + stderr_save.restore() + + # capture stderr/stdout. + out_stderr = open( stderr_f, "r").readlines() + out_stdout = open( stdout_f, "r").readlines() + + # clean up files + os.remove( stderr_f ) + os.remove( stdout_f ) + + return retval, out_stderr, out_stdout + +__all__ = ["Samfile", + "Fastafile", + "IteratorRow", + "IteratorRowAll", + "IteratorColumn", + "AlignedRead", + "PileupColumn", + "PileupProxy", + "PileupRead" ] + + + diff --git a/pysam/namedtuple.py b/pysam/namedtuple.py new file mode 100644 index 0000000..a60fb1a --- /dev/null +++ b/pysam/namedtuple.py @@ -0,0 +1,117 @@ +from operator import itemgetter as _itemgetter +from keyword import iskeyword as _iskeyword +import sys as _sys + +def namedtuple(typename, field_names, verbose=False, rename=False): + """Returns a new subclass of tuple with named fields. + + >>> Point = namedtuple('Point', 'x y') + >>> Point.__doc__ # docstring for the new class + 'Point(x, y)' + >>> p = Point(11, y=22) # instantiate with positional args or keywords + >>> p[0] + p[1] # indexable like a plain tuple + 33 + >>> x, y = p # unpack like a regular tuple + >>> x, y + (11, 22) + >>> p.x + p.y # fields also accessable by name + 33 + >>> d = p._asdict() # convert to a dictionary + >>> d['x'] + 11 + >>> Point(**d) # convert from a dictionary + Point(x=11, y=22) + >>> p._replace(x=100) # _replace() is like str.replace() but targets named fields + Point(x=100, y=22) + + """ + + # Parse and validate the field names. Validation serves two purposes, + # generating informative error messages and preventing template injection attacks. + if isinstance(field_names, basestring): + field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas + field_names = tuple(map(str, field_names)) + if rename: + names = list(field_names) + seen = set() + for i, name in enumerate(names): + if (not min(c.isalnum() or c=='_' for c in name) or _iskeyword(name) + or not name or name[0].isdigit() or name.startswith('_') + or name in seen): + names[i] = '_%d' % i + seen.add(name) + field_names = tuple(names) + for name in (typename,) + field_names: + if not min(c.isalnum() or c=='_' for c in name): + raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name) + if _iskeyword(name): + raise ValueError('Type names and field names cannot be a keyword: %r' % name) + if name[0].isdigit(): + raise ValueError('Type names and field names cannot start with a number: %r' % name) + seen_names = set() + for name in field_names: + if name.startswith('_') and not rename: + raise ValueError('Field names cannot start with an underscore: %r' % name) + if name in seen_names: + raise ValueError('Encountered duplicate field name: %r' % name) + seen_names.add(name) + + # Create and fill-in the class template + numfields = len(field_names) + argtxt = repr(field_names).replace("'", "")[1:-1] # tuple repr without parens or quotes + reprtxt = ', '.join('%s=%%r' % name for name in field_names) + template = '''class %(typename)s(tuple): + '%(typename)s(%(argtxt)s)' \n + __slots__ = () \n + _fields = %(field_names)r \n + def __new__(_cls, %(argtxt)s): + return _tuple.__new__(_cls, (%(argtxt)s)) \n + @classmethod + def _make(cls, iterable, new=tuple.__new__, len=len): + 'Make a new %(typename)s object from a sequence or iterable' + result = new(cls, iterable) + if len(result) != %(numfields)d: + raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result)) + return result \n + def __repr__(self): + return '%(typename)s(%(reprtxt)s)' %% self \n + def _asdict(self): + 'Return a new dict which maps field names to their values' + return dict(zip(self._fields, self)) \n + def _replace(_self, **kwds): + 'Return a new %(typename)s object replacing specified fields with new values' + result = _self._make(map(kwds.pop, %(field_names)r, _self)) + if kwds: + raise ValueError('Got unexpected field names: %%r' %% kwds.keys()) + return result \n + def __getnewargs__(self): + return tuple(self) \n\n''' % locals() + for i, name in enumerate(field_names): + template += ' %s = _property(_itemgetter(%d))\n' % (name, i) + if verbose: + print template + + # Execute the template string in a temporary namespace + namespace = dict(_itemgetter=_itemgetter, __name__='namedtuple_%s' % typename, + _property=property, _tuple=tuple) + try: + exec template in namespace + except SyntaxError, e: + raise SyntaxError(e.message + ':\n' + template) + result = namespace[typename] + + # For pickling to work, the __module__ variable needs to be set to the frame + # where the named tuple is created. Bypass this step in enviroments where + # sys._getframe is not defined (Jython for example) or sys._getframe is not + # defined for arguments greater than 0 (IronPython). + try: + result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__') + except (AttributeError, ValueError): + pass + + return result + + + + + diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c new file mode 100644 index 0000000..5360626 --- /dev/null +++ b/pysam/pysam_util.c @@ -0,0 +1,499 @@ +#include +#include +#include "bam.h" +#include "khash.h" +#include "ksort.h" +#include "bam_endian.h" +#include "knetfile.h" +#include "pysam_util.h" + +// ####################################################### +// utility routines to avoid using callbacks in bam_fetch +// taken from bam_index.c +// The order of the following declarations is important. +// ####################################################### + +typedef struct +{ + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} bam_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bam_lidx_t; + +KSORT_INIT(my_off, pair64_t, pair64_lt); +KHASH_MAP_INIT_INT(my_i, bam_binlist_t); + +struct __bam_index_t +{ + int32_t n; + khash_t(my_i) **index; + bam_lidx_t *index2; +}; + +typedef struct __linkbuf_t { + bam1_t b; + uint32_t beg, end; + struct __linkbuf_t *next; +} lbnode_t; + +typedef struct { + int cnt, n, max; + lbnode_t **buf; +} mempool_t; + +struct __bam_plbuf_t { + mempool_t *mp; + lbnode_t *head, *tail, *dummy; + bam_pileup_f func; + void *func_data; + int32_t tid, pos, max_tid, max_pos; + int max_pu, is_eof; + bam_pileup1_t *pu; + int flag_mask; +}; + +static mempool_t *mp_init() +{ + mempool_t *mp; + mp = (mempool_t*)calloc(1, sizeof(mempool_t)); + return mp; +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) { + free(mp->buf[k]->b.data); + free(mp->buf[k]); + } + free(mp->buf); + free(mp); +} +static inline lbnode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, lbnode_t *p) +{ + --mp->cnt; p->next = 0; // clear lbnode_t::next here + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) +{ + unsigned k; + bam1_t *b = p->b; + bam1_core_t *c = &b->core; + uint32_t x = c->pos, y = 0; + int ret = 1, is_restart = 1; + + if (c->flag&BAM_FUNMAP) return 0; // unmapped read + assert(x <= pos); // otherwise a bug + p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation + int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length + if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip + if (x + l > pos) { // overlap with pos + p->indel = p->is_del = 0; + p->qpos = y + (pos - x); + if (x == pos && is_restart) p->is_head = 1; + if (x + l - 1 == pos) { // come to the end of a match + if (k < c->n_cigar - 1) { // there are additional operation(s) + uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR + int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation + if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del + else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins + if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP) + p->is_tail = 1; // tail + } else p->is_tail = 1; // this is the last operation; set tail + } + } + x += l; y += l; + } else if (op == BAM_CDEL) { // then set ->is_del + if (x + l > pos) { + p->indel = 0; p->is_del = 1; + p->qpos = y + (pos - x); + } + x += l; + } else if (op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); + if (x > pos) { + if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all + break; + } + } + assert(x > pos); // otherwise a bug + return ret; +} + + + + +// the following code has been taken from bam_plbuf_push +// and modified such that instead of a function call +// the function returns and will continue (if cont is true). +// from where it left off. + +// returns +// 1: if buf is full and can be emitted +// 0: if b has been added +// -1: if there was an error +int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont) +{ + if (!cont) + { + if (b) { // fill buffer + if (b->core.tid < 0) return 0; + if (b->core.flag & buf->flag_mask) return 0; + bam_copy1(&buf->tail->b, b); + buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); + if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n"); + abort(); + } + buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg; + if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) { + buf->tail->next = mp_alloc(buf->mp); + buf->tail = buf->tail->next; + } + } else buf->is_eof = 1; + } + else + // continue end of loop + { + // update tid and pos + if (buf->head->next) { + if (buf->tid > buf->head->b.core.tid) { + fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); + return -1; + } + } + if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence + buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference + } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid + buf->pos = buf->head->beg; // jump to the next position + } else ++buf->pos; // scan contiguously + if (buf->is_eof && buf->head->next == 0) return 0; + } + + // enter yield loop + while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) + { + int n_pu = 0; + lbnode_t *p, *q; + buf->dummy->next = buf->head; + for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list + q->next = p->next; mp_free(buf->mp, p); p = q; + } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup + if (n_pu == buf->max_pu) { // then double the capacity + buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256; + buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu); + } + buf->pu[n_pu].b = &p->b; + if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP + } + } + buf->head = buf->dummy->next; // dummy->next may be changed + + // exit if alignments need to be emitted + if (n_pu) { return n_pu; } + + // update tid and pos + if (buf->head->next) { + if (buf->tid > buf->head->b.core.tid) { + fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); + return -2; + } + } + if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence + buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference + } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid + buf->pos = buf->head->beg; // jump to the next position + } else ++buf->pos; // scan contiguously + if (buf->is_eof && buf->head->next == 0) break; + } + return 0; +} + +int pysam_get_pos( const bam_plbuf_t *buf) +{ + return buf->pos; +} + + +int pysam_get_tid( const bam_plbuf_t *buf) +{ + return buf->tid; +} + +bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf) +{ + return buf->pu; +} + +// pysam dispatch function to emulate the samtools +// command line within python. +// taken from the main function in bamtk.c +// added code to reset getopt +extern int main_samview(int argc, char *argv[]); +extern int main_import(int argc, char *argv[]); +extern int bam_pileup(int argc, char *argv[]); +extern int bam_merge(int argc, char *argv[]); +extern int bam_sort(int argc, char *argv[]); +extern int bam_index(int argc, char *argv[]); +extern int faidx_main(int argc, char *argv[]); +extern int bam_mating(int argc, char *argv[]); +extern int bam_rmdup(int argc, char *argv[]); +extern int glf3_view_main(int argc, char *argv[]); +extern int bam_flagstat(int argc, char *argv[]); +extern int bam_fillmd(int argc, char *argv[]); + +int pysam_dispatch(int argc, char *argv[] ) +{ + +#ifdef _WIN32 + setmode(fileno(stdout), O_BINARY); + setmode(fileno(stdin), O_BINARY); +#ifdef _USE_KNETFILE + knet_win32_init(); +#endif +#endif + + extern int optind; + + // reset getop + optind = 1; + + if (argc < 2) return 1; + + if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); + else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); + else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); + else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); + else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); + else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); + else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); + else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); + else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); + else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); + else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); + +#if _CURSES_LIB != 0 + else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); +#endif + else + { + fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); + return 1; + } + return 0; +} + +// standin for bam_destroy1 in bam.h +// deletes all variable length data +void pysam_bam_destroy1( bam1_t * b ) +{ + if (b == NULL) return; + if (b->data != NULL) free(b->data); + free(b); +} + +// taken from samtools/bam_import.c +static inline uint8_t *alloc_data(bam1_t *b, size_t size) +{ + if (b->m_data < size) + { + b->m_data = size; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + return b->data; +} + +// update the variable length data within a bam1_t entry. +// Adds *nbytes_new* - *nbytes_old* into the variable length data of *src* at *pos*. +// Data within the bam1_t entry is moved so that it is +// consistent with the data field lengths. +bam1_t * pysam_bam_update( bam1_t * b, + const size_t nbytes_old, + const size_t nbytes_new, + uint8_t * pos ) +{ + int d = nbytes_new-nbytes_old; + + // no change + if (d == 0) return b; + + int new_size = d + b->data_len; + size_t offset = pos - b->data; + + //printf("d=%i, old=%i, new=%i, old_size=%i, new_size=%i\n", + // d, nbytes_old, nbytes_new, b->data_len, new_size); + + // increase memory if required + if (d > 0) + { + alloc_data( b, new_size ); + pos = b->data + offset; + } + + if (b->data_len != 0) + { + if (offset < 0 || offset > b->data_len) + fprintf(stderr, "[pysam_bam_insert] illegal offset: '%i'\n", (int)offset); + } + + // printf("dest=%p, src=%p, n=%i\n", pos+nbytes_new, pos + nbytes_old, b->data_len - (offset+nbytes_old)); + memmove( pos + nbytes_new, + pos + nbytes_old, + b->data_len - (offset + nbytes_old)); + + b->data_len = new_size; + + return b; +} + +// translate a nucleotide character to binary code +unsigned char pysam_translate_sequence( const unsigned char s ) +{ + return bam_nt16_table[s]; +} + +// stand-ins for samtools macros in bam.h +char * pysam_bam1_qname( const bam1_t * b) +{ + return (char*)b->data; +} + +uint32_t * pysam_bam1_cigar( const bam1_t * b) +{ + return (uint32_t*)(b->data + b->core.l_qname); +} + +uint8_t * pysam_bam1_seq( const bam1_t * b) +{ + return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname); +} + +uint8_t * pysam_bam1_qual( const bam1_t * b) +{ + return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + (b->core.l_qseq + 1)/2); +} + +uint8_t * pysam_bam1_aux( const bam1_t * b) +{ + return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + b->core.l_qseq + (b->core.l_qseq + 1)/2); +} + +// ####################################################### +// Iterator implementation +// ####################################################### + +// functions defined in bam_index.c +extern pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off); + +static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) +{ + uint32_t rbeg = b->core.pos; + uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; + return (rend > beg && rbeg < end); +} + +struct __bam_fetch_iterator_t +{ + bam1_t * b; + pair64_t * off; + int n_off; + uint64_t curr_off; + int curr_chunk; + bamFile fp; + int tid; + int beg; + int end; + int n_seeks; +}; + +bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end) +{ + // iterator contains current alignment position + // and will contain actual alignment during iterations + bam_fetch_iterator_t* iter = (bam_fetch_iterator_t*)calloc(1, sizeof(bam_fetch_iterator_t)); + iter->b = (bam1_t*)calloc(1, sizeof(bam1_t)); + + // list of chunks containing our alignments + iter->off = get_chunk_coordinates(idx, tid, beg, end, &iter->n_off); + + // initialise other state variables in iterator + iter->fp = fp; + iter->curr_chunk = -1; + iter->curr_off = 0; + iter->n_seeks = 0; + iter->tid = tid; + iter->beg = beg; + iter->end = end; + return iter; +} + +bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter) +{ + if (!iter->off) { + return 0; + } + + int ret; + // iterate through all alignments in chunks + for (;;) { + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->curr_chunk].v) { // then jump to the next chunk + if (iter->curr_chunk == iter->n_off - 1) break; // no more chunks + if (iter->curr_chunk >= 0) assert(iter->curr_off == iter->off[iter->curr_chunk].v); // otherwise bug + if (iter->curr_chunk < 0 || iter->off[iter->curr_chunk].v != iter->off[iter->curr_chunk+1].u) { // not adjacent chunks; then seek + bam_seek(iter->fp, iter->off[iter->curr_chunk+1].u, SEEK_SET); + iter->curr_off = bam_tell(iter->fp); + ++iter->n_seeks; + } + ++iter->curr_chunk; + } + if ((ret = bam_read1(iter->fp, iter->b)) > 0) { + iter->curr_off = bam_tell(iter->fp); + if (iter->b->core.tid != iter->tid || iter->b->core.pos >= iter->end) break; // no need to proceed + else if (is_overlap(iter->beg, iter->end, iter->b)) + // + //func(iter->b, data); + // + return iter->b; + } else + return 0; // end of file + } + return 0; +} + +void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter) +{ + // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", iter->n_seeks); + bam_destroy1(iter->b); + free(iter->off); +} + + + + diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h new file mode 100644 index 0000000..ff5d569 --- /dev/null +++ b/pysam/pysam_util.h @@ -0,0 +1,95 @@ +#ifndef PYSAM_UTIL_H +#define PYSAM_UTIL_H + +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +// code for iterator + +/*! @typedef + @Structure for holding current state (current alignment etc.) for iterating through + alignments overlapping a specified region. + @field b pointer to the current alignment + @field off pointer to an array of chunk loci (each with beg/end positions) + @field n_off The number of chunks + @field curr_off The current file positon + @field curr_chunk The item in a list of chunk + @discussion See also bam_fetch_iterate +*/ +struct __bam_fetch_iterator_t; +typedef struct __bam_fetch_iterator_t bam_fetch_iterator_t; + +/*! + @abstract Retrieve the alignments that are overlapped with the + specified region. + + @discussion Returns iterator object to retrieve successive alignments ordered by + start position. + @param fp BAM file handler + @param idx pointer to the alignment index + @param tid chromosome ID as is defined in the header + @param beg start coordinate, 0-based + @param end end coordinate, 0-based +*/ +bam_fetch_iterator_t * bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end); + + +/*! + @abstract Iterates through alignments overlapped the specified region. + @discussion Returns pointer to successive alignments ordered by start position. + Returns null pointer to signal the end of the iteration. + The alignment data is nested within the iterator to avoid unnecessary allocations. +*/ +bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter); + +bam_fetch_iterator_t* bam_init_fetchall_iterator(bamFile fp, const bam_index_t *idx); +bam1_t * bam_fetchall_iterate(bam_fetch_iterator_t *iter); + +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +// various helper functions + +int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont); + +// accessor functions - necessary as bam_plbuf_t is hidden +// among the implementation +int pysam_get_pos( const bam_plbuf_t *buf); +int pysam_get_tid( const bam_plbuf_t *buf); +bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf); + +int pysam_dispatch(int argc, char *argv[] ); + +// stand-in for macro - not wrappable in pyrex +void pysam_bam_destroy1( bam1_t * b ); + +// stand-in for other samtools macros +uint32_t * pysam_bam1_cigar( const bam1_t * b); +char * pysam_bam1_qname( const bam1_t * b); +uint8_t * pysam_bam1_seq( const bam1_t * b); +uint8_t * pysam_bam1_qual( const bam1_t * b); +uint8_t * pysam_bam1_aux( const bam1_t * b); + +/*! + @abstract Update the variable length data within a bam1_t entry + + Old data is deleted and the data within b are re-arranged to + make place for new data. + + @discussion Returns b + + @param b bam1_t data + @param nbytes_old size of old data + @param nbytes_new size of new data + @param pos position of data +*/ +bam1_t * pysam_bam_update( bam1_t * b, + const size_t nbytes_old, + const size_t nbytes_new, + uint8_t * pos ); + +// translate a nucleotide character to binary code +unsigned char pysam_translate_sequence( const unsigned char s ); + + +#endif diff --git a/samtools/bam.c b/samtools/bam.c new file mode 100644 index 0000000..ee7642b --- /dev/null +++ b/samtools/bam.c @@ -0,0 +1,303 @@ +#include +#include +#include +#include +#include "bam.h" +#include "bam_endian.h" +#include "kstring.h" +#include "sam_header.h" + +int bam_is_be = 0; +char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; + +/************************** + * CIGAR related routines * + **************************/ + +uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) +{ + uint32_t k, end; + end = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) + end += cigar[k] >> BAM_CIGAR_SHIFT; + } + return end; +} + +int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) +{ + uint32_t k; + int32_t l = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) + l += cigar[k] >> BAM_CIGAR_SHIFT; + } + return l; +} + +/******************** + * BAM I/O routines * + ********************/ + +bam_header_t *bam_header_init() +{ + bam_is_be = bam_is_big_endian(); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); +} + +void bam_header_destroy(bam_header_t *header) +{ + int32_t i; + extern void bam_destroy_header_hash(bam_header_t *header); + if (header == 0) return; + if (header->target_name) { + for (i = 0; i < header->n_targets; ++i) + free(header->target_name[i]); + free(header->target_name); + free(header->target_len); + } + free(header->text); + if (header->dict) sam_header_free(header->dict); + if (header->rg2lib) sam_tbl_destroy(header->rg2lib); + bam_destroy_header_hash(header); + free(header); +} + +bam_header_t *bam_header_read(bamFile fp) +{ + bam_header_t *header; + char buf[4]; + int32_t i = 1, name_len; + // check EOF + i = bgzf_check_EOF(fp); + if (i < 0) { + // If the file is a pipe, checking the EOF marker will *always* fail + // with ESPIPE. Suppress the error message in this case. + if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); + } + else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n"); + // read "BAM1" + if (bam_read(fp, buf, 4) != 4) return 0; + if (strncmp(buf, "BAM\001", 4)) { + fprintf(stderr, "[bam_header_read] wrong header\n"); + return 0; + } + header = bam_header_init(); + // read plain text and the number of reference sequences + bam_read(fp, &header->l_text, 4); + if (bam_is_be) bam_swap_endian_4p(&header->l_text); + header->text = (char*)calloc(header->l_text + 1, 1); + bam_read(fp, header->text, header->l_text); + bam_read(fp, &header->n_targets, 4); + if (bam_is_be) bam_swap_endian_4p(&header->n_targets); + // read reference sequence names and lengths + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); + for (i = 0; i != header->n_targets; ++i) { + bam_read(fp, &name_len, 4); + if (bam_is_be) bam_swap_endian_4p(&name_len); + header->target_name[i] = (char*)calloc(name_len, 1); + bam_read(fp, header->target_name[i], name_len); + bam_read(fp, &header->target_len[i], 4); + if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); + } + return header; +} + +int bam_header_write(bamFile fp, const bam_header_t *header) +{ + char buf[4]; + int32_t i, name_len, x; + // write "BAM1" + strncpy(buf, "BAM\001", 4); + bam_write(fp, buf, 4); + // write plain text and the number of reference sequences + if (bam_is_be) { + x = bam_swap_endian_4(header->l_text); + bam_write(fp, &x, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + x = bam_swap_endian_4(header->n_targets); + bam_write(fp, &x, 4); + } else { + bam_write(fp, &header->l_text, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + bam_write(fp, &header->n_targets, 4); + } + // write sequence names and lengths + for (i = 0; i != header->n_targets; ++i) { + char *p = header->target_name[i]; + name_len = strlen(p) + 1; + if (bam_is_be) { + x = bam_swap_endian_4(name_len); + bam_write(fp, &x, 4); + } else bam_write(fp, &name_len, 4); + bam_write(fp, p, name_len); + if (bam_is_be) { + x = bam_swap_endian_4(header->target_len[i]); + bam_write(fp, &x, 4); + } else bam_write(fp, &header->target_len[i], 4); + } + return 0; +} + +static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint8_t *s; + uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); + while (s < data + data_len) { + uint8_t type; + s += 2; // skip key + type = toupper(*s); ++s; // skip type + if (type == 'C' || type == 'A') ++s; + else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } + else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } + else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } +} + +int bam_read1(bamFile fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + + assert(BAM_CORE_SIZE == 32); + if ((ret = bam_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3; + if (bam_is_be) { + bam_swap_endian_4p(&block_len); + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->data_len = block_len - BAM_CORE_SIZE; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; + b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + return 4 + block_len; +} + +inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y; + int i; + assert(BAM_CORE_SIZE == 32); + x[0] = c->tid; + x[1] = c->pos; + x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; + x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + x[4] = c->l_qseq; + x[5] = c->mtid; + x[6] = c->mpos; + x[7] = c->isize; + if (bam_is_be) { + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + y = block_len; + bam_write(fp, bam_swap_endian_4p(&y), 4); + swap_endian_data(c, data_len, data); + } else bam_write(fp, &block_len, 4); + bam_write(fp, x, BAM_CORE_SIZE); + bam_write(fp, data, data_len); + if (bam_is_be) swap_endian_data(c, data_len, data); + return 4 + block_len; +} + +int bam_write1(bamFile fp, const bam1_t *b) +{ + return bam_write1_core(fp, &b->core, b->data_len, b->data); +} + +char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) +{ + uint8_t *s = bam1_seq(b), *t = bam1_qual(b); + int i; + const bam1_core_t *c = &b->core; + kstring_t str; + str.l = str.m = 0; str.s = 0; + + ksprintf(&str, "%s\t", bam1_qname(b)); + if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag); + else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); + else { // BAM_OFSTR + for (i = 0; i < 16; ++i) + if ((c->flag & 1<tid < 0) kputs("*\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->tid]); + ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); + if (c->n_cigar == 0) kputc('*', &str); + else { + for (i = 0; i < c->n_cigar; ++i) + ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); + } + kputc('\t', &str); + if (c->mtid < 0) kputs("*\t", &str); + else if (c->mtid == c->tid) kputs("=\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->mtid]); + ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); + if (c->l_qseq) { + for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); + kputc('\t', &str); + if (t[0] == 0xff) kputc('*', &str); + else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); + } else ksprintf(&str, "*\t*"); + s = bam1_aux(b); + while (s < b->data + b->data_len) { + uint8_t type, key[2]; + key[0] = s[0]; key[1] = s[1]; + s += 2; type = *s; ++s; + ksprintf(&str, "\t%c%c:", key[0], key[1]); + if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } + else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } + else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; } + else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } + else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } + else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } + else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } + else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } + else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } + else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } + } + return str.s; +} + +char *bam_format1(const bam_header_t *header, const bam1_t *b) +{ + return bam_format1_core(header, b, BAM_OFDEC); +} + +void bam_view1(const bam_header_t *header, const bam1_t *b) +{ + char *s = bam_format1(header, b); + printf("%s\n", s); + free(s); +} + +// FIXME: we should also check the LB tag associated with each alignment +const char *bam_get_library(bam_header_t *h, const bam1_t *b) +{ + const uint8_t *rg; + if (h->dict == 0) h->dict = sam_header_parse2(h->text); + if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB"); + rg = bam_aux_get(b, "RG"); + return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1)); +} diff --git a/samtools/bam.h b/samtools/bam.h new file mode 100644 index 0000000..291b303 --- /dev/null +++ b/samtools/bam.h @@ -0,0 +1,697 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BAM_BAM_H +#define BAM_BAM_H + +/*! + @header + + BAM library provides I/O and various operations on manipulating files + in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) + format. It now supports importing from or exporting to TAM, sorting, + merging, generating pileup, and quickly retrieval of reads overlapped + with a specified region. + + @copyright Genome Research Ltd. + */ + +#include +#include +#include +#include + +#ifndef BAM_LITE +#define BAM_VIRTUAL_OFFSET16 +#include "bgzf.h" +/*! @abstract BAM file handler */ +typedef BGZF *bamFile; +#define bam_open(fn, mode) bgzf_open(fn, mode) +#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) +#define bam_close(fp) bgzf_close(fp) +#define bam_read(fp, buf, size) bgzf_read(fp, buf, size) +#define bam_write(fp, buf, size) bgzf_write(fp, buf, size) +#define bam_tell(fp) bgzf_tell(fp) +#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) +#else +#define BAM_TRUE_OFFSET +#include +typedef gzFile bamFile; +#define bam_open(fn, mode) gzopen(fn, mode) +#define bam_dopen(fd, mode) gzdopen(fd, mode) +#define bam_close(fp) gzclose(fp) +#define bam_read(fp, buf, size) gzread(fp, buf, size) +/* no bam_write/bam_tell/bam_seek() here */ +#endif + +/*! @typedef + @abstract Structure for the alignment header. + @field n_targets number of reference sequences + @field target_name names of the reference sequences + @field target_len lengths of the referene sequences + @field dict header dictionary + @field hash hash table for fast name lookup + @field rg2lib hash table for @RG-ID -> LB lookup + @field l_text length of the plain text in the header + @field text plain text + + @discussion Field hash points to null by default. It is a private + member. + */ +typedef struct { + int32_t n_targets; + char **target_name; + uint32_t *target_len; + void *dict, *hash, *rg2lib; + int l_text; + char *text; +} bam_header_t; + +/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ +#define BAM_FPAIRED 1 +/*! @abstract the read is mapped in a proper pair */ +#define BAM_FPROPER_PAIR 2 +/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ +#define BAM_FUNMAP 4 +/*! @abstract the mate is unmapped */ +#define BAM_FMUNMAP 8 +/*! @abstract the read is mapped to the reverse strand */ +#define BAM_FREVERSE 16 +/*! @abstract the mate is mapped to the reverse strand */ +#define BAM_FMREVERSE 32 +/*! @abstract this is read1 */ +#define BAM_FREAD1 64 +/*! @abstract this is read2 */ +#define BAM_FREAD2 128 +/*! @abstract not primary alignment */ +#define BAM_FSECONDARY 256 +/*! @abstract QC failure */ +#define BAM_FQCFAIL 512 +/*! @abstract optical or PCR duplicate */ +#define BAM_FDUP 1024 + +#define BAM_OFDEC 0 +#define BAM_OFHEX 1 +#define BAM_OFSTR 2 + +/*! @abstract defautl mask for pileup */ +#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) + +#define BAM_CORE_SIZE sizeof(bam1_core_t) + +/** + * Describing how CIGAR operation/length is packed in a 32-bit integer. + */ +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +/* + CIGAR operations. + */ +/*! @abstract CIGAR: match */ +#define BAM_CMATCH 0 +/*! @abstract CIGAR: insertion to the reference */ +#define BAM_CINS 1 +/*! @abstract CIGAR: deletion from the reference */ +#define BAM_CDEL 2 +/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +#define BAM_CREF_SKIP 3 +/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +#define BAM_CSOFT_CLIP 4 +/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +#define BAM_CHARD_CLIP 5 +/*! @abstract CIGAR: padding */ +#define BAM_CPAD 6 + +/*! @typedef + @abstract Structure for core alignment information. + @field tid chromosome ID, defined by bam_header_t + @field pos 0-based leftmost coordinate + @field strand strand; 0 for forward and 1 otherwise + @field bin bin calculated by bam_reg2bin() + @field qual mapping quality + @field l_qname length of the query name + @field flag bitwise flag + @field n_cigar number of CIGAR operations + @field l_qseq length of the query sequence (read) + */ +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +/*! @typedef + @abstract Structure for one alignment. + @field core core information about the alignment + @field l_aux length of auxiliary data + @field data_len current length of bam1_t::data + @field m_data maximum length of bam1_t::data + @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux + + @discussion Notes: + + 1. qname is zero tailing and core.l_qname includes the tailing '\0'. + 2. l_qseq is calculated from the total length of an alignment block + on reading or from CIGAR. + */ +typedef struct { + bam1_core_t core; + int l_aux, data_len, m_data; + uint8_t *data; +} bam1_t; + +#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) + +/*! @function + @abstract Get the CIGAR array + @param b pointer to an alignment + @return pointer to the CIGAR array + + @discussion In the CIGAR array, each element is a 32-bit integer. The + lower 4 bits gives a CIGAR operation and the higher 28 bits keep the + length of a CIGAR. + */ +#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) + +/*! @function + @abstract Get the name of the query + @param b pointer to an alignment + @return pointer to the name string, null terminated + */ +#define bam1_qname(b) ((char*)((b)->data)) + +/*! @function + @abstract Get query sequence + @param b pointer to an alignment + @return pointer to sequence + + @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, + 8 for T and 15 for N. Two bases are packed in one byte with the base + at the higher 4 bits having smaller coordinate on the read. It is + recommended to use bam1_seqi() macro to get the base. + */ +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) + +/*! @function + @abstract Get query quality + @param b pointer to an alignment + @return pointer to quality string + */ +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2) + +/*! @function + @abstract Get a base on read + @param s Query sequence returned by bam1_seq() + @param i The i-th position, 0-based + @return 4-bit integer representing the base. + */ +#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) + +/*! @function + @abstract Get query sequence and quality + @param b pointer to an alignment + @return pointer to the concatenated auxiliary data + */ +#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) + +#ifndef kroundup32 +/*! @function + @abstract Round an integer to the next closest power-2 integer. + @param x integer to be rounded (in place) + @discussion x will be modified. + */ +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +/*! + @abstract Whether the machine is big-endian; modified only in + bam_header_init(). + */ +extern int bam_is_be; + +/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ +extern unsigned char bam_nt16_table[256]; + +/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ +extern char *bam_nt16_rev_table; + +extern char bam_nt16_nt4_table[]; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! @abstract TAM file handler */ + typedef struct __tamFile_t *tamFile; + + /*! + @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. + @param fn SAM file name + @return SAM file handler + */ + tamFile sam_open(const char *fn); + + /*! + @abstract Close a SAM file handler + @param fp SAM file handler + */ + void sam_close(tamFile fp); + + /*! + @abstract Read one alignment from a SAM file handler + @param fp SAM file handler + @param header header information (ordered names of chromosomes) + @param b read alignment; all members in b will be updated + @return 0 if successful; otherwise negative + */ + int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); + + /*! + @abstract Read header information from a TAB-delimited list file. + @param fn_list file name for the list + @return a pointer to the header structure + + @discussion Each line in this file consists of chromosome name and + the length of chromosome. + */ + bam_header_t *sam_header_read2(const char *fn_list); + + /*! + @abstract Read header from a SAM file (if present) + @param fp SAM file handler + @return pointer to header struct; 0 if no @SQ lines available + */ + bam_header_t *sam_header_read(tamFile fp); + + /*! + @abstract Parse @SQ lines a update a header struct + @param h pointer to the header struct to be updated + @return number of target sequences + + @discussion bam_header_t::{n_targets,target_len,target_name} will + be destroyed in the first place. + */ + int sam_header_parse(bam_header_t *h); + + /*! + @abstract Parse @RG lines a update a header struct + @param h pointer to the header struct to be updated + @return number of @RG lines + + @discussion bam_header_t::rg2lib will be destroyed in the first + place. + */ + int sam_header_parse_rg(bam_header_t *h); + +#define sam_write1(header, b) bam_view1(header, b) + + int bam_strmap_put(void *strmap, const char *rg, const char *lib); + const char *bam_strmap_get(const void *strmap, const char *rg); + void *bam_strmap_dup(const void*); + void *bam_strmap_init(); + void bam_strmap_destroy(void *strmap); + + /*! + @abstract Initialize a header structure. + @return the pointer to the header structure + + @discussion This function also modifies the global variable + bam_is_be. + */ + bam_header_t *bam_header_init(); + + /*! + @abstract Destroy a header structure. + @param header pointer to the header + */ + void bam_header_destroy(bam_header_t *header); + + /*! + @abstract Read a header structure from BAM. + @param fp BAM file handler, opened by bam_open() + @return pointer to the header structure + + @discussion The file position indicator must be placed at the + beginning of the file. Upon success, the position indicator will + be set at the start of the first alignment. + */ + bam_header_t *bam_header_read(bamFile fp); + + /*! + @abstract Write a header structure to BAM. + @param fp BAM file handler + @param header pointer to the header structure + @return always 0 currently + */ + int bam_header_write(bamFile fp, const bam_header_t *header); + + /*! + @abstract Read an alignment from BAM. + @param fp BAM file handler + @param b read alignment; all members are updated. + @return number of bytes read from the file + + @discussion The file position indicator must be + placed right before an alignment. Upon success, this function + will set the position indicator to the start of the next + alignment. This function is not affected by the machine + endianness. + */ + int bam_read1(bamFile fp, bam1_t *b); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param c pointer to the bam1_core_t structure + @param data_len total length of variable size data related to + the alignment + @param data pointer to the concatenated data + @return number of bytes written to the file + + @discussion This function is not affected by the machine + endianness. + */ + int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param b alignment to write + @return number of bytes written to the file + + @abstract It is equivalent to: + bam_write1_core(fp, &b->core, b->data_len, b->data) + */ + int bam_write1(bamFile fp, const bam1_t *b); + + /*! @function + @abstract Initiate a pointer to bam1_t struct + */ +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) + + /*! @function + @abstract Free the memory allocated for an alignment. + @param b pointer to an alignment + */ +#define bam_destroy1(b) do { \ + if (b) { free((b)->data); free(b); } \ + } while (0) + + /*! + @abstract Format a BAM record in the SAM format + @param header pointer to the header structure + @param b alignment to print + @return a pointer to the SAM string + */ + char *bam_format1(const bam_header_t *header, const bam1_t *b); + + char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); + + const char *bam_get_library(bam_header_t *header, const bam1_t *b); + + /*! @typedef + @abstract Structure for one alignment covering the pileup position. + @field b pointer to the alignment + @field qpos position of the read base at the pileup site, 0-based + @field indel indel length; 0 for no indel, positive for ins and negative for del + @field is_del 1 iff the base on the padded read is a deletion + @field level the level of the read in the "viewer" mode + + @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The + difference between the two functions is that the former does not + set bam_pileup1_t::level, while the later does. Level helps the + implementation of alignment viewers, but calculating this has some + overhead. + */ + typedef struct { + bam1_t *b; + int32_t qpos; + int indel, level; + uint32_t is_del:1, is_head:1, is_tail:1; + } bam_pileup1_t; + + struct __bam_plbuf_t; + /*! @abstract pileup buffer */ + typedef struct __bam_plbuf_t bam_plbuf_t; + + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + + /*! @typedef + @abstract Type of function to be called by bam_plbuf_push(). + @param tid chromosome ID as is defined in the header + @param pos start coordinate of the alignment, 0-based + @param n number of elements in pl array + @param pl array of alignments + @param data user provided data + @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. + */ + typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); + + /*! + @abstract Reset a pileup buffer for another pileup process + @param buf the pileup buffer to be reset + */ + void bam_plbuf_reset(bam_plbuf_t *buf); + + /*! + @abstract Initialize a buffer for pileup. + @param func fucntion to be called by bam_pileup_core() + @param data user provided data + @return pointer to the pileup buffer + */ + bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); + + /*! + @abstract Destroy a pileup buffer. + @param buf pointer to the pileup buffer + */ + void bam_plbuf_destroy(bam_plbuf_t *buf); + + /*! + @abstract Push an alignment to the pileup buffer. + @param b alignment to be pushed + @param buf pileup buffer + @see bam_plbuf_init() + @return always 0 currently + + @discussion If all the alignments covering a particular site have + been collected, this function will call the user defined function + as is provided to bam_plbuf_init(). The coordinate of the site and + all the alignments will be transferred to the user defined + function as function parameters. + + When all the alignments are pushed to the buffer, this function + needs to be called with b equal to NULL. This will flush the + buffer. A pileup buffer can only be reused when bam_plbuf_reset() + is called. + */ + int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); + + int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); + + struct __bam_lplbuf_t; + typedef struct __bam_lplbuf_t bam_lplbuf_t; + + void bam_lplbuf_reset(bam_lplbuf_t *buf); + + /*! @abstract bam_plbuf_init() equivalent with level calculated. */ + bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); + + /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ + void bam_lplbuf_destroy(bam_lplbuf_t *tv); + + /*! @abstract bam_plbuf_push() equivalent with level calculated. */ + int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); + + struct __bam_index_t; + typedef struct __bam_index_t bam_index_t; + + /*! + @abstract Build index for a BAM file. + @discussion Index file "fn.bai" will be created. + @param fn name of the BAM file + @return always 0 currently + */ + int bam_index_build(const char *fn); + + /*! + @abstract Load index from file "fn.bai". + @param fn name of the BAM file (NOT the index file) + @return pointer to the index structure + */ + bam_index_t *bam_index_load(const char *fn); + + /*! + @abstract Destroy an index structure. + @param idx pointer to the index structure + */ + void bam_index_destroy(bam_index_t *idx); + + /*! @typedef + @abstract Type of function to be called by bam_fetch(). + @param b the alignment + @param data user provided data + */ + typedef int (*bam_fetch_f)(const bam1_t *b, void *data); + + /*! + @abstract Retrieve the alignments that are overlapped with the + specified region. + + @discussion A user defined function will be called for each + retrieved alignment ordered by its start position. + + @param fp BAM file handler + @param idx pointer to the alignment index + @param tid chromosome ID as is defined in the header + @param beg start coordinate, 0-based + @param end end coordinate, 0-based + @param data user provided data (will be transferred to func) + @param func user defined function + */ + int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); + + /*! + @abstract Parse a region in the format: "chr2:100,000-200,000". + @discussion bam_header_t::hash will be initialized if empty. + @param header pointer to the header structure + @param str string to be parsed + @param ref_id the returned chromosome ID + @param begin the returned start coordinate + @param end the returned end coordinate + @return 0 on success; -1 on failure + */ + int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + + /*! + @abstract Retrieve data of a tag + @param b pointer to an alignment struct + @param tag two-character tag to be retrieved + + @return pointer to the type and data. The first character is the + type that can be 'iIsScCdfAZH'. + + @discussion Use bam_aux2?() series to convert the returned data to + the corresponding type. + */ + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); + + int32_t bam_aux2i(const uint8_t *s); + float bam_aux2f(const uint8_t *s); + double bam_aux2d(const uint8_t *s); + char bam_aux2A(const uint8_t *s); + char *bam_aux2Z(const uint8_t *s); + + int bam_aux_del(bam1_t *b, uint8_t *s); + void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); + uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() + + /*! + @abstract Calculate the rightmost coordinate of an alignment on the + reference genome. + + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return the rightmost coordinate, 0-based + */ + uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar); + + /*! + @abstract Calculate the length of the query sequence from CIGAR. + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return length of the query sequence + */ + int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); + +#ifdef __cplusplus +} +#endif + +/*! + @abstract Calculate the minimum bin that contains a region [beg,end). + @param beg start of the region, 0-based + @param end end of the region, 0-based + @return bin + */ +static inline int bam_reg2bin(uint32_t beg, uint32_t end) +{ + --end; + if (beg>>14 == end>>14) return 4681 + (beg>>14); + if (beg>>17 == end>>17) return 585 + (beg>>17); + if (beg>>20 == end>>20) return 73 + (beg>>20); + if (beg>>23 == end>>23) return 9 + (beg>>23); + if (beg>>26 == end>>26) return 1 + (beg>>26); + return 0; +} + +/*! + @abstract Copy an alignment + @param bdst destination alignment struct + @param bsrc source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +{ + uint8_t *data = bdst->data; + int m_data = bdst->m_data; // backup data and m_data + if (m_data < bsrc->m_data) { // double the capacity + m_data = bsrc->m_data; kroundup32(m_data); + data = (uint8_t*)realloc(data, m_data); + } + memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data + *bdst = *bsrc; // copy the rest + // restore the backup + bdst->m_data = m_data; + bdst->data = data; + return bdst; +} + +/*! + @abstract Duplicate an alignment + @param src source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_dup1(const bam1_t *src) +{ + bam1_t *b; + b = bam_init1(); + *b = *src; + b->m_data = b->data_len; + b->data = (uint8_t*)calloc(b->data_len, 1); + memcpy(b->data, src->data, b->data_len); + return b; +} + +#endif diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c new file mode 100644 index 0000000..89e99f2 --- /dev/null +++ b/samtools/bam_aux.c @@ -0,0 +1,182 @@ +#include +#include "bam.h" +#include "khash.h" +typedef char *str_p; +KHASH_MAP_INIT_STR(s, int) +KHASH_MAP_INIT_STR(r2l, str_p) + +void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) +{ + int ori_len = b->data_len; + b->data_len += 3 + len; + b->l_aux += 3 + len; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; + b->data[ori_len + 2] = type; + memcpy(b->data + ori_len + 3, data, len); +} + +uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) +{ + return bam_aux_get(b, tag); +} + +#define __skip_tag(s) do { \ + int type = toupper(*(s)); \ + ++(s); \ + if (type == 'C' || type == 'A') ++(s); \ + else if (type == 'S') (s) += 2; \ + else if (type == 'I' || type == 'F') (s) += 4; \ + else if (type == 'D') (s) += 8; \ + else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ + } while (0) + +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) +{ + uint8_t *s; + int y = tag[0]<<8 | tag[1]; + s = bam1_aux(b); + while (s < b->data + b->data_len) { + int x = (int)s[0]<<8 | s[1]; + s += 2; + if (x == y) return s; + __skip_tag(s); + } + return 0; +} +// s MUST BE returned by bam_aux_get() +int bam_aux_del(bam1_t *b, uint8_t *s) +{ + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(p, s, b->l_aux - (s - aux)); + b->data_len -= s - p; + b->l_aux -= s - p; + return 0; +} + +void bam_init_header_hash(bam_header_t *header) +{ + if (header->hash == 0) { + int ret, i; + khiter_t iter; + khash_t(s) *h; + header->hash = h = kh_init(s); + for (i = 0; i < header->n_targets; ++i) { + iter = kh_put(s, h, header->target_name[i], &ret); + kh_value(h, iter) = i; + } + } +} + +void bam_destroy_header_hash(bam_header_t *header) +{ + if (header->hash) + kh_destroy(s, (khash_t(s)*)header->hash); +} + +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) +{ + khint_t k; + khash_t(s) *h = (khash_t(s)*)header->hash; + k = kh_get(s, h, seq_name); + return k == kh_end(h)? -1 : kh_value(h, k); +} + +int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + khiter_t iter; + khash_t(s) *h; + + bam_init_header_hash(header); + h = (khash_t(s)*)header->hash; + + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the ref_id */ + if (iter == kh_end(h)) { // name not found + *ref_id = -1; free(s); + return -1; + } + *ref_id = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return -1; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) { + fprintf(stderr, "[bam_parse_region] invalid region.\n"); + return -1; + } + return 0; +} + +int32_t bam_aux2i(const uint8_t *s) +{ + int type; + if (s == 0) return 0; + type = *s++; + if (type == 'c') return (int32_t)*(int8_t*)s; + else if (type == 'C') return (int32_t)*(uint8_t*)s; + else if (type == 's') return (int32_t)*(int16_t*)s; + else if (type == 'S') return (int32_t)*(uint16_t*)s; + else if (type == 'i' || type == 'I') return *(int32_t*)s; + else return 0; +} + +float bam_aux2f(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'f') return *(float*)s; + else return 0.0; +} + +double bam_aux2d(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'd') return *(double*)s; + else return 0.0; +} + +char bam_aux2A(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'A') return *(char*)s; + else return 0; +} + +char *bam_aux2Z(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'Z' || type == 'H') return (char*)s; + else return 0; +} diff --git a/samtools/bam_color.c b/samtools/bam_color.c new file mode 100644 index 0000000..ce637f7 --- /dev/null +++ b/samtools/bam_color.c @@ -0,0 +1,127 @@ +#include +#include "bam.h" + +/*! + @abstract Get the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCSi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) i = strlen(cs) - 1 - i; + else i++; + return cs[i]; +} + +/*! + @abstract Get the color quality of the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color quality + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCQi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CQ"); + char *cq = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cq = bam_aux2Z(c); + // adjust for strandedness + if(bam1_strand(b)) i = strlen(cq) - 1 - i; + return cq[i]; +} + +char bam_aux_nt2int(char a) +{ + switch(toupper(a)) { + case 'A': + return 0; + break; + case 'C': + return 1; + break; + case 'G': + return 2; + break; + case 'T': + return 3; + break; + default: + return 4; + break; + } +} + +char bam_aux_ntnt2cs(char a, char b) +{ + a = bam_aux_nt2int(a); + b = bam_aux_nt2int(b); + if(4 == a || 4 == b) return '4'; + return "0123"[(int)(a ^ b)]; +} + +/*! + @abstract Get the color error profile at the give position + @param b pointer to an alignment + @return the original color if the color was an error, '-' (dash) otherwise + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCEi(bam1_t *b, int i) +{ + int cs_i; + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + char prev_b, cur_b; + char cur_color, cor_color; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) { //reverse strand + cs_i = strlen(cs) - 1 - i; + // get current color + cur_color = cs[cs_i]; + // get previous base. Note: must rc adaptor + prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + else { + cs_i=i+1; + // get current color + cur_color = cs[cs_i]; + // get previous base + prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + + // corrected color + cor_color = bam_aux_ntnt2cs(prev_b, cur_b); + + if(cur_color == cor_color) { + return '-'; + } + else { + return cur_color; + } +} diff --git a/samtools/bam_endian.h b/samtools/bam_endian.h new file mode 100644 index 0000000..0fc74a8 --- /dev/null +++ b/samtools/bam_endian.h @@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif diff --git a/samtools/bam_import.c b/samtools/bam_import.c new file mode 100644 index 0000000..9d463d1 --- /dev/null +++ b/samtools/bam_import.c @@ -0,0 +1,439 @@ +#include +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#endif +#include "kstring.h" +#include "bam.h" +#include "sam_header.h" +#include "kseq.h" +#include "khash.h" + +KSTREAM_INIT(gzFile, gzread, 8192) +KHASH_MAP_INIT_STR(ref, uint64_t) + +void bam_init_header_hash(bam_header_t *header); +void bam_destroy_header_hash(bam_header_t *header); +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + +unsigned char bam_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +unsigned short bam_char2flag_table[256] = { + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0, + BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 +}; + +char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"; + +struct __tamFile_t { + gzFile fp; + kstream_t *ks; + kstring_t *str; + uint64_t n_lines; + int is_first; +}; + +char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only +{ + char **list = 0, *s; + int n = 0, dret, m = 0; + gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + kstream_t *ks; + kstring_t *str; + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, '\n', str, &dret) > 0) { + if (n == m) { + m = m? m << 1 : 16; + list = (char**)realloc(list, m * sizeof(char*)); + } + if (str->s[str->l-1] == '\r') + str->s[--str->l] = '\0'; + s = list[n++] = (char*)calloc(str->l + 1, 1); + strcpy(s, str->s); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + *_n = n; + return list; +} + +static bam_header_t *hash2header(const kh_ref_t *hash) +{ + bam_header_t *header; + khiter_t k; + header = bam_header_init(); + header->n_targets = kh_size(hash); + header->target_name = (char**)calloc(kh_size(hash), sizeof(char*)); + header->target_len = (uint32_t*)calloc(kh_size(hash), 4); + for (k = kh_begin(hash); k != kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + int i = (int)kh_value(hash, k); + header->target_name[i] = (char*)kh_key(hash, k); + header->target_len[i] = kh_value(hash, k)>>32; + } + } + bam_init_header_hash(header); + return header; +} +bam_header_t *sam_header_read2(const char *fn) +{ + bam_header_t *header; + int c, dret, ret; + gzFile fp; + kstream_t *ks; + kstring_t *str; + kh_ref_t *hash; + khiter_t k; + if (fn == 0) return 0; + fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + if (fp == 0) return 0; + hash = kh_init(ref); + ks = ks_init(fp); + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + while (ks_getuntil(ks, 0, str, &dret) > 0) { + char *s = strdup(str->s); + int len, i; + i = kh_size(hash); + ks_getuntil(ks, 0, str, &dret); + len = atoi(str->s); + k = kh_put(ref, hash, s, &ret); + kh_value(hash, k) = (uint64_t)len<<32 | i; + if (dret != '\n') + while ((c = ks_getc(ks)) != '\n' && c != -1); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); + header = hash2header(hash); + kh_destroy(ref, hash); + return header; +} +static inline uint8_t *alloc_data(bam1_t *b, int size) +{ + if (b->m_data < size) { + b->m_data = size; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + return b->data; +} +static inline void parse_error(int64_t n_lines, const char * __restrict msg) +{ + fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg); + abort(); +} +static inline void append_text(bam_header_t *header, kstring_t *str) +{ + int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null + kroundup32(x); kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. + header->l_text += str->l + 1; + header->text[header->l_text] = 0; +} + +int sam_header_parse(bam_header_t *h) +{ + char **tmp; + int i; + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + if (h->l_text < 3) return 0; + if (h->dict == 0) h->dict = sam_header_parse2(h->text); + tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets); + if (h->n_targets == 0) return 0; + h->target_name = calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) + h->target_name[i] = strdup(tmp[i]); + free(tmp); + tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets); + h->target_len = calloc(h->n_targets, 4); + for (i = 0; i < h->n_targets; ++i) + h->target_len[i] = atoi(tmp[i]); + free(tmp); + return h->n_targets; +} + +bam_header_t *sam_header_read(tamFile fp) +{ + int ret, dret; + bam_header_t *header = bam_header_init(); + kstring_t *str = fp->str; + while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header + str->s[str->l] = dret; // note that str->s is NOT null terminated!! + append_text(header, str); + if (dret != '\n') { + ret = ks_getuntil(fp->ks, '\n', str, &dret); + str->s[str->l] = '\n'; // NOT null terminated!! + append_text(header, str); + } + ++fp->n_lines; + } + sam_header_parse(header); + bam_init_header_hash(header); + fp->is_first = 1; + return header; +} + +int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) +{ + int ret, doff, doff0, dret, z = 0; + bam1_core_t *c = &b->core; + kstring_t *str = fp->str; + kstream_t *ks = fp->ks; + + if (fp->is_first) { + fp->is_first = 0; + ret = str->l; + } else { + do { // special consideration for empty lines + ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); + if (ret >= 0) z += str->l + 1; + } while (ret == 0); + } + if (ret < 0) return -1; + ++fp->n_lines; + doff = 0; + + { // name + c->l_qname = strlen(str->s) + 1; + memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname); + doff += c->l_qname; + } + { // flag + long flag; + char *s; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + flag = strtol((char*)str->s, &s, 0); + if (*s) { // not the end of the string + flag = 0; + for (s = str->s; *s; ++s) + flag |= bam_char2flag_table[(int)*s]; + } + c->flag = flag; + } + { // tid, pos, qual + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); + if (c->tid < 0 && strcmp(str->s, "*")) { + if (header->n_targets == 0) { + fprintf(stderr, "[sam_read1] missing header? Abort!\n"); + exit(1); + } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s); + } + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; + if (ret < 0) return -2; + } + { // cigar + char *s, *t; + int i, op; + long x; + c->n_cigar = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; + z += str->l + 1; + if (str->s[0] != '*') { + for (s = str->s; *s; ++s) { + if (isalpha(*s)) ++c->n_cigar; + else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); + } + b->data = alloc_data(b, doff + c->n_cigar * 4); + for (i = 0, s = str->s; i != c->n_cigar; ++i) { + x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; + else if (op == 'I') op = BAM_CINS; + else if (op == 'D') op = BAM_CDEL; + else if (op == 'N') op = BAM_CREF_SKIP; + else if (op == 'S') op = BAM_CSOFT_CLIP; + else if (op == 'H') op = BAM_CHARD_CLIP; + else if (op == 'P') op = BAM_CPAD; + else parse_error(fp->n_lines, "invalid CIGAR operation"); + s = t + 1; + bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; + } + if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); + c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); + doff += c->n_cigar * 4; + } else { + if (!(c->flag&BAM_FUNMAP)) { + fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines); + c->flag |= BAM_FUNMAP; + } + c->bin = bam_reg2bin(c->pos, c->pos + 1); + } + } + { // mtid, mpos, isize + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; + if (ret < 0) return -4; + } + { // seq and qual + int i; + uint8_t *p = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq + z += str->l + 1; + if (strcmp(str->s, "*")) { + c->l_qseq = strlen(str->s); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; + memset(p, 0, (c->l_qseq+1)/2); + for (i = 0; i < c->l_qseq; ++i) + p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); + } else c->l_qseq = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual + z += str->l + 1; + if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) + parse_error(fp->n_lines, "sequence and quality are inconsistent"); + p += (c->l_qseq+1)/2; + if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff; + else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; + doff += c->l_qseq + (c->l_qseq+1)/2; + } + doff0 = doff; + if (dret != '\n' && dret != '\r') { // aux + while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) { + uint8_t *s, type, key[2]; + z += str->l + 1; + if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') + parse_error(fp->n_lines, "missing colon in auxiliary data"); + key[0] = str->s[0]; key[1] = str->s[1]; + type = str->s[3]; + s = alloc_data(b, doff + 3) + doff; + s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility + s = alloc_data(b, doff + 2) + doff; + *s++ = 'A'; *s = str->s[5]; + doff += 2; + } else if (type == 'I' || type == 'i') { + long long x; + s = alloc_data(b, doff + 5) + doff; + x = (long long)atoll(str->s + 5); + if (x < 0) { + if (x >= -127) { + *s++ = 'c'; *(int8_t*)s = (int8_t)x; + s += 1; doff += 2; + } else if (x >= -32767) { + *s++ = 's'; *(int16_t*)s = (int16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'i'; *(int32_t*)s = (int32_t)x; + s += 4; doff += 5; + if (x < -2147483648ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } else { + if (x <= 255) { + *s++ = 'C'; *s++ = (uint8_t)x; + doff += 2; + } else if (x <= 65535) { + *s++ = 'S'; *(uint16_t*)s = (uint16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'I'; *(uint32_t*)s = (uint32_t)x; + s += 4; doff += 5; + if (x > 4294967295ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } + } else if (type == 'f') { + s = alloc_data(b, doff + 5) + doff; + *s++ = 'f'; + *(float*)s = (float)atof(str->s + 5); + s += 4; doff += 5; + } else if (type == 'd') { + s = alloc_data(b, doff + 9) + doff; + *s++ = 'd'; + *(float*)s = (float)atof(str->s + 9); + s += 8; doff += 9; + } else if (type == 'Z' || type == 'H') { + int size = 1 + (str->l - 5) + 1; + if (type == 'H') { // check whether the hex string is valid + int i; + if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even"); + for (i = 0; i < str->l - 5; ++i) { + int c = toupper(str->s[5 + i]); + if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) + parse_error(fp->n_lines, "invalid hex character"); + } + } + s = alloc_data(b, doff + size) + doff; + *s++ = type; + memcpy(s, str->s + 5, str->l - 5); + s[str->l - 5] = 0; + doff += size; + } else parse_error(fp->n_lines, "unrecognized type"); + if (dret == '\n' || dret == '\r') break; + } + } + b->l_aux = doff - doff0; + b->data_len = doff; + return z; +} + +tamFile sam_open(const char *fn) +{ + tamFile fp; + gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb"); + if (gzfp == 0) return 0; + fp = (tamFile)calloc(1, sizeof(struct __tamFile_t)); + fp->str = (kstring_t*)calloc(1, sizeof(kstring_t)); + fp->fp = gzfp; + fp->ks = ks_init(fp->fp); + return fp; +} + +void sam_close(tamFile fp) +{ + if (fp) { + ks_destroy(fp->ks); + gzclose(fp->fp); + free(fp->str->s); free(fp->str); + free(fp); + } +} diff --git a/samtools/bam_index.c b/samtools/bam_index.c new file mode 100644 index 0000000..a627884 --- /dev/null +++ b/samtools/bam_index.c @@ -0,0 +1,574 @@ +#include +#include +#include "bam.h" +#include "khash.h" +#include "ksort.h" +#include "bam_endian.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +/*! + @header + + Alignment indexing. Before indexing, BAM must be sorted based on the + leftmost coordinate of alignments. In indexing, BAM uses two indices: + a UCSC binning index and a simple linear index. The binning index is + efficient for alignments spanning long distance, while the auxiliary + linear index helps to reduce unnecessary seek calls especially for + short alignments. + + The UCSC binning scheme was suggested by Richard Durbin and Lincoln + Stein and is explained by Kent et al. (2002). In this scheme, each bin + represents a contiguous genomic region which can be fully contained in + another bin; each alignment is associated with a bin which represents + the smallest region containing the entire alignment. The binning + scheme is essentially another representation of R-tree. A distinct bin + uniquely corresponds to a distinct internal node in a R-tree. Bin A is + a child of Bin B if region A is contained in B. + + In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin + 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp, + 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to + find the alignments overlapped with a region [rbeg,rend), we need to + calculate the list of bins that may be overlapped the region and test + the alignments in the bins to confirm the overlaps. If the specified + region is short, typically only a few alignments in six bins need to + be retrieved. The overlapping alignments can be quickly fetched. + + */ + +#define BAM_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. +#define BAM_LIDX_SHIFT 14 + +typedef struct { + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) +KSORT_INIT(off, pair64_t, pair64_lt) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} bam_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bam_lidx_t; + +KHASH_MAP_INIT_INT(i, bam_binlist_t) + +struct __bam_index_t { + int32_t n; + khash_t(i) **index; + bam_lidx_t *index2; +}; + +// requirement: len <= LEN_MASK +static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + bam_binlist_t *l; + int ret; + k = kh_put(i, h, bin, &ret); + l = &kh_value(h, k); + if (ret) { // not present + l->m = 1; l->n = 0; + l->list = (pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; l->list[l->n++].v = end; +} + +static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset) +{ + int i, beg, end; + beg = b->core.pos >> BAM_LIDX_SHIFT; + end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + for (i = beg + 1; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + index2->n = end + 1; +} + +static void merge_chunks(bam_index_t *idx) +{ +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + khash_t(i) *index; + int i, l, m; + khint_t k; + for (i = 0; i < idx->n; ++i) { + index = idx->index[i]; + for (k = kh_begin(index); k != kh_end(index); ++k) { + bam_binlist_t *p; + if (!kh_exist(index, k)) continue; + p = &kh_value(index, k); + m = 0; + for (l = 1; l < p->n; ++l) { +#ifdef BAM_TRUE_OFFSET + if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v; +#else + if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; +#endif + else p->list[++m] = p->list[l]; + } // ~for(l) + p->n = m + 1; + } // ~for(k) + } // ~for(i) +#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) +} + +bam_index_t *bam_index_core(bamFile fp) +{ + bam1_t *b; + bam_header_t *h; + int i, ret; + bam_index_t *idx; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; + bam1_core_t *c; + uint64_t save_off, last_off; + + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + h = bam_header_read(fp); + c = &b->core; + + idx->n = h->n_targets; + bam_header_destroy(h); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + + save_bin = save_tid = last_tid = last_bin = 0xffffffffu; + save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; + while ((ret = bam_read1(fp, b)) >= 0) { + if (last_tid != c->tid) { // change of chromosomes + last_tid = c->tid; + last_bin = 0xffffffffu; + } else if (last_coor > c->pos) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", + bam1_qname(b), last_coor, c->pos, c->tid+1); + exit(1); + } + if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->bin != last_bin) { // then possibly write the binning index + if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + save_off = last_off; + save_bin = last_bin = c->bin; + save_tid = c->tid; + if (save_tid < 0) break; + } + if (bam_tell(fp) <= last_off) { + fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", + (unsigned long long)bam_tell(fp), (unsigned long long)last_off); + exit(1); + } + last_off = bam_tell(fp); + last_coor = b->core.pos; + } + if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + merge_chunks(idx); + if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); + free(b->data); free(b); + return idx; +} + +void bam_index_destroy(bam_index_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) + free(kh_value(index, k).list); + } + kh_destroy(i, index); + free(index2->offset); + } + free(idx->index); free(idx->index2); + free(idx); +} + +void bam_index_save(const bam_index_t *idx, FILE *fp) +{ + int32_t i, size; + khint_t k; + fwrite("BAI\1", 1, 4, fp); + if (bam_is_be) { + uint32_t x = idx->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&idx->n, 4, 1, fp); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + // write binning index + size = kh_size(index); + if (bam_is_be) { // big endian + uint32_t x = size; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&size, 4, 1, fp); + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) { + bam_binlist_t *p = &kh_value(index, k); + if (bam_is_be) { // big endian + uint32_t x; + x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + fwrite(p->list, 16, p->n, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } else { + fwrite(&kh_key(index, k), 4, 1, fp); + fwrite(&p->n, 4, 1, fp); + fwrite(p->list, 16, p->n, fp); + } + } + } + // write linear index (index2) + if (bam_is_be) { + int x = index2->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&index2->n, 4, 1, fp); + if (bam_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + fwrite(index2->offset, 8, index2->n, fp); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else fwrite(index2->offset, 8, index2->n, fp); + } + fflush(fp); +} + +static bam_index_t *bam_index_load_core(FILE *fp) +{ + int i; + char magic[4]; + bam_index_t *idx; + if (fp == 0) { + fprintf(stderr, "[bam_index_load_core] fail to load index.\n"); + return 0; + } + fread(magic, 1, 4, fp); + if (strncmp(magic, "BAI\1", 4)) { + fprintf(stderr, "[bam_index_load] wrong magic number.\n"); + fclose(fp); + return 0; + } + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + fread(&idx->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&idx->n); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index; + bam_lidx_t *index2 = idx->index2 + i; + uint32_t key, size; + khint_t k; + int j, ret; + bam_binlist_t *p; + index = idx->index[i] = kh_init(i); + // load binning index + fread(&size, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&size); + for (j = 0; j < (int)size; ++j) { + fread(&key, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&key); + k = kh_put(i, index, key, &ret); + p = &kh_value(index, k); + fread(&p->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&p->n); + p->m = p->n; + p->list = (pair64_t*)malloc(p->m * 16); + fread(p->list, 16, p->n, fp); + if (bam_is_be) { + int x; + for (x = 0; x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } + } + // load linear index + fread(&index2->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + fread(index2->offset, index2->n, 8, fp); + if (bam_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +bam_index_t *bam_index_load_local(const char *_fn) +{ + FILE *fp; + char *fnidx, *fn; + + if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) { + const char *p; + int l = strlen(_fn); + for (p = _fn + l - 1; p >= _fn; --p) + if (*p == '/') break; + fn = strdup(p + 1); + } else fn = strdup(_fn); + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + fp = fopen(fnidx, "r"); + if (fp == 0) { // try "{base}.bai" + char *s = strstr(fn, "bam"); + if (s == fn + strlen(fn) - 3) { + strcpy(fnidx, fn); + fnidx[strlen(fn)-1] = 'i'; + fp = fopen(fnidx, "r"); + } + } + free(fnidx); free(fn); + if (fp) { + bam_index_t *idx = bam_index_load_core(fp); + fclose(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +bam_index_t *bam_index_load(const char *fn) +{ + bam_index_t *idx; + idx = bam_index_load_local(fn); + if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) { + char *fnidx = calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bai"); + fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); + download_from_remote(fnidx); + idx = bam_index_load_local(fn); + } + if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); + return idx; +} + +int bam_index_build2(const char *fn, const char *_fnidx) +{ + char *fnidx; + FILE *fpidx; + bamFile fp; + bam_index_t *idx; + if ((fp = bam_open(fn, "r")) == 0) { + fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n"); + return -1; + } + idx = bam_index_core(fp); + bam_close(fp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + } else fnidx = strdup(_fnidx); + fpidx = fopen(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + bam_index_save(idx, fpidx); + bam_index_destroy(idx); + fclose(fpidx); + free(fnidx); + return 0; +} + +int bam_index_build(const char *fn) +{ + return bam_index_build2(fn, 0); +} + +int bam_index(int argc, char *argv[]) +{ + if (argc < 2) { + fprintf(stderr, "Usage: samtools index []\n"); + return 1; + } + if (argc >= 3) bam_index_build2(argv[1], argv[2]); + else bam_index_build(argv[1]); + return 0; +} + +#define MAX_BIN 37450 // =(8^6-1)/7+1 + +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +{ + int i = 0, k; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; +} + +static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) +{ + uint32_t rbeg = b->core.pos; + uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; + return (rend > beg && rbeg < end); +} + +// bam_fetch helper function retrieves +pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off) +{ + uint16_t *bins; + int i, n_bins, n_off; + pair64_t *off; + khint_t k; + khash_t(i) *index; + uint64_t min_off; + + bins = (uint16_t*)calloc(MAX_BIN, 2); + n_bins = reg2bins(beg, end, bins); + index = idx->index[tid]; + min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) + n_off += kh_value(index, k).n; + } + if (n_off == 0) { + free(bins); return 0; + } + off = (pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { + int j; + bam_binlist_t *p = &kh_value(index, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + free(bins); + { + bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); + int l; + ks_introsort(off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + { // merge adjacent blocks +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + for (i = 1, l = 0; i < n_off; ++i) { +#ifdef BAM_TRUE_OFFSET + if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v; +#else + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; +#endif + else off[++l] = off[i]; + } + n_off = l + 1; +#endif + } + bam_destroy1(b); + } + *cnt_off = n_off; + return off; +} + +int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +{ + int n_off; + pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off); + if (off == 0) return 0; + { + // retrive alignments + uint64_t curr_off; + int i, ret, n_seeks; + n_seeks = 0; i = -1; curr_off = 0; + bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); + for (;;) { + if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk + if (i == n_off - 1) break; // no more chunks + if (i >= 0) assert(curr_off == off[i].v); // otherwise bug + if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek + bam_seek(fp, off[i+1].u, SEEK_SET); + curr_off = bam_tell(fp); + ++n_seeks; + } + ++i; + } + if ((ret = bam_read1(fp, b)) > 0) { + curr_off = bam_tell(fp); + if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed + else if (is_overlap(beg, end, b)) func(b, data); + } else break; // end of file + } +// fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); + bam_destroy1(b); + } + free(off); + return 0; +} diff --git a/samtools/bam_lpileup.c b/samtools/bam_lpileup.c new file mode 100644 index 0000000..d4dd63b --- /dev/null +++ b/samtools/bam_lpileup.c @@ -0,0 +1,198 @@ +#include +#include +#include +#include "bam.h" +#include "ksort.h" + +#define TV_GAP 2 + +typedef struct __freenode_t { + uint32_t level:28, cnt:4; + struct __freenode_t *next; +} freenode_t, *freenode_p; + +#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) +KSORT_INIT(node, freenode_p, freenode_lt) + +/* Memory pool, similar to the one in bam_pileup.c */ +typedef struct { + int cnt, n, max; + freenode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + return (mempool_t*)calloc(1, sizeof(mempool_t)); +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) free(mp->buf[k]); + free(mp->buf); free(mp); +} +static inline freenode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, freenode_t *p) +{ + --mp->cnt; p->next = 0; p->cnt = TV_GAP; + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* core part */ +struct __bam_lplbuf_t { + int max, n_cur, n_pre; + int max_level, *cur_level, *pre_level; + mempool_t *mp; + freenode_t **aux, *head, *tail; + int n_nodes, m_aux; + bam_pileup_f func; + void *user_data; + bam_plbuf_t *plbuf; +}; + +void bam_lplbuf_reset(bam_lplbuf_t *buf) +{ + freenode_t *p, *q; + bam_plbuf_reset(buf->plbuf); + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; + buf->max_level = 0; + buf->n_cur = buf->n_pre = 0; + buf->n_nodes = 0; +} + +static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + bam_lplbuf_t *tv = (bam_lplbuf_t*)data; + freenode_t *p; + int i, l, max_level; + // allocate memory if necessary + if (tv->max < n) { // enlarge + tv->max = n; + kroundup32(tv->max); + tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); + tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); + } + tv->n_cur = n; + // update cnt + for (p = tv->head; p->next; p = p->next) + if (p->cnt > 0) --p->cnt; + // calculate cur_level[] + max_level = 0; + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) { + if (tv->head->next && tv->head->cnt == 0) { // then take a free slot + freenode_t *p = tv->head->next; + tv->cur_level[i] = tv->head->level; + mp_free(tv->mp, tv->head); + tv->head = p; + --tv->n_nodes; + } else tv->cur_level[i] = ++tv->max_level; + } else { + tv->cur_level[i] = tv->pre_level[l++]; + if (p->is_tail) { // then return a free slot + tv->tail->level = tv->cur_level[i]; + tv->tail->next = mp_alloc(tv->mp); + tv->tail = tv->tail->next; + ++tv->n_nodes; + } + } + if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; + ((bam_pileup1_t*)p)->level = tv->cur_level[i]; + } + assert(l == tv->n_pre); + tv->func(tid, pos, n, pl, tv->user_data); + // sort the linked list + if (tv->n_nodes) { + freenode_t *q; + if (tv->n_nodes + 1 > tv->m_aux) { // enlarge + tv->m_aux = tv->n_nodes + 1; + kroundup32(tv->m_aux); + tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); + } + for (p = tv->head, i = l = 0; p->next;) { + if (p->level > max_level) { // then discard this entry + q = p->next; + mp_free(tv->mp, p); + p = q; + } else { + tv->aux[i++] = p; + p = p->next; + } + } + tv->aux[i] = tv->tail; // add a proper tail for the loop below + tv->n_nodes = i; + if (tv->n_nodes) { + ks_introsort(node, tv->n_nodes, tv->aux); + for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; + tv->head = tv->aux[0]; + } else tv->head = tv->tail; + } + // clean up + tv->max_level = max_level; + memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); + // squeeze out terminated levels + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!p->is_tail) + tv->pre_level[l++] = tv->pre_level[i]; + } + tv->n_pre = l; +/* + fprintf(stderr, "%d\t", pos+1); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) fprintf(stderr, "^"); + if (p->is_tail) fprintf(stderr, "$"); + fprintf(stderr, "%d,", p->level); + } + fprintf(stderr, "\n"); +*/ + return 0; +} + +bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) +{ + bam_lplbuf_t *tv; + tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); + tv->mp = mp_init(); + tv->head = tv->tail = mp_alloc(tv->mp); + tv->func = func; + tv->user_data = data; + tv->plbuf = bam_plbuf_init(tview_func, tv); + return (bam_lplbuf_t*)tv; +} + +void bam_lplbuf_destroy(bam_lplbuf_t *tv) +{ + freenode_t *p, *q; + free(tv->cur_level); free(tv->pre_level); + bam_plbuf_destroy(tv->plbuf); + free(tv->aux); + for (p = tv->head; p->next;) { + q = p->next; + mp_free(tv->mp, p); p = q; + } + mp_free(tv->mp, p); + assert(tv->mp->cnt == 0); + mp_destroy(tv->mp); + free(tv); +} + +int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) +{ + return bam_plbuf_push(b, tv->plbuf); +} diff --git a/samtools/bam_maqcns.c b/samtools/bam_maqcns.c new file mode 100644 index 0000000..71c2185 --- /dev/null +++ b/samtools/bam_maqcns.c @@ -0,0 +1,601 @@ +#include +#include +#include "bam.h" +#include "bam_maqcns.h" +#include "ksort.h" +#include "kaln.h" +KSORT_INIT_GENERIC(uint32_t) + +#define INDEL_WINDOW_SIZE 50 +#define INDEL_EXT_DEP 0.9 + +typedef struct __bmc_aux_t { + int max; + uint32_t *info; +} bmc_aux_t; + +typedef struct { + float esum[4], fsum[4]; + uint32_t c[4]; + uint32_t rms_mapQ; +} glf_call_aux_t; + +char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +/* + P() = \theta \sum_{i=1}^{N-1} 1/i + P(D|) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2] + p_k = 1/k / \sum_{i=1}^{N-1} 1/i + */ +static void cal_het(bam_maqcns_t *aa) +{ + int k, n1, n2; + double sum_harmo; // harmonic sum + double poly_rate; + + free(aa->lhet); + aa->lhet = (double*)calloc(256 * 256, sizeof(double)); + sum_harmo = 0.0; + for (k = 1; k <= aa->n_hap - 1; ++k) + sum_harmo += 1.0 / k; + for (n1 = 0; n1 < 256; ++n1) { + for (n2 = 0; n2 < 256; ++n2) { + long double sum = 0.0; + double lC = aa->is_soap? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1} + for (k = 1; k <= aa->n_hap - 1; ++k) { + double pk = 1.0 / k / sum_harmo; + double log1 = log((double)k/aa->n_hap); + double log2 = log(1.0 - (double)k/aa->n_hap); + sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2)); + } + aa->lhet[n1<<8|n2] = lC + logl(sum); + } + } + poly_rate = aa->het_rate * sum_harmo; + aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate)); +} + +/** initialize the helper structure */ +static void cal_coef(bam_maqcns_t *aa) +{ + int k, n, q; + long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256]; + double *lC; + + // aa->lhet will be allocated and initialized + free(aa->fk); free(aa->coef); + aa->coef = 0; + aa->fk = (double*)calloc(256, sizeof(double)); + aa->fk[0] = fk2[0] = 1.0; + for (n = 1; n != 256; ++n) { + aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta; + fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands + } + if (aa->is_soap) return; + aa->coef = (double*)calloc(256*256*64, sizeof(double)); + lC = (double*)calloc(256 * 256, sizeof(double)); + for (n = 1; n != 256; ++n) + for (k = 1; k <= n; ++k) + lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); + for (q = 1; q != 64; ++q) { + double e = pow(10.0, -q/10.0); + double le = log(e); + double le1 = log(1.0-e); + for (n = 1; n != 256; ++n) { + double *coef = aa->coef + (q<<16|n<<8); + sum_a[n+1] = 0.0; + for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k} + sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1); + b[k] = sum_a[k+1] / sum_a[k]; + if (b[k] > 0.99) b[k] = 0.99; + } + for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k}) + q_c[k] = -4.343 * fk2[k] * logl(b[k] / e); + for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i + for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9 + tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k]))); + coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk} + } + } + } + free(lC); +} + +bam_maqcns_t *bam_maqcns_init() +{ + bam_maqcns_t *bm; + bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t)); + bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t)); + bm->het_rate = 0.001; + bm->theta = 0.85; + bm->n_hap = 2; + bm->eta = 0.03; + bm->cap_mapQ = 60; + return bm; +} + +void bam_maqcns_prepare(bam_maqcns_t *bm) +{ + cal_coef(bm); cal_het(bm); +} + +void bam_maqcns_destroy(bam_maqcns_t *bm) +{ + if (bm == 0) return; + free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); + free(bm->aux); free(bm); +} + +glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm) +{ + glf_call_aux_t *b; + int i, j, k, w[8], c, n; + glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t)); + float p[16], min_p = 1e30; + uint64_t rms; + + g->ref_base = ref_base; + if (_n == 0) return g; + + // construct aux array + if (bm->aux->max < _n) { + bm->aux->max = _n; + kroundup32(bm->aux->max); + bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max); + } + for (i = n = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + uint32_t q, x = 0, qq; + if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; + q = (uint32_t)bam1_qual(p->b)[p->qpos]; + x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; + if (p->b->core.qual < q) q = p->b->core.qual; + x |= q << 24; + qq = bam1_seqi(bam1_seq(p->b), p->qpos); + q = bam_nt16_nt4_table[qq? qq : ref_base]; + if (!p->is_del && q < 4) x |= 1 << 21 | q << 16; + bm->aux->info[n++] = x; + } + ks_introsort(uint32_t, n, bm->aux->info); + // generate esum and fsum + b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t)); + for (k = 0; k != 8; ++k) w[k] = 0; + rms = 0; + for (j = n - 1; j >= 0; --j) { // calculate esum and fsum + uint32_t info = bm->aux->info[j]; + int tmp; + if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); + k = info>>16&7; + if (info>>24 > 0) { + b->esum[k&3] += bm->fk[w[k]] * (info>>24); + b->fsum[k&3] += bm->fk[w[k]]; + if (w[k] < 0xff) ++w[k]; + ++b->c[k&3]; + } + tmp = (int)(info&0xff) < bm->cap_mapQ? (int)(info&0xff) : bm->cap_mapQ; + rms += tmp * tmp; + } + b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499); + // rescale ->c[] + for (j = c = 0; j != 4; ++j) c += b->c[j]; + if (c > 255) { + for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5); + for (j = c = 0; j != 4; ++j) c += b->c[j]; + } + if (!bm->is_soap) { + // generate likelihood + for (j = 0; j != 4; ++j) { + // homozygous + float tmp1, tmp3; + int tmp2, bar_e; + for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) { + if (j == k) continue; + tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.5); + if (bar_e < 4) bar_e = 4; // should not happen + if (bar_e > 63) bar_e = 63; + p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; + } else p[j<<2|j] = 0.0; // all the bases are j + // heterozygous + for (k = j + 1; k < 4; ++k) { + for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) { + if (i == j || i == k) continue; + tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.5); + if (bar_e < 4) bar_e = 4; + if (bar_e > 63) bar_e = 63; + p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; + } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k + } + // + for (k = 0; k != 4; ++k) + if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0; + } + + { // fix p[k<<2|k] + float max1, max2, min1, min2; + int max_k, min_k; + max_k = min_k = -1; + max1 = max2 = -1.0; min1 = min2 = 1e30; + for (k = 0; k < 4; ++k) { + if (b->esum[k] > max1) { + max2 = max1; max1 = b->esum[k]; max_k = k; + } else if (b->esum[k] > max2) max2 = b->esum[k]; + } + for (k = 0; k < 4; ++k) { + if (p[k<<2|k] < min1) { + min2 = min1; min1 = p[k<<2|k]; min_k = k; + } else if (p[k<<2|k] < min2) min2 = p[k<<2|k]; + } + if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2)) + p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0; + } + } else { // apply the SOAP model + // generate likelihood + for (j = 0; j != 4; ++j) { + float tmp; + // homozygous + for (k = 0, tmp = 0.0; k != 4; ++k) + if (j != k) tmp += b->esum[k]; + p[j<<2|j] = tmp; + // heterozygous + for (k = j + 1; k < 4; ++k) { + for (i = 0, tmp = 0.0; i != 4; ++i) + if (i != j && i != k) tmp += b->esum[i]; + p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp; + } + } + } + + // convert necessary information to glf1_t + g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ; + g->depth = n > 16777215? 16777215 : n; + for (j = 0; j != 4; ++j) + for (k = j; k < 4; ++k) + if (p[j<<2|k] < min_p) min_p = p[j<<2|k]; + g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5); + for (j = c = 0; j != 4; ++j) + for (k = j; k < 4; ++k) + g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5); + + free(b); + return g; +} + +uint32_t glf2cns(const glf1_t *g, int q_r) +{ + int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1; + uint32_t x = 0; + for (i = k = 0; i < 4; ++i) + for (j = i; j < 4; ++j) { + tmp[j<<2|i] = -1; + tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r); + } + for (i = 0; i < 16; ++i) { + if (tmp[i] < 0) continue; + if (tmp[i] < min) { + min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i; + } else if (tmp[i] < min2) { + min3 = min2; min2 = tmp[i]; min_g2 = i; + } else if (tmp[i] < min3) min3 = tmp[i]; + } + x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28; + x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24; + x |= (uint32_t)g->max_mapQ << 16; + x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8; + x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff; + return x; +} + +uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) +{ + glf1_t *g; + uint32_t x; + if (n) { + g = bam_maqcns_glfgen(n, pl, 0xf, bm); + x = glf2cns(g, (int)(bm->q_r + 0.5)); + free(g); + } else x = 0xfU<<28 | 0xfU<<24; + return x; +} + +/************** *****************/ + +bam_maqindel_opt_t *bam_maqindel_opt_init() +{ + bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); + mi->q_indel = 40; + mi->r_indel = 0.00015; + // + mi->mm_penalty = 3; + mi->indel_err = 4; + mi->ambi_thres = 10; + return mi; +} + +void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) +{ + if (mir == 0) return; + free(mir->s[0]); free(mir->s[1]); free(mir); +} + +int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +{ + int k, x = c->pos, y = 0, last_y = 0; + *_tpos = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int l = cigar[k] >> BAM_CIGAR_SHIFT; + if (op == BAM_CMATCH) { + if (c->pos > tpos) return y; + if (x + l > tpos) { + *_tpos = tpos; + return y + (tpos - x); + } + x += l; y += l; + last_y = y; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + if (x + l > tpos) { + *_tpos = is_left? x : x + l; + return y; + } + x += l; + } + } + *_tpos = x; + return last_y; +} + +#define MINUS_CONST 0x10000000 + +bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types) +{ + int i, j, n_types, *types, left, right, max_rd_len = 0; + bam_maqindel_ret_t *ret = 0; + // if there is no proposed indel, check if there is an indel from the alignment + if (_n_types == 0) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; + } + if (i == n) return 0; // no indel + } + { // calculate how many types of indels are available (set n_types and types) + int m; + uint32_t *aux; + aux = (uint32_t*)calloc(n + _n_types + 1, 4); + m = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) + aux[m++] = MINUS_CONST + p->indel; + j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + if (_n_types) // then also add this to aux[] + for (i = 0; i < _n_types; ++i) + if (_types[i]) aux[m++] = MINUS_CONST + _types[i]; + ks_introsort(uint32_t, m, aux); + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + types = (int*)calloc(n_types, sizeof(int)); + j = 0; + types[j++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) { + if (aux[i] != aux[i-1]) + types[j++] = aux[i] - MINUS_CONST; + } + free(aux); + } + { // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + } + { // the core part + char *ref2, *rs, *inscns = 0; + int k, l, *score, *pscore, max_ins = types[n_types-1]; + if (max_ins > 0) { // get the consensus of inserted sequences + int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); + // count occurrences + for (i = 0; i < n_types; ++i) { + if (types[i] <= 0) continue; // not insertion + for (j = 0; j < n; ++j) { + const bam_pileup1_t *p = pl + j; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) { + for (k = 1; k <= p->indel; ++k) { + int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)]; + if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c]; + } + } + } + } + // construct the consensus of inserted sequence + inscns = (char*)calloc(n_types * max_ins, sizeof(char)); + for (i = 0; i < n_types; ++i) { + for (j = 0; j < types[i]; ++j) { + int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4; + for (k = 0; k < 4; ++k) { + if (ia[k] > max) { + max = ia[k]; + max_k = k; + } + } + inscns[i*max_ins + j] = max? 1<b->core.flag & BAM_FUNMAP) continue; + qbeg = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); + qend = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); + assert(tbeg >= left); + for (l = qbeg; l < qend; ++l) + rs[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), l)]; + { + int x, y, n_acigar, ps; + uint32_t *acigar; + ps = 0; + if (tend - tbeg + types[i] <= 0) { + score[i*n+j] = -(1<<20); + pscore[i*n+j] = 1<<20; + continue; + } + acigar = ka_global_core((uint8_t*)ref2 + tbeg - left, tend - tbeg + types[i], (uint8_t*)rs, qend - qbeg, &ap, &score[i*n+j], &n_acigar); + x = tbeg - left; y = 0; + for (l = 0; l < n_acigar; ++l) { + int op = acigar[l]&0xf; + int len = acigar[l]>>4; + if (op == BAM_CMATCH) { + int k; + for (k = 0; k < len; ++k) + if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k]; + x += len; y += len; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + if (op == BAM_CINS) ps += mi->q_indel * len; + y += len; + } else if (op == BAM_CDEL) { + ps += mi->q_indel * len; + x += len; + } + } + pscore[i*n+j] = ps; + /*if (pos == 2618517) { // for debugging only + fprintf(stderr, "pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend); + for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); fprintf(stderr, "\n"); + for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l]], stderr); fputc('\n', stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); fputc('\n', stderr); + }*/ + free(acigar); + } + } + } + { // get final result + int *sum, max1, max2, max1_i, max2_i; + // pick up the best two score + sum = (int*)calloc(n_types, sizeof(int)); + for (i = 0; i < n_types; ++i) + for (j = 0; j < n; ++j) + sum[i] += -pscore[i*n+j]; + max1 = max2 = -0x7fffffff; max1_i = max2_i = -1; + for (i = 0; i < n_types; ++i) { + if (sum[i] > max1) { + max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i; + } else if (sum[i] > max2) { + max2 = sum[i]; max2_i = i; + } + } + free(sum); + // write ret + ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t)); + ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i]; + ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1); + ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1); + // write indel sequence + if (ret->indel1 > 0) { + ret->s[0][0] = '+'; + for (k = 0; k < ret->indel1; ++k) + ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; + } else if (ret->indel1 < 0) { + ret->s[0][0] = '-'; + for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k) + ret->s[0][k+1] = ref[pos + k + 1]; + } else ret->s[0][0] = '*'; + if (ret->indel2 > 0) { + ret->s[1][0] = '+'; + for (k = 0; k < ret->indel2; ++k) + ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; + } else if (ret->indel2 < 0) { + ret->s[1][0] = '-'; + for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k) + ret->s[1][k+1] = ref[pos + k + 1]; + } else ret->s[1][0] = '*'; + // write count + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel == ret->indel1) ++ret->cnt1; + else if (p->indel == ret->indel2) ++ret->cnt2; + else ++ret->cnt_anti; + } + { // write gl[] + int tmp, seq_err = 0; + double x = 1.0; + tmp = max1_i - max2_i; + if (tmp < 0) tmp = -tmp; + for (j = 0; j < tmp + 1; ++j) x *= INDEL_EXT_DEP; + seq_err = mi->q_indel * (1.0 - x) / (1.0 - INDEL_EXT_DEP); + ret->gl[0] = ret->gl[1] = 0; + for (j = 0; j < n; ++j) { + int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; + //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2); + if (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err; + else ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err; + } + } + // write cnt_ref and cnt_ambi + if (max1_i != 0 && max2_i != 0) { + for (j = 0; j < n; ++j) { + int diff1 = score[j] - score[max1_i * n + j]; + int diff2 = score[j] - score[max2_i * n + j]; + if (diff1 > 0 && diff2 > 0) ++ret->cnt_ref; + else if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi; + } + } + } + free(score); free(pscore); free(ref2); free(rs); free(inscns); + } + { // call genotype + int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); + int min1, min2, min1_i; + q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel; + q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel; + q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel; + min1 = min2 = 0x7fffffff; min1_i = -1; + for (i = 0; i < 3; ++i) { + if (q[i] < min1) { + min2 = min1; min1 = q[i]; min1_i = i; + } else if (q[i] < min2) min2 = q[i]; + } + ret->gt = min1_i; + ret->q_cns = min2 - min1; + // set q_ref + if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3; + else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2]; + if (ret->q_ref < 0) ret->q_ref = 0; + } + free(types); + return ret; +} diff --git a/samtools/bam_maqcns.h b/samtools/bam_maqcns.h new file mode 100644 index 0000000..fa5489d --- /dev/null +++ b/samtools/bam_maqcns.h @@ -0,0 +1,56 @@ +#ifndef BAM_MAQCNS_H +#define BAM_MAQCNS_H + +#include "glf.h" + +struct __bmc_aux_t; + +typedef struct { + float het_rate, theta; + int n_hap, cap_mapQ, is_soap; + + float eta, q_r; + double *fk, *coef; + double *lhet; + struct __bmc_aux_t *aux; +} bam_maqcns_t; + +typedef struct { + int q_indel; + float r_indel; + // hidden parameters, unchangeable from command line + int mm_penalty, indel_err, ambi_thres; +} bam_maqindel_opt_t; + +typedef struct { + int indel1, indel2; + int cnt1, cnt2, cnt_anti; + int cnt_ref, cnt_ambi; + char *s[2]; + // + int gt, gl[2]; + int q_cns, q_ref; +} bam_maqindel_ret_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bam_maqcns_t *bam_maqcns_init(); + void bam_maqcns_prepare(bam_maqcns_t *bm); + void bam_maqcns_destroy(bam_maqcns_t *bm); + glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm); + uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm); + // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2 + uint32_t glf2cns(const glf1_t *g, int q_r); + + bam_maqindel_opt_t *bam_maqindel_opt_init(); + bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types); + void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c new file mode 100644 index 0000000..61f808a --- /dev/null +++ b/samtools/bam_mate.c @@ -0,0 +1,70 @@ +#include +#include +#include "bam.h" + +// currently, this function ONLY works if each read has one hit +void bam_mating_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b[2]; + int curr, has_prev; + + header = bam_header_read(in); + bam_header_write(out, header); + + b[0] = bam_init1(); + b[1] = bam_init1(); + curr = 0; has_prev = 0; + while (bam_read1(in, b[curr]) >= 0) { + bam1_t *cur = b[curr], *pre = b[1-curr]; + if (has_prev) { + if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name + cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; + pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; + if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) + { + uint32_t cur5, pre5; + cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; + cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; + } else cur->core.isize = pre->core.isize = 0; + if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; + else cur->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; + else pre->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } + if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_write1(out, pre); + bam_write1(out, cur); + has_prev = 0; + } else { // unpaired or singleton + pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; + if (pre->core.flag & BAM_FPAIRED) { + pre->core.flag |= BAM_FMUNMAP; + pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; + } + bam_write1(out, pre); + } + } else has_prev = 1; + curr = 1 - curr; + } + if (has_prev) bam_write1(out, b[1-curr]); + bam_header_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); +} + +int bam_mating(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "samtools fixmate \n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + bam_mating_core(in, out); + bam_close(in); bam_close(out); + return 0; +} diff --git a/samtools/bam_md.c b/samtools/bam_md.c new file mode 100644 index 0000000..3ca7309 --- /dev/null +++ b/samtools/bam_md.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#include "faidx.h" +#include "sam.h" +#include "kstring.h" + +void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +{ + uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, u = 0; + kstring_t *str; + uint8_t *old_md, *old_nm; + int32_t old_nm_i = -1, nm = 0; + + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match + if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + ++u; + } else { + ksprintf(str, "%d", u); + kputc(ref[x+j], str); + u = 0; ++nm; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL) { + ksprintf(str, "%d", u); + kputc('^', str); + for (j = 0; j < l; ++j) { + if (ref[x+j] == 0) break; + kputc(ref[x+j], str); + } + u = 0; + if (j < l) break; + x += l; nm += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + if (op == BAM_CINS) nm += l; + } else if (op == BAM_CREF_SKIP) { + x += l; + } + } + ksprintf(str, "%d", u); + // update NM + old_nm = bam_aux_get(b, "NM"); + if (c->flag & BAM_FUNMAP) return; + if (old_nm) old_nm_i = bam_aux2i(old_nm); + if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + else if (nm != old_nm_i) { + fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); + bam_aux_del(b, old_nm); + bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + } + // update MD + old_md = bam_aux_get(b, "MD"); + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) { + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); + bam_aux_del(b, old_md); + bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + } + } + free(str->s); free(str); +} + +int bam_fillmd(int argc, char *argv[]) +{ + int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed; + samfile_t *fp, *fpout = 0; + faidx_t *fai; + char *ref = 0, mode_w[8], mode_r[8]; + bam1_t *b; + + is_bam_out = is_sam_in = is_uncompressed = 0; + mode_w[0] = mode_r[0] = 0; + strcpy(mode_r, "r"); strcpy(mode_w, "w"); + while ((c = getopt(argc, argv, "eubS")) >= 0) { + switch (c) { + case 'e': is_equal = 1; break; + case 'b': is_bam_out = 1; break; + case 'u': is_uncompressed = is_bam_out = 1; break; + case 'S': is_sam_in = 1; break; + default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; + } + } + if (!is_sam_in) strcat(mode_r, "b"); + if (is_bam_out) strcat(mode_w, "b"); + else strcat(mode_w, "h"); + if (is_uncompressed) strcat(mode_w, "u"); + if (optind + 1 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools fillmd [-eubS] \n\n"); + fprintf(stderr, "Options: -e change identical bases to '='\n"); + fprintf(stderr, " -u uncompressed BAM output (for piping)\n"); + fprintf(stderr, " -b compressed BAM output\n"); + fprintf(stderr, " -S the input is SAM with header\n\n"); + return 1; + } + fp = samopen(argv[optind], mode_r, 0); + if (fp == 0) return 1; + if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) { + fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); + return 1; + } + fpout = samopen("-", mode_w, fp->header); + fai = fai_load(argv[optind+1]); + + b = bam_init1(); + while ((ret = samread(fp, b)) >= 0) { + if (b->core.tid >= 0) { + if (tid != b->core.tid) { + free(ref); + ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len); + tid = b->core.tid; + if (ref == 0) + fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", + fp->header->target_name[tid]); + } + if (ref) bam_fillmd1(b, ref, is_equal); + } + samwrite(fpout, b); + } + bam_destroy1(b); + + free(ref); + fai_destroy(fai); + samclose(fp); samclose(fpout); + return 0; +} diff --git a/samtools/bam_pileup.c b/samtools/bam_pileup.c new file mode 100644 index 0000000..f68f400 --- /dev/null +++ b/samtools/bam_pileup.c @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include "sam.h" + +typedef struct __linkbuf_t { + bam1_t b; + uint32_t beg, end; + struct __linkbuf_t *next; +} lbnode_t; + +/* --- BEGIN: Memory pool */ + +typedef struct { + int cnt, n, max; + lbnode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + mempool_t *mp; + mp = (mempool_t*)calloc(1, sizeof(mempool_t)); + return mp; +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) { + free(mp->buf[k]->b.data); + free(mp->buf[k]); + } + free(mp->buf); + free(mp); +} +static inline lbnode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, lbnode_t *p) +{ + --mp->cnt; p->next = 0; // clear lbnode_t::next here + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* --- END: Memory pool */ + +/* --- BEGIN: Auxiliary functions */ + +static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) +{ + unsigned k; + bam1_t *b = p->b; + bam1_core_t *c = &b->core; + uint32_t x = c->pos, y = 0; + int ret = 1, is_restart = 1; + + if (c->flag&BAM_FUNMAP) return 0; // unmapped read + assert(x <= pos); // otherwise a bug + p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0; + for (k = 0; k < c->n_cigar; ++k) { + int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation + int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length + if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip + if (x + l > pos) { // overlap with pos + p->indel = p->is_del = 0; + p->qpos = y + (pos - x); + if (x == pos && is_restart) p->is_head = 1; + if (x + l - 1 == pos) { // come to the end of a match + if (k < c->n_cigar - 1) { // there are additional operation(s) + uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR + int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation + if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del + else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins + if (op_next == BAM_CDEL || op_next == BAM_CINS) { + if (k + 2 < c->n_cigar) op_next = bam1_cigar(b)[k+2]&BAM_CIGAR_MASK; + else p->is_tail = 1; + } + if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP) + p->is_tail = 1; // tail + } else p->is_tail = 1; // this is the last operation; set tail + } + } + x += l; y += l; + } else if (op == BAM_CDEL) { // then set ->is_del + if (x + l > pos) { + p->indel = 0; p->is_del = 1; + p->qpos = y + (pos - x); + } + x += l; + } else if (op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); + if (x > pos) { + if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all + break; + } + } + assert(x > pos); // otherwise a bug + return ret; +} + +/* --- END: Auxiliary functions */ + +struct __bam_plbuf_t { + mempool_t *mp; + lbnode_t *head, *tail, *dummy; + bam_pileup_f func; + void *func_data; + int32_t tid, pos, max_tid, max_pos; + int max_pu, is_eof; + bam_pileup1_t *pu; + int flag_mask; +}; + +void bam_plbuf_reset(bam_plbuf_t *buf) +{ + lbnode_t *p, *q; + buf->max_tid = buf->max_pos = -1; + buf->tid = buf->pos = 0; + buf->is_eof = 0; + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; +} + +void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) +{ + if (mask < 0) buf->flag_mask = BAM_DEF_MASK; + else buf->flag_mask = BAM_FUNMAP | mask; +} + +bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +{ + bam_plbuf_t *buf; + buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t)); + buf->func = func; buf->func_data = data; + buf->mp = mp_init(); + buf->head = buf->tail = mp_alloc(buf->mp); + buf->dummy = mp_alloc(buf->mp); + buf->max_tid = buf->max_pos = -1; + buf->flag_mask = BAM_DEF_MASK; + return buf; +} + +void bam_plbuf_destroy(bam_plbuf_t *buf) +{ + mp_free(buf->mp, buf->dummy); + mp_free(buf->mp, buf->head); + if (buf->mp->cnt != 0) + fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt); + mp_destroy(buf->mp); + free(buf->pu); + free(buf); +} + +int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +{ + if (b) { // fill buffer + if (b->core.tid < 0) return 0; + if (b->core.flag & buf->flag_mask) return 0; + bam_copy1(&buf->tail->b, b); + buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); + if (b->core.tid < buf->max_tid) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); + return -1; + } + if ((b->core.tid == buf->max_tid) && (buf->tail->beg < buf->max_pos)) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); + return -1; + } + buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg; + if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) { + buf->tail->next = mp_alloc(buf->mp); + buf->tail = buf->tail->next; + } + } else buf->is_eof = 1; + while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) { + int n_pu = 0; + lbnode_t *p, *q; + buf->dummy->next = buf->head; + for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list + q->next = p->next; mp_free(buf->mp, p); p = q; + } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup + if (n_pu == buf->max_pu) { // then double the capacity + buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256; + buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu); + } + buf->pu[n_pu].b = &p->b; + if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP + } + } + buf->head = buf->dummy->next; // dummy->next may be changed + if (n_pu) { // then call user defined function + buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data); + } + // update tid and pos + if (buf->head->next) { + if (buf->tid > buf->head->b.core.tid) { + fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); + return 1; + } + } + if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence + buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference + } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid + buf->pos = buf->head->beg; // jump to the next position + } else ++buf->pos; // scan contiguously + if (buf->is_eof && buf->head->next == 0) break; + } + return 0; +} + +int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_plbuf_t *buf; + int ret; + bam1_t *b; + b = bam_init1(); + buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); + while ((ret = bam_read1(fp, b)) >= 0) + bam_plbuf_push(b, buf); + bam_plbuf_push(0, buf); + bam_plbuf_destroy(buf); + bam_destroy1(b); + return 0; +} diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c new file mode 100644 index 0000000..ba787a9 --- /dev/null +++ b/samtools/bam_plcmd.c @@ -0,0 +1,392 @@ +#include +#include +#include +#include +#include "sam.h" +#include "faidx.h" +#include "bam_maqcns.h" +#include "khash.h" +#include "glf.h" +#include "kstring.h" + +typedef int *indel_list_t; +KHASH_MAP_INIT_INT64(64, indel_list_t) + +#define BAM_PLF_SIMPLE 0x01 +#define BAM_PLF_CNS 0x02 +#define BAM_PLF_INDEL_ONLY 0x04 +#define BAM_PLF_GLF 0x08 +#define BAM_PLF_VAR_ONLY 0x10 +#define BAM_PLF_2ND 0x20 + +typedef struct { + bam_header_t *h; + bam_maqcns_t *c; + bam_maqindel_opt_t *ido; + faidx_t *fai; + khash_t(64) *hash; + uint32_t format; + int tid, len, last_pos; + int mask; + char *ref; + glfFile fp_glf; // for glf output only +} pu_data_t; + +char **__bam_get_lines(const char *fn, int *_n); +void bam_init_header_hash(bam_header_t *header); +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + +static khash_t(64) *load_pos(const char *fn, bam_header_t *h) +{ + char **list; + int i, j, n, *fields, max_fields; + khash_t(64) *hash; + bam_init_header_hash(h); + list = __bam_get_lines(fn, &n); + hash = kh_init(64); + max_fields = 0; fields = 0; + for (i = 0; i < n; ++i) { + char *str = list[i]; + int chr, n_fields, ret; + khint_t k; + uint64_t x; + n_fields = ksplit_core(str, 0, &max_fields, &fields); + if (n_fields < 2) continue; + chr = bam_get_tid(h, str + fields[0]); + if (chr < 0) { + fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]); + continue; + } + x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1); + k = kh_put(64, hash, x, &ret); + if (ret == 0) { + fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]); + continue; + } + kh_val(hash, k) = 0; + if (n_fields > 2) { + // count + for (j = 2; j < n_fields; ++j) { + char *s = str + fields[j]; + if ((*s != '+' && *s != '-') || !isdigit(s[1])) break; + } + if (j > 2) { // update kh_val() + int *q, y, z; + q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int)); + q[0] = j - 2; z = j; y = 1; + for (j = 2; j < z; ++j) + q[y++] = atoi(str + fields[j]); + } + } + free(str); + } + free(list); free(fields); + return hash; +} + +// an analogy to pileup_func() below +static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +{ + pu_data_t *d = (pu_data_t*)data; + bam_maqindel_ret_t *r = 0; + int rb, *proposed_indels = 0; + glf1_t *g; + glf3_t *g3; + + if (d->fai == 0) { + fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n"); + exit(1); + } + if (d->hash) { // only output a list of sites + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } + g3 = glf3_init1(); + if (d->fai && (int)tid != d->tid) { + if (d->ref) { // then write the end mark + g3->rtype = GLF3_RTYPE_END; + glf3_write1(d->fp_glf, g3); + } + glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference + free(d->ref); + d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->tid = tid; + d->last_pos = 0; + } + rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); + memcpy(g3, g, sizeof(glf1_t)); + g3->rtype = GLF3_RTYPE_SUB; + g3->offset = pos - d->last_pos; + d->last_pos = pos; + glf3_write1(d->fp_glf, g3); + if (pos < d->len) { + if (proposed_indels) + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + } + if (r) { // then write indel line + int het = 3 * n, min; + min = het; + if (min > r->gl[0]) min = r->gl[0]; + if (min > r->gl[1]) min = r->gl[1]; + g3->ref_base = 0; + g3->rtype = GLF3_RTYPE_INDEL; + memset(g3->lk, 0, 10); + g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255; + g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255; + g3->lk[2] = het - min < 255? het - min : 255; + g3->offset = 0; + g3->indel_len[0] = r->indel1; + g3->indel_len[1] = r->indel2; + g3->min_lk = min < 255? min : 255; + g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1; + g3->indel_seq[0] = strdup(r->s[0]+1); + g3->indel_seq[1] = strdup(r->s[1]+1); + glf3_write1(d->fp_glf, g3); + bam_maqindel_ret_destroy(r); + } + free(g); + glf3_destroy1(g3); + return 0; +} + +static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +{ + pu_data_t *d = (pu_data_t*)data; + bam_maqindel_ret_t *r = 0; + int i, j, rb, rms_mapq = -1, *proposed_indels = 0; + uint64_t rms_aux; + uint32_t cns = 0; + + // if GLF is required, suppress -c completely + if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data); + // if d->hash is initialized, only output the sites in the hash table + if (d->hash) { + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } + // update d->ref if necessary + if (d->fai && (int)tid != d->tid) { + free(d->ref); + d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->tid = tid; + } + rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + // when the indel-only mode is asked for, return if no reads mapped with indels + if (d->format & BAM_PLF_INDEL_ONLY) { + for (i = 0; i < n; ++i) + if (pu[i].indel != 0) break; + if (i == n) return 0; + } + // call the consensus and indel + if (d->format & BAM_PLF_CNS) // call consensus + cns = bam_maqcns_call(n, pu, d->c); + if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels + if (proposed_indels) // the first element gives the size of the array + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + } + // when only variant sites are asked for, test if the site is a variant + if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { + if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP + if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel + if (r) bam_maqindel_ret_destroy(r); + return 0; + } + } + } + // print the first 3 columns + printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb); + // print consensus information if required + if (d->format & BAM_PLF_CNS) { + int ref_q, rb4 = bam_nt16_table[rb]; + ref_q = 0; + if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP + ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff); + if (ref_q > 255) ref_q = 255; + } + rms_mapq = cns>>16&0xff; + printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq); + } + // print pileup sequences + printf("%d\t", n); + rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + rms_aux += tmp * tmp; + if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); + if (!p->is_del) { + int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + putchar(c); + if (p->indel > 0) { + printf("+%d", p->indel); + for (j = 1; j <= p->indel; ++j) { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } else if (p->indel < 0) { + printf("%d", p->indel); + for (j = 1; j <= -p->indel; ++j) { + c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N'; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } + } else putchar('*'); + if (p->is_tail) putchar('$'); + } + // finalize rms_mapq + rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); + if (rms_mapq < 0) rms_mapq = rms_aux; + putchar('\t'); + // print quality + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int c = bam1_qual(p->b)[p->qpos] + 33; + if (c > 126) c = 126; + putchar(c); + } + if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities + const unsigned char *q; + putchar('\t'); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + q = bam_aux_get(p->b, "E2"); + putchar(q? q[p->qpos + 1] : 'N'); + } + putchar('\t'); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + q = bam_aux_get(p->b, "U2"); + putchar(q? q[p->qpos + 1] : '!'); + } + } + // print mapping quality if -s is flagged on the command line + if (d->format & BAM_PLF_SIMPLE) { + putchar('\t'); + for (i = 0; i < n; ++i) { + int c = pu[i].b->core.qual + 33; + if (c > 126) c = 126; + putchar(c); + } + } + putchar('\n'); + // print the indel line if r has been calculated. This only happens if: + // a) -c or -i are flagged, AND b) the reference sequence is available + if (r) { + printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1); + if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]); + else printf("%s/%s\t", r->s[0], r->s[1]); + printf("%d\t%d\t", r->q_cns, r->q_ref); + printf("%d\t%d\t", rms_mapq, n); + printf("%s\t%s\t", r->s[0], r->s[1]); + //printf("%d\t%d\t", r->gl[0], r->gl[1]); + printf("%d\t%d\t%d\t", r->cnt1, r->cnt2, r->cnt_anti); + printf("%d\t%d\n", r->cnt_ref, r->cnt_ambi); + bam_maqindel_ret_destroy(r); + } + return 0; +} + +int bam_pileup(int argc, char *argv[]) +{ + int c, is_SAM = 0; + char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; + pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); + d->tid = -1; d->mask = BAM_DEF_MASK; + d->c = bam_maqcns_init(); + d->ido = bam_maqindel_opt_init(); + while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2a")) >= 0) { + switch (c) { + case 'a': d->c->is_soap = 1; break; + case 's': d->format |= BAM_PLF_SIMPLE; break; + case 't': fn_list = strdup(optarg); break; + case 'l': fn_pos = strdup(optarg); break; + case 'f': fn_fa = strdup(optarg); break; + case 'T': d->c->theta = atof(optarg); break; + case 'N': d->c->n_hap = atoi(optarg); break; + case 'r': d->c->het_rate = atof(optarg); break; + case 'M': d->c->cap_mapQ = atoi(optarg); break; + case 'c': d->format |= BAM_PLF_CNS; break; + case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; + case 'v': d->format |= BAM_PLF_VAR_ONLY; break; + case 'm': d->mask = strtol(optarg, 0, 0); break; + case 'g': d->format |= BAM_PLF_GLF; break; + case '2': d->format |= BAM_PLF_2ND; break; + case 'I': d->ido->q_indel = atoi(optarg); break; + case 'G': d->ido->r_indel = atof(optarg); break; + case 'S': is_SAM = 1; break; + default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; + } + } + if (fn_list) is_SAM = 1; + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); + fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); + fprintf(stderr, " -S the input is in SAM\n"); + fprintf(stderr, " -a use the SOAPsnp model for SNP calling\n"); + fprintf(stderr, " -2 output the 2nd best call and quality\n"); + fprintf(stderr, " -i only show lines/consensus with indels\n"); + fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); + fprintf(stderr, " -t FILE list of reference sequences (force -S)\n"); + fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); + fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); + fprintf(stderr, " -c output the maq consensus sequence\n"); + fprintf(stderr, " -v print variants only (for -c)\n"); + fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n"); + fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta); + fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap); + fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate); + fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel); + fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel); + fprintf(stderr, "\n"); + free(fn_list); free(fn_fa); free(d); + return 1; + } + if (fn_fa) d->fai = fai_load(fn_fa); + if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling + if (d->format & BAM_PLF_GLF) { // for glf output + glf3_header_t *h; + h = glf3_header_init(); + d->fp_glf = bgzf_fdopen(fileno(stdout), "w"); + glf3_header_write(d->fp_glf, h); + glf3_header_destroy(h); + } + if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY))) + fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n"); + if (fn_fa && is_SAM && fn_list == 0) fn_list = samfaipath(fn_fa); + + { + samfile_t *fp; + fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0); + if (fp == 0 || fp->header == 0) { + fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n"); + return 1; + } + d->h = fp->header; + if (fn_pos) d->hash = load_pos(fn_pos, d->h); + sampileup(fp, d->mask, pileup_func, d); + samclose(fp); // d->h will be destroyed here + } + + // free + if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf); + if (fn_pos) { // free the hash table + khint_t k; + for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k) + if (kh_exist(d->hash, k)) free(kh_val(d->hash, k)); + kh_destroy(64, d->hash); + } + free(fn_pos); free(fn_list); free(fn_fa); + if (d->fai) fai_destroy(d->fai); + bam_maqcns_destroy(d->c); + free(d->ido); free(d->ref); free(d); + return 0; +} diff --git a/samtools/bam_rmdup.c b/samtools/bam_rmdup.c new file mode 100644 index 0000000..f0d2b5d --- /dev/null +++ b/samtools/bam_rmdup.c @@ -0,0 +1,206 @@ +#include +#include +#include +#include +#include +#include "sam.h" + +typedef bam1_t *bam1_p; + +#include "khash.h" +KHASH_SET_INIT_STR(name) +KHASH_MAP_INIT_INT64(pos, bam1_p) + +#define BUFFER_SIZE 0x40000 + +typedef struct { + uint64_t n_checked, n_removed; + khash_t(pos) *best_hash; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) + +typedef struct { + int n, max; + bam1_t **a; +} tmp_stack_t; + +static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) +{ + if (stack->n == stack->max) { + stack->max = stack->max? stack->max<<1 : 0x10000; + stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); + } + stack->a[stack->n++] = b; +} + +static inline void dump_best(tmp_stack_t *stack, samfile_t *out) +{ + int i; + for (i = 0; i != stack->n; ++i) { + samwrite(out, stack->a[i]); + bam_destroy1(stack->a[i]); + } + stack->n = 0; +} + +static void clear_del_set(khash_t(name) *del_set) +{ + khint_t k; + for (k = kh_begin(del_set); k < kh_end(del_set); ++k) + if (kh_exist(del_set, k)) + free((char*)kh_key(del_set, k)); + kh_clear(name, del_set); +} + +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) +{ + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->n_checked = q->n_removed = 0; + q->best_hash = kh_init(pos); + return q; + } else return &kh_val(aux, k); +} + +static void clear_best(khash_t(lib) *aux, int max) +{ + khint_t k; + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + if (kh_size(q->best_hash) >= max) + kh_clear(pos, q->best_hash); + } + } +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + +void bam_rmdup_core(samfile_t *in, samfile_t *out) +{ + bam1_t *b; + int last_tid = -1, last_pos = -1; + tmp_stack_t stack; + khint_t k; + khash_t(lib) *aux; + khash_t(name) *del_set; + + aux = kh_init(lib); + del_set = kh_init(name); + b = bam_init1(); + memset(&stack, 0, sizeof(tmp_stack_t)); + + kh_resize(name, del_set, 4 * BUFFER_SIZE); + while (samread(in, b) >= 0) { + bam1_core_t *c = &b->core; + if (c->tid != last_tid || last_pos != c->pos) { + dump_best(&stack, out); // write the result + clear_best(aux, BUFFER_SIZE); + if (c->tid != last_tid) { + clear_best(aux, 0); + if (kh_size(del_set)) { // check + fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + clear_del_set(del_set); + } + if ((int)c->tid == -1) { // append unmapped reads + samwrite(out, b); + while (samread(in, b) >= 0) samwrite(out, b); + break; + } + last_tid = c->tid; + fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); + } + } + if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { + samwrite(out, b); + } else if (c->isize > 0) { // paired, head + uint64_t key = (uint64_t)c->pos<<32 | c->isize; + const char *lib; + lib_aux_t *q; + int ret; + lib = bam_get_library(in->header, b); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + k = kh_put(pos, q->best_hash, key, &ret); + if (ret == 0) { // found in best_hash + bam1_t *p = kh_val(q->best_hash, k); + ++q->n_removed; + if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle + kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed + bam_copy1(p, b); // replaced as b + } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed + if (ret == 0) + fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); + } else { // not found in best_hash + kh_val(q->best_hash, k) = bam_dup1(b); + stack_insert(&stack, kh_val(q->best_hash, k)); + } + } else { // paired, tail + k = kh_get(name, del_set, bam1_qname(b)); + if (k != kh_end(del_set)) { + free((char*)kh_key(del_set, k)); + kh_del(name, del_set, k); + } else samwrite(out, b); + } + last_pos = c->pos; + } + + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + dump_best(&stack, out); + fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(pos, q->best_hash); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + + clear_del_set(del_set); + kh_destroy(name, del_set); + free(stack.a); + bam_destroy1(b); +} + +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); + +int bam_rmdup(int argc, char *argv[]) +{ + int c, is_se = 0, force_se = 0; + samfile_t *in, *out; + while ((c = getopt(argc, argv, "sS")) >= 0) { + switch (c) { + case 's': is_se = 1; break; + case 'S': force_se = is_se = 1; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools rmdup [-sS] \n\n"); + fprintf(stderr, "Option: -s rmdup for SE reads\n"); + fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); + return 1; + } + in = samopen(argv[optind], "rb", 0); + out = samopen(argv[optind+1], "wb", in->header); + if (in == 0 || out == 0) { + fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); + return 1; + } + if (is_se) bam_rmdupse_core(in, out, force_se); + else bam_rmdup_core(in, out); + samclose(in); samclose(out); + return 0; +} diff --git a/samtools/bam_rmdupse.c b/samtools/bam_rmdupse.c new file mode 100644 index 0000000..e7dbdc7 --- /dev/null +++ b/samtools/bam_rmdupse.c @@ -0,0 +1,159 @@ +#include +#include "sam.h" +#include "khash.h" +#include "klist.h" + +#define QUEUE_CLEAR_SIZE 0x100000 +#define MAX_POS 0x7fffffff + +typedef struct { + int endpos; + uint32_t score:31, discarded:1; + bam1_t *b; +} elem_t, *elem_p; +#define __free_elem(p) bam_destroy1((p)->data.b) +KLIST_INIT(q, elem_t, __free_elem) +typedef klist_t(q) queue_t; + +KHASH_MAP_INIT_INT(best, elem_p) +typedef khash_t(best) besthash_t; + +typedef struct { + uint64_t n_checked, n_removed; + besthash_t *left, *rght; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) + +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) +{ + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->left = kh_init(best); + q->rght = kh_init(best); + q->n_checked = q->n_removed = 0; + return q; + } else return &kh_val(aux, k); +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + +static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) +{ + elem_t *p = kl_pushp(q, queue); + p->discarded = 0; + p->endpos = endpos; p->score = score; + if (p->b == 0) p->b = bam_init1(); + bam_copy1(p->b, b); + return p; +} + +static void clear_besthash(besthash_t *h, int32_t pos) +{ + khint_t k; + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) + kh_del(best, h, k); +} + +static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) +{ + if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { + khint_t k; + while (1) { + elem_t *q; + if (queue->head == queue->tail) break; + q = &kl_val(queue->head); + if (q->discarded) { + q->b->data_len = 0; + kl_shift(q, queue, 0); + continue; + } + if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; + samwrite(out, q->b); + q->b->data_len = 0; + kl_shift(q, queue, 0); + } + for (k = kh_begin(h); k != kh_end(h); ++k) { + if (kh_exist(h, k)) { + clear_besthash(kh_val(h, k).left, pos); + clear_besthash(kh_val(h, k).rght, pos); + } + } + } +} + +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) +{ + bam1_t *b; + queue_t *queue; + khint_t k; + int last_tid = -2; + khash_t(lib) *aux; + + aux = kh_init(lib); + b = bam_init1(); + queue = kl_init(q); + while (samread(in, b) >= 0) { + bam1_core_t *c = &b->core; + int endpos = bam_calend(c, bam1_cigar(b)); + int score = sum_qual(b); + + if (last_tid != c->tid) { + if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); + last_tid = c->tid; + } else dump_alignment(out, queue, c->pos, aux); + if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { + push_queue(queue, b, endpos, score); + } else { + const char *lib; + lib_aux_t *q; + besthash_t *h; + uint32_t key; + int ret; + lib = bam_get_library(in->header, b); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + h = (c->flag&BAM_FREVERSE)? q->rght : q->left; + key = (c->flag&BAM_FREVERSE)? endpos : c->pos; + k = kh_put(best, h, key, &ret); + if (ret == 0) { // in the hash table + elem_t *p = kh_val(h, k); + ++q->n_removed; + if (p->score < score) { + if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue + p->discarded = 1; + kh_val(h, k) = push_queue(queue, b, endpos, score); + } else { // replace + p->score = score; p->endpos = endpos; + bam_copy1(p->b, b); + } + } // otherwise, discard the alignment + } else kh_val(h, k) = push_queue(queue, b, endpos, score); + } + } + dump_alignment(out, queue, MAX_POS, aux); + + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(best, q->left); kh_destroy(best, q->rght); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + bam_destroy1(b); + kl_destroy(q, queue); +} diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c new file mode 100644 index 0000000..9884f3d --- /dev/null +++ b/samtools/bam_sort.c @@ -0,0 +1,357 @@ +#include +#include +#include +#include +#include +#include +#include "bam.h" +#include "ksort.h" + +static int g_is_by_qname = 0; + +static inline int strnum_cmp(const char *a, const char *b) +{ + char *pa, *pb; + pa = (char*)a; pb = (char*)b; + while (*pa && *pb) { + if (isdigit(*pa) && isdigit(*pb)) { + long ai, bi; + ai = strtol(pa, &pa, 10); + bi = strtol(pb, &pb, 10); + if (ai != bi) return aibi? 1 : 0; + } else { + if (*pa != *pb) break; + ++pa; ++pb; + } + } + if (*pa == *pb) + return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; + return *pa<*pb? -1 : *pa>*pb? 1 : 0; +} + +#define HEAP_EMPTY 0xffffffffffffffffull + +typedef struct { + int i; + uint64_t pos, idx; + bam1_t *b; +} heap1_t; + +#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx)))) + +static inline int heap_lt(const heap1_t a, const heap1_t b) +{ + if (g_is_by_qname) { + int t; + if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; + t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); + return (t > 0 || (t == 0 && __pos_cmp(a, b))); + } else return __pos_cmp(a, b); +} + +KSORT_INIT(heap, heap1_t, heap_lt) + +static void swap_header_text(bam_header_t *h1, bam_header_t *h2) +{ + int tempi; + char *temps; + tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi; + temps = h1->text, h1->text = h2->text, h2->text = temps; +} + +/*! + @abstract Merge multiple sorted BAM. + @param is_by_qname whether to sort by query name + @param out output BAM file name + @param headers name of SAM file from which to copy '@' header lines, + or NULL to copy them from the first file to be merged + @param n number of files to be merged + @param fn names of files to be merged + + @discussion Padding information may NOT correctly maintained. This + function is NOT thread safe. + */ +void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int add_RG) +{ + bamFile fpout, *fp; + heap1_t *heap; + bam_header_t *hout = 0; + bam_header_t *hheaders = NULL; + int i, j, *RG_len = 0; + uint64_t idx = 0; + char **RG = 0; + + if (headers) { + tamFile fpheaders = sam_open(headers); + if (fpheaders == 0) { + fprintf(stderr, "[bam_merge_core] Cannot open file `%s'. Continue anyway.\n", headers); + } else { + hheaders = sam_header_read(fpheaders); + sam_close(fpheaders); + } + } + + g_is_by_qname = by_qname; + fp = (bamFile*)calloc(n, sizeof(bamFile)); + heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + // prepare RG tag + if (add_RG) { + RG = (char**)calloc(n, sizeof(void*)); + RG_len = (int*)calloc(n, sizeof(int)); + for (i = 0; i != n; ++i) { + int l = strlen(fn[i]); + const char *s = fn[i]; + if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; + for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; + ++j; l -= j; + RG[i] = calloc(l + 1, 1); + RG_len[i] = l; + strncpy(RG[i], s + j, l); + } + } + // read the first + for (i = 0; i != n; ++i) { + heap1_t *h; + bam_header_t *hin; + fp[i] = bam_open(fn[i], "r"); + if (fp[i] == 0) { + int j; + fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); + for (j = 0; j < i; ++j) bam_close(fp[j]); + free(fp); free(heap); + // FIXME: possible memory leak + return; + } + hin = bam_header_read(fp[i]); + if (i == 0) { // the first SAM + hout = hin; + if (hheaders) { + // If the text headers to be swapped in include any @SQ headers, + // check that they are consistent with the existing binary list + // of reference information. + if (hheaders->n_targets > 0) { + if (hout->n_targets != hheaders->n_targets) + fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers); + for (j = 0; j < hout->n_targets; ++j) + if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) + fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers); + } + swap_header_text(hout, hheaders); + bam_header_destroy(hheaders); + hheaders = NULL; + } + } else { // validate multiple baf + if (hout->n_targets != hin->n_targets) { + fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]); + exit(1); + } + for (j = 0; j < hout->n_targets; ++j) { + if (strcmp(hout->target_name[j], hin->target_name[j])) { + fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n", + hout->target_name[j], hin->target_name[j], fn[i]); + exit(1); + } + } + bam_header_destroy(hin); + } + h = heap + i; + h->i = i; + h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); + if (bam_read1(fp[i], h->b) >= 0) { + h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); + h->idx = idx++; + } + else h->pos = HEAP_EMPTY; + } + fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); + assert(fpout); + bam_header_write(fpout, hout); + bam_header_destroy(hout); + + ks_heapmake(heap, n, heap); + while (heap->pos != HEAP_EMPTY) { + bam1_t *b = heap->b; + if (add_RG && bam_aux_get(b, "RG") == 0) + bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); + bam_write1_core(fpout, &b->core, b->data_len, b->data); + if ((j = bam_read1(fp[heap->i], b)) >= 0) { + heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); + heap->idx = idx++; + } else if (j == -1) { + heap->pos = HEAP_EMPTY; + free(heap->b->data); free(heap->b); + heap->b = 0; + } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); + ks_heapadjust(heap, 0, n, heap); + } + + if (add_RG) { + for (i = 0; i != n; ++i) free(RG[i]); + free(RG); free(RG_len); + } + for (i = 0; i != n; ++i) bam_close(fp[i]); + bam_close(fpout); + free(fp); free(heap); +} +int bam_merge(int argc, char *argv[]) +{ + int c, is_by_qname = 0, add_RG = 0; + char *fn_headers = NULL; + + while ((c = getopt(argc, argv, "h:nr")) >= 0) { + switch (c) { + case 'r': add_RG = 1; break; + case 'h': fn_headers = strdup(optarg); break; + case 'n': is_by_qname = 1; break; + } + } + if (optind + 2 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] [...]\n\n"); + fprintf(stderr, "Options: -n sort by read names\n"); + fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); + fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); + fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); + fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n"); + fprintf(stderr, " the header dictionary in merging.\n\n"); + return 1; + } + bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, add_RG); + free(fn_headers); + return 0; +} + +typedef bam1_t *bam1_p; + +static inline int bam1_lt(const bam1_p a, const bam1_p b) +{ + if (g_is_by_qname) { + int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); + return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)))); + } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)); +} +KSORT_INIT(sort, bam1_p, bam1_lt) + +static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) +{ + char *name; + int i; + bamFile fp; + ks_mergesort(sort, k, buf, 0); + name = (char*)calloc(strlen(prefix) + 20, 1); + if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); + else sprintf(name, "%s.bam", prefix); + fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w"); + if (fp == 0) { + fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); + free(name); + // FIXME: possible memory leak + return; + } + free(name); + bam_header_write(fp, h); + for (i = 0; i < k; ++i) + bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data); + bam_close(fp); +} + +/*! + @abstract Sort an unsorted BAM file based on the chromosome order + and the leftmost position of an alignment + + @param is_by_qname whether to sort by query name + @param fn name of the file to be sorted + @param prefix prefix of the output and the temporary files; upon + sucessess, prefix.bam will be written. + @param max_mem approxiate maximum memory (very inaccurate) + + @discussion It may create multiple temporary subalignment files + and then merge them by calling bam_merge_core(). This function is + NOT thread safe. + */ +void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) +{ + int n, ret, k, i; + size_t mem; + bam_header_t *header; + bamFile fp; + bam1_t *b, **buf; + + g_is_by_qname = is_by_qname; + n = k = 0; mem = 0; + fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); + return; + } + header = bam_header_read(fp); + buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); + // write sub files + for (;;) { + if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); + b = buf[k]; + if ((ret = bam_read1(fp, b)) < 0) break; + mem += ret; + ++k; + if (mem >= max_mem) { + sort_blocks(n++, k, buf, prefix, header, is_stdout); + mem = 0; k = 0; + } + } + if (ret != -1) + fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); + if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); + else { // then merge + char **fns, *fnout; + fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); + sort_blocks(n++, k, buf, prefix, header, is_stdout); + fnout = (char*)calloc(strlen(prefix) + 20, 1); + if (is_stdout) sprintf(fnout, "-"); + else sprintf(fnout, "%s.bam", prefix); + fns = (char**)calloc(n, sizeof(char*)); + for (i = 0; i < n; ++i) { + fns[i] = (char*)calloc(strlen(prefix) + 20, 1); + sprintf(fns[i], "%s.%.4d.bam", prefix, i); + } + bam_merge_core(is_by_qname, fnout, 0, n, fns, 0); + free(fnout); + for (i = 0; i < n; ++i) { + unlink(fns[i]); + free(fns[i]); + } + free(fns); + } + for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) { + if (buf[k]) { + free(buf[k]->data); + free(buf[k]); + } + } + free(buf); + bam_header_destroy(header); + bam_close(fp); +} + +void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) +{ + bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0); +} + +int bam_sort(int argc, char *argv[]) +{ + size_t max_mem = 500000000; + int c, is_by_qname = 0, is_stdout = 0; + while ((c = getopt(argc, argv, "nom:")) >= 0) { + switch (c) { + case 'o': is_stdout = 1; break; + case 'n': is_by_qname = 1; break; + case 'm': max_mem = atol(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: samtools sort [-on] [-m ] \n"); + return 1; + } + bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout); + return 0; +} diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c new file mode 100644 index 0000000..ea9deee --- /dev/null +++ b/samtools/bam_stat.c @@ -0,0 +1,78 @@ +#include +#include +#include "bam.h" + +typedef struct { + long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; + long long n_sgltn, n_read1, n_read2; + long long n_qcfail, n_dup; + long long n_diffchr, n_diffhigh; +} bam_flagstat_t; + +#define flagstat_loop(s, c) do { \ + ++(s)->n_reads; \ + if ((c)->flag & BAM_FPAIRED) { \ + ++(s)->n_pair_all; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ + if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \ + if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ + ++(s)->n_pair_map; \ + if ((c)->mtid != (c)->tid) { \ + ++(s)->n_diffchr; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh; \ + } \ + } \ + } \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ + if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ + } while (0) + +bam_flagstat_t *bam_flagstat_core(bamFile fp) +{ + bam_flagstat_t *s; + bam1_t *b; + bam1_core_t *c; + int ret; + s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); + b = bam_init1(); + c = &b->core; + while ((ret = bam_read1(fp, b)) >= 0) + flagstat_loop(s, c); + bam_destroy1(b); + if (ret != -1) + fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + return s; +} +int bam_flagstat(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *header; + bam_flagstat_t *s; + if (argc == optind) { + fprintf(stderr, "Usage: samtools flagstat \n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + s = bam_flagstat_core(fp); + printf("%lld in total\n", s->n_reads); + printf("%lld QC failure\n", s->n_qcfail); + printf("%lld duplicates\n", s->n_dup); + printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); + printf("%lld paired in sequencing\n", s->n_pair_all); + printf("%lld read1\n", s->n_read1); + printf("%lld read2\n", s->n_read2); + printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); + printf("%lld with itself and mate mapped\n", s->n_pair_map); + printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); + printf("%lld with mate mapped to a different chr\n", s->n_diffchr); + printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); + free(s); + bam_header_destroy(header); + bam_close(fp); + return 0; +} diff --git a/samtools/bam_tview.c b/samtools/bam_tview.c new file mode 100644 index 0000000..4c121e7 --- /dev/null +++ b/samtools/bam_tview.c @@ -0,0 +1,415 @@ +#undef _HAVE_CURSES + +#if _CURSES_LIB == 0 +#elif _CURSES_LIB == 1 +#include +#ifndef NCURSES_VERSION +#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled" +#else +#define _HAVE_CURSES +#endif +#elif _CURSES_LIB == 2 +#include +#define _HAVE_CURSES +#else +#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled" +#endif + +#ifdef _HAVE_CURSES +#include +#include +#include +#include "bam.h" +#include "faidx.h" +#include "bam_maqcns.h" + +char bam_aux_getCEi(bam1_t *b, int i); +char bam_aux_getCSi(bam1_t *b, int i); +char bam_aux_getCQi(bam1_t *b, int i); + +#define TV_MIN_ALNROW 2 +#define TV_MAX_GOTO 40 +#define TV_LOW_MAPQ 10 + +#define TV_COLOR_MAPQ 0 +#define TV_COLOR_BASEQ 1 +#define TV_COLOR_NUCL 2 +#define TV_COLOR_COL 3 +#define TV_COLOR_COLQ 4 + +#define TV_BASE_NUCL 0 +#define TV_BASE_COLOR_SPACE 1 + +typedef struct { + int mrow, mcol; + WINDOW *wgoto, *whelp; + + bam_index_t *idx; + bam_lplbuf_t *lplbuf; + bam_header_t *header; + bamFile fp; + int curr_tid, left_pos; + faidx_t *fai; + bam_maqcns_t *bmc; + + int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; + char *ref; +} tview_t; + +int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + tview_t *tv = (tview_t*)data; + int i, j, c, rb, attr, max_ins = 0; + uint32_t call = 0; + if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen + // print referece + rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; + for (i = tv->last_pos + 1; i < pos; ++i) { + if (i%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", i+1); + c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; + mvaddch(1, tv->ccol++, c); + } + if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); + // print consensus + call = bam_maqcns_call(n, pl, tv->bmc); + attr = A_UNDERLINE; + c = ",ACMGRSVTWYHKDBN"[call>>28&0xf]; + i = (call>>8&0xff)/10+1; + if (i > 4) i = 4; + attr |= COLOR_PAIR(i); + if (c == toupper(rb)) c = '.'; + attron(attr); + mvaddch(2, tv->ccol, c); + attroff(attr); + if(tv->ins) { + // calculate maximum insert + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel; + } + } + // core loop + for (j = 0; j <= max_ins; ++j) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + int row = TV_MIN_ALNROW + p->level - tv->row_shift; + if (j == 0) { + if (!p->is_del) { + if (tv->base_for == TV_BASE_COLOR_SPACE && + (c = bam_aux_getCSi(p->b, p->qpos))) { + c = bam_aux_getCSi(p->b, p->qpos); + // assume that if we found one color, we will be able to get the color error + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.'; + } else { + if (tv->show_name) { + char *name = bam1_qname(p->b); + c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos]; + } else { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + } + } else c = '*'; + } else { // padding + if (j > p->indel) c = '*'; + else { // insertion + if (tv->base_for == TV_BASE_NUCL) { + if (tv->show_name) { + char *name = bam1_qname(p->b); + c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j]; + } else { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + } else { + c = bam_aux_getCSi(p->b, p->qpos + j); + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.'; + } + } + } + if (row > TV_MIN_ALNROW && row < tv->mrow) { + int x; + attr = 0; + if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) + || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE; + if (tv->color_for == TV_COLOR_BASEQ) { + x = bam1_qual(p->b)[p->qpos]/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } else if (tv->color_for == TV_COLOR_MAPQ) { + x = p->b->core.qual/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } else if (tv->color_for == TV_COLOR_NUCL) { + x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5; + attr |= COLOR_PAIR(x); + } else if(tv->color_for == TV_COLOR_COL) { + x = 0; + switch(bam_aux_getCSi(p->b, p->qpos)) { + case '0': x = 0; break; + case '1': x = 1; break; + case '2': x = 2; break; + case '3': x = 3; break; + case '4': x = 4; break; + default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break; + } + x+=5; + attr |= COLOR_PAIR(x); + } else if(tv->color_for == TV_COLOR_COLQ) { + x = bam_aux_getCQi(p->b, p->qpos); + if(0 == x) x = bam1_qual(p->b)[p->qpos]; + x = x/10 + 1; + if (x > 4) x = 4; + attr |= COLOR_PAIR(x); + } + attron(attr); + mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c)); + attroff(attr); + } + } + c = j? '*' : rb; + if (c == '*') { + attr = COLOR_PAIR(8); + attron(attr); + mvaddch(1, tv->ccol++, c); + attroff(attr); + } else mvaddch(1, tv->ccol++, c); + } + tv->last_pos = pos; + return 0; +} + +tview_t *tv_init(const char *fn, const char *fn_fa) +{ + tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t)); + tv->is_dot = 1; + tv->idx = bam_index_load(fn); + if (tv->idx == 0) exit(1); + tv->fp = bam_open(fn, "r"); + bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); + assert(tv->fp); + tv->header = bam_header_read(tv->fp); + tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); + if (fn_fa) tv->fai = fai_load(fn_fa); + tv->bmc = bam_maqcns_init(); + tv->ins = 1; + bam_maqcns_prepare(tv->bmc); + + initscr(); + keypad(stdscr, TRUE); + clear(); + noecho(); + cbreak(); + tv->mrow = 24; tv->mcol = 80; + getmaxyx(stdscr, tv->mrow, tv->mcol); + tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); + tv->whelp = newwin(29, 40, 5, 5); + tv->color_for = TV_COLOR_MAPQ; + start_color(); + init_pair(1, COLOR_BLUE, COLOR_BLACK); + init_pair(2, COLOR_GREEN, COLOR_BLACK); + init_pair(3, COLOR_YELLOW, COLOR_BLACK); + init_pair(4, COLOR_WHITE, COLOR_BLACK); + init_pair(5, COLOR_GREEN, COLOR_BLACK); + init_pair(6, COLOR_CYAN, COLOR_BLACK); + init_pair(7, COLOR_YELLOW, COLOR_BLACK); + init_pair(8, COLOR_RED, COLOR_BLACK); + init_pair(9, COLOR_BLUE, COLOR_BLACK); + return tv; +} + +void tv_destroy(tview_t *tv) +{ + delwin(tv->wgoto); delwin(tv->whelp); + endwin(); + + bam_lplbuf_destroy(tv->lplbuf); + bam_maqcns_destroy(tv->bmc); + bam_index_destroy(tv->idx); + if (tv->fai) fai_destroy(tv->fai); + free(tv->ref); + bam_header_destroy(tv->header); + bam_close(tv->fp); + free(tv); +} + +int tv_fetch_func(const bam1_t *b, void *data) +{ + tview_t *tv = (tview_t*)data; + if (tv->no_skip) { + uint32_t *cigar = bam1_cigar(b); // this is cheating... + int i; + for (i = 0; i core.n_cigar; ++i) { + if ((cigar[i]&0xf) == BAM_CREF_SKIP) + cigar[i] = cigar[i]>>4<<4 | BAM_CDEL; + } + } + bam_lplbuf_push(b, tv->lplbuf); + return 0; +} + +int tv_draw_aln(tview_t *tv, int tid, int pos) +{ + // reset + clear(); + tv->curr_tid = tid; tv->left_pos = pos; + tv->last_pos = tv->left_pos - 1; + tv->ccol = 0; + // print ref and consensus + if (tv->fai) { + char *str; + if (tv->ref) free(tv->ref); + str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); + sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); + tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); + free(str); + } + // draw aln + bam_lplbuf_reset(tv->lplbuf); + bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func); + bam_lplbuf_push(0, tv->lplbuf); + + while (tv->ccol < tv->mcol) { + int pos = tv->last_pos + 1; + if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); + mvaddch(1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'); + ++tv->last_pos; + } + return 0; +} + +static void tv_win_goto(tview_t *tv, int *tid, int *pos) +{ + char str[256]; + int i, l = 0; + wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(tv->wgoto, 1, 2, "Goto: "); + for (;;) { + int c = wgetch(tv->wgoto); + wrefresh(tv->wgoto); + if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { + --l; + } else if (c == KEY_ENTER || c == '\012' || c == '\015') { + int _tid = -1, _beg, _end; + bam_parse_region(tv->header, str, &_tid, &_beg, &_end); + if (_tid >= 0) { + *tid = _tid; *pos = _beg; + return; + } + } else if (isgraph(c)) { + if (l < TV_MAX_GOTO) str[l++] = c; + } else if (c == '\027') l = 0; + else if (c == '\033') return; + str[l] = '\0'; + for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); + mvwprintw(tv->wgoto, 1, 8, "%s", str); + } +} + +static void tv_win_help(tview_t *tv) { + int r = 1; + WINDOW *win = tv->whelp; + wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(win, r++, 2, " -=- Help -=- "); + r++; + mvwprintw(win, r++, 2, "? This window"); + mvwprintw(win, r++, 2, "Arrows Small scroll movement"); + mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); + mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); + mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); + mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); + mvwprintw(win, r++, 2, "space Scroll one screen"); + mvwprintw(win, r++, 2, "backspace Scroll back one screen"); + mvwprintw(win, r++, 2, "g Go to specific location"); + mvwprintw(win, r++, 2, "m Color for mapping qual"); + mvwprintw(win, r++, 2, "n Color for nucleotide"); + mvwprintw(win, r++, 2, "b Color for base quality"); + mvwprintw(win, r++, 2, "c Color for cs color"); + mvwprintw(win, r++, 2, "z Color for cs qual"); + mvwprintw(win, r++, 2, ". Toggle on/off dot view"); + mvwprintw(win, r++, 2, "s Toggle on/off ref skip"); + mvwprintw(win, r++, 2, "r Toggle on/off rd name"); + mvwprintw(win, r++, 2, "N Turn on nt view"); + mvwprintw(win, r++, 2, "C Turn on cs view"); + mvwprintw(win, r++, 2, "i Toggle on/off ins"); + mvwprintw(win, r++, 2, "q Exit"); + r++; + mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); + mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); + mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); + wrefresh(win); + wgetch(win); +} + +void tv_loop(tview_t *tv) +{ + int tid, pos; + tid = tv->curr_tid; pos = tv->left_pos; + while (1) { + int c = getch(); + switch (c) { + case '?': tv_win_help(tv); break; + case '\033': + case 'q': goto end_loop; + case 'g': tv_win_goto(tv, &tid, &pos); break; + case 'm': tv->color_for = TV_COLOR_MAPQ; break; + case 'b': tv->color_for = TV_COLOR_BASEQ; break; + case 'n': tv->color_for = TV_COLOR_NUCL; break; + case 'c': tv->color_for = TV_COLOR_COL; break; + case 'z': tv->color_for = TV_COLOR_COLQ; break; + case 's': tv->no_skip = !tv->no_skip; break; + case 'r': tv->show_name = !tv->show_name; break; + case KEY_LEFT: + case 'h': --pos; break; + case KEY_RIGHT: + case 'l': ++pos; break; + case KEY_SLEFT: + case 'H': pos -= 20; break; + case KEY_SRIGHT: + case 'L': pos += 20; break; + case '.': tv->is_dot = !tv->is_dot; break; + case 'N': tv->base_for = TV_BASE_NUCL; break; + case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; + case 'i': tv->ins = !tv->ins; break; + case '\010': pos -= 1000; break; + case '\014': pos += 1000; break; + case ' ': pos += tv->mcol; break; + case KEY_UP: + case 'j': --tv->row_shift; break; + case KEY_DOWN: + case 'k': ++tv->row_shift; break; + case KEY_BACKSPACE: + case '\177': pos -= tv->mcol; break; + case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; + default: continue; + } + if (pos < 0) pos = 0; + if (tv->row_shift < 0) tv->row_shift = 0; + tv_draw_aln(tv, tid, pos); + } +end_loop: + return; +} + +int bam_tview_main(int argc, char *argv[]) +{ + tview_t *tv; + if (argc == 1) { + fprintf(stderr, "Usage: bamtk tview [ref.fasta]\n"); + return 1; + } + tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]); + tv_draw_aln(tv, 0, 0); + tv_loop(tv); + tv_destroy(tv); + return 0; +} +#else // #ifdef _HAVE_CURSES +#include +#warning "No curses library is available; tview is disabled." +int bam_tview_main(int argc, char *argv[]) +{ + fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n"); + return 1; +} +#endif // #ifdef _HAVE_CURSES diff --git a/samtools/bgzf.c b/samtools/bgzf.c new file mode 100644 index 0000000..59f902f --- /dev/null +++ b/samtools/bgzf.c @@ -0,0 +1,683 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif + +typedef int8_t bgzf_byte_t; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +static inline +int +bgzf_min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); + return fp; +} + +static +BGZF* +open_read(int fd) +{ +#ifdef _USE_KNETFILE + knetFile *file = knet_dopen(fd, "r"); +#else + FILE* file = fdopen(fd, "r"); +#endif + BGZF* fp; + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = fd; + fp->open_mode = 'r'; +#ifdef _USE_KNETFILE + fp->x.fpr = file; +#else + fp->file = file; +#endif + return fp; +} + +static +BGZF* +open_write(int fd, bool is_uncompressed) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp; + if (file == 0) return 0; + fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; +#ifdef _USE_KNETFILE + fp->x.fpw = file; +#else + fp->file = file; +#endif + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ +#ifdef _USE_KNETFILE + knetFile *file = knet_open(path, mode); + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = -1; + fp->open_mode = 'r'; + fp->x.fpr = file; +#else + int fd, oflag = O_RDONLY; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag); + if (fd == -1) return 0; + fp = open_read(fd); +#endif + } else if (mode[0] == 'w' || mode[0] == 'W') { + int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag, 0666); + if (fd == -1) return 0; + fp = open_write(fd, strstr(mode, "u")? 1 : 0); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (fd == -1) return 0; + if (mode[0] == 'r' || mode[0] == 'R') { + return open_read(fd); + } else if (mode[0] == 'w' || mode[0] == 'W') { + return open_write(fd, strstr(mode, "u")? 1 : 0); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + bgzf_byte_t* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const bgzf_byte_t* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (bgzf_byte_t) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + +static +int +read_block(BGZF* fp) +{ + bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + int size = 0; +#ifdef _USE_KNETFILE + int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; + int count = knet_read(fp->x.fpr, header, sizeof(header)); +#else + int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; + int count = fread(header, 1, sizeof(header), fp->file); +#endif + if (count == 0) { + fp->block_length = 0; + return 0; + } + size = count; + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; +#ifdef _USE_KNETFILE + count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); +#else + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); +#endif + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + size += count; + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + bgzf_byte_t* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = bgzf_min(length-bytes_read, available); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } +#ifdef _USE_KNETFILE + int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const bgzf_byte_t* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } + { // add an empty block + int count, block_length = deflate_block(fp, 0); +#ifdef _USE_KNETFILE + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + } +#ifdef _USE_KNETFILE + if (fflush(fp->x.fpw) != 0) { +#else + if (fflush(fp->file) != 0) { +#endif + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { +#ifdef _USE_KNETFILE + int ret; + if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); + else ret = knet_close(fp->x.fpr); + if (ret != 0) return -1; +#else + if (fclose(fp->file) != 0) { + return -1; + } +#endif + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +int64_t +bgzf_tell(BGZF* fp) +{ + return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + uint8_t buf[28]; + off_t offset; +#ifdef _USE_KNETFILE + offset = knet_tell(fp->x.fpr); + if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; + knet_read(fp->x.fpr, buf, 28); + knet_seek(fp->x.fpr, offset, SEEK_SET); +#else + offset = ftello(fp->file); + if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; + fread(buf, 1, 28, fp->file); + fseeko(fp->file, offset, SEEK_SET); +#endif + return (memcmp(magic, buf, 28) == 0)? 1 : 0; +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { +#else + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { +#endif + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} diff --git a/samtools/bgzf.h b/samtools/bgzf.h new file mode 100644 index 0000000..91b3317 --- /dev/null +++ b/samtools/bgzf.h @@ -0,0 +1,134 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include +#include +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +int64_t bgzf_tell(BGZF* fp); + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +int bgzf_check_EOF(BGZF *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/faidx.c b/samtools/faidx.c new file mode 100644 index 0000000..811bdf8 --- /dev/null +++ b/samtools/faidx.c @@ -0,0 +1,422 @@ +#include +#include +#include +#include +#include "faidx.h" +#include "khash.h" + +typedef struct { + uint64_t len:32, line_len:16, line_blen:16; + uint64_t offset; +} faidx1_t; +KHASH_MAP_INIT_STR(s, faidx1_t) + +#ifndef _NO_RAZF +#include "razf.h" +#else +#ifdef _WIN32 +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif +#define RAZF FILE +#define razf_read(fp, buf, size) fread(buf, 1, size, fp) +#define razf_open(fn, mode) fopen(fn, mode) +#define razf_close(fp) fclose(fp) +#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) +#define razf_tell(fp) ftello(fp) +#endif +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +struct __faidx_t { + RAZF *rz; + int n, m; + char **name; + khash_t(s) *hash; +}; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) +{ + khint_t k; + int ret; + faidx1_t t; + if (idx->n == idx->m) { + idx->m = idx->m? idx->m<<1 : 16; + idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); + } + idx->name[idx->n] = strdup(name); + k = kh_put(s, idx->hash, idx->name[idx->n], &ret); + t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; + kh_value(idx->hash, k) = t; + ++idx->n; +} + +faidx_t *fai_build_core(RAZF *rz) +{ + char c, *name; + int l_name, m_name, ret; + int len, line_len, line_blen, state; + int l1, l2; + faidx_t *idx; + uint64_t offset; + + idx = (faidx_t*)calloc(1, sizeof(faidx_t)); + idx->hash = kh_init(s); + name = 0; l_name = m_name = 0; + len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; + while (razf_read(rz, &c, 1)) { + if (c == '\n') { // an empty line + if (state == 1) { + offset = razf_tell(rz); + continue; + } else if ((state == 0 && len < 0) || state == 2) continue; + } + if (c == '>') { // fasta header + if (len >= 0) + fai_insert_index(idx, name, len, line_len, line_blen, offset); + l_name = 0; + while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { + if (m_name < l_name + 2) { + m_name = l_name + 2; + kroundup32(m_name); + name = (char*)realloc(name, m_name); + } + name[l_name++] = c; + } + name[l_name] = '\0'; + if (ret == 0) { + fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); + free(name); fai_destroy(idx); + return 0; + } + if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); + state = 1; len = 0; + offset = razf_tell(rz); + } else { + if (state == 3) { + fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 2) state = 3; + l1 = l2 = 0; + do { + ++l1; + if (isgraph(c)) ++l2; + } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); + if (state == 3 && l2) { + fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + ++l1; len += l2; + if (l2 >= 0x10000) { + fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 1) line_len = l1, line_blen = l2, state = 0; + else if (state == 0) { + if (l1 != line_len || l2 != line_blen) state = 2; + } + } + } + fai_insert_index(idx, name, len, line_len, line_blen, offset); + free(name); + return idx; +} + +void fai_save(const faidx_t *fai, FILE *fp) +{ + khint_t k; + int i; + for (i = 0; i < fai->n; ++i) { + faidx1_t x; + k = kh_get(s, fai->hash, fai->name[i]); + x = kh_value(fai->hash, k); +#ifdef _WIN32 + fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); +#else + fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); +#endif + } +} + +faidx_t *fai_read(FILE *fp) +{ + faidx_t *fai; + char *buf, *p; + int len, line_len, line_blen; +#ifdef _WIN32 + long offset; +#else + long long offset; +#endif + fai = (faidx_t*)calloc(1, sizeof(faidx_t)); + fai->hash = kh_init(s); + buf = (char*)calloc(0x10000, 1); + while (!feof(fp) && fgets(buf, 0x10000, fp)) { + for (p = buf; *p && isgraph(*p); ++p); + *p = 0; ++p; +#ifdef _WIN32 + sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); +#else + sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); +#endif + fai_insert_index(fai, buf, len, line_len, line_blen, offset); + } + free(buf); + return fai; +} + +void fai_destroy(faidx_t *fai) +{ + int i; + for (i = 0; i < fai->n; ++i) free(fai->name[i]); + free(fai->name); + kh_destroy(s, fai->hash); + if (fai->rz) razf_close(fai->rz); + free(fai); +} + +int fai_build(const char *fn) +{ + char *str; + RAZF *rz; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + rz = razf_open(fn, "r"); + if (rz == 0) { + fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",str); + free(str); + return -1; + } + fai = fai_build_core(rz); + razf_close(rz); + fp = fopen(str, "wb"); + if (fp == 0) { + fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); + fai_destroy(fai); free(str); + return -1; + } + fai_save(fai, fp); + fclose(fp); + free(str); + fai_destroy(fai); + return 0; +} + +#ifdef _USE_KNETFILE +FILE *download_and_open(const char *fn) +{ + const int buf_size = 1 * 1024 * 1024; + uint8_t *buf; + FILE *fp; + knetFile *fp_remote; + const char *url = fn; + const char *p; + int l = strlen(fn); + for (p = fn + l - 1; p >= fn; --p) + if (*p == '/') break; + fn = p + 1; + + // First try to open a local copy + fp = fopen(fn, "r"); + if (fp) + return fp; + + // If failed, download from remote and open + fp_remote = knet_open(url, "rb"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); + return NULL; + } + if ((fp = fopen(fn, "wb")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); + knet_close(fp_remote); + return NULL; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); + + return fopen(fn, "r"); +} +#endif + +faidx_t *fai_load(const char *fn) +{ + char *str; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + +#ifdef _USE_KNETFILE + if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) + { + fp = download_and_open(str); + if ( !fp ) + { + fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); + free(str); + return 0; + } + } + else +#endif + fp = fopen(str, "rb"); + if (fp == 0) { + fprintf(stderr, "[fai_load] build FASTA index.\n"); + fai_build(fn); + fp = fopen(str, "rb"); + if (fp == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); + free(str); + return 0; + } + } + + fai = fai_read(fp); + fclose(fp); + + fai->rz = razf_open(fn, "rb"); + free(str); + if (fai->rz == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); + return 0; + } + return fai; +} + +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + char *s, *p, c; + int i, l, k; + khiter_t iter; + faidx1_t val; + khash_t(s) *h; + int beg, end; + + beg = end = -1; + h = fai->hash; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the ref_id */ + if (iter == kh_end(h)) { + *len = 0; + free(s); return 0; + } + val = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + beg = 0; end = val.len; + } else { + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + beg = atoi(p); + if (i < k) { + p = s + i + 1; + end = atoi(p); + } else end = val.len; + } + if (beg > 0) --beg; + if (beg >= val.len) beg = val.len; + if (end >= val.len) end = val.len; + if (beg > end) beg = end; + free(s); + + // now retrieve the sequence + l = 0; + s = (char*)malloc(end - beg + 2); + razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); + while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) + if (isgraph(c)) s[l++] = c; + s[l] = '\0'; + *len = l; + return s; +} + +int faidx_main(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "Usage: faidx [ [...]]\n"); + return 1; + } else { + if (argc == 2) fai_build(argv[1]); + else { + int i, j, k, l; + char *s; + faidx_t *fai; + fai = fai_load(argv[1]); + if (fai == 0) return 1; + for (i = 2; i != argc; ++i) { + printf(">%s\n", argv[i]); + s = fai_fetch(fai, argv[i], &l); + for (j = 0; j < l; j += 60) { + for (k = 0; k < 60 && k < l - j; ++k) + putchar(s[j + k]); + putchar('\n'); + } + free(s); + } + fai_destroy(fai); + } + } + return 0; +} + +int faidx_fetch_nseq(const faidx_t *fai) +{ + return fai->n; +} + +char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) +{ + int l; + char c; + khiter_t iter; + faidx1_t val; + char *seq=NULL; + + // Adjust position + iter = kh_get(s, fai->hash, c_name); + if(iter == kh_end(fai->hash)) return 0; + val = kh_value(fai->hash, iter); + if(p_end_i < p_beg_i) p_beg_i = p_end_i; + if(p_beg_i < 0) p_beg_i = 0; + else if(val.len <= p_beg_i) p_beg_i = val.len - 1; + if(p_end_i < 0) p_end_i = 0; + else if(val.len <= p_end_i) p_end_i = val.len - 1; + + // Now retrieve the sequence + l = 0; + seq = (char*)malloc(p_end_i - p_beg_i + 2); + razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); + while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) + if (isgraph(c)) seq[l++] = c; + seq[l] = '\0'; + *len = l; + return seq; +} + +#ifdef FAIDX_MAIN +int main(int argc, char *argv[]) { return faidx_main(argc, argv); } +#endif diff --git a/samtools/faidx.h b/samtools/faidx.h new file mode 100644 index 0000000..1fb1b1f --- /dev/null +++ b/samtools/faidx.h @@ -0,0 +1,103 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef FAIDX_H +#define FAIDX_H + +/*! + @header + + Index FASTA files and extract subsequence. + + @copyright The Wellcome Trust Sanger Institute. + */ + +struct __faidx_t; +typedef struct __faidx_t faidx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Build index for a FASTA or razip compressed FASTA file. + @param fn FASTA file name + @return 0 on success; or -1 on failure + @discussion File "fn.fai" will be generated. + */ + int fai_build(const char *fn); + + /*! + @abstract Distroy a faidx_t struct. + @param fai Pointer to the struct to be destroyed + */ + void fai_destroy(faidx_t *fai); + + /*! + @abstract Load index from "fn.fai". + @param fn File name of the FASTA file + */ + faidx_t *fai_load(const char *fn); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *fai_fetch(const faidx_t *fai, const char *reg, int *len); + + /*! + @abstract Fetch the number of sequences. + @param fai Pointer to the faidx_t struct + @return The number of sequences + */ + int faidx_fetch_nseq(const faidx_t *fai); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/glf.c b/samtools/glf.c new file mode 100644 index 0000000..8d5346a --- /dev/null +++ b/samtools/glf.c @@ -0,0 +1,236 @@ +#include +#include +#include "glf.h" + +#ifdef _NO_BGZF +// then alias bgzf_*() functions +#endif + +static int glf3_is_BE = 0; + +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} + +glf3_header_t *glf3_header_init() +{ + glf3_is_BE = bam_is_big_endian(); + return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); +} + +glf3_header_t *glf3_header_read(glfFile fp) +{ + glf3_header_t *h; + char magic[4]; + h = glf3_header_init(); + bgzf_read(fp, magic, 4); + if (strncmp(magic, "GLF\3", 4)) { + fprintf(stderr, "[glf3_header_read] invalid magic.\n"); + glf3_header_destroy(h); + return 0; + } + bgzf_read(fp, &h->l_text, 4); + if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); + if (h->l_text) { + h->text = (uint8_t*)calloc(h->l_text + 1, 1); + bgzf_read(fp, h->text, h->l_text); + } + return h; +} + +void glf3_header_write(glfFile fp, const glf3_header_t *h) +{ + int32_t x; + bgzf_write(fp, "GLF\3", 4); + x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; + bgzf_write(fp, &x, 4); + if (h->l_text) bgzf_write(fp, h->text, h->l_text); +} + +void glf3_header_destroy(glf3_header_t *h) +{ + free(h->text); + free(h); +} + +char *glf3_ref_read(glfFile fp, int *len) +{ + int32_t n, x; + char *str; + *len = 0; + if (bgzf_read(fp, &n, 4) != 4) return 0; + if (glf3_is_BE) n = bam_swap_endian_4(n); + if (n < 0) { + fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); + return 0; + } + str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact + x = bgzf_read(fp, str, n); + x += bgzf_read(fp, len, 4); + if (x != n + 4) { + free(str); *len = -1; return 0; // truncated + } + if (glf3_is_BE) *len = bam_swap_endian_4(*len); + return str; +} + +void glf3_ref_write(glfFile fp, const char *str, int len) +{ + int32_t m, n = strlen(str) + 1; + m = glf3_is_BE? bam_swap_endian_4(n) : n; + bgzf_write(fp, &m, 4); + bgzf_write(fp, str, n); + if (glf3_is_BE) len = bam_swap_endian_4(len); + bgzf_write(fp, &len, 4); +} + +void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) +{ + int j; + if (g3->rtype == GLF3_RTYPE_END) return; + printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, + g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], + g3->depth, g3->rms_mapQ, g3->min_lk); + if (g3->rtype == GLF3_RTYPE_SUB) + for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); + else { + printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], + g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); + } + printf("\n"); +} + +int glf3_write1(glfFile fp, const glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + c = g3->rtype<<4 | g3->ref_base; + r = bgzf_write(fp, &c, 1); + if (g3->rtype == GLF3_RTYPE_END) return r; + y[0] = g3->offset; + y[1] = g3->min_lk<<24 | g3->depth; + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + r += bgzf_write(fp, y, 8); + r += bgzf_write(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); + else { + int16_t x[2]; + r += bgzf_write(fp, g3->lk, 3); + x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; + x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; + r += bgzf_write(fp, x, 4); + if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); + if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); + } + return r; +} + +#ifndef kv_roundup32 +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +int glf3_read1(glfFile fp, glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + r = bgzf_read(fp, &c, 1); + if (r == 0) return 0; + g3->ref_base = c & 0xf; + g3->rtype = c>>4; + if (g3->rtype == GLF3_RTYPE_END) return r; + r += bgzf_read(fp, y, 8); + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + g3->offset = y[0]; + g3->min_lk = y[1]>>24; + g3->depth = y[1]<<8>>8; + r += bgzf_read(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); + else { + int16_t x[2], max; + r += bgzf_read(fp, g3->lk, 3); + r += bgzf_read(fp, x, 4); + if (glf3_is_BE) { + x[0] = bam_swap_endian_2(x[0]); + x[1] = bam_swap_endian_2(x[1]); + } + g3->indel_len[0] = x[0]; + g3->indel_len[1] = x[1]; + x[0] = abs(x[0]); x[1] = abs(x[1]); + max = (x[0] > x[1]? x[0] : x[1]) + 1; + if (g3->max_len < max) { + g3->max_len = max; + kv_roundup32(g3->max_len); + g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); + g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); + } + r += bgzf_read(fp, g3->indel_seq[0], x[0]); + r += bgzf_read(fp, g3->indel_seq[1], x[1]); + g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; + } + return r; +} + +void glf3_view(glfFile fp) +{ + glf3_header_t *h; + char *name; + glf3_t *g3; + int len; + h = glf3_header_read(fp); + g3 = glf3_init1(); + while ((name = glf3_ref_read(fp, &len)) != 0) { + int pos = 0; + while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { + pos += g3->offset; + glf3_view1(name, g3, pos); + } + free(name); + } + glf3_header_destroy(h); + glf3_destroy1(g3); +} + +int glf3_view_main(int argc, char *argv[]) +{ + glfFile fp; + if (argc == 1) { + fprintf(stderr, "Usage: glfview \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "Fail to open file '%s'\n", argv[1]); + return 1; + } + glf3_view(fp); + bgzf_close(fp); + return 0; +} + +#ifdef GLFVIEW_MAIN +int main(int argc, char *argv[]) +{ + return glf3_view_main(argc, argv); +} +#endif diff --git a/samtools/glf.h b/samtools/glf.h new file mode 100644 index 0000000..12e5400 --- /dev/null +++ b/samtools/glf.h @@ -0,0 +1,56 @@ +#ifndef GLF_H_ +#define GLF_H_ + +typedef struct { + unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + unsigned char max_mapQ; /** maximum mapping quality */ + unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ + unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ +} glf1_t; + +#include +#include "bgzf.h" +typedef BGZF *glfFile; + +#define GLF3_RTYPE_END 0 +#define GLF3_RTYPE_SUB 1 +#define GLF3_RTYPE_INDEL 2 + +typedef struct { + uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + uint8_t rms_mapQ; /** RMS mapping quality */ + uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ + uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ + int32_t offset; /** the first base in a chromosome has offset zero. */ + // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) + int16_t indel_len[2]; + int32_t max_len; // maximum indel len; will be modified by glf3_read1() + char *indel_seq[2]; +} glf3_t; + +typedef struct { + int32_t l_text; + uint8_t *text; +} glf3_header_t; + +#ifdef __cplusplus +extern "C" { +#endif + +#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) +#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) + + glf3_header_t *glf3_header_init(); + glf3_header_t *glf3_header_read(glfFile fp); + void glf3_header_write(glfFile fp, const glf3_header_t *h); + void glf3_header_destroy(glf3_header_t *h); + char *glf3_ref_read(glfFile fp, int *len); + void glf3_ref_write(glfFile fp, const char *name, int len); + int glf3_write1(glfFile fp, const glf3_t *g3); + int glf3_read1(glfFile fp, glf3_t *g3); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/kaln.c b/samtools/kaln.c new file mode 100644 index 0000000..9fa40d0 --- /dev/null +++ b/samtools/kaln.c @@ -0,0 +1,370 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include "kaln.h" + +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 + +typedef struct { + int i, j; + unsigned char ctype; +} path_t; + +int aln_sm_blosum62[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, + -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, + -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, + -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, + 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, + -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, + -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, + 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, + -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, + -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, + -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, + -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, + -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, + -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, + -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, + -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, + -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, + 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, + -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 +}; + +int aln_sm_blast[] = { + 1, -3, -3, -3, -2, + -3, 1, -3, -3, -2, + -3, -3, 1, -3, -2, + -3, -3, -3, 1, -2, + -2, -2, -2, -2, -2 +}; + +ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; +ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; + +static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) +{ + int i, n; + uint32_t *cigar; + unsigned char last_type; + + if (path_len == 0 || path == 0) { + *n_cigar = 0; + return 0; + } + + last_type = path->ctype; + for (i = n = 1; i < path_len; ++i) { + if (last_type != path[i].ctype) ++n; + last_type = path[i].ctype; + } + *n_cigar = n; + cigar = (uint32_t*)calloc(*n_cigar, 4); + + cigar[0] = 1u << 4 | path[path_len-1].ctype; + last_type = path[path_len-1].ctype; + for (i = path_len - 2, n = 0; i >= 0; --i) { + if (path[i].ctype == last_type) cigar[n] += 1u << 4; + else { + cigar[++n] = 1u << 4 | path[i].ctype; + last_type = path[i].ctype; + } + } + + return cigar; +} + +/***************************/ +/* START OF common_align.c */ +/***************************/ + +#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; + +#define set_M(MM, cur, p, sc) \ +{ \ + if ((p)->M >= (p)->I) { \ + if ((p)->M >= (p)->D) { \ + (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } else { \ + if ((p)->I > (p)->D) { \ + (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } \ +} +#define set_I(II, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_ext; \ + } \ +} +#define set_end_I(II, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_end; \ + } \ + } else set_I(II, cur, p); \ +} +#define set_D(DD, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_ext; \ + } \ +} +#define set_end_D(DD, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_end; \ + } \ + } else set_D(DD, cur, p); \ +} + +typedef struct { + uint8_t Mt:3, It:2, Dt:2; +} dpcell_t; + +typedef struct { + int M, I, D; +} dpscore_t; + +/*************************** + * banded global alignment * + ***************************/ +uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar) +{ + int i, j; + dpcell_t **dpcell, *q; + dpscore_t *curr, *last, *s; + int b1, b2, tmp_end; + int *mat, end, max = 0; + uint8_t type, ctype; + uint32_t *cigar = 0; + + int gap_open, gap_ext, gap_end, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + gap_end = ap->gap_end; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + *n_cigar = 0; + if (len1 == 0 || len2 == 0) return 0; + + /* calculate b1 and b2 */ + if (len1 > len2) { + b1 = len1 - len2 + b; + b2 = b; + } else { + b1 = b; + b2 = len2 - len1 + b; + } + if (b1 > len1) b1 = len1; + if (b2 > len2) b2 = len2; + --seq1; --seq2; + + /* allocate memory */ + end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); + dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); + for (j = 0; j <= len2; ++j) + dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] -= j - b2; + curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + + /* set first row */ + SET_INF(*curr); curr->M = 0; + for (i = 1, s = curr + 1; i < b1; ++i, ++s) { + SET_INF(*s); + set_end_D(s->D, dpcell[0] + i, s - 1); + } + s = curr; curr = last; last = s; + + /* core dynamic programming, part 1 */ + tmp_end = (b2 < len2)? b2 : len2 - 1; + for (j = 1; j <= tmp_end; ++j) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + /* last row for part 1, use set_end_D() instead of set_D() */ + if (j == len2 && b2 != len2 - 1) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_end_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + ++j; + } + + /* core dynamic programming, part 2 */ + for (; j <= len2 - b2 + 1; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + end = j + b1 - 1; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + + /* core dynamic programming, part 3 */ + for (; j < len2; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + /* last row */ + if (j == len2) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + + *_score = last[len1].M; + if (n_cigar) { /* backtrace */ + path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2)); + i = len1; j = len2; + q = dpcell[j] + i; + s = last + len1; + max = s->M; type = q->Mt; ctype = FROM_M; + if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } + if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } + + p = path; + p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ + ++p; + do { + switch (ctype) { + case FROM_M: --i; --j; break; + case FROM_I: --j; break; + case FROM_D: --i; break; + } + q = dpcell[j] + i; + ctype = type; + switch (type) { + case FROM_M: type = q->Mt; break; + case FROM_I: type = q->It; break; + case FROM_D: type = q->Dt; break; + } + p->ctype = ctype; p->i = i; p->j = j; + ++p; + } while (i || j); + cigar = ka_path2cigar32(path, p - path - 1, n_cigar); + free(path); + } + + /* free memory */ + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] += j - b2; + for (j = 0; j <= len2; ++j) + free(dpcell[j]); + free(dpcell); + free(curr); free(last); + + return cigar; +} diff --git a/samtools/kaln.h b/samtools/kaln.h new file mode 100644 index 0000000..b04d8cc --- /dev/null +++ b/samtools/kaln.h @@ -0,0 +1,55 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009 by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef LH3_KALN_H_ +#define LH3_KALN_H_ + +#include + +#define MINOR_INF -1073741823 + +typedef struct { + int gap_open; + int gap_ext; + int gap_end; + + int *matrix; + int row; + int band_width; +} ka_param_t; + +#ifdef __cplusplus +extern "C" { +#endif + + uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +extern ka_param_t ka_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ + +#endif diff --git a/samtools/khash.h b/samtools/khash.h new file mode 100644 index 0000000..1d583ef --- /dev/null +++ b/samtools/khash.h @@ -0,0 +1,486 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.2" + +#include +#include +#include + +typedef uint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + uint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + uint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [uint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (uint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [uint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/samtools/klist.h b/samtools/klist.h new file mode 100644 index 0000000..2f17016 --- /dev/null +++ b/samtools/klist.h @@ -0,0 +1,96 @@ +#ifndef _LH3_KLIST_H +#define _LH3_KLIST_H + +#include + +#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + } kmp_##name##_t; \ + static inline kmp_##name##_t *kmp_init_##name() { \ + return calloc(1, sizeof(kmp_##name##_t)); \ + } \ + static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) { \ + kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ + } \ + free(mp->buf); free(mp); \ + } \ + static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) { \ + mp->max = mp->max? mp->max<<1 : 16; \ + mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \ + } \ + mp->buf[mp->n++] = p; \ + } + +#define kmempool_t(name) kmp_##name##_t +#define kmp_init(name) kmp_init_##name() +#define kmp_destroy(name, mp) kmp_destroy_##name(mp) +#define kmp_alloc(name, mp) kmp_alloc_##name(mp) +#define kmp_free(name, mp, p) kmp_free_##name(mp, p) + +#define KLIST_INIT(name, kltype_t, kmpfree_t) \ + struct __kl1_##name { \ + kltype_t data; \ + struct __kl1_##name *next; \ + }; \ + typedef struct __kl1_##name kl1_##name; \ + KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \ + typedef struct { \ + kl1_##name *head, *tail; \ + kmp_##name##_t *mp; \ + size_t size; \ + } kl_##name##_t; \ + static inline kl_##name##_t *kl_init_##name() { \ + kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \ + kl->mp = kmp_init(name); \ + kl->head = kl->tail = kmp_alloc(name, kl->mp); \ + kl->head->next = 0; \ + return kl; \ + } \ + static inline void kl_destroy_##name(kl_##name##_t *kl) { \ + kl1_##name *p; \ + for (p = kl->head; p != kl->tail; p = p->next) \ + kmp_free(name, kl->mp, p); \ + kmp_free(name, kl->mp, p); \ + kmp_destroy(name, kl->mp); \ + free(kl); \ + } \ + static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ + kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ + q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ + ++kl->size; \ + return &q->data; \ + } \ + static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ + kl1_##name *p; \ + if (kl->head->next == 0) return -1; \ + --kl->size; \ + p = kl->head; kl->head = kl->head->next; \ + if (d) *d = p->data; \ + kmp_free(name, kl->mp, p); \ + return 0; \ + } + +#define kliter_t(name) kl1_##name +#define klist_t(name) kl_##name##_t +#define kl_val(iter) ((iter)->data) +#define kl_next(iter) ((iter)->next) +#define kl_begin(kl) ((kl)->head) +#define kl_end(kl) ((kl)->tail) + +#define kl_init(name) kl_init_##name() +#define kl_destroy(name, kl) kl_destroy_##name(kl) +#define kl_pushp(name, kl) kl_pushp_##name(kl) +#define kl_shift(name, kl, d) kl_shift_##name(kl, d) + +#endif diff --git a/samtools/knetfile.c b/samtools/knetfile.c new file mode 100644 index 0000000..994babb --- /dev/null +++ b/samtools/knetfile.c @@ -0,0 +1,632 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Probably I will not do socket programming in the next few years and + therefore I decide to heavily annotate this file, for Linux and + Windows as well. -lh3 */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#include +#include +#endif + +#include "knetfile.h" + +/* In winsock.h, the type of a socket is SOCKET, which is: "typedef + * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed + * integer -1. In knetfile.c, I use "int" for socket type + * throughout. This should be improved to avoid confusion. + * + * In Linux/Mac, recv() and read() do almost the same thing. You can see + * in the header file that netread() is simply an alias of read(). In + * Windows, however, they are different and using recv() is mandatory. + */ + +/* This function tests if the file handler is ready for reading (or + * writing if is_read==0). */ +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); +#ifndef _WIN32 + if (ret == -1) perror("select"); +#else + if (ret == 0) + fprintf(stderr, "select time-out\n"); + else if (ret == SOCKET_ERROR) + fprintf(stderr, "select: %d\n", WSAGetLastError()); +#endif + return ret; +} + +#ifndef _WIN32 +/* This function does not work with Windows due to the lack of + * getaddrinfo() in winsock. It is addapted from an example in "Beej's + * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + /* In Unix/Mac, getaddrinfo() is the most convenient way to get + * server information. */ + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + /* The following two setsockopt() are used by ftplib + * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they + * necessary. */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +} +#else +/* MinGW's printf has problem with "%lld" */ +char *int64tostr(char *buf, int64_t x) +{ + int cnt; + int i = 0; + do { + buf[i++] = '0' + x % 10; + x /= 10; + } while (x); + buf[i] = 0; + for (cnt = i, i = 0; i < cnt/2; ++i) { + int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; + } + return buf; +} + +int64_t strtoint64(const char *buf) +{ + int64_t x; + for (x = 0; *buf != '\0'; ++buf) + x = x * 10 + ((int64_t) *buf - 48); + return x; +} +/* In windows, the first thing is to establish the TCP connection. */ +int knet_win32_init() +{ + WSADATA wsaData; + return WSAStartup(MAKEWORD(2, 2), &wsaData); +} +void knet_win32_destroy() +{ + WSACleanup(); +} +/* A slightly modfied version of the following function also works on + * Mac (and presummably Linux). However, this function is not stable on + * my Mac. It sometimes works fine but sometimes does not. Therefore for + * non-Windows OS, I do not use this one. */ +static SOCKET socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) \ + do { \ + fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ + return -1; \ + } while (0) + + int on = 1; + SOCKET fd; + struct linger lng = { 0, 0 }; + struct sockaddr_in server; + struct hostent *hp = 0; + // open socket + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + // get host info + if (isalpha(host[0])) hp = gethostbyname(host); + else { + struct in_addr addr; + addr.s_addr = inet_addr(host); + hp = gethostbyaddr((char*)&addr, 4, AF_INET); + } + if (hp == 0) __err_connect("gethost"); + // connect + server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); + server.sin_family= AF_INET; + server.sin_port = htons(atoi(port)); + if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); + // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) + return fd; +} +#endif + +static off_t my_netread(int fd, void *buf, off_t len) +{ + off_t rest = len, curr, l = 0; + /* recv() and read() may not read the required length of data with + * one call. They have to be called repeatedly. */ + while (rest) { + if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading + curr = netread(fd, buf + l, rest); + /* According to the glibc manual, section 13.2, a zero returned + * value indicates end-of-file (EOF), which should mean that + * read() will not return zero if EOF has not been met but data + * are not immediately available. */ + if (curr == 0) break; + l += curr; rest -= curr; + } + return l; +} + +/************************* + * FTP specific routines * + *************************/ + +static int kftp_get_response(knetFile *ftp) +{ +#ifndef _WIN32 + unsigned char c; +#else + char c; +#endif + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + + +static int kftp_pasv_connect(knetFile *ftp) +{ + char host[80], port[10]; + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + ftp->fd = socket_connect(host, port); + if (ftp->fd == -1) return -1; + return 0; +} + +int kftp_connect(knetFile *ftp) +{ + ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); + if (ftp->ctrl_fd == -1) return -1; + kftp_get_response(ftp); + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd != -1) { + netclose(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + netclose(ftp->fd); + ftp->fd = -1; + return kftp_connect(ftp); +} + +// initialize ->type, ->host, ->retr and ->size +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + /* the Linux/Mac version of socket_connect() also recognizes a port + * like "ftp", but the Windows version does not. */ + fp->port = strdup("21"); + fp->host = calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->size_cmd = calloc(strlen(p) + 8, 1); + sprintf(fp->size_cmd, "SIZE %s\r\n", p); + fp->seek_offset = 0; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + long long file_size; + if (fp->fd != -1) { + netclose(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + kftp_send_cmd(fp, fp->size_cmd, 1); +#ifndef _WIN32 + if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) + { + fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); + return -1; + } +#else + const char *p = fp->response; + while (*p != ' ') ++p; + while (*p < '0' || *p > '9') ++p; + file_size = strtoint64(p); +#endif + fp->file_size = file_size; + if (fp->offset>=0) { + char tmp[32]; +#ifndef _WIN32 + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); +#else + strcpy(tmp, "REST "); + int64tostr(tmp + 5, fp->offset); + strcat(tmp, "\r\n"); +#endif + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + + +/************************** + * HTTP specific routines * + **************************/ + +knetFile *khttp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p, *proxy, *q; + int l; + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + fp = calloc(1, sizeof(knetFile)); + fp->http_host = calloc(l + 1, 1); + strncpy(fp->http_host, fn + 7, l); + fp->http_host[l] = 0; + for (q = fp->http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set ->host, ->port and ->path + if (proxy == 0) { + fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. + fp->port = strdup(*q? q : "80"); + fp->path = strdup(*p? p : "/"); + } else { + fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = fp->host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + fp->port = strdup(*q? q : "80"); + fp->path = strdup(fn); + } + fp->type = KNF_TYPE_HTTP; + fp->ctrl_fd = fp->fd = -1; + fp->seek_offset = 0; + return fp; +} + +int khttp_connect_file(knetFile *fp) +{ + int ret, l = 0; + char *buf, *p; + if (fp->fd != -1) netclose(fp->fd); + fp->fd = socket_connect(fp->host, fp->port); + buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); + l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); + l += sprintf(buf + l, "\r\n"); + netwrite(fp->fd, buf, l); + l = 0; + while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + netclose(fp->fd); + fp->fd = -1; + return -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file + off_t rest = fp->offset; + while (rest) { + off_t l = rest < 0x10000? rest : 0x10000; + rest -= my_netread(fp->fd, buf, l); + } + } else if (ret != 206 && ret != 200) { + free(buf); + fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + free(buf); + fp->is_ready = 1; + return 0; +} + +/******************** + * Generic routines * + ********************/ + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + } else if (strstr(fn, "http://") == fn) { + fp = khttp_parse_url(fn, mode); + if (fp == 0) return 0; + khttp_connect_file(fp); + } else { // local file +#ifdef _WIN32 + /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may + * be undefined on some systems, although it is defined on my + * Mac and the Linux I have tested on. */ + int fd = open(fn, O_RDONLY | O_BINARY); +#else + int fd = open(fn, O_RDONLY); +#endif + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + fp->ctrl_fd = -1; + } + if (fp && fp->fd == -1) { + knet_close(fp); + return 0; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +off_t knet_read(knetFile *fp, void *buf, off_t len) +{ + off_t l = 0; + if (fp->fd == -1) return 0; + if (fp->type == KNF_TYPE_FTP) { + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + } + } else if (fp->type == KNF_TYPE_HTTP) { + if (fp->is_ready == 0) + khttp_connect_file(fp); + } + if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX + off_t rest = len, curr; + while (rest) { + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; + l += curr; rest -= curr; + } + } else l = my_netread(fp->fd, buf, len); + fp->offset += l; + return l; +} + +off_t knet_seek(knetFile *fp, int64_t off, int whence) +{ + if (whence == SEEK_SET && off == fp->offset) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + /* Be aware that lseek() returns the offset after seeking, + * while fseek() returns zero on success. */ + off_t offset = lseek(fp->fd, off, whence); + if (offset == -1) { + // Be silent, it is OK for knet_seek to fail when the file is streamed + // fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; + } + fp->offset = offset; + return 0; + } + else if (fp->type == KNF_TYPE_FTP) + { + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + else if ( whence==SEEK_END) + fp->offset = fp->file_size+off; + fp->is_ready = 0; + return 0; + } + else if (fp->type == KNF_TYPE_HTTP) + { + if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? + fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); + errno = ESPIPE; + return -1; + } + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + fp->is_ready = 0; + return fp->offset; + } + errno = EINVAL; + fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific + if (fp->fd != -1) { + /* On Linux/Mac, netclose() is an alias of close(), but on + * Windows, it is an alias of closesocket(). */ + if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); + else netclose(fp->fd); + } + free(fp->host); free(fp->port); + free(fp->response); free(fp->retr); // FTP specific + free(fp->path); free(fp->http_host); // HTTP specific + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char *buf; + knetFile *fp; + int type = 4, l; +#ifdef _WIN32 + knet_win32_init(); +#endif + buf = calloc(0x100000, 1); + if (type == 0) { + fp = knet_open("knetfile.c", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 1) { // NCBI FTP, large file + fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); + knet_seek(fp, 2500000000ll, SEEK_SET); + l = knet_read(fp, buf, 255); + } else if (type == 2) { + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 3) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 4) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); + knet_read(fp, buf, 10000); + knet_seek(fp, 20000, SEEK_SET); + knet_seek(fp, 10000, SEEK_SET); + l = knet_read(fp, buf+10000, 10000000) + 10000; + } + if (type != 4 && type != 1) { + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + } else write(fileno(stdout), buf, l); + knet_close(fp); + free(buf); + return 0; +} +#endif diff --git a/samtools/knetfile.h b/samtools/knetfile.h new file mode 100644 index 0000000..0a0e66f --- /dev/null +++ b/samtools/knetfile.h @@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include +#include + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, int64_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/kseq.h b/samtools/kseq.h new file mode 100644 index 0000000..82face0 --- /dev/null +++ b/samtools/kseq.h @@ -0,0 +1,227 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" + */ + +/* Last Modified: 12APR2009 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_MAX 1 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->l == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff --git a/samtools/ksort.h b/samtools/ksort.h new file mode 100644 index 0000000..16a03fd --- /dev/null +++ b/samtools/ksort.h @@ -0,0 +1,271 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/samtools/kstring.c b/samtools/kstring.c new file mode 100644 index 0000000..e0203fa --- /dev/null +++ b/samtools/kstring.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +int *ksBM_prep(const uint8_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = calloc(m + 256, 1); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +{ + int i, j, *prep, *bmGs, *bmBc; + int *matches = 0, mm = 0, nm = 0; + prep = _prep? _prep : ksBM_prep(pat, m); + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i < 0) { + if (nm == mm) { + mm = mm? mm<<1 : 1; + matches = realloc(matches, mm * sizeof(int)); + } + matches[nm++] = j; + j += bmGs[0]; + } else { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } + } + *n_matches = nm; + if (_prep == 0) free(prep); + return matches; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + + { + static char *str = "abcdefgcdg"; + static char *pat = "cd"; + int n, *matches; + matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); + printf("%d: \n", n); + for (i = 0; i < n; ++i) + printf("- %d\n", matches[i]); + free(matches); + } + return 0; +} +#endif diff --git a/samtools/kstring.h b/samtools/kstring.h new file mode 100644 index 0000000..f4e5a99 --- /dev/null +++ b/samtools/kstring.h @@ -0,0 +1,68 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +// calculate the auxiliary array, allocated by calloc() +int *ksBM_prep(const uint8_t *pat, int m); + +/* Search pat in str and returned the list of matches. The size of the + * list is returned as n_matches. _prep is the array returned by + * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/samtools/razf.c b/samtools/razf.c new file mode 100644 index 0000000..e7499f9 --- /dev/null +++ b/samtools/razf.c @@ -0,0 +1,853 @@ +/* + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan , Heng Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NO_RAZF + +#include +#include +#include +#include +#include +#include "razf.h" + + +#if ZLIB_VERNUM < 0x1221 +struct _gz_header_s { + int text; + uLong time; + int xflags; + int os; + Bytef *extra; + uInt extra_len; + uInt extra_max; + Bytef *name; + uInt name_max; + Bytef *comment; + uInt comm_max; + int hcrc; + int done; +}; +#warning "zlib < 1.2.2.1; RAZF writing is disabled." +#endif + +#define DEF_MEM_LEVEL 8 + +static inline uint32_t byte_swap_4(uint32_t v){ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint64_t byte_swap_8(uint64_t v){ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} + +static inline int is_big_endian(){ + int x = 0x01; + char *c = (char*)&x; + return (c[0] != 0x01); +} + +#ifndef _RZ_READONLY +static void add_zindex(RAZF *rz, int64_t in, int64_t out){ + if(rz->index->size == rz->index->cap){ + rz->index->cap = rz->index->cap * 1.5 + 2; + rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap); + rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1)); + } + if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out; + rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE]; + rz->index->size ++; +} + +static void save_zindex(RAZF *rz, int fd){ + int32_t i, v32; + int is_be; + is_be = is_big_endian(); + if(is_be) write(fd, &rz->index->size, sizeof(int)); + else { + v32 = byte_swap_4((uint32_t)rz->index->size); + write(fd, &v32, sizeof(uint32_t)); + } + v32 = rz->index->size / RZ_BIN_SIZE + 1; + if(!is_be){ + for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } + write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); + write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size); +} +#endif + +#ifdef _USE_KNETFILE +static void load_zindex(RAZF *rz, knetFile *fp){ +#else +static void load_zindex(RAZF *rz, int fd){ +#endif + int32_t i, v32; + int is_be; + if(!rz->load_index) return; + if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex)); + is_be = is_big_endian(); +#ifdef _USE_KNETFILE + knet_read(fp, &rz->index->size, sizeof(int)); +#else + read(fd, &rz->index->size, sizeof(int)); +#endif + if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size); + rz->index->cap = rz->index->size; + v32 = rz->index->size / RZ_BIN_SIZE + 1; + rz->index->bin_offsets = malloc(sizeof(int64_t) * v32); +#ifdef _USE_KNETFILE + knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32); +#else + read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); +#endif + rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size); +#ifdef _USE_KNETFILE + knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size); +#else + read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size); +#endif + if(!is_be){ + for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } +} + +#ifdef _RZ_READONLY +static RAZF* razf_open_w(int fd) +{ + fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n"); + return 0; +} +#else +static RAZF* razf_open_w(int fd){ + RAZF *rz; +#ifdef _WIN32 + setmode(fd, O_BINARY); +#endif + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'w'; +#ifdef _USE_KNETFILE + rz->x.fpw = fd; +#else + rz->filedes = fd; +#endif + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->index = calloc(sizeof(ZBlockIndex), 1); + deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->header = calloc(sizeof(gz_header), 1); + rz->header->os = 0x03; //Unix + rz->header->text = 0; + rz->header->time = 0; + rz->header->extra = malloc(7); + strncpy((char*)rz->header->extra, "RAZF", 4); + rz->header->extra[4] = 1; // obsolete field + // block size = RZ_BLOCK_SIZE, Big-Endian + rz->header->extra[5] = RZ_BLOCK_SIZE >> 8; + rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF; + rz->header->extra_len = 7; + rz->header->name = rz->header->comment = 0; + rz->header->hcrc = 0; + deflateSetHeader(rz->stream, rz->header); + rz->block_pos = rz->block_off = 0; + return rz; +} + +static void _razf_write(RAZF* rz, const void *data, int size){ + int tout; + rz->stream->avail_in = size; + rz->stream->next_in = (void*)data; + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_NO_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out) break; +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + if(rz->stream->avail_in == 0) break; + }; + rz->in += size - rz->stream->avail_in; + rz->block_off += size - rz->stream->avail_in; +} + +static void razf_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + if(rz->stream->avail_out){ +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FULL_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out == 0){ +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } + rz->block_pos = rz->out; + rz->block_off = 0; +} + +static void razf_end_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FINISH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out < RZ_BUFFER_SIZE){ +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } +} + +static void _razf_buffered_write(RAZF *rz, const void *data, int size){ + int i, n; + while(1){ + if(rz->buf_len == RZ_BUFFER_SIZE){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_len = 0; + } + if(size + rz->buf_len < RZ_BUFFER_SIZE){ + for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; + rz->buf_len += size; + return; + } else { + n = RZ_BUFFER_SIZE - rz->buf_len; + for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; + size -= n; + data += n; + rz->buf_len += n; + } + } +} + +int razf_write(RAZF* rz, const void *data, int size){ + int ori_size, n; + int64_t next_block; + ori_size = size; + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + while(rz->in + rz->buf_len + size >= next_block){ + n = next_block - rz->in - rz->buf_len; + _razf_buffered_write(rz, data, n); + data += n; + size -= n; + razf_flush(rz); + add_zindex(rz, rz->in, rz->out); + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + } + _razf_buffered_write(rz, data, size); + return ori_size; +} +#endif + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */ +#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define RESERVED 0xE0 /* bits 5..7: reserved */ + +static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){ + int method, flags, n, len; + if(size < 2) return 0; + if(data[0] != 0x1f || data[1] != 0x8b) return 0; + if(size < 4) return 0; + method = data[2]; + flags = data[3]; + if(method != Z_DEFLATED || (flags & RESERVED)) return 0; + n = 4 + 6; // Skip 6 bytes + *extra_off = n + 2; + *extra_len = 0; + if(flags & EXTRA_FIELD){ + if(size < n + 2) return 0; + len = ((int)data[n + 1] << 8) | data[n]; + n += 2; + *extra_off = n; + while(len){ + if(n >= size) return 0; + n ++; + len --; + } + *extra_len = n - (*extra_off); + } + if(flags & ORIG_NAME) while(n < size && data[n++]); + if(flags & COMMENT) while(n < size && data[n++]); + if(flags & HEAD_CRC){ + if(n + 2 > size) return 0; + n += 2; + } + return n; +} + +#ifdef _USE_KNETFILE +static RAZF* razf_open_r(knetFile *fp, int _load_index){ +#else +static RAZF* razf_open_r(int fd, int _load_index){ +#endif + RAZF *rz; + int ext_off, ext_len; + int n, is_be, ret; + int64_t end; + unsigned char c[] = "RAZF"; + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'r'; +#ifdef _USE_KNETFILE + rz->x.fpr = fp; +#else +#ifdef _WIN32 + setmode(fd, O_BINARY); +#endif + rz->filedes = fd; +#endif + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE); +#else + n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); +#endif + ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len); + if(ret == 0){ + PLAIN_FILE: + rz->in = n; + rz->file_type = FILE_TYPE_PLAIN; + memcpy(rz->outbuf, rz->inbuf, n); + rz->buf_len = n; + free(rz->stream); + rz->stream = NULL; + return rz; + } + rz->header_size = ret; + ret = inflateInit2(rz->stream, -WINDOW_BITS); + if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;} + rz->stream->avail_in = n - rz->header_size; + rz->stream->next_in = rz->inbuf + rz->header_size; + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->file_type = FILE_TYPE_GZ; + rz->in = rz->header_size; + rz->block_pos = rz->header_size; + rz->next_block_pos = rz->header_size; + rz->block_off = 0; + if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz; + if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){ + fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__); + return rz; + } + rz->load_index = _load_index; + rz->file_type = FILE_TYPE_RZ; +#ifdef _USE_KNETFILE + if(knet_seek(fp, -16, SEEK_END) == -1){ +#else + if(lseek(fd, -16, SEEK_END) == -1){ +#endif + UNSEEKABLE: + rz->seekable = 0; + rz->index = NULL; + rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL; + } else { + is_be = is_big_endian(); + rz->seekable = 1; +#ifdef _USE_KNETFILE + knet_read(fp, &end, sizeof(int64_t)); +#else + read(fd, &end, sizeof(int64_t)); +#endif + if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end); + else rz->src_end = end; + +#ifdef _USE_KNETFILE + knet_read(fp, &end, sizeof(int64_t)); +#else + read(fd, &end, sizeof(int64_t)); +#endif + if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end); + else rz->end = end; + if(n > rz->end){ + rz->stream->avail_in -= n - rz->end; + n = rz->end; + } + if(rz->end > rz->src_end){ +#ifdef _USE_KNETFILE + knet_seek(fp, rz->in, SEEK_SET); +#else + lseek(fd, rz->in, SEEK_SET); +#endif + goto UNSEEKABLE; + } +#ifdef _USE_KNETFILE + knet_seek(fp, rz->end, SEEK_SET); + if(knet_tell(fp) != rz->end){ + knet_seek(fp, rz->in, SEEK_SET); +#else + if(lseek(fd, rz->end, SEEK_SET) != rz->end){ + lseek(fd, rz->in, SEEK_SET); +#endif + goto UNSEEKABLE; + } +#ifdef _USE_KNETFILE + load_zindex(rz, fp); + knet_seek(fp, n, SEEK_SET); +#else + load_zindex(rz, fd); + lseek(fd, n, SEEK_SET); +#endif + } + return rz; +} + +#ifdef _USE_KNETFILE +RAZF* razf_dopen(int fd, const char *mode){ + if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n"); + else if(strstr(mode, "w")) return razf_open_w(fd); + return NULL; +} + +RAZF* razf_dopen2(int fd, const char *mode) +{ + fprintf(stderr,"[razf_dopen2] implement me\n"); + return NULL; +} +#else +RAZF* razf_dopen(int fd, const char *mode){ + if(strstr(mode, "r")) return razf_open_r(fd, 1); + else if(strstr(mode, "w")) return razf_open_w(fd); + else return NULL; +} + +RAZF* razf_dopen2(int fd, const char *mode) +{ + if(strstr(mode, "r")) return razf_open_r(fd, 0); + else if(strstr(mode, "w")) return razf_open_w(fd); + else return NULL; +} +#endif + +static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){ + int fd; + RAZF *rz; + if(strstr(mode, "r")){ +#ifdef _USE_KNETFILE + knetFile *fd = knet_open(filename, "r"); + if (fd == 0) { + fprintf(stderr, "[_razf_open] fail to open %s\n", filename); + return NULL; + } +#else +#ifdef _WIN32 + fd = open(filename, O_RDONLY | O_BINARY); +#else + fd = open(filename, O_RDONLY); +#endif +#endif + if(fd < 0) return NULL; + rz = razf_open_r(fd, _load_index); + } else if(strstr(mode, "w")){ +#ifdef _WIN32 + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666); +#else + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666); +#endif + if(fd < 0) return NULL; + rz = razf_open_w(fd); + } else return NULL; + return rz; +} + +RAZF* razf_open(const char *filename, const char *mode){ + return _razf_open(filename, mode, 1); +} + +RAZF* razf_open2(const char *filename, const char *mode){ + return _razf_open(filename, mode, 0); +} + +int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){ + int64_t n; + if(rz->mode != 'r' && rz->mode != 'R') return 0; + switch(rz->file_type){ + case FILE_TYPE_PLAIN: + if(rz->end == 0x7fffffffffffffffLL){ +#ifdef _USE_KNETFILE + if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0; + n = knet_tell(rz->x.fpr); + knet_seek(rz->x.fpr, 0, SEEK_END); + rz->end = knet_tell(rz->x.fpr); + knet_seek(rz->x.fpr, n, SEEK_SET); +#else + if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0; + rz->end = lseek(rz->filedes, 0, SEEK_END); + lseek(rz->filedes, n, SEEK_SET); +#endif + } + *u_size = *c_size = rz->end; + return 1; + case FILE_TYPE_GZ: + return 0; + case FILE_TYPE_RZ: + if(rz->src_end == rz->end) return 0; + *u_size = rz->src_end; + *c_size = rz->end; + return 1; + default: + return 0; + } +} + +static int _razf_read(RAZF* rz, void *data, int size){ + int ret, tin; + if(rz->z_eof || rz->z_err) return 0; + if (rz->file_type == FILE_TYPE_PLAIN) { +#ifdef _USE_KNETFILE + ret = knet_read(rz->x.fpr, data, size); +#else + ret = read(rz->filedes, data, size); +#endif + if (ret == 0) rz->z_eof = 1; + return ret; + } + rz->stream->avail_out = size; + rz->stream->next_out = data; + while(rz->stream->avail_out){ + if(rz->stream->avail_in == 0){ + if(rz->in >= rz->end){ rz->z_eof = 1; break; } + if(rz->end - rz->in < RZ_BUFFER_SIZE){ +#ifdef _USE_KNETFILE + rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in); +#else + rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in); +#endif + } else { +#ifdef _USE_KNETFILE + rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE); +#else + rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); +#endif + } + if(rz->stream->avail_in == 0){ + rz->z_eof = 1; + break; + } + rz->stream->next_in = rz->inbuf; + } + tin = rz->stream->avail_in; + ret = inflate(rz->stream, Z_BLOCK); + rz->in += tin - rz->stream->avail_in; + if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){ + fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__); + rz->z_err = 1; + break; + } + if(ret == Z_STREAM_END){ + rz->z_eof = 1; + break; + } + if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){ + rz->buf_flush = 1; + rz->next_block_pos = rz->in; + break; + } + } + return size - rz->stream->avail_out; +} + +int razf_read(RAZF *rz, void *data, int size){ + int ori_size, i; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + for(i=0;ioutbuf + rz->buf_off)[i]; + rz->buf_off += size; + rz->buf_len -= size; + data += size; + rz->block_off += size; + size = 0; + break; + } else { + for(i=0;ibuf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i]; + data += rz->buf_len; + size -= rz->buf_len; + rz->block_off += rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof && rz->buf_len == 0) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +int razf_skip(RAZF* rz, int size){ + int ori_size; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + rz->buf_off += size; + rz->buf_len -= size; + rz->block_off += size; + size = 0; + break; + } else { + size -= rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + rz->block_off += rz->buf_len; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof || rz->z_err) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){ +#ifdef _USE_KNETFILE + knet_seek(rz->x.fpr, in, SEEK_SET); +#else + lseek(rz->filedes, in, SEEK_SET); +#endif + rz->in = in; + rz->out = out; + rz->block_pos = in; + rz->next_block_pos = in; + rz->block_off = 0; + rz->buf_flush = 0; + rz->z_eof = rz->z_err = 0; + inflateReset(rz->stream); + rz->stream->avail_in = 0; + rz->buf_off = rz->buf_len = 0; +} + +int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){ + int64_t pos; + rz->z_eof = 0; + if(rz->file_type == FILE_TYPE_PLAIN){ + rz->buf_off = rz->buf_len = 0; + pos = block_start + block_offset; +#ifdef _USE_KNETFILE + knet_seek(rz->x.fpr, pos, SEEK_SET); + pos = knet_tell(rz->x.fpr); +#else + pos = lseek(rz->filedes, pos, SEEK_SET); +#endif + rz->out = rz->in = pos; + return pos; + } + if(block_start == rz->block_pos && block_offset >= rz->block_off) { + block_offset -= rz->block_off; + goto SKIP; // Needn't reset inflate + } + if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start + _razf_reset_read(rz, block_start, 0); + SKIP: + if(block_offset) razf_skip(rz, block_offset); + return rz->block_off; +} + +int64_t razf_seek(RAZF* rz, int64_t pos, int where){ + int64_t idx; + int64_t seek_pos, new_out; + rz->z_eof = 0; + if (where == SEEK_CUR) pos += rz->out; + else if (where == SEEK_END) pos += rz->src_end; + if(rz->file_type == FILE_TYPE_PLAIN){ +#ifdef _USE_KNETFILE + knet_seek(rz->x.fpr, pos, SEEK_SET); + seek_pos = knet_tell(rz->x.fpr); +#else + seek_pos = lseek(rz->filedes, pos, SEEK_SET); +#endif + rz->buf_off = rz->buf_len = 0; + rz->out = rz->in = seek_pos; + return seek_pos; + } else if(rz->file_type == FILE_TYPE_GZ){ + if(pos >= rz->out) goto SKIP; + return rz->out; + } + if(pos == rz->out) return pos; + if(pos > rz->src_end) return rz->out; + if(!rz->seekable || !rz->load_index){ + if(pos >= rz->out) goto SKIP; + } + idx = pos / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + new_out = (idx + 1) * RZ_BLOCK_SIZE; + if(pos > rz->out && new_out <= rz->out) goto SKIP; + _razf_reset_read(rz, seek_pos, new_out); + SKIP: + razf_skip(rz, (int)(pos - rz->out)); + return rz->out; +} + +uint64_t razf_tell2(RAZF *rz) +{ + /* + if (rz->load_index) { + int64_t idx, seek_pos; + idx = rz->out / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off) + fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n", + (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off); + } + */ + return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff); +} + +int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where) +{ + if (where != SEEK_SET) return -1; + return razf_jump(rz, voffset>>16, voffset&0xffff); +} + +void razf_close(RAZF *rz){ + if(rz->mode == 'w'){ +#ifndef _RZ_READONLY + razf_end_flush(rz); + deflateEnd(rz->stream); +#ifdef _USE_KNETFILE + save_zindex(rz, rz->x.fpw); + if(is_big_endian()){ + write(rz->x.fpw, &rz->in, sizeof(int64_t)); + write(rz->x.fpw, &rz->out, sizeof(int64_t)); + } else { + uint64_t v64 = byte_swap_8((uint64_t)rz->in); + write(rz->x.fpw, &v64, sizeof(int64_t)); + v64 = byte_swap_8((uint64_t)rz->out); + write(rz->x.fpw, &v64, sizeof(int64_t)); + } +#else + save_zindex(rz, rz->filedes); + if(is_big_endian()){ + write(rz->filedes, &rz->in, sizeof(int64_t)); + write(rz->filedes, &rz->out, sizeof(int64_t)); + } else { + uint64_t v64 = byte_swap_8((uint64_t)rz->in); + write(rz->filedes, &v64, sizeof(int64_t)); + v64 = byte_swap_8((uint64_t)rz->out); + write(rz->filedes, &v64, sizeof(int64_t)); + } +#endif +#endif + } else if(rz->mode == 'r'){ + if(rz->stream) inflateEnd(rz->stream); + } + if(rz->inbuf) free(rz->inbuf); + if(rz->outbuf) free(rz->outbuf); + if(rz->header){ + free(rz->header->extra); + free(rz->header->name); + free(rz->header->comment); + free(rz->header); + } + if(rz->index){ + free(rz->index->bin_offsets); + free(rz->index->cell_offsets); + free(rz->index); + } + free(rz->stream); +#ifdef _USE_KNETFILE + if (rz->mode == 'r') + knet_close(rz->x.fpr); + if (rz->mode == 'w') + close(rz->x.fpw); +#else + close(rz->filedes); +#endif + free(rz); +} + +#endif diff --git a/samtools/razf.h b/samtools/razf.h new file mode 100644 index 0000000..60a0c96 --- /dev/null +++ b/samtools/razf.h @@ -0,0 +1,134 @@ + /*- + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan , Heng Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef __RAZF_RJ_H +#define __RAZF_RJ_H + +#include +#include +#include "zlib.h" + +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +#if ZLIB_VERNUM < 0x1221 +#define _RZ_READONLY +struct _gz_header_s; +typedef struct _gz_header_s _gz_header; +#define gz_header _gz_header +#endif + +#define WINDOW_BITS 15 + +#ifndef RZ_BLOCK_SIZE +#define RZ_BLOCK_SIZE (1<mode from HEAD to TYPE after call inflateReset */ + int buf_off, buf_len; + int z_err, z_eof; + int seekable; + /* Indice where the source is seekable */ + int load_index; + /* set has_index to 0 in mode 'w', then index will be discarded */ +} RAZF; + +#ifdef __cplusplus +extern "C" { +#endif + + RAZF* razf_dopen(int data_fd, const char *mode); + RAZF *razf_open(const char *fn, const char *mode); + int razf_write(RAZF* rz, const void *data, int size); + int razf_read(RAZF* rz, void *data, int size); + int64_t razf_seek(RAZF* rz, int64_t pos, int where); + void razf_close(RAZF* rz); + +#define razf_tell(rz) ((rz)->out) + + RAZF* razf_open2(const char *filename, const char *mode); + RAZF* razf_dopen2(int fd, const char *mode); + uint64_t razf_tell2(RAZF *rz); + int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/sam.c b/samtools/sam.c new file mode 100644 index 0000000..ad4325b --- /dev/null +++ b/samtools/sam.c @@ -0,0 +1,174 @@ +#include +#include +#include "faidx.h" +#include "sam.h" + +#define TYPE_BAM 1 +#define TYPE_READ 2 + +bam_header_t *bam_header_dup(const bam_header_t *h0) +{ + bam_header_t *h; + int i; + h = bam_header_init(); + *h = *h0; + h->hash = h->dict = h->rg2lib = 0; + h->text = (char*)calloc(h->l_text + 1, 1); + memcpy(h->text, h0->text, h->l_text); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + } + return h; +} +static void append_header_text(bam_header_t *header, char* text, int len) +{ + int x = header->l_text + 1; + int y = header->l_text + len + 1; // 1 byte null + if (text == 0) return; + kroundup32(x); + kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. + header->l_text += len; + header->text[header->l_text] = 0; +} + +samfile_t *samopen(const char *fn, const char *mode, const void *aux) +{ + samfile_t *fp; + fp = (samfile_t*)calloc(1, sizeof(samfile_t)); + if (mode[0] == 'r') { // read + fp->type |= TYPE_READ; + if (mode[1] == 'b') { // binary + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + if (fp->x.bam == 0) goto open_err_ret; + fp->header = bam_header_read(fp->x.bam); + } else { // text + fp->x.tamr = sam_open(fn); + if (fp->x.tamr == 0) goto open_err_ret; + fp->header = sam_header_read(fp->x.tamr); + if (fp->header->n_targets == 0) { // no @SQ fields + if (aux) { // check if aux is present + bam_header_t *textheader = fp->header; + fp->header = sam_header_read2((const char*)aux); + append_header_text(fp->header, textheader->text, textheader->l_text); + bam_header_destroy(textheader); + } + if (fp->header->n_targets == 0) + fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); + } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } + } else if (mode[0] == 'w') { // write + fp->header = bam_header_dup((const bam_header_t*)aux); + if (mode[1] == 'b') { // binary + char bmode[3]; + bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); + if (fp->x.bam == 0) goto open_err_ret; + bam_header_write(fp->x.bam, fp->header); + } else { // text + // open file + fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; + if (fp->x.tamr == 0) goto open_err_ret; + if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2; + else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2; + else fp->type |= BAM_OFDEC<<2; + // write header + if (strstr(mode, "h")) { + int i; + bam_header_t *alt; + // parse the header text + alt = bam_header_init(); + alt->l_text = fp->header->l_text; alt->text = fp->header->text; + sam_header_parse(alt); + alt->l_text = 0; alt->text = 0; + // check if there are @SQ lines in the header + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} + if (alt->n_targets != fp->header->n_targets) + fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + } else { // then dump ->target_{name,len} + for (i = 0; i < fp->header->n_targets; ++i) + fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); + } + bam_header_destroy(alt); + } + } + } + return fp; + +open_err_ret: + free(fp); + return 0; +} + +void samclose(samfile_t *fp) +{ + if (fp == 0) return; + if (fp->header) bam_header_destroy(fp->header); + if (fp->type & TYPE_BAM) bam_close(fp->x.bam); + else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); + else fclose(fp->x.tamw); + free(fp); +} + +int samread(samfile_t *fp, bam1_t *b) +{ + if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading + if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); + else return sam_read1(fp->x.tamr, fp->header, b); +} + +int samwrite(samfile_t *fp, const bam1_t *b) +{ + if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing + if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); + else { + char *s = bam_format1_core(fp->header, b, fp->type>>2&3); + int l = strlen(s); + fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); + free(s); + return l + 1; + } +} + +int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_plbuf_t *buf; + int ret; + bam1_t *b; + b = bam_init1(); + buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); + while ((ret = samread(fp, b)) >= 0) + bam_plbuf_push(b, buf); + bam_plbuf_push(0, buf); + bam_plbuf_destroy(buf); + bam_destroy1(b); + return 0; +} + +char *samfaipath(const char *fn_ref) +{ + char *fn_list = 0; + if (fn_ref == 0) return 0; + fn_list = calloc(strlen(fn_ref) + 5, 1); + strcat(strcpy(fn_list, fn_ref), ".fai"); + if (access(fn_list, R_OK) == -1) { // fn_list is unreadable + if (access(fn_ref, R_OK) == -1) { + fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); + } else { + fprintf(stderr, "[samfaipath] build FASTA index...\n"); + if (fai_build(fn_ref) == -1) { + fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); + free(fn_list); fn_list = 0; + } + } + } + return fn_list; +} diff --git a/samtools/sam.h b/samtools/sam.h new file mode 100644 index 0000000..0b87194 --- /dev/null +++ b/samtools/sam.h @@ -0,0 +1,98 @@ +#ifndef BAM_SAM_H +#define BAM_SAM_H + +#include "bam.h" + +/*! + @header + + This file provides higher level of I/O routines and unifies the APIs + for SAM and BAM formats. These APIs are more convenient and + recommended. + + @copyright Genome Research Ltd. + */ + +/*! @typedef + @abstract SAM/BAM file handler + @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format + @field bam BAM file handler; valid if (type&1) == 1 + @field tamr SAM file handler for reading; valid if type == 2 + @field tamw SAM file handler for writing; valid if type == 0 + @field header header struct + */ +typedef struct { + int type; + union { + tamFile tamr; + bamFile bam; + FILE *tamw; + } x; + bam_header_t *header; +} samfile_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Open a SAM/BAM file + + @param fn SAM/BAM file name; "-" is recognized as stdin (for + reading) or stdout (for writing). + + @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading, + 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output, + 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for + string flag. If 'b' present, it must immediately follow 'r' or + 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX", + "rb", "wb" and "wbu" exclusively. + + @param aux auxiliary data; if mode[0]=='w', aux points to + bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM + are absent, aux points the file name of the list of the reference; + aux is not used otherwise. If @SQ header lines are present in SAM, + aux is not used, either. + + @return SAM/BAM file handler + */ + samfile_t *samopen(const char *fn, const char *mode, const void *aux); + + /*! + @abstract Close a SAM/BAM handler + @param fp file handler to be closed + */ + void samclose(samfile_t *fp); + + /*! + @abstract Read one alignment + @param fp file handler + @param b alignment + @return bytes read + */ + int samread(samfile_t *fp, bam1_t *b); + + /*! + @abstract Write one alignment + @param fp file handler + @param b alignment + @return bytes written + */ + int samwrite(samfile_t *fp, const bam1_t *b); + + /*! + @abstract Get the pileup for a whole alignment file + @param fp file handler + @param mask mask transferred to bam_plbuf_set_mask() + @param func user defined function called in the pileup process + #param data user provided data for func() + */ + int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); + + char *samfaipath(const char *fn_ref); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/sam_header.c b/samtools/sam_header.c new file mode 100644 index 0000000..a119c02 --- /dev/null +++ b/samtools/sam_header.c @@ -0,0 +1,701 @@ +#include "sam_header.h" +#include +#include +#include +#include +#include + +#include "khash.h" +KHASH_MAP_INIT_STR(str, const char *) + +struct _HeaderList +{ + struct _HeaderList *next; + void *data; +}; +typedef struct _HeaderList list_t; +typedef list_t HeaderDict; + +typedef struct +{ + char key[2]; + char *value; +} +HeaderTag; + +typedef struct +{ + char type[2]; + list_t *tags; +} +HeaderLine; + +const char *o_hd_tags[] = {"SO","GO",NULL}; +const char *r_hd_tags[] = {"VN",NULL}; + +const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; +const char *r_sq_tags[] = {"SN","LN",NULL}; +const char *u_sq_tags[] = {"SN",NULL}; + +const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; +const char *r_rg_tags[] = {"ID",NULL}; +const char *u_rg_tags[] = {"ID",NULL}; + +const char *o_pg_tags[] = {"VN","CL",NULL}; +const char *r_pg_tags[] = {"ID",NULL}; + +const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; +const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; +const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; +const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; + + +static void debug(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); +} + +static list_t *list_append(list_t *root, void *data) +{ + list_t *l = root; + while (l && l->next) + l = l->next; + if ( l ) + { + l->next = malloc(sizeof(list_t)); + l = l->next; + } + else + { + l = malloc(sizeof(list_t)); + root = l; + } + l->data = data; + l->next = NULL; + return root; +} + +static void list_free(list_t *root) +{ + list_t *l = root; + while (root) + { + l = root; + root = root->next; + free(l); + } +} + + + +// Look for a tag "XY" in a predefined const char *[] array. +static int tag_exists(const char *tag, const char **tags) +{ + int itag=0; + if ( !tags ) return -1; + while ( tags[itag] ) + { + if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; + itag++; + } + return -1; +} + + + +// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text +// or NULL if everything has been read. The lineptr should be freed by the caller. The +// newline character is stripped. +static const char *nextline(char **lineptr, size_t *n, const char *text) +{ + int len; + const char *to = text; + + if ( !*to ) return NULL; + + while ( *to && *to!='\n' && *to!='\r' ) to++; + len = to - text + 1; + + if ( *to ) + { + // Advance the pointer for the next call + if ( *to=='\n' ) to++; + else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; + } + if ( !len ) + return to; + + if ( !*lineptr ) + { + *lineptr = malloc(len); + *n = len; + } + else if ( *nkey[0] = name[0]; + tag->key[1] = name[1]; + tag->value = malloc(len+1); + memcpy(tag->value,value_from,len+1); + tag->value[len] = 0; + return tag; +} + +static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) +{ + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; + tags = tags->next; + } + return NULL; +} + + +// Return codes: +// 0 .. different types or unique tags differ or conflicting tags, cannot be merged +// 1 .. all tags identical -> no need to merge, drop one +// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated +// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line +static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) +{ + HeaderTag *t1, *t2; + + if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) + return 0; + + int itype = tag_exists(hline1->type,types); + if ( itype==-1 ) { + debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); + return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code + } + + if ( unique_tags[itype] ) + { + t1 = header_line_has_tag(hline1,unique_tags[itype][0]); + t2 = header_line_has_tag(hline2,unique_tags[itype][0]); + if ( !t1 || !t2 ) // this should never happen, the unique tags are required + return 2; + + if ( strcmp(t1->value,t2->value) ) + return 0; // the unique tags differ, cannot be merged + } + if ( !required_tags[itype] && !optional_tags[itype] ) + { + t1 = hline1->tags->data; + t2 = hline2->tags->data; + if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments + return 0; + } + + int missing=0, itag=0; + while ( required_tags[itype] && required_tags[itype][itag] ) + { + t1 = header_line_has_tag(hline1,required_tags[itype][itag]); + t2 = header_line_has_tag(hline2,required_tags[itype][itag]); + if ( !t1 && !t2 ) + return 2; // this should never happen + else if ( !t1 || !t2 ) + missing = 1; // there is some tag missing in one of the hlines + else if ( strcmp(t1->value,t2->value) ) + { + if ( unique_tags[itype] ) + return 2; // the lines have a matching unique tag but have a conflicting tag + + return 0; // the lines contain conflicting tags, cannot be merged + } + itag++; + } + itag = 0; + while ( optional_tags[itype] && optional_tags[itype][itag] ) + { + t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); + t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); + if ( !t1 && !t2 ) + { + itag++; + continue; + } + if ( !t1 || !t2 ) + missing = 1; // there is some tag missing in one of the hlines + else if ( strcmp(t1->value,t2->value) ) + { + if ( unique_tags[itype] ) + return 2; // the lines have a matching unique tag but have a conflicting tag + + return 0; // the lines contain conflicting tags, cannot be merged + } + itag++; + } + if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged + return 1; +} + + +static HeaderLine *sam_header_line_clone(const HeaderLine *hline) +{ + list_t *tags; + HeaderLine *out = malloc(sizeof(HeaderLine)); + out->type[0] = hline->type[0]; + out->type[1] = hline->type[1]; + out->tags = NULL; + + tags = hline->tags; + while (tags) + { + HeaderTag *old = tags->data; + + HeaderTag *new = malloc(sizeof(HeaderTag)); + new->key[0] = old->key[0]; + new->key[1] = old->key[1]; + new->value = strdup(old->value); + out->tags = list_append(out->tags, new); + + tags = tags->next; + } + return out; +} + +static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) +{ + list_t *tmpl_tags; + + if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) + return 0; + + tmpl_tags = tmpl_hline->tags; + while (tmpl_tags) + { + HeaderTag *tmpl_tag = tmpl_tags->data; + HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); + if ( !out_tag ) + { + HeaderTag *tag = malloc(sizeof(HeaderTag)); + tag->key[0] = tmpl_tag->key[0]; + tag->key[1] = tmpl_tag->key[1]; + tag->value = strdup(tmpl_tag->value); + out_hline->tags = list_append(out_hline->tags,tag); + } + tmpl_tags = tmpl_tags->next; + } + return 1; +} + + +static HeaderLine *sam_header_line_parse(const char *headerLine) +{ + HeaderLine *hline; + HeaderTag *tag; + const char *from, *to; + from = headerLine; + + if ( *from != '@' ) { + debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); + return 0; + } + to = ++from; + + while (*to && *to!='\t') to++; + if ( to-from != 2 ) { + debug("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine); + return 0; + } + + hline = malloc(sizeof(HeaderLine)); + hline->type[0] = from[0]; + hline->type[1] = from[1]; + hline->tags = NULL; + + int itype = tag_exists(hline->type, types); + + from = to; + while (*to && *to=='\t') to++; + if ( to-from != 1 ) { + debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + return 0; + } + from = to; + while (*from) + { + while (*to && *to!='\t') to++; + + if ( !required_tags[itype] && !optional_tags[itype] ) + tag = new_tag(" ",from,to-1); + else + tag = new_tag(from,from+3,to-1); + + if ( header_line_has_tag(hline,tag->key) ) + debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); + hline->tags = list_append(hline->tags, tag); + + from = to; + while (*to && *to=='\t') to++; + if ( *to && to-from != 1 ) { + debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + return 0; + } + + from = to; + } + return hline; +} + + +// Must be of an existing type, all tags must be recognised and all required tags must be present +static int sam_header_line_validate(HeaderLine *hline) +{ + list_t *tags; + HeaderTag *tag; + int itype, itag; + + // Is the type correct? + itype = tag_exists(hline->type, types); + if ( itype==-1 ) + { + debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); + return 0; + } + + // Has all required tags? + itag = 0; + while ( required_tags[itype] && required_tags[itype][itag] ) + { + if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) + { + debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], + hline->type[0],hline->type[1]); + return 0; + } + itag++; + } + + // Are all tags recognised? + tags = hline->tags; + while ( tags ) + { + tag = tags->data; + if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) + { + debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); + return 0; + } + tags = tags->next; + } + + return 1; +} + + +static void print_header_line(FILE *fp, HeaderLine *hline) +{ + list_t *tags = hline->tags; + HeaderTag *tag; + + fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); + while (tags) + { + tag = tags->data; + + fprintf(fp, "\t"); + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); + fprintf(fp, "%s", tag->value); + + tags = tags->next; + } + fprintf(fp,"\n"); +} + + +static void sam_header_line_free(HeaderLine *hline) +{ + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + free(tag->value); + free(tag); + tags = tags->next; + } + list_free(hline->tags); + free(hline); +} + +void sam_header_free(void *_header) +{ + HeaderDict *header = (HeaderDict*)_header; + list_t *hlines = header; + while (hlines) + { + sam_header_line_free(hlines->data); + hlines = hlines->next; + } + list_free(header); +} + +HeaderDict *sam_header_clone(const HeaderDict *dict) +{ + HeaderDict *out = NULL; + while (dict) + { + HeaderLine *hline = dict->data; + out = list_append(out, sam_header_line_clone(hline)); + dict = dict->next; + } + return out; +} + +// Returns a newly allocated string +char *sam_header_write(const void *_header) +{ + const HeaderDict *header = (const HeaderDict*)_header; + char *out = NULL; + int len=0, nout=0; + const list_t *hlines; + + // Calculate the length of the string to allocate + hlines = header; + while (hlines) + { + len += 4; // @XY and \n + + HeaderLine *hline = hlines->data; + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + len += strlen(tag->value) + 1; // \t + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + len += strlen(tag->value) + 3; // XY: + tags = tags->next; + } + hlines = hlines->next; + } + + nout = 0; + out = malloc(len+1); + hlines = header; + while (hlines) + { + HeaderLine *hline = hlines->data; + + nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); + + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + nout += sprintf(out+nout,"\t"); + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); + nout += sprintf(out+nout,"%s", tag->value); + tags = tags->next; + } + hlines = hlines->next; + nout += sprintf(out+nout,"\n"); + } + out[len] = 0; + return out; +} + +void *sam_header_parse2(const char *headerText) +{ + list_t *hlines = NULL; + HeaderLine *hline; + const char *text; + char *buf=NULL; + size_t nbuf = 0; + + if ( !headerText ) + return 0; + + text = headerText; + while ( (text=nextline(&buf, &nbuf, text)) ) + { + hline = sam_header_line_parse(buf); + if ( hline && sam_header_line_validate(hline) ) + hlines = list_append(hlines, hline); + else + { + if (hline) sam_header_line_free(hline); + sam_header_free(hlines); + if ( buf ) free(buf); + return NULL; + } + } + if ( buf ) free(buf); + + return hlines; +} + +void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) +{ + const HeaderDict *dict = (const HeaderDict*)_dict; + const list_t *l = dict; + khash_t(str) *tbl = kh_init(str); + khiter_t k; + int ret; + + if (_dict == 0) return tbl; // return an empty (not null) hash table + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key || !value ) + { + l = l->next; + continue; + } + + k = kh_get(str, tbl, key->value); + if ( k != kh_end(tbl) ) + debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); + k = kh_put(str, tbl, key->value, &ret); + kh_value(tbl, k) = value->value; + + l = l->next; + } + return tbl; +} + +char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) +{ + const HeaderDict *dict = (const HeaderDict*)_dict; + const list_t *l = dict; + int max, n; + char **ret; + + ret = 0; *_n = max = n = 0; + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key; + key = header_line_has_tag(hline,key_tag); + if ( !key ) + { + l = l->next; + continue; + } + + if (n == max) { + max = max? max<<1 : 4; + ret = realloc(ret, max * sizeof(void*)); + } + ret[n++] = key->value; + + l = l->next; + } + *_n = n; + return ret; +} + +const char *sam_tbl_get(void *h, const char *key) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + khint_t k; + k = kh_get(str, tbl, key); + return k == kh_end(tbl)? 0 : kh_val(tbl, k); +} + +int sam_tbl_size(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + return h? kh_size(tbl) : 0; +} + +void sam_tbl_destroy(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + kh_destroy(str, tbl); +} + +void *sam_header_merge(int n, const void **_dicts) +{ + const HeaderDict **dicts = (const HeaderDict**)_dicts; + HeaderDict *out_dict; + int idict, status; + + if ( n<2 ) return NULL; + + out_dict = sam_header_clone(dicts[0]); + + for (idict=1; idictdata, out_hlines->data); + if ( status==0 ) + { + out_hlines = out_hlines->next; + continue; + } + + if ( status==2 ) + { + print_header_line(stderr,tmpl_hlines->data); + print_header_line(stderr,out_hlines->data); + debug("Conflicting lines, cannot merge the headers.\n"); + return 0; + } + if ( status==3 ) + sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); + + inserted = 1; + break; + } + if ( !inserted ) + out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); + + tmpl_hlines = tmpl_hlines->next; + } + } + + return out_dict; +} + + diff --git a/samtools/sam_header.h b/samtools/sam_header.h new file mode 100644 index 0000000..e5c754f --- /dev/null +++ b/samtools/sam_header.h @@ -0,0 +1,24 @@ +#ifndef __SAM_HEADER_H__ +#define __SAM_HEADER_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + void *sam_header_parse2(const char *headerText); + void *sam_header_merge(int n, const void **dicts); + void sam_header_free(void *header); + char *sam_header_write(const void *headerDict); // returns a newly allocated string + + char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); + + void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); + const char *sam_tbl_get(void *h, const char *key); + int sam_tbl_size(void *h); + void sam_tbl_destroy(void *h); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/samtools/sam_view.c b/samtools/sam_view.c new file mode 100644 index 0000000..06dd01a --- /dev/null +++ b/samtools/sam_view.c @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include "sam_header.h" +#include "sam.h" +#include "faidx.h" + +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; +static char *g_library, *g_rg; +static int g_sol2sanger_tbl[128]; + +static void sol2sanger(bam1_t *b) +{ + int l; + uint8_t *qual = bam1_qual(b); + if (g_sol2sanger_tbl[30] == 0) { + for (l = 0; l != 128; ++l) { + g_sol2sanger_tbl[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64 + 33) / 10.0)) / log(10.0) + .499); + if (g_sol2sanger_tbl[l] >= 93) g_sol2sanger_tbl[l] = 93; + } + } + for (l = 0; l < b->core.l_qseq; ++l) { + int q = qual[l]; + if (q > 127) q = 127; + qual[l] = g_sol2sanger_tbl[q]; + } +} + +static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) +{ + if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) + return 1; + if (g_rg) { + uint8_t *s = bam_aux_get(b, "RG"); + if (s && strcmp(g_rg, (char*)(s + 1)) == 0) return 0; + } + if (g_library) { + const char *p = bam_get_library((bam_header_t*)h, b); + return (p && strcmp(p, g_library) == 0)? 0 : 1; + } + return 0; +} + +// callback function for bam_fetch() +static int view_func(const bam1_t *b, void *data) +{ + if (!__g_skip_aln(((samfile_t*)data)->header, b)) + samwrite((samfile_t*)data, b); + return 0; +} + +static int usage(int is_long_help); + +int main_samview(int argc, char *argv[]) +{ + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0; + int of_type = BAM_OFDEC, is_long_help = 0; + samfile_t *in = 0, *out = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; + + /* parse command-line options */ + strcpy(in_mode, "r"); strcpy(out_mode, "w"); + while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:C")) >= 0) { + switch (c) { + case 'C': slx2sngr = 1; break; + case 'S': is_bamin = 0; break; + case 'b': is_bamout = 1; break; + case 't': fn_list = strdup(optarg); is_bamin = 0; break; + case 'h': is_header = 1; break; + case 'H': is_header_only = 1; break; + case 'o': fn_out = strdup(optarg); break; + case 'f': g_flag_on = strtol(optarg, 0, 0); break; + case 'F': g_flag_off = strtol(optarg, 0, 0); break; + case 'q': g_min_mapQ = atoi(optarg); break; + case 'u': is_uncompressed = 1; break; + case 'l': g_library = strdup(optarg); break; + case 'r': g_rg = strdup(optarg); break; + case 'x': of_type = BAM_OFHEX; break; + case 'X': of_type = BAM_OFSTR; break; + case '?': is_long_help = 1; break; + case 'T': fn_ref = strdup(optarg); is_bamin = 0; break; + default: return usage(is_long_help); + } + } + if (is_uncompressed) is_bamout = 1; + if (is_header_only) is_header = 1; + if (is_bamout) strcat(out_mode, "b"); + else { + if (of_type == BAM_OFHEX) strcat(out_mode, "x"); + else if (of_type == BAM_OFSTR) strcat(out_mode, "X"); + } + if (is_bamin) strcat(in_mode, "b"); + if (is_header) strcat(out_mode, "h"); + if (is_uncompressed) strcat(out_mode, "u"); + if (argc == optind) return usage(is_long_help); + + // generate the fn_list if necessary + if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref); + // open file handlers + if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { + fprintf(stderr, "[main_samview] fail to open file for reading.\n"); + goto view_end; + } + if (in->header == 0) { + fprintf(stderr, "[main_samview] fail to read the header.\n"); + goto view_end; + } + if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { + fprintf(stderr, "[main_samview] fail to open file for writing.\n"); + goto view_end; + } + if (is_header_only) goto view_end; // no need to print alignments + + if (argc == optind + 1) { // convert/print the entire file + bam1_t *b = bam_init1(); + int r; + while ((r = samread(in, b)) >= 0) { // read one alignment from `in' + if (!__g_skip_aln(in->header, b)) { + if (slx2sngr) sol2sanger(b); + samwrite(out, b); // write the alignment to `out' + } + } + if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n"); + bam_destroy1(b); + } else { // retrieve alignments in specified regions + int i; + bam_index_t *idx = 0; + if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index + if (idx == 0) { // index is unavailable + fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n"); + ret = 1; + goto view_end; + } + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' + if (tid < 0) { // reference name is not found + fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n"); + continue; + } + bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments + } + bam_index_destroy(idx); // destroy the BAM index + } + +view_end: + // close files, free and return + free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); + samclose(in); + samclose(out); + return ret; +} + +static int usage(int is_long_help) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools view [options] | [region1 [...]]\n\n"); + fprintf(stderr, "Options: -b output BAM\n"); + fprintf(stderr, " -h print header for the SAM output\n"); + fprintf(stderr, " -H print header only (no alignments)\n"); + fprintf(stderr, " -S input is SAM\n"); + fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); + fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); + fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); + fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); + fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); + fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); + fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); + fprintf(stderr, " -q INT minimum mapping quality [0]\n"); + fprintf(stderr, " -l STR only output reads in library STR [null]\n"); + fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); + fprintf(stderr, " -? longer help\n"); + fprintf(stderr, "\n"); + if (is_long_help) + fprintf(stderr, "Notes:\n\ +\n\ + 1. By default, this command assumes the file on the command line is in\n\ + the BAM format and it prints the alignments in SAM. If `-t' is\n\ + applied, the input file is assumed to be in the SAM format. The\n\ + file supplied with `-t' is SPACE/TAB delimited with the first two\n\ + fields of each line consisting of the reference name and the\n\ + corresponding sequence length. The `.fai' file generated by `faidx'\n\ + can be used here. This file may be empty if reads are unaligned.\n\ +\n\ + 2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\ +\n\ + 3. BAM->SAM conversion: `samtools view in.bam'.\n\ +\n\ + 4. A region should be presented in one of the following formats:\n\ + `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\ + specified, the input alignment file must be an indexed BAM file.\n\ +\n\ + 5. Option `-u' is preferred over `-b' when the output is piped to\n\ + another samtools command.\n\ +\n\ + 6. In a string FLAG, each character represents one bit with\n\ + p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\n\ + U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\n\ + 1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \n\ + f=0x200 (failure) and d=0x400 (duplicate). Note that `-x' and\n\ + `-X' are samtools-C specific. Picard and older samtools do not\n\ + support HEX or string flags.\n\ +\n"); + return 1; +} + +int main_import(int argc, char *argv[]) +{ + int argc2, ret; + char **argv2; + if (argc != 4) { + fprintf(stderr, "Usage: bamtk import \n"); + return 1; + } + argc2 = 6; + argv2 = calloc(6, sizeof(char*)); + argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; + ret = main_samview(argc2, argv2); + free(argv2); + return ret; +} diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..652736c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[bdist_rpm] +doc_files = README doc/*.html ChangeLog +vendor = TDB +packager = TDB +distribution-name = Red Hat Linux +requires = python diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..098cb7f --- /dev/null +++ b/setup.py @@ -0,0 +1,79 @@ +#!/usr/bin/python +''' + +pysam +***** + +''' + +import os, sys, glob, shutil + +name = "pysam" +version = "0.2" + +samtools_exclude = ( "bamtk.c", "razip.c", "bgzip.c" ) +samtools_dest = os.path.abspath( "samtools" ) + +# copy samtools source +if len(sys.argv) >= 2 and sys.argv[1] == "import": + if len(sys.argv) < 3: raise ValueError("missing PATH to samtools source directory") + samtools_src = os.path.abspath( sys.argv[2] ) + if not os.path.exists( samtools_src ): raise IOError( "samtools src dir `%s` does not exist." % samtools_src ) + + cfiles = glob.glob( os.path.join( samtools_src, "*.c" ) ) + hfiles = glob.glob( os.path.join( samtools_src, "*.h" ) ) + ncopied = 0 + for p in cfiles + hfiles: + f = os.path.basename(p) + if f in samtools_exclude: continue + if os.path.exists( os.path.join( samtools_dest, f )): continue + shutil.copy( p, samtools_dest ) + ncopied += 1 + print "installed latest source code from %s: %i files copied" % (samtools_src, ncopied) + sys.exit(0) + +from distutils.core import setup, Extension +from Pyrex.Distutils import build_ext + +classifiers = """ +Development Status :: 2 - Alpha +Operating System :: MacOS :: MacOS X +Operating System :: Microsoft :: Windows :: Windows NT/2000 +Operating System :: OS Independent +Operating System :: POSIX +Operating System :: POSIX :: Linux +Operating System :: Unix +Programming Language :: Python +Topic :: Scientific/Engineering +Topic :: Scientific/Engineering :: Bioinformatics +""" + +pysam = Extension( + "pysam/csamtools", # name of extension + [ "pysam/csamtools.pyx" ] +\ + [ "pysam/%s" % x for x in ( + "pysam_util.c", )] +\ + glob.glob( os.path.join( "samtools", "*.c" ) ), + library_dirs=[], + include_dirs=[ "samtools", ], + libraries=[ "z", ], + language="c", + ) + +metadata = { + 'name': name, + 'version': version, + 'description': "pysam", + 'long_description': __doc__, + 'author': "Andreas Heger", + 'author_email': "andreas.heger@gmail.com", + 'license': "MIT", + 'platforms': "ALL", + 'url': "http://code.google.com/p/pysam/", + 'py_modules': [ + "pysam/__init__", "pysam/Pileup", "pysam/namedtuple" ], + 'ext_modules': [pysam,], + 'cmdclass' : {'build_ext': build_ext} } + +if __name__=='__main__': + dist = setup(**metadata) diff --git a/tests/00README.txt b/tests/00README.txt new file mode 100644 index 0000000..67b8689 --- /dev/null +++ b/tests/00README.txt @@ -0,0 +1,32 @@ +File ex1.fa contains two sequences cut from the human genome +build36. They were exatracted with command: + + samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 + +Sequence names were changed manually for simplicity. File ex1.sam.gz +contains MAQ alignments exatracted with: + + (samtools view NA18507_maq.bam 2:2044001-2045500; + samtools view NA18507_maq.bam 20:68001-69500) + +and processed with `samtools fixmate' to make it self-consistent as a +standalone alignment. + +To try samtools, you may run the following commands: + + samtools faidx ex1.fa # index the reference FASTA + samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM + samtools index ex1.bam # index BAM + samtools tview ex1.bam ex1.fa # view alignment + samtools pileup -cf ex1.fa ex1.bam # pileup and consensus + samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz + +In order for the script pysam_test.py to work, you will need pysam +in your PYTHONPATH. + +In order for the script example.py to work, you will need pysam +in your PYTHONPATH and run + + make all + +beforehand. diff --git a/tests/Makefile b/tests/Makefile new file mode 100644 index 0000000..5403750 --- /dev/null +++ b/tests/Makefile @@ -0,0 +1,32 @@ +all: ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz \ + ex2.sam.gz ex2.sam ex1.sam \ + ex2.bam \ + ex3.bam ex3.bam.bai \ + ex4.bam ex4.bam.bai \ + ex5.bam ex5.bam.bai \ + ex6.bam + +ex2.sam.gz: ex1.bam ex1.bam.bai + samtools view -h ex1.bam | gzip > ex2.sam.gz + +%.bam: %.sam ex1.fa.fai + samtools import ex1.fa.fai $< $@ + +%.sam: %.sam.gz + gunzip < $< > $@ + +ex1.fa.fai:ex1.fa + samtools faidx ex1.fa +ex1.bam:ex1.sam.gz ex1.fa.fai + samtools import ex1.fa.fai ex1.sam.gz ex1.bam +%.bam.bai:%.bam + samtools index $< +ex1.pileup.gz:ex1.bam ex1.fa + samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz +ex1.glf:ex1.bam ex1.fa + samtools pileup -gf ex1.fa ex1.bam > ex1.glf +ex1.glfview.gz:ex1.glf + samtools glfview ex1.glf | gzip > ex1.glfview.gz + +clean: + rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM pysam_*.sam ex2.sam ex2.sam.gz ex1.sam diff --git a/tests/ex1.fa b/tests/ex1.fa new file mode 100644 index 0000000..b4ed0cf --- /dev/null +++ b/tests/ex1.fa @@ -0,0 +1,56 @@ +>chr1 +CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT +GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC +GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG +TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC +AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA +CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC +AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT +CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA +ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC +AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC +AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC +ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC +CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT +TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT +TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT +GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT +ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA +ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG +TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA +CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG +TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC +TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC +TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG +TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG +AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA +TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC +TCCCTCGTCTTCTTA +>chr2 +TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG +CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT +TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT +CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA +AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT +AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC +ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG +GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT +CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT +TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA +AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA +ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT +TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA +AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC +TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA +GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT +AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA +AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT +AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT +AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT +ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT +GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG +CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA +GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA +AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA +TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC +CAGAAAAAAATATTTACAGTAACT diff --git a/tests/ex1.sam.gz b/tests/ex1.sam.gz new file mode 100644 index 0000000000000000000000000000000000000000..8dd2bc447cb504be23c29aa54d1a7b8ccfb8fa73 GIT binary patch literal 113194 zcmV(vK{E?NUCng zig)#1<5q;ja=RMt*iHNYH;j=0$prWTaMpQq?B~55lj%?3`Z-y&jTI&WD74@~}$hBtrD-G`&yZz`Wz9clRJ zY~(K4C|^czEHk#=muO?mYNOqMJF}2${7INC1O%tH`ToWE-5?l~AI8zzOBSN@ss1g! z3-O*#4#v5m*L=srVtU^37M{d3MVQf4cYLw)4}dtdX0)SGtxzgIp}L6Fc~3HQgAlzp za6j})uAW42kLQ09UDp&*?ie(FoQFFM(pu+ib}~hA&-p9EUml%zdj6N}3`40`l+q&< zqHRhL{XaeP%UJk+TyC%#WnZ=K&YRPCSEtNi6<>9`9$#VMmqk-V`LmC6z4XSXewr@d zAJ+bxPLcIry#uiR|CIrCeA)fRb-F3zJiX}rM*?&Pw7fn4?N7U3a}U#-HLP?1DLcOE zz2BRzDMIw_kOPVq^fv{tDdncO!I{gqTs-Z5TY6I(e~4nJ5P`BoBwNvl)_ai>%%5Gq znC*bawr7j@2Aj0=M%63`l>O|x>8DEHZFaGR{VB~TllyHK!0NvJzt_xuF@PwVDSDeR z`S%a=R~x=xb>BQ}!DZ~vhd(2|!ueg1num}UKL1@vvF)xc>+j)}+`Ye#0|KhiYODe=htzf7}9 zDZ+36`CV|14Cbh#bw`i*&!1`k7U=>`wR4n3_wUXhB8{*g!C1#~I)9>9oIZRM{vs*B z{Q6yS@^~S9AHJ6C;fui3bgX_H;N6f`6>VtXa_y3yoxU-sr^uew1P=#gbyiS%L zX{Yx~hn6IeSo_8X6WVSz=NJzE$Eba((7IM;K(#4HIbCc0iMPV%Gl+5pgPE+n57QhR zQwgPapX-wMnLo!#^2ukm@~SHycrs`2&FS5f&*NRs;xXNGoPwg;GR)uIJJHgyAL!2U zKU|J(y*Z`#^TMg2_e*b;V;fX+*W}&4Xl+x3c~~BC_i!IMe>?V9T~jcq89E#zf%}{L zCd2AD(%)4$^M@$E0?MDw?LE$JN{`Re;6|ZG>9UoV$w~52_Vy&Hf9oDMy!ZIVhvZJS z>(U#3k|dU1H8aYuJzJs}^}AX6;5nx?e0&{-{Tn_yQDJt0+75}glY{KBSGjXC{4n)! zPNq3q^Z=AhIxGh&@uj1}Kf7->>G@%bp@>hrTc6hUD2?$;-t_qvMNPJZ@)u{`p)er) zx7TshYOaR1?MpW)JKVFpyDwMpMC)%QF=&5tb{?#4c}i<|>fN_78Y$t-&eAZ4bU5>I zBaC@&Q}m%e8Zq#p5=Y-hcZif$pteizc~o4^G2$P1i`xBTKPf$FWykSv9yS%jJHJCs zH(2aUALk3yLymyY*9U*e=yvE8i_~4u98It3j`;?TGmKwp95imxh99#) z$kYTvM*3h~sX;j(tE)ai>YdGOPeWl)1=42ZklZw`_ckC7<0SJFT)??(PpSv{{b_kh z;aKHJ`gB4Fq&@{}vgjgt8D0`~dZ50vJ!#ny1WDoDp+}lzya>+Nx*OWAMUX%H6<(1b z_c`YN{+#V851$}V=op;#>6#T=tkINEzTu(F8rdlUci z54}ZP+@ZX`0$=^LjU?g#sEvX81)zJb{yF@E%k?A5&Uim;@-T|e~FdgrCNK{-%7?%HcWX-|fYIjnmCPf?dwp1%S3^WGu#@!quc zX^!24>w}ax1aUQRT~<7D>#?6dm&@DX4ambZs3%yDn8hT5cdsb_8{wnCGSJ_hSD?7^ zH~KoZngyVmzi~A8 zjU6vu`NR>IdD%!`O=$iZmo^~$#{Vqw2XCJg8a;3_C=B)yrt@dSB?maUyakml%P4o# zZ4(2_;2sBa*@e>!m8QlPsjLs54uSTP>X;XRj_lwr&IX0`!Ca0q!V#t_n>7wQkK^Qb zx6=f?46{%Ynkd%1dUwxt0qfHrAFVsXJ)%yxC&Kp{n4gEAWy4;DDLNa9mdMO~9Nd|% zV?b>dUE*!B1arrpbLYb_TXSR2^EylHYVz<j~w!$o+EEEBQiSu%0xl|p?l|HgS`M38TighBOjJiNoJ~77?RDtviYFK)G zHLHjlw4--(gN6jhO)4|DcHNA#3Z0QrkrNBMK9QEc6zy-$9;bK-#-huI6jw3lIy%NKhFv*12sjmw|+Ru8h% zaP$ZER6dbFAY27>O~u)Rk2wWe4nw5-cjx@l`(28M-Y&@`glUXzR2w#^q10lF^DD)P zuixnF={z_+-dmrLnZJ{1$2b266p8qZXz+QFKfizSkVK|k9=kmWlvYZ5{*gZYb?E}JXsCQGE8m&!hIZo$ugox9JO>f@gHf5M! zP5<$%<@?R2s^+Oz>Z$Gfr&jgqW=yMlt+mPDrfXsngUcQd=o)S{aZ>CW#TzS1OEoyR z!H9RC^v&;p&rdVvPt%oMqG;}qeO=12>)zuuY6+hFx(h=%zgL`JdpK|U=8RFdY1C#? z>h`AS!N)MSzh99R2Nl`4@YFD6JXW8nJH>!XcHjHx?+vx&vOPoIJKn>YANMo$%%D}> ztO}3lRv{=s<%^d(Wm4)?)HqXv^h1xK7l}RwKfj0H;ve{Zx@i!u4DXRu`t8~Kvm{wK z${U##^#@ZWsQ@#%?h}7cuJz`>y?1AHYw{=M6MUOSv|*^ z8ML-RQV+p>c1rTXF$ap**ias+0J`~D_7zi7xmzmdAt8M7calF$+Th6u(;iDv&O`OP zJOohqOtLzj0{UbF?>^JY7wQzM*4H1axWjd53cz1sAm!hdy#KI_BOZ;D`5ZJUUU z@gj(aR?CBV#h(RQG?dGxVke$HU`-L}|$@bQ~JJ8w>eVUp}rWs6_V;xWuF5YTo0@V9WLYF++5 zc@LCx*m(}~bk@qmSR|h$DeuxaHR)r@!5T9t%Mv4x)6OqgmI_UM6;5~{ON-7ternPb zXIH)@<7x6Cl5vC63?vIXjSJCS0U$1e=C5;3k59w+HjUJ^_qlx82Px}~iMfmp(|~-@ zd3cOw8z-u8=Kg(N@ZeBe+^Y|z?KFFgpi?RU!i(;C|mn8WVT5 z5r8Dm`1DClsF&I+L*M$O@+)zQk37x-bXV$eAaaMLn=3K(=yPn?}cb3yui=d+xrZY z(q|>_ezWBj(Q+U?`zya+ou$*ocL59gG%BL9Zz=#clX;P5t}ca$di{ire0A<^eXq0L zV>y8t=k*2%KgXa?IeG}MvFp+BJU$>(!aQj{Tv9=}gW2KEUk`&DRLfwR4#|9H|5yKi zbb_#IMehxJPX9LyZq0B{`xVm-yLi^!^4P|);&ss*)h|6dANVDJyQ`gdtn4Y_qj>6v z+w?2jtGLd7zM9)zPZ~8)f8lrj5PM5LJ0A#7AJ2YUh&O*5JbxJN_9Lr!8~BZpf{rhE zh^$%^0eqCh&zS#DCiegf8-m?;lus9|yB@OL58j-{o>$E3ZHycM^;{3qr~SwU62KPs zY}Wy>3tK`)&rm?S_S@0?+ZY=T?(|0=0!4E(z8_q+CtG5VHfQk&mi36CNcDzp+e3iZ z2lU5&_o_+bNTE6GPW_gf`z(XHKXR0Agp5gGy%wxmsRY0)@J3t$MVLR$ed?ySQM*vq z8|cNpfO9t}XtSUIl}L&7-)~#p^616-E!)k}i)lmjQUv;Q{5U>_9D6i6U;JG!n;@mh zO&`5lk5*&EG)KC`grE;|bx61EL}OAg@0Xz9;x7IE$E{lNH!{m>zP$>y=_1ZAe(Lq?n*3K04!!@pUT?7u*_HNp znO}8{44XYUCO|Q@AWRidZoTz1*^vfeC#9w#ulK|}9Zy`)ZbX!e!1#3nmoEL`?@`(7b#3nF%?w!H}LmB0KmQD)**B{0)0c8BOB+*ISEB-#}bn2WiO;?7Q zpLzBnf(HDdySnP3prY{DX#DM|I(3Ls&U_~qa3-J76E}d0F5jJ1yey8MPmoCa)L*nm zKryG^n#X`*z%!03y$;>@yva_>qd!}oFuMm6oMCUZ3vY}iOzwE2?z4!L42pQk5vQNi z{N2oF9c~Gl{pp8S`d1O>Z}X4W`^$a(%s;=pJo}$tUhYpXFIhgCb`$RKnPGUgrx+Bh z2R%>j9PfOyx(N?F1&lw$A+iq8;5+pdDhllEJ2ggKZY$d17+upP4it=|)KXf5lzJcT zZnQGca!59-1cdI{y?L2V#9@bRbfrhG#bs>jyo(e(G* z^(Qk*l)-G}f3>LhyCbC9P=HsDbmN%j^AEm`2Pr30Zp>xz!H3CCy$o{>Wu!g>9fpfl z>8fjq`pW#F3e-XC@Rj7`-xS=pC`y-L9zR1yYT-wCS$KrkVa&Tz{)=EU1hQo@NssOm zshTp_B0;5_HeDT6BQAy#CeeZ`kZ;o@B$;WBN+6dPDSX~JJU#i4$4u7dB!T6+>3oHP zbXzE#DaEXr;0zOObvREymygpf4f1-H_(&?Ka^J#R)Ct0`NWEuijBWdp{~w&CaP0eV zePTR=G3iUr_>P_wYSJv?JTT}8@MchE^ti5*9OVSV%%1#tM|SH&)EP!(Nl(v?`z9px z>pnvFd(a!9j(=DR@S(zPu;@n~kxWP?hInM=m}-4L*?xiU7xDD_TrR8^jwk(bUP zp5zmPpyzsn6D`98igU~ymLcc_x3xqpes_r z&%Do;-ojwpIn2=SJS>w}2d_|znV|vknzHa_X>HK6xkUuyt;N}Ta4De~b z-lxevejlXd0ZB4_MWYxcq+j1js=PLk)0fzAI8O4q-ho5;PN2)Iw>eDy;!OPfrrDtU z&QVfJB*d%wA<#Ofyfe5w{7bKXFeTFNZZ+U*!AdCCBX2^BnWhxhnn_?R@bw{r(3!(cc+>nPP&+e(O#+Im5U^Mq>CG*I*{K9P><*uq~UFGh=*NQ zVJ#QM!)@=6d@Ba8=$BFueqIaJ6xJ6SdOW0g}5RZO%kZE^noXdNhW z7?Wj+l-!ei3@LqLP-US_ln*mjk@###z?VP|vFRei)5l)JB2IsYYV)o!C^yiko!?vI z(4)#jl~D==@;=iFo?R+EdDqkATc|pdvd?xjYj2BIGxl+Hf_0B7p4rhvBlHQ1K))L|5;Z3c zW(0E%EVbWhfUKYcj;G$}@DSEpZqY8imGJgmgEJQIkS$pi6J`rm!Dl5L{@7lomYd7> z-aYzLk7g%*1l0eJhD)ggbkB>I8G4_2~Q>+6y7;Yb(%Su$jd&?o`$Xa zDxByN8kdsY&II%IHto6WgkEyI)OLw)O5!_1qfTX*yH$JInt~3#>8A~+WW^gpDC4;n zw)A)%+n*e!3=Izt!eq9DiSy*M__(m)wq6&VU-1r^k_G+4IYfG}nJH@i`->|%*n<$Mf#5^H0BHcoAI(_<;-j?q+fcU&GkKNZj*g-!J9D9T!OiH`E3IxTCbHx zpweC1o$HtjCL4To=P)tqQl!hMun(yMKhZJL?-?m)rMa8r z@AvWE_TT2g`|@ir}px7v`6G=_i%!-<_(pxDxeOr{6}g|F&(F!70-!bs70fj5iR#N zwWwOBuzFDY`TvZHwWk#Na09qtWapGqs?)h-Qu#=26b;rSB+iV6DxT{@h*yKR8EK`@ zKUe+fp0?Av3;5%r!An?I1+|%2R$K+NM%yV@3s_;YniJCb(B@XL0h%{A<6Q8D`U7Hs zw_TuRy5k)!q8zGR;`^ZPRDkOD2|-y-5!(gkp)(b-It+^X7_Zku;`Dj+1ZgTOYOJ~6 z>4YM=fs-;mpputh##j`Oo4E*{S~A##0gS+;?K^|>jQ=G9a8&c!8N<+vDQ?x zFXQ}bmg{}QJ+xwG;2$8*qQNz%JoXvZdI(07~5()JpGyEi`CuhDeq(_ zw@JvJ)XM6cxQcYE=Sbvi)rmZ_iEZ`sy6?3TJ<{B^5ysm^8m>(ts*oDvjL+H~=0D?e z*D%jhj^mDOQH3zxj@HT=-I5T^j?04v*5(0l`T1m{?}qr!%N{Vmcq+0 z{dxK@DfA2O8RD@x)e`18`YG_iuUe#ISX zKVBh(0I1^*@o084^*dv~Vc^>bdZ5$-)wJagONJ*Qn56D#lKRx3-+RyVYrL#c(>0rl zPWjn;o~N~fT8Mlkm+VrTMx7MBQ>#sMo-7h&7*>RcB4O@K2D>`a<`2Jab z?ZW(NBKPr*8(lv8x%{cbsLmu$j(}2BVw5w0Niqg+BOTHaPG@E z!_4=!Ih#s#7Vq!T-{gq)Z}m?cC<^;?=XD7uJ3!+=tpp*Ewo{mGkkk96tA0_Ha6YR? zOOE6(b(YDA^d!|wPss(4>1eVgxvEGvO%LAVveZ(9*0`GIg2pAkIAs(&(_#$6tuhQ zLF?}1HCxfeA&Vl)I+nj{Phj&gO68bxq+#VnTaC0KhW`1W=VL459?o*^IWy8(AA{;% z11&pAhCPLV{n2#^(BA`-{>H#*Rk-5!6LF@|$@h759zB(^rd>^1cB=SY-}Ye~th@)} zMD?b_1ej&P(P`}$9S@`uLlpjFjl zC^YzN)ZpW+6w)Qv=uGpOqx{Xc>JCmsrx56&0dR~AoL-}OGO{a}y)L}t^8pSEFI-bH z=3-CY^>9d$cCCVsEChVsGVLbeIC~ScEo)$IWqFqFhl2~Mign)Ny6^PhpeVv==To4N zc@?JKp7BGHe0B;~l#eAxOFv$3xu3a6{eka=&zv3advSSb+`FhU&_f!Uxmt$0!htA)x1A3iN(;f1gsiOUQPwU9;nG_>dG-6kW6rTZ=2BUxs!dAl=D$jP@S^Vx!#Q4#lnK8>y5>OM{+_1G3QID;pej-1qvcPd{>I|_Fz zP$qB(#}w(@H#mP&oL^LkZl1^CuHJ6M+Bx_yc+oLFE44$I3bmMWnDYB6Q@GnJJ;!Pp zx^M@l&CpxERg-rdM5sO?Et(dk=WOYBVyVlI;nY|F)uYxM7rg$0`ze>@Car49IQMvy zlAlMnAt|V;lx7llvq3R1vZSQA?MZl`0;cQdI+;ebgsNmVyoc_Zvx0&e7e2i-46?(; z&oVv5DQV#Ki6O59^7rRN9r$y!eLrr`R29!luEw@_eZ?7R$m`GYz~?}}8Z!dU>!)>u zPNJtEBAvfqCQjN>7-C3y-y4YrdLu7Dzm9-@H6U*s@y77A{)LHLun! zngG?Yx>K~MH>ty$=pq*9FH$*;!vC$uxmoj(G#}k=3ha2$W?O<%)p8A1u#ynlWC`TK z;Kh92XA!wezA0v+I77WRzQ6T*!*KGV2|9``iclq#hixVLhW9%U={1i@`$r_3)>Y@8 zUQsOqp1yz*IG6smAv}akB}Ln+Dile_bw*QI+OvBUf_Ln~Zv)qS5hupiS{hSb{romO zd6Z|A%c87Do>TllB!x&c%0O}C^)%=Ye?aj!rUdxK^w22FoY;dVttD|3wCad%g}PEp z``@%xIEnb)8ihGp>;tu_>t2M;!4c|uOwldOL3la$RbTG7Vd0rt`DqWpU z&o4#d;6;(Xfbl2Duf+p=MBQ_FKqw&H zK-p+TcpSG?D)ErC3A(>~HE1qnpt|Jc1Dn)G#ATCygR(13dCo(g|6SV@4T2Jr$eoAE z{V4BjupL!DYj947Od}RShw^`2&V%MBFQrhY!RrTWwoQ=|SzB9$bmkJ8qO=WqY$u0C z!NLD%{0*NG92b;oV=0-WA;G+@dv@qWn_Wvd^d*R;2d^M%difn`s|~qsP>wU6dP_m( z_t!bl+xYbM@%+}^NFzp2H=~%k=`XQP4A$n;=d(Iw*1)>&YY*Plg*`eu-+JK^G&cA(uwr5qdvkd1VTq1}I2X&-MpaN~SakeKP zQH#) z({=MEm2%G5=%*jjfTpw!GHVsWV5X1R{;8zttR4(x3{s?dK`((PJqVjK8Z058^-7}i`-QTH7ySwYrU#ol;b!>Rz*wcLQum_>S_;JGX-fG z$a&AGqfyIUGdH!~-=wS1Ht{XI;tcXo*bND2v@3sK(EBLu?15SqGqd%eD>onvepQ-p; zx1$5T%UXWVhV=Y9`teG`4X-B#J1)#}?W8SpiRP50`9q7!YG1Ahy z+N6OmMc>DWa^J=9LxE!AhAfg@-b`)%6} zKD8f+$Z2}n1^S>6G=s``xgiVvhES-+w9WW225G{liRKnH-vhf~tILomLy`WkYq`JT zq)xiiBoS1F^z>t_>$6F)x(U!!Q+e61J&9IFixLBB2kDjKR;Bn6p`(gblU+~n$lSO0 zRB8~|sg@ohxJi=?+QciMe5?vBt^ShDy+EbHp3m_i5rlAx`Tw3hwM1#)-a76aDP7?T>NnuG(`) z32@z`{JGKT@Frh|8KTm$qYe=?I7X6!Wc=cC_du9Y_Kpd2b%r>vS(4+Su5a?aq_nk+ zlY~r$IP$;$m~%CH-i2uy%$Q}+Puu#=Cedhy_}DbI8Ls=_z`jt#Ghk`-^)m+qW)C6ux(h`zb}(|^M9splTTnyD^Y5vv^W#jgS??7euS2T z<*y38ppMFdIk5#BwZ3?~gS-A|^jG@FD5+`fN#0tW@k#YnJ|`FNUSkb#N?9HjLDsVj z^tUm3{4*lQ@VkH%p3InUgv_Xw+~7zvt@N9~v7S0kfuyAh;-dwKU+mS3A@f)Vn)brj ztZ{j=QNzMYQ6^#`i}H$Kfrek6H5E2C2F694zi0;P!}iSOiH^=v)qJ!M-Unp|e{;7y z!t@^2^b1T%iSX)GLK#X6r@HMIqoHLy8l9BJM;TVKw@%G^F?1gog3BPwig|t$s9Z^> z?$ImP%3K$or5Di6Bqv%%N>X3zSi*mUMVHc7s^hx?wb`u{k5U1r2kORcm(Xwmv`|Ui z({x-=a!MiDM#+LA#?X5;%>7p@M00|&ifbDU$>8BsE!f~i0(1$5sP+{Od<9S+?utNP z9A6PjlXBJi%}lm?J@TCVNCYAjsw@?7{xrPidvMb~vO(#^$$te`6&82BJ57>Uv^3aN z#`)9C6>~GqJwqY=wHgv2K1SBp`C@nEJef(wW|N9Kq{|MqyYsx8e)O_QcZD4pRF+&w z;GKX*WSjdXxLBXG%K zw)n~2B%@wt5$JEDTkS@?k{vVKKGH!iED^K~1`d^%lnrYW)DXxGU4kW?=j~!+d^EaT z`=BnYo~|d9KLmSeHhD1-#Ah`YMZeuR9M2zaELx75ceKV=oGy<_g4TG!p5GuqddIn$ zf$sDNUnSGlx+n<)r z$Z3eryBeI$(mD`6>rXgV>cf3ppNyXQmMsSN2gyWyXOJ=k&6>)GmrhVr{24Sg4)t$+ z+oG6ym04zH5HtPYKpfagHWZyKc_=yvQ@Q5PYWg^!jnsO(Gb=76yvKn7#d{^ChCYd# zY!WxAK*L$aNqx3qUvfHx-3RN76Ni>`AQU7$1smt$K&j-=>MERDD+ZLqK>AOQ1S6r^ zB#3ESID=|hV(IeR&U_MN5Z+VJHO*_ z+#yDKFJAZJJ*;w@#iILDrgOWU-c6K($`dFsG-Sb zR^7F0oCIy$Kw?}(IX}Ko($NKiqh-10R-U|Vx%`^Br-HruPscO)z}6sYur_#N9rZxD zu~z(^Us|KGQfN)*7HMM|fk6e8Tu2RzyE&QJKrlT>Ei5zdYnMXuru79Hljc^~hSR8CzzNHdv;0FMM(gcS2koSQ`x!<7%~ ziZD~EqYb5qqX*)vbOPbPOK|H;G=OWSO_SpE|IJX+sKLpMDJDT(c5w5g=s4b-@P9o} zsqbk@0kj9g_&bTU)C1>IxN|0)_bQ-0?Zw{CxsgqF!S=j&wJjDTHVU;D>-vL*CyuK+ zoN>a765!rZ9R#LC0y-*bjIq}`EY=cuAFPSGeak~-$3~N6H;`fs-i&1!w|5ZFv2@BHaK1njIPn0Y_@w%}~HH1!l zTLW~_dr7_gh=-eDl+HUR)n4ABvp|Voeg^>vmK8gbW$#_r)WY0;63ad^kp|Tq??R~v zMj<_0dLG?@#1@21C7fTgmgD9#E=AGJaAyr?P%ec#yQZE~$3Vq3=qVIH(i4q8;wH%9 zuH&+L;sV&0o84)A$`Bim580DAGzBVbRCcNXs;0Db6K{HV1hi5yZKHX=IHwe=RP%61 z8LLuCT4^x?GFgKc?~lASvGDdnoCgD?#s-F-w zkG(hUUHBv~i<`QdIcXAkLJrcqHqawzxk60?CzKY;P-6#>A=vKa2cbB`@pi>E(Xp$ zWW|N%2cO@fI*g^1aqeYI$w9(6z~vSfQu)&8`xh8OZ&_T@b2=tWE}+g>hIvYfbFlK| z_qoW%)S>bKO3T(ToJmICN9E=c%p;tOUHk2uSCPnJD~r4OB~IKEH>hgLFuxk7oVCuW zchEyB5hHy}Rj2^CyB0m|3gd2YYKF6~!TA%Tl4s2HQt@_=6z74#DJnTh#Q9Jr(&EUb zGBR&6tiMM#Ra2l?e4ng+XAiV0M1N+3G^5_zo0bl4)Q5R3#-<=c$dSrCeClw0mW}%1 z#K-8v57HRbn}_5r(u%x(t1r%g@+4H*SZ^%BB&Heg2|4r+K}F-mO<|T|>L4{P@0-WO zs%JVHY4za~dd3uYPMpSrh*4}6%FKLJi3mUT@wwqZPvgvq1}QS>Jt^2mj65%J$7yxgB169Rxha-a$P{r)+DN!%_J|0aaIYUNecmo>TZxqDKr)1^Z8+P|qNUGP)gN zJv`tP{N3AzA)g{hp5WWi=SKEKtRm=|%C#M!QwiRtJqOBke}dn~sjAti;KNd&7>oCOA3}U*=c$h^)czdm(vN3DPY&+W z^&NDE8uzG)IeuyFzkZwd@%>$n_T5q`Vhp5BtClz3n=bp)_}~Dk3W=_)p2wwvzbm_t zJJ*>HJlyWycN5Wac%RBCM)Git(EH@UIC}_~wYY+D8F@CQ_!$ zTiq&ZrcV4@lB|*$G;$Ng!xW)59@QRV{$kek9tj5CZO7dA0Oey9tC5=eHMCRI>9}r$ zekE3spqHd+-y+gqBHPEf+-S@lGv2d}$3v*H40Z0->AEnH*mKtHJ|P{0E|}}9pq_8n zJ*Z_x(j7%_w*H#m0NvU*eh$?ZgHfUNDQJMQPi)Gd8W6crQ)cmOJ&>Lj zi@Zx3-O@C&?fKoWAr3HmRYl#{RWSK=`VM(l;+pjPR#EULj+k^x-8w!&kv;@~j%MGn zv_GbgP(Db+}N=F7;$n7 z-%#&WqjMW!kh)4r7hWKJP=$G%mQ~R>p_?QQZtrM|rX?ZGaY_|*6YM+2ExXkn>EzV` zfYNPg+cI6KQj~$l`)YlKTqoz(pDu5MFo9*D{n%N9cU_J&39MpcVHs%H0;4)r{k*$w-eJGhleeV*`^=)_c1A&HViamz@Smj0j^R#+T_H#K+lC`-*SaVKOj@KL@ zV@5Szn-;5Fd)cDz#NcJu=ctZ7MuLNKiC1+$i$IpKwls31;%w9VboZWBjp{f|k=1@R z)dFNkig{KUI6k@>tdKeRZuEy8h8OqoB47SuNO*8B)67x8_s6M$-4QzhqCkJZAg0av z5_|7WotY1p_g>VFPzIb4o2A|5O4$2k+shTec)EOUJ@h2Nk>Om<77-KK=KNFz!#4l? z=jT8FdH?iZr~zs6{@|Y=Nd54~>ZA9$=9{?O{~k5r1SHgbE@;`Pu(vStR-%oXaAlnP ztQq{1ze`q*kf0mE?^wL_^QYbfXBYdm%NzAnB?Odg_5l|#cXxE6L)~}KTn(^Syh^P_ z6tXH>)ZhN&(*9cKt*SY4Q(^MOStUp@eoeyU0qNHn(s;?Qw2D4FZFS5Bx;?*|o~BM5 z`c+Mn;!rZOjK{fGl865;)YOV5Wv@P)EOm%%S_kL8G>(ae1Vvkw_r>-;GSRF5SM%ai z$LWIeC&dYCH@7<)%YDi)-fzyqqN5UW5;|D<)u*lzC<-8S?lZ2^RQs9nLy)qr#xq*a z4BB%C75e0gPX9mxBca~#L7~1}{X@&XcTPR2(i_KfRFz~ZE-fP;{v-`! zn@A6FiZb`5^Yn1|SUZC2>nhAyC&uS~kUwFq+X+f+U5ZXFqf8I~K%_c(6|Ld|#lUDs z(>cP*0elGSUC&9q3^7oNri}BDcoNk3QP#v_VxB-O(9C#TB7dnJji z5zwzQpa}TeAfY=^$_zcQJgPn&f9a1ddlXUn0m&XnHkPI598^Z38(ud>d-5j~lNt0i zvJwtiUZTqzlqSW|$>)uU`h?Nq0I0BGdhiO+b%V0Ww_y}H&^k>KiQ*ZeJgD&R@dmuY+asX&hhC!!1VW?L9v$G2O)N>7(?l|m++GeIkfEWuYEth=JTT!ibuI{ z9Xs~SkwV7qjYbH6Bb(PD&hO^$U-S24{M-F$6)A$fP<7`e5bx-nW8bG?-pa#L}bSQHMSEo?x};S3G;MMq?3LJ^dh{REQ{}hFT3QS z-qNT>o#On()fINB=|cM{JkkwaJ^pu{_BZHF>ieHR*?EeWlR6$G+|_ztmtXT|Uh8IQ z&5}+1KDDn$tZoNJE%*13Q7wrn+a3eN?84Vw=JoyJmth{#7Y*p|F?%>=&5Uy{s=91a zPOb%k)zEKVgo(Ac!_N52i9M^>wI~-s`CJj;eV~#E{xMWm0rYNKBiqqm`XHGkV6lGc z_JKYoPQDM+&CTXj7VS%|9hpU|^%b>=5tL&TTM1GFa!gSR^zgGk1y$7*-5bQD;0M9j zPRE)gHBXmCNS9AxB9}%;x`^h|r^nyZYwxDd?sK}0w~b7X&h(jf0}~QBG?X#*!sjB6 zJw$P&k_Jg3q+GPezJ z;pphC-r3&Tmm5!M*coRDJ(4Xy^L^ueRVpi_8rSEl%vb^5-Sd3^i&rM03qo=1GVM#6_R(I9sisY+Bc~bA9 zl2jQd$WiS`f|2X)%q@C7#@}j%?gE8wD3$XGs9*oD6QDO~-sNqSe_ujO?|5II+W{&w zGW;EtoP`YEU=Yd-XxrL1snsFOt1+n}VQLemGZkG;K6V1zXULy6;o3{nlMDV>cdg*j!u2{ezV6C6DK!Nf{-Xi^nBry znvM&ZT?NkN#(Z|h-|T+9bUVnqG;h+}C)DI}e&;IjRcL#mAws7Toc?HWn0??5D1kgq zK$$1Vy?tdUN~Dke$ZK$dt)WwbQ+J`mqy}d?hq?ULqAwY6jwcBdOqhnO@XtxzHUe7z{0EIw$zXFR6)HX$!-r4(|I83dz zc(GXr3QvGm32^pd_RW$o2d41X9m>hTj1Ic>Wt?kqbbee0MxTpF4aq_O`Oq0HRGHQ! zJTPJO&2LVk{9HF^0#o&&G+GZcJ3Qe8VOW2h1_wcS%UX-9%EKn2)dS40 zW0>E+&C};ccmKQ%pSOi7CTEM;zmevUSsa!|Y0}+VpM`d;3(_W6rzS@CqCNN9ACkIL zKU_;f;-rSgf!FBNvb+U8@~efCIEs)?H${xU`*O=O%wdW;tXEy=j+8As^6W-}u_Tq=k5v!JL?l)K)MMyAY>hK91RNnH(6`uXq@CGf#*4IsX`>Hy{kJcK6b z*zP!Bo4Hd2q-DGetB$`$|JA*9xUQ(kBOa+eQQfhJ+owM0{s1Uw%s};91(b4R{Jo5n zMdG^Y{cW7?Ijd0xe;pkwG(UOA@4y7f6K2i`=mV^vK~UZRrv*}~oL#Ky#3ET>2h%N@ ziFp#rDFKn!cUCtw+jNiqa~hZ&o3%cqe_-yFauLiD3LkWFgK`2p%>UrVuK~K(k#~AS z)$#0sc8jRpJ2^yxpomlCUMi#_mySJq17Da<9!r_v_|0HJ7$aF-oYU7CB46s8GROnx zG%mj1-vW7HQe{mlI^reNht9T3HmFQf5l~jv0KLBKmF&q7od5-kA`iM=qyVay*tut% z*)!6w8R>6=)aRN{>eIXQ4>KSM4SJ>@^lcn>kcxDCxZCS2m;D8r-&#^va|`QP|>`GaV;#vX)5lN>XxxW1yV z080W)1bi6b>cwWlkAlG$cES}r&;xek9nP#LvgHjtvH zyGbRZ_%45BLOfE3bXn@yJGim$BekbY4SYRHyyr=}KxfoQyxKvC@DksePY@(3w{C-49MW0TSp zc|Iyqixk8rkS`q+mxR?Vf3@=&3C1tWIQc!lofg1YrrEH4SI)FZQt{|P7sGqo6QD}; zaj3llsV%vu2kuXcJ!&KdnFeLopjmT@J!2x5@;N|AhVHF<>>nDYY45eRmUcxGGnj=e zw%ou}VbwJ~GL&KNTuI}2U-{s+g{Si1^`h6MD52AQGAy;G6VyWVpf%Ia258NbG6>B4 z7lU9qg~^|6z_=t(a4dlQUUWPTK6={lLRI*A#@)BbuL9Gk} z_FA&rJ!%m|oVeXPqNyywJS~U?LFjHXY*TaIp_AZ&IsyHGwaBbF`(rrnvIghgAj7mg zEm~-aU8bj9Uy{L~ikEN44Vu)$M5NPvz@Bl^I=}^^ZVt3)PR^s-lSGC9Y9%>bmD#m( zO$lpIDd+YK%RV|zyzyAcL7;xu()^m_%inXWswS%U6i^;tN+tq-`VaIDm7@+K*RD;N zy5ku#(Uy?-b_wUVx4loRg5iR8AYptVlC7BSJup>`+4@fQL$A<~)iQe~LA?p0oCS$nS)F&KXtQ7Ur%29yXmBgze8k&V$=hm56;h~5FA;qa>WXMczYj-HD`YOAiUMou z7O%D3qGAKHYsx6U8&X@R#_-Sgr%(6Ur|IXoefX}WRl}Q7FQFN{_SmijQ==WN-w;~kQQ0~tku7O6if!Xy zow$d_?c`U6IMuk{=X6!7@4|kq*aeL!=E)Ce2V?m)_;fn~UIo$hY{&BOnFzo{70`2x z6!l*||4TXouj<`uYczNL$sL7e4G=1okJT#2`!WBZaiuDY<+u1;onAx@7j=6ufT0C@ zeGAc?_sO_E(ti~&mWoceS2v_z)jDI|S=qiH6yv8zVOp@Ko2xMWMtu*3QRuN!3~hFn zy}Cx?x1~Sz2%xs+RC9KRr=>E^!tQiVKV3o@m)s0Qn;yD|R*39wvS50FQz}}rSRg^w z`Eo0JxSD$OyODps0YM0UA8B}?x>02n5~%li~* zsL-L%6uF39&ZPGUC*4*jejlpQ^BZUxs#Gw80V+B&qnq?ITDpXTOIUjn%=)HW%9JsTc_U_ML z|Bsc*Qu+1O-E+JA3D**}xAqRbrVJ=Q z^Szi;rUFcFQp_&tg^9=*+)be?XCZpJ?4Iaek~C7SS|5-)Ud~fGi%swpR>XN!F2QuD z23d(3H_YDkv!MQ$B8J}soCf~|kQIFdCv}b9yfu5Y5t)r|s$-!Sal|V<}8ABUhP2*t2^G!!}N9F%-JA|X`$oEXrK@m?XH42wtqD(T7 zwqPuf^t<#Ampe3Qk5cC~^tcC%_OE-iPf}1DHF}Fs4|*}(=jSl_Up{Mgms9%Cd@To( zJceCaFv?-V*z~+u0h1s0@C$PS0}0@U&@X=wtD99u8;SqrLs99oCeh6>MPufMQT(QD zYF6F)%QE&290`X9kXi+h6@x(=jFw?+o0k>f$>*DzH3Hk;Owce(}a-nl$ZZH#|$-uS@Z`Ct`k0bvixx5(U6Yozuxg@qPxoK=Se|AY)XQ=o*2Bscb$L?%_4``Bsl84VjEykcMA%s=2jI^rF>OI&2n zcY&d9kqWq(i@as7(%bj5dtke6L^#>OVnERx{szXR8l+>K=Qo>YnWKYMS*Y`oyY~Y= zIFgb-Byk!%3f@uiuGj#L7(pBE_v-*j!!eQykH43XuZ|TdI&z_Wd0(H?c zhp>6jWhq1bZT>$0rBC?w^7r}W`LF-``T6<&>VLTZeR<7j5W7^KsXb!xNo7MBR+eQ# zM3ibvMWFjdu>3l>93FN5Mu~2a81^LsCw`=SuHyIle57n>2lq(60rLkizkfMQsI+37 zp?SLB|NOb%@6WII*Za%ee|ddYgg?XnnqpB{lQRH1AJAMVk*6<3%4RLxLZ`Pd+TcdZX4GmLbtPUkSRt4we zLl)z-81^q4c8~LXTpp5%*Ch|=YeOsIqmX{bf{1%PSSBfjl2~{hQelN%31^0ZzK+S~ zksfx{2WcKDF!$MWqP8ILLBkN?Wln$e)-RkV$e6}2tuHTyH&$C7;u9jjIJkcgEG7?u#1}i}&2mdXj z#LgGl>1STYFJ`1)GScsFbEnULJw4$PXP+DUISY=`dpwh@WlhRB!x)?5HABPLFf7pt zk@EZLcKb=#Fidpa;gP5(K3kt|-iIl_#%t8zEKTtmN;c4CwT2spY3f-;0TsclW9a%SHL?#721{v1P- zWU(gaJqjlYv)e_SU(GP=JHlh1Sk?a}@#N;SGE76mDXnKdVQ3<8#gJC`c+c#6I>Xzkof%%D3q_vMp;5i%nVU@tcW&WdEm-uY8Ct8oS^XVrhCtX6 z@R{X#>spc-FKy^*_&ti_#JYg|4iI|TpluTr)wUP`e8KpeFxOLI0oTp@bPI9&(>Ut) z1BzG~^Pp}g4~if1+gHr7d8fXlv(d3Gc9weB4?3 z6z8m^H1~h_?`HZN8iFQz8^jFyK`-qYQS6?S?&_gf*u+0K<+MZ~;d>z!?ot|;Q=(Md;} zK!rBIx5=oNr|H&rxm;1TM6smyy7Unwff>2EF2YPt(X{l`Uw%@c0e+|}pt}m=KCB&x zGhW#~jQSkH{90DOQD{dT9KyPRE_?JZ+38KaeZ1Okdh$VK=1fx}xjhZNIRQ4P(wPC> z0INYclTWfyoNPT&94AD;-XKp_12i_Lr9)Gv^|>l}vP|lm0aZ=c zE>22^4#2OU1C&4C-#_@xkB{}<`{DjR`W(F*M&rD}{N5VeUEhZ8b{p=)^K+Jl#*?JV zVU+CbAQj!CtKWCYH7a7tTyRbpjvsAwKx5=M8;zNNKkk{dFg>%BlF+&$q5iZUB$pg` z9cJbV0E-Imd-U}cg`~*RJ=LF}Iq48~MVCeQ$KKJ<;XF4Ih zSEZqBnRpv|6zGo)YQt<_g+UJ+fIgSzpv$8uA+KpyT>=3@tW!?YKfP6Kq#9GV?I@*|qwL0Y3+!I=g=~4{?r6jGgia2Apx})a2&ZYhr;PIJXC{{!*-KGm zUB;#QJ`F?PUOoDYJeAcx#`F8tNDh@Zd}EvwG9V{7gKYNUco(&~c%N_XI+v_S+c3l) zPjnEa^GS$7&Be15rQ?Sd)Jy@lV=AzeKwW5PjLMN=L{bD~*f zb9H1Yb_oiI(E3co^7%u<$tHq->QqyM^izLIBG}J61RsCV-b$l0ocwZ9edrP>!=z*X z_r#bFc+z$C)!_Ul1v}fi_Cj!=Z9Gh6oZn60yg6%WyDpzM^D(0R0wW(i>ky3(qw^)I zBIEWU&~E_!o{P?FzyGpX^=NN{ZAX(cFRN76_arhAKFW6}k_<|5r0m(&wPLQ5oZ9n9 z#2_B9Fr#EPHmw`y&6UU3<>$VX*l;>hp)DfS-mwEz)Fa$;0GA`^J)LZJq74j zBPi&7|9qXS{C9Kza~nSHPfC9(b2HrsHA94WN?3m=H}})*q*U5hvNwVXAs~U+lsAYP z&Nty&L&EdLE&9EUOV>eJ6r{S*xts>AmdacbnFE?`l|BGU(E!Af9{@T=P0DbrcGOGC zf$5GK>3ZxXZ)4Ub9Qz{7@88B~e=CpRS|39Sy&`_pLCw-ixSlZ98{2^+P+s0a4q#nV zhPhiN{pSjG*uhyz4$l=fmZPK??;+GS-f-UsfM5Fn(Je+D8~5A&<6{;5qJ-JbKkOM) z@9NOzKn;yrIS`$8+&OWiP8ec7JhkKubl-EX%WsW&yA+jj#sQ{An%gvNQPW1LDT9ww8Q#=*_9n3_w_KTwS-ZtU(+~CV0Z}NR7m3?-9~4dajRFv!M_6;+Wq5X1ymsYl~ zYG~9EA(C$SHtqRkFJlp^fHDN7;~aeQ-Ki(1<%l@2yIe8ZD!(y<@@gqjjrbd-2B;41 z(}#NY-NSoc&i?wj+5ojtDqD{di}9fa_J(Vz4AQUrs_8k-Uxq1*O0QP+!onlK_ou%1 znUmt4)cAa9IZJJvl#WQh7o@*iq|@~0@1JRUU*ga{ev+~e;c|AZbG>`#B7 zI$nf=aMAoA>O?z>)DhCHNx!}W(0DaH^&hTxSbRtBUQD;%-EKZgA8f9fJHLm0v?HBG5n>l| zk>R?4^v`IzPIgaEcn)#m>5fckoZjSQr^vsa)pJeBpgSVj>-JQ9d#Z&c{v0KZ{rEjN z_=seBUIk&!4pQNwk9~v^t_i}4QYp6VJQ_jm6vgo(#^0#qJqUKxJPrCeJ8T?-uV%Vi_xeRo! zb#+Q8EQ?WODaNIMw zTz1@f1F2P!>ab`~pU#g%V6Qj(XK1_h52~i-SYr#RHKG5d0;~Tc5BLq$L94n+jqw($ zLR~dIfS{D*vITef{HYO?O`UEkz@{Nd=+_uyzd=;8xb5FerRkPbk&@Y!m&Zxn8J#e{ zG$nG1^gbH%$EPE=fi$P_(b;C$kMmCM&|Qt+1FT^W^s80Bj(?mywRG#GU@v!W ztO^YMAo1|fGglRg&MNp+9a8)9PHk>aQvTKL`1beym*z%Z zK4{8YZioE~C5V_YOS(*z^f0+bozaRF$}rU?x~2&7>*t_Q#=GF!@p935Po8kuqdGbG z@i6@oBgF|$>e#tWTz{dw_M-T9PgxUGY~;{GyvVUIO0!onY+oXzD428ZEJ{iadhRC# z$_v0tf+&vuK{Y$Uzm3c`iP!4JM{@r8=~hLpm+ycGtY{M zuVOwl7gSqea>4}NMk<;jNQ^aMg++V`Z}(7MU-&uRGvH)(LvML?9Br<%cB<=T6Pld? zG-RA#zuy5U@Zo?FZKntNt|s@c?(g@%H*n5Q_er7MK(sH~-}+B(`jb(n%%1qj=BCoxR!W(4KpA(DXEFpQ1gjsrypG1iM;~>#~S|3AGwbPcpCtA#!TF7|6|DB#B9*ZXK(fBRT}ZmO}wOv+MwC@vATbr~lqE<|3&L;rMn`Yy5$ zbwUo#S%3OiG0!B)lM5y1B209Y4P#HdkuEO#f7qJWvc%+NhXBlbp>Hl}kXehZipJU*U^}oz`l-^g}^CVlP;6Y{D z^T>fZypkGOg>%|5;An;@r$@{_*Vi4jCyZKyRzvfa0>h&wWYVBVtXTGB9|ovz;rknY zcpG%i;ZrG=oT@sn%g#HA90emGQ$EKDirewy<*53ww>>M~?DjM?SJ%Y(6`-g|w`URP z&u?=u$aHezEBQtRX;XTzOl8Gzb_Iv7Ed!J#kgVDqC`vvDwf>15={#iytqGgLppXqT zV@|L#8D}j3foHFpGD?4p!f@TJS=27Uvu+Was1KdD4wo76wwg6;)fr5%y1fO}nlezk zy3Mdlhcz7uSLg#02APKDa*Qm?JHfd?d$SDE!)(W>u8`eBCw{)tf2LyIIaC&kh$^TJ zE==t+x1dZ&GlB)3jq2V(O|;fDy3XU}1S(}p#MyQX^=pQDr!7k25HlHoj=!k+xjlad z+X5n-iG&()!TDo1=p*f^jfj_9?44(tXVwN%hWrM_<8Qke8Ai1>eI}iI?w6Q z^HRi#HvISg-7othI<1mzI&L><6}{+2`D@*!9I2pzHc^Gt#|Mzn^qX{m@&Yz%sE>jf zkAl7Y0Yg0##`p zKU1*j*76fq&OWecYv$T`*`jz3vB`-q0IljI+{nHz7InP-Bw{CF87Ga6ls;Dzrx=+^ zTvT9;>p;uBi!_uWq^LuxVNo^NL%o3OWUV_=VC&*@+6oARkykAP{c7|CY4^z;wBqj| zt?ne$=+*#<0joK4oM@4t#*!k;8pYo~`!G-XAQpWsMM<;fvrSIq_bDjfisnR`_vTN$ z>%vju)tsZUPKA5gXzi>xc@PZ|Rg$eL!u;`W^zC^N2t}lvyW3CKy&~q@lO|R{)8bfmcnPw`Ygx^3*K?DLeQt22M0NNj6XJg{gu$Q;*ied`M|$A86l& zqf`TwPLDmkoUV2|;-3GX-?GJ9JrM&afqK4@2XH%z_+1rThZO3*E#uomxHa#gL8L`d zd8fB6^Z0yD3?$PI%yb2m>1;Tdq{n{x{IpbgH16~B{j>ku?`KeHrdO((Ea%C#%;lpv z@#fP#p!zwsSx+zaKHjF=+*lNq_B8$Ov_)^PyJ%0LWnTFRG{*ZqHMzG< zVby_@WD%O-J`6+uF*-ay+1XxyBcBDG&r;x|#K9+E(&g->#A}=Xm#T{ozgp zbZcMqsB^LB+JH~tQ1#H0aN-E4?ZT7zBS6U+aGIFQ>gjVK5*!A0u&rV!c~Qc~8R6pM zFNH|=nKkS6pQ8)fYq$NAf81aYxJwK*7AcEhkM)+MKwS{#so14d;nN@7nB-o)J$-Ot zC#xrUwJ0G(`)^uy2Tt*>>yPZ|BR@Pi9l-qNFb|x)ST?|v(QM?StVh$~M9AhaBRGIC zjTw{`Pr4T4;SzI##bdwec(!p11geeh^K;0XV6JOuudb9+!JwKNy4+9w5{90G?Qm+MJ_0VZ~fhb8)2!P^CjFL=8|<-e+YuiBFb(vlcV zh!(14X65g>VE>M6(*A_o#6GHJkMqWGqhq z$@Tc^3ep&B_L-E_G%U^`y8jAN+VI#<7YZe-qu%93)Ac6?a$mqUPP)kGWl(2|>nC~=TN*^UuC)?Jp! z6t9;PX9S@t4|Nt6aeg;wXpf4-V>)xa{Pai!ZD~%_ptLVn_FOAX;_D?*tegV;id`HT zrmUUZ%Het@x}%}KE2eNxvG)PupH>s?!bP@VW1S z{N*5_;6g~=teTB#C-aU+^3j;v@Mp@UY-Kvoy7!0|dY=8A*^59^Cyt>?zw%HDF50sc zkN|t0;uV|7HySSMIl}neBBX6)cyQtc^@8e|O#~Zlk-|6vzi_YY| z*(ozXXLpzZB3%)+MQFcOcwtZRBd3v+vYKR?txoQnTA}6%GGpIlN zP=A`i{*e`93h?mixNFm| z8R*ZF-1SrXh}+3nh@4)}M=&#Y+AURxRBe5AQoJpy@56guVeRR8bod;`B)Tg`dKEOP znFt#-ki$;8F}q=TSvUK3L}F|C&6+?5P#nbnrJS)&p;H)c`j1X1`EY{lt?tnBuQApnIXDXb#syY6eHW8!ELfP@zGJv^u9@%>bi9k z(aUw&BbPxq@?k(2#K|R;{>WQJ3>K6V52ObisNVVKkxodWdIf{7H)+o&2@1(h%yEExHMwNm^Y`cEBcecGTMDKEEm^#rQ2I#3jfS6|Xw# z%NMbOM-m=s_vaY*LqLQG~ zSHzhMZJmB*H-Tw^-fW32 z`52@R@ci<`U@mOUl0CELH?i0Kwxc(~49cN)7|XGJ6;uJX`M>>bqy%*AL)#86CX5u_ zanon*Z)v8ygOtJ%I8RHA$=0wAElyt1wa}H5q{V%{?DF@BVf&m*&(pB{D6xD~H63*}D z>CZGMWO3dhb#FfLFj`09kvk%jO`<%@2A|H2Xdvy7*#R)$p?wmMwBh-?XwRLCKBjk+ zcX|Um_fRp%qij2$clLweVUtO6Pl(FNzF8su>_Pkq)K8YwWDG9*=V?@vHU!eBH#5GD z_Vqx6{eD#<>r)t9q3oI7wz+4*^LG*D zSAHu^C_2ouj6Fu2$NS~hoVhs*GebR^p7_$INDb#oCEzn1R9;{Fm$)4ZWzv$B=GGCg{x&w&!;QU9 zp9n}lM~l2_*;DLUV<&4lQj7*$*A#K0t6++S(c|jqBXeyeh67rBzx>cTUQbg} z%mMdbXR3fEMbQMTAx>hSjy%vqMsEzHatt!kmWO%PWo0KkESG`S^gldQ&&gG;mtD%o zt)VhW&o)Jv{t}hL>+~Aq`^2NIfCtD&F*dLl ztR|J~ghMEm&=gJy=ofYG`Ts(+xu7;euQmgWYy8dhfiaC?7~FL0$MK%c4yc-~%yyKb*$l1w?jD_oSb2oL@5~%e?O|+Tq9#H5s2MO^CpwGjz8bYte zXo>Dw=bYJBBmQ};S;zpiPjXV%=g!JI+KNqZ67e53Y}7HW5E(t@oP>{54C~5Hk$Tq` zAMcAu_xva;M}L~pTu=6L1@U$1nS2w}#OL{z;-PA=($D@EvR!OSJ)jV!qlXR5|Cec@ zs2t(1RGcb^%CfZ7<kNu)K+FlGt`liI@4>bnHJ1d&gW%MU_!L@EwB-nVfGqj6C$ir2cgtu$G@W? zyaHvq%+M1K0@5XzR5Zb%`5QvvP-6jU=I!Ih zMQ2KZ!1b;oB&5CHpRL6I`QkxcoA@Pb0}pixqem%{_j)@7zx0I0_hGQ_!;qIE3b9E< ziC%03QoWPRlqpCZi{oXGxz7wtK$b|h5UJSB15PDeqLYiuQ&e{-?O|amE!K)(Z!jn+^)ySWPu(=FpB2%H@vo^kHrG(wB0}O zj}Vt=0*dOmH?CBCe@aZ&T9A$(r!Fl)3hI0+S@rwtk?g6QknU()FFT;)*%PP=qmXC;r@xqzWgqu4=u1y{ zj8u!sbFMg<7li~S70HVzecbfn^P~4ky)S%!!=_^XNR|h6>!@$;lBPGur^9{nFX;95 z80dEd`rC|-wGDT3-5H;5{G8gs$)h?OJ?3PSXG5|q(L?e2`r!3t2K_|ZWKt4uRq{s^vTSk+@X>< zt7&nZet*tT{(d!YANT(4{dODr`{2ZN_@& z78R`dyEk>Y`8-0m5Z*<7vLJ12%UxWlXtRuon01Ock4M6NMnCziWE7k8RKo$5$w{tD z&v6=>Y-(i{PQMj3X$gkX@#WO#tO9qYcGhUi8b4euk$%CZU!*G+N!`cDMm$nUXPP=>{cr6-<3_2!#Bb6bQO*kL0tUQ2=h!{>LH8SviTibJ5S338Gg=+`08RRBxBq8TQ9ZuAdc{f<3v zgz5GwWb`y99CbZ*UC*v1rZ!_#_XyH*ZV*+}@}0sm*|%TQr^~5}eTrqTK5iMp&=;jg*dRtHH7Vdp(Qcs5{;3VKLS?A&Nt`-!^%6H}<9|0k32*2pm#20$1fPgX zpwTG<<<|;RBrr8yOZy;ynLXE&6lF)^LBEYRs^yL!nanziH1i{L7JPe#4q4&|rN&OlA98KCDH|2&~ z9*=d@jV)DsfU|WnP`o8LzcY`tEboJj|Y%t5g0`{!iEzaW9x z48z~?%|6Ejxr-)EKBqle`x6)@Qt6<-U9nq}!|Wh2h~DAzjTvAjfG~84$FP=8k)z4r z{(34ukF*%vMCj0R2S5?kqbaivXi&o#ZI}}=cT0^#*%PN^^VDoy6%Re`&8e%Aq?+c-v5yobA*s`9Zq7T$2+8wiKHrBK-4wZSql22?H)*%h};p6A}vi0B+H+?FNzpB7l9sx zdc)#Lk>XwTQ9HVVb`K{s25C674ks}Y8X94Hk6EZ%J;5dT7VAO8fl|9N&aY+|`~F4*FI^;} z+&tZ0wPzs6^}!xp_ekv=OO^X1eD<4{W?L$d?rhJ2MOA()3oG7c(gjN?kY0 zAC_?bvZA=$V%{@pEm;MgzkyN=U?5s|scy5tggUZ|KsP2$SH7qHdc8CX4Wxl42HHV$ zwrQu6OB7yFM16dTP!evD6Pu1xsQ~3}Uhg(xjdvXTIqY(382Q zd82N|SSuUe(A_E!^6_qQr_OOI!K?A1%lahNm*;$E`8Xe^%E$CfsJNdt!-;Bsx-m>Y?onDeO7PSNLgHYRhH zWqQc$dg3g2A4RUzRnw!P={cnJ!IO_ngyyey2@ga^Y)@TN2FmL90y-6HtAM|C)Kxc=)yf*j` zx`(`jx=m0S(L>IUT`_t$H{~i7coSK%#3b%Xn{b5|VHQfcKHcBup_erJ5^0gu?Q=Q^ zv>63e)*{-kCiLQ6krb>fnV_2p8L8g@hSf?66|)TN_l{k$ z!ghMY4O+KRv%cItLXfQn5(6sui86sv>&&R%(WqT#|7Zg)_g94^a#H^s{5JEaavX0y zlltd--T+f1Puwc>9pI$F+fJO~oEM*8pr58@=&nkziyF10zfuwP8%DO^Y+Bp_zb7uK z6QCF>Ik=>jsn(H{oc58PaFoR()#KgXYCCAb4ph=nhV+SAGfNgdm>U@8v-M;!@Z%nM zdJ6PUwk2-YWQ;Y+Hu3uu9c3k`-}l<=V_E0h$Jx5(3?}IyFP#ue7X+VokT-W9$DHXg!@1M>7W#6*&I2@_bYlITVu=bFCXGn<(8?E=D z|DITB@A&#I$I(cWcWh)2PhGp?N%vH`mLjBO)jcS7ksx=`q^L*OiFg%G-XF4)Us*KE zRHY7(irMRzk@~fE^Tkv*dRfKx7+Reb8~~R+b8eGDwSl1S4CICeYWcYM)l6a&M%PWR zQyi_Kd}s!x*nvus_OWJGg>{X4Aw>o~sN|4#RhyK{erzxxMDoeZ~aRb=p;iP=n}$~gDZQXRQTFYaa5x|{{O z=F}-VQn~ZuyeD?dJSls7vc#}~a#YEnKGH7{`EN~Hc2F}544PsTYG}MQ+qvv^3T!}| zY-xiu!Q{8WYosNmV*iUbtjfl)vd-5|?c7dV3DI?za+8YKV^(9*=jh^xk=_`Wyy@7a zWeL%ZB5L|7L|R_L?7p&e=i^EZe21t*Mmv77grK16%#^`Ft|8(|Hlp1fS1NwV(P zb4LS>0p%qVsLD}GOv#*dH(8suii)P-)hprrX}ZXrx+r2y zOAp6G7ZfA-2H7u|3i(r6inQx&8`^g&ke+({afgq(jC5|_=|J(V(ZsdB?0BR}bmH2; z{8h?;!TuFq^)4c<0b21`Ld{10_~SV7M?KQ!<4fNMlkPwVWE73`X z`uRDB#ZHB1M^kV;_#7z&2H%2oQ^}ye4Fb~Ol_nA2NzU*5GQInJoCJpPz*{9Gx2ed} z62JCEoK>Qd*8;v>Rc3ZSajF%FGzU_k0#0;Ab@yeZA&?M$>Q8{yJd`Xvl`RUvJTf*MrU!>u$XpJ6mUIm_?T z8hJULtDo{6kl&_(O1S_O$A6fh;beyCP}}1XUKkY3&Otu^9I$PI1GOws5KE?VMP-bsCv_X?_|9!gUa-Ak6uR7oe z$z(;D&*uy=P#L9kuV_uojB9@>5985i*pzq+DfFt;i+3Hk(PV>)K1oe<5E4Jf_;rQpxu`7Pr3@*wTSG^51=87Vj>$KC>}Ra2T{mb) zLA1Z0e0u77e(Dpl2y{gu(Jx>mPCKSbkp48T4?MZHPbFCdVpELgTJKD|E!H_RRUtAd zV@+LD{3tO{2oS^SM_AT!&2zT=oG}6t=rrtJHYzBW%&4X)O|d9cP<`ScHz@~E@d?sj zl2nhDn%gwVOo-$_HD#P!F9~hYi&vX_Q;|OF?zQslC^cJ-H8-i*V$BWe?|eI;A8H)HzcH$D zxG!>9M{8dZ=VxM*3?^gFhe+aEXyYoO{GFG6>f@C(?5+M~uF@SzX@-?Q>(X>~m@O(o zo&Dzt3+w}R(_E~X`%*sqsC58(K!v~LtSkckY6fpkklZiCaP-B7AG+&{zl`T{am3Z4 zoaq2ci9rpwgc6(#RzRo`Q#$bi-5mc1-=Q@1ZY#`fNPhMcQU){}#irO(sA2KvyM2W4 zl`Z~gYTJBmMsTJ5jcyn&rX8HrdAkbS(Gqrl% z`6@Sdf%8OmVA{1uNOhE$uh;~QRWz>^Ex6l2^T@1OE}nLP)Vk0(QG;}x#*;gB4{6o( zUM8NyF4lDQZPco$Kq}u2RZONp5+;jv$qISoQ0YZKA3CPIBTzl}SvE;iTe`4neL((E z9=I|w$olZP)m83AJfRUisKbfUN!vu9T&=={$=xE(r7k%{=2aiK3hGA+ffhJW@(#o@ zndP1&9oqmN>mo`Ag*w`*3s&khCe98h|0P&QXuaKTR@_>3zulA_L%Za7%4uP;!@U8^g@rm?f4+8<|5l%ZZ(Ym*ERNc9>B;8jxQNWT}P z-z!q`qxYqM|4C2d$A>>j_6bDMqi$`^(GypH)~eZ=Hk>=U4c<;$sHch?D~nhy@q*V_ zb!WYkXXy>2Oic#Z1^V*d^gahqBI{mrM}-{K~&k^j__$ldMXFe#sI@Jf7X&-(Mh*+%0v<{=E+beb7w)9Qs^SrIIX` z-gkT9e|5aWBO$*LL@6Vyfaca?ooF^Df)6*Na5){e_?!d<>N;-_)F}bQ^rkUdcOh{a z@OOMnvqOA7^IxwPsldmnsN(Ol6df1@Q>fM^*FK&n_Mv`rs5AjTE_L&fXhr(z!yN6? zoCjsEUwTK`n{8E{AARE5FWA&asK)I#pd26lN@^K9xWW2JHNByP*Z_m0HxArR{afNLxbz#CF_r{=)&5`^tv)NX(S|z#*H?!ki?!~qFV|Fv1W!|*YI__leqWg`&!~m2MhcLjv|8~@ zSyiXnNP+Qv%Ai8X>J4IGRZy*El8-e|E3~WnFg2lwos~bS8Ihm-Fd$t(`O|n)V#2(O z!QbBLzn-I#=!S0nKB3i5HffSf6CGA1(B04L0{V4B(pjF8JGr{?yIP)UzL3nOkmXt_@xfA*0^?=QvC(4?W}*GdvDafCpux!9|7Of zB;76LNpY7$zquzF!K&3+PqIl`6T#mMMZm(@2hvaJ=KkZ6{&KOK4qx9s@1JkEtWELw z$AJWYtT5W`GZqprWEU0hQ*nI=t!uDO2?bh>t16c(D6{VA?Z+qXK3++WRbtjYFpO40 zofxegeg00LlRpXXKEHNT&gD{3S=owiv7M^KmXh=ePvpm2$u3qYBxt8@BbyBKI$KxQPM0DrOZhkY|?UT zY9K>wyTJ$u9yR?k+hzxHc zs%D25V3mu&*3R~5D*A%9(e_)U@zy6cJXeEs_rZ6d{fdeOcEK=S#v0b&P&19Qphn|e zN}LSQwIr(Sni{CDk@_0?^xf9B+nrh=8$=@>Aer(=rB6>4@Ai&JRkUgwl@=db{y9AH zF~BHa_oLHdjT%I#kOcLPp9$^-pgOsoFo;xPQeu}gBlRcvul@kNy>$MYcJ9OfZB#yv zQXWLwlTv5yloL!sSv^dl6_NhTJ!)s4>ugh$oobrrhlC8j@e|TU%>INYiW1JVA>^PS zq92o3rPnFQ#aWM7@ma|XsSYeo{{hlqW`F#py;?+>mG{t`(%~sJiJ}tXM)cB=_FfU_ zK{0)p?nHlmH`7aZO>=Q!?h~N#@xfkFK(RcAYWrmzp@8zY>E8eJy*1`;@7CJZ|1x)+ zE_COEy74w<2JK~KTGD}5R_Xk8!lpHU7hZZx=FG3sc{^HG-;aTP~(}GRZzJtH}x~M)aEq3q8lXXyWLlbi^Uq_gzW%-khZlNilNt{w8_jx+LP-Dwg8}7Fg7Rxd z`Gt3>NZ~Qv?rxOeI>wjj?$qdRsvBGTC(H8Z36HsMo~HSUCuJI@%&3lEqf!Sh`Ez>$ z=>zmAb;=yL@G*iWaGADAl`r9!ggzTp&WlL*uBUz)U#BL&+q(nuPH#Hj6l29YahS=$TqM$C{L+OgL6$s9%lgq#1^t8PvlL z`iJz&In<;kVWW6d5$RWRyGfz%LK4gDeV{rDQc{0$??ct?UiECptlp#@gqeKWClkl& z(k`#iJ5{#o*FpJwj59RIhMpeFqcDYxPLIGUzTQ?Y=|{%2hm1Mj8P4C|22bX3Dup}M zlPRw<(jDntz5)0ZAZ?uXsDP@*`^Vk*#LwPunD`OIJZC3yl=79ucY_N+(>!#}29Zus zxa3|nGElWwAU$`f&!f10uP%f>XiP%4Z>tWL-vw^GI;5kXS`p^w+)|iwjL3PT##hxs zMV-^snd0MB&AAbs<4F6E@c-@r{Yrp-H@DFlztT64?sXpfxw!xuB`SGTnwqcGy~)Xt zNu$7v(Ih+2k>@)g>7DS0ACD+yvwk;$v3+4}XCF|e8hjogAEHUWPHNH&m4XKWUEA_w z)yC1;BGT{Xy&qn+*3+{;N*MamjDsjVPF>fH3|l}4sTRKvgkyu#2~_V^b$FJjfO>eO z{C0Q=JNNp4Kj~k9;6WP_v0wIT7a1otJ5_yCM2Ty4rk5O6fSZu@`fG290Zlhpu(vCo z{B>>eF6}}?dIi=qU-hz&v|3OKaR9Y@>J+MM{D19|u9(P0P_`=cZT)>k@4J1>r3^?& z``7jrkG=WpX;&`!((E(@1Uqu8rZ*W>ZdDuGyvtBG+cW(=IzKO2HSScTE4_se5D82l zN6V$+YXVFe){u4i@|DyDug-skP z8%GEZ71KyXn4v(muW!0Mr|AcnDCqqRoCMUAvIjN#elb_ki%>B)cVFjaW9f&S*Func zwTd3x6p}xmRQKrf;vO}5>C>T@R%g;1>MH)+zmNU&*TZ}_dYT=;4$#8GRD3^EctLUg z2i+^aMU%h}{6jR|Q$o3E@^SndUMZ}-PB<6eqD2dWEWb;W77su*c6FvUw0Ims{mr4O z=;!gvyuT0Lo4yR@-Awn5>Q&k`vD=D*m*^hksOX?gT~rcDSxaOoXBPb?n7gNbY{Qbj z-JsN&3o};83C&zykYGmWC(}rZhp1A8V7k2cKA47T{bWNvJq8ic>~lHjBP}USJmWq` zwU#A)9Vc;heVjooG|w2q|CrnOU;ZbYNt`i?fVVtqp9>zrC4Ec!I`{G@^7Em=Zna`ww?25fGudZx`rKW>leGX;glI{~TlwnU2?BTJIG%J2lrr==R&t zP@f>FONqNbiTM)3VTwkhj%`XR%niXxNQ7hQ{;ikaq=l5yH*&ImLW>5L6V9t%f?DSoKlzIlS~ksR@z@Y+^P=s;RqA zS0SZ)^ZZ0z?(&$)ef+JD7j(s&ks{K_?IWCQf`)7n=`V!4nPKN4Py7Zit4lOE4(?CZ z;H_)==ZJCK`V0VJi*p>Z&jZ)CBpaM+1>z>?AuPZ`v9B}z*cnMfzHcAdjx9e?#i~yS z5&|emCnHl`N|lPLPfI;Cb%s6ppjdMKky?X?5D{QYV>3P9vD{nrF;ywSB%RJ_7*u2HxjDg|-N<{Gjy&m0dt|^T6BcBSRq_XFVxTgrDFclr zHJlv1gtTh3D{v)=7UcLm_?Nso6&NUOQ$+c*-c=`aXb;Z!L%9i(w;)WFtQ|SQ8tzqp zS=C@|r;t2Z9f${R)b=wz%h8OwS8vK>YQlYGD~@HHLXA#5) zk^U!6D&~AXN(Ga%&ge%wmE5Bso?2g|4?+H*j{4y@qJ3Tt&@{eV=d&p@y^oNXlQM6h zD!=m)?I2!xG1Y4aWqH7tA@xy|-y2m(^N+^DzbOh?JbGoXbI(%q3^C*P-nNnr%Op6R z?&o)&_#AzLb9^6_zwRZuC6G)_Gg&SbCU-bJGoBQ?SDSLEqIz$yEJ_m`XA+p0sC-&9 z=T|enz-~Tr62`cF_Mz*C9x6B(I#;2LW_XKF$5}j`(Xkz^i1fEH&tb?^?_W`J3$;={ zUf?+SgdfFzmqbyDx6bu8OrDpa`lJtRD3d>@K{&YL;8!VLQ#*nTGQHABz5#d~Wl5~j zHf5k+%@Q`b!%X@z(ojQ(7j=VEph@JJv@&@REilVlLi&Z~StOz3Qj?62D3RTL2Wnlp zU&IL{nnKixL*;LMvNa(uEh0TB_Qh$`jjC+jXh*Ji&bBJ&#|!~K29E>KpF^Pj+SlV7 zEc!(7?9^Fyx^}Iad8!@q;h7#$cu(c`tg|UnUP%dDZe1|y*Re%^e}9^%QQoJUKj$;s zR2uMqcsI7+k9OANDdPHT>vKLIrl@1AE>?X?tN$B^kYtLlke-wa6Epu1f@szE1L;bv zcWPkvi*+9fMpAT6m5}~~d#nk(2;tQ%zE8r@g3I~k&(u%y{%(ht{<8Q)tosn1Z#!r% z+T8X0D$qz%gl#GSH_E7&2$bwPBo-bY?MxIT=g|$;Zw4)reV|yshmArBHKuLwVm7X` z(xD7s&UQitQR>|8>#r>5Z-kD!*Iwr#i=gMU(jI7h;C$$|_P1<}zOW@JWK4NEp91#0 zg*t&sSrujzYM_Rf9lbvCjDe6~?8DFF16tVei4xtu4tmHUIa>)*(KSJXpp5j~p6#?D zpRFCA@;ems(!zfcNe^|$xiX#8(Px!?zysMzxlBRr-snp9@wLT_FW;#=QzfYk1Omr6 ze-~d>pmf*(ts%_E_#B8*1U05=ojg$XZgXe3rtDRJnrWPS=PLyhDn?V{p@PsTblj3= z{|srY`qm&F-8zagnpJve<@U%ay($^q(4J(xp8_4BIYMe=$XTk8!kv;bhDdgSIqLPV zdZIL_qzlNRI*fVq>`EuGm576BMWAs0%TB$Ar!;lG`ZZJ2M3ZaiMNPrH$p}q`)rHev zgB6Tk1Es0&$zDBX_N<`w=p$Q(!j5&kl&ke3bTAzm75iJOt33Y9W-A zWPu4()6Q?{WvDA9;(esu3hrS8%_K2b;9g~Sl+>X&r3A8ErP}t< z)Vxkp51qeDHJ*Gr-bAHYEamS~O_~E`n48Aw-A~@|RjjAMpka2>=hP}mR5c#amn~{T zq7s1q?gRZLpv|P!)VrI%e5WV>88&3RX<*rgo{WL`csk^Rv7Tb;$H2~n<$l)goq@+##lJT%Rc zgYkxXI-r~{y`aMFtDxGPbf)O!!AFJluR%%%Jol(qW4?L3UuqAwMWcUa)ugCG=U+Bu zs5|lLnEW(sN$0Duf*#-BsGuMAX>$@vbEu9cCtFtq^{^@B(5#h;Lxz;c44!^uQmRUr zd}XhRd$NplCqv=MwCxbGJkpmyXBs4+4MP9k{uGW0$JrAsCBI!xmJa!I_XVJ_(!DC!&JRQQHY#KC2kMHJ-+3k#kEz1?pehlb=@LrMwv%Dhw~Wq~!7g}&gRT$H)Y=iz&oR{2#h9f6Dhz6u zb6*3sA_&BL1!`#ylb%T45SY_~7n&97FKE{_e+pMWd z>fX>t)JXI0p9Fmbry%SSVn3%Nop-^YKNv|Jrc$zUx$B26EQHpW^FtDtJpOa2{#1#% zxL(E@VCtB!_G>LwA7L$D15wXO)5OQ1lQLz(?biklozR|kiW5xv@BJvI!TRMkA~0nc zQltGiLRu^6)l*_6J5qS$W=+9%I?9e$fSo?3v9q|g#?hK-o}U}7JOEp+{)WacIiR5q z53PMK4(-~-0zeH^%q9&JTkrqS127gH$hsnMHOZ)V!kQ@k-X{{3BG5b#O!+buy}N&0 z-K*hs72Rn*v_^!T94pR5=#6S48C?f`OQu)#ZCN(h+*QqGYuX6vGsM|C*QS zhFMRPt61abosO5?6awZ{FzS1BdllfBn7~4VRzTQ~o7Lr;72Qe&o6_6|&r(o>#4^Sm{u!MC(5<|2jtX;BitLNZS-q&IAfaQ=@%;y}pA|l@cKr3%C3QgKh&Qzc>NFcL zM%VZmxjSszPNPp*GWCH%<9D>F_x+BTDU$azXGwK7npF04kV<;9C|Mt%^f{H&H|;M| zGyLNp)9Wx|SV6Db)J$f~E7mhFq1t^>O{bKY1bKpX?N))bchGZQZTh{S{Y}wAJE3{D zw`bRy=eN<_Uj3oR5t~}-UWv80&=I~*{4lG0?KN4>)obeiw@^*x2TZhWUbmn^TVW?4 z!_wL){ptqN&7C#=mJj{@dXa@51JPwm(+Z@Uvg})t@cu}n!mLsW=y4~&CpCc!h1$@= zgY_u1DM78v%We<}mrkqN*uO}D;2F}NM&0k@)024eIiZ@~KRBz>Wa;Y7#;SBS1b))B z;6nm|I!oVkqpBpp>KhsaESYqltOB!is1P}236J=m!6)9+jbcX$(Vaw@J6pH^9{|c6fI8m+V%M zyKZamA%n|2v*VSNg?bTGG)mTmjH-OU(P#y9KkFKq>&O6dNqV+%BOgS($CM4epa>PT z+IKg+-KU{@BXK7P(ch_)aU7P4ab6oNw_VQjOQ5OudlQ^>eD8Ki6$1mI=n<~~n`hbR zUHcs7ZkrBzx{r*pP!FSRq4omu5^mU}iXr8!`ddQ!)962cPzlt2v%W;Nk8Cp>^o1VN zl?N&)f4rbcEui?!4q1YfvKSE67frf5WNRlAa&B#eG5Kgv;e}|?(2}65^Ou6_MPgFP z_o|FSx@DmLI_RIgTlN1_lkKJ#`}P*3{~TQHc#q1p8P@8VPz8B1w1^7Uax;VN*N_XB zSq0cezfmwX)RRIK3vTl(F`frNu`QD`#qU_vitvzCVbl$*kyQsB7%cN4>#7z;uvg%L z0Lh_AfTkChqiRbT=l1YVOP8G;)n2i!TKYqvnE^_&os^27sRXKb)2P4HKsCX7@BS3& zpa3#ZZ+q|V@9*!%pR2rpRZh2dRO$JtO0O#&qzOwv5xDyG3>sQP?c7ZbxgP1Mp!LT4 zfiP(mfeM-IQyJ(APa*z-zV@Wn8a^A}U%^{53+PcO2+Cp}wumHZb)pGVS0$i3b)lus z9$H$=CTpn_dsHL?&#8>}M6tp2ozKj|Dnw zlU<;bUC;;mx)R)n_jQhte?gbDcM?*4(frIaQIgvJ8`ELnx^h$4D;)flIj+ z)J34#=y@(E_^dUZe#Eb3n4vcmYd%anT^Hj%(#n`B6{cjqN+iAttyg#|s6gBF;={fL zZL&J6R-CLI#Ycu!CY?8?k7+>3tUo{VBZdy&z^JWBzaqetJDoShLRNgl@OP>OWx3^r zi-ZtFb%?3ErVR7+L}61P$a^1CC*Ed_3KN(CqQ}^gl&Gbq?-9Jq1YoGg@_p5AKg(c}`r3ofGxWTq0Rg^$w zn^c?7!cev-dKvq10CCkY4zVHZyw+Y>wW7rjjZ9t7HB?iqO8D?k)NWm~~kJn*dUrT^0VW}(N z63{V5)BcCyQ_?t?o=B|rYD^Vu3WaxiX|>IgIat z-MN5rfsJk{Dp?Y%dY4>M3DrhRla@iFu7LEXfrgf%s)YGujQ}iXxjbn%PY0vE*7{h5 z&x8rpGJ!&>R1sHw5hzWEYr6H3NH3ji@6oCj9YWVBR;j$Mh`G?TJ_k#Vl;TeT3Ii87 z{S{CT&y%5D3#<=iDXBauTYr^Y{I-kDmPMq$4W_}}+RYqyYBI(2KnG(_WouMfy=85N zRcxvhl6F7=s=uiWX??siZ0WdFds(q+N2QSM)dZ>SgSM8(_WfHxdjIokv<*EL2SN3| z1BlOU=7bJbLj1K;K2~9U0Yb@*S8bcJP50vz^)b^e`2gT0e+sJP0|kW%-UQJlo}8)# zX$OvFQAQfy0_NM41b_WzeKtam5QdnZZ{sb0Y)#{oy}nS&RH60mD5%2o8e9J)Q>6*U z7MzeJn|?RrYe-MVTg2O!*1txrNQ1VJgmqdFSRNv!n2Ic%NDu3v9njI^~Nu5%<-(Ixw|-|W*Wu)w-@ss`r5$Ou3b z4AblLWwko;{sIhZuCv5g$<{te`-u#$LrZwcroYW+`+2)bImc4?yk{5I18fkvNGU^_ zL@7Qj2xY$#PAdBkvAaSCpFeDxt}chuJA%*bmg02^n5%TPlq%}2qCC#tF=2xHc= zvqWAdEw^f)-2GHeflf^k=y%f(G+SrPAtzui_+57WdxG!WtZASU$AW9X-p6~3NAz*( z&8$-rJ_l+gbq|#rEo}~#i)qI_01YUH^>+{Jck^aU>$7kOfZdcB2QfE*_%9#fhto;M z-S*X}1p;?v3}nl=7wAP^`Uy8c$*zzr9o8xKs{L9u8@u?&Cb8QF(gn880xV1wj=M!&Y<-lW6H4%-#k$jj{=yrmQLyf=VV0BkgW)s2P7Ym#-dUwChND${01 zT;8vt!~)tq>6gK~*Q8}B!)j-NkTO;(|0GQy%Ww8yf-5+}X67}TxpM=`Dtx;pZ2-i- z)1N~9icn9_)E?B@d6raVm08tzW3Ex4WaBKq*t7}K@71V3&4+z{vmg4&*~w{x)8&i{ z{lQUjHG?{0pNg1i_1}2?vLgcnpS&$@%h!9BbK>i}R1^;4uA}#x4&7>R1A4{2rPs+WRe& zG_IK>xY9j|*Orm~pF$G2r?|ClfP^jt@>_2~aqE2bXQ&|ue6NFA<6Fa%5Y)isU@_R3 zJYFUY97=Z$+L?+*-I_Jb-A&S=%L)Xwqq(16S)Dkdv{ZDvTJNv4>JFjGo@ltrP!A>r zU-?^mceyM?mw+VlP2oKOpA-@)QwZ^Jfzq(F{CK?zSyVGg=&p^!cQeQ@ybJGv{Dt7&q>TKpztg=R z%?lbsZj+2EyMg9*$Q43nN$8ldbh#vS6WZ`IX?%_H&iLQs$F_ai{h)YK4+}%dRjRLsC1 zOgGgJf&C6P4eDDt*yi^9{4CyXO0*u<-?ulYTD*l`tHDgV-x{?$H!9BpR-JW@wNDCx z{VZj*>qr-)!UIe`O$#}ztrHBW&R9~_)zw|4SOckzZAu$}us%83rQ=%tGs(9HCWWab zNYIoxTCKabJA&F0W1PPLYM}o4{QNLC8{gH?aHBuZAKS{Wf=e+NgI1t7O!Mo6DTIO* zNN*N*zxt!IGzjDcarSkh~ zA&col2Mbs^9nS)6(!hoZ!6;_C*#KW0>24k<7kbXUwy->0$%}z|vg>TG+QbF8Fb-DQ zsIcj&j@_P3>!_l0iSQ$+bX}orSp^kQz^hM&50ye6d=D~IViTCOXn9X)f>rrcyVj~< znZrZSIDZeXLBrc5!$$>bes;4fB3Aa_4T+q zc3Sxez#`#MykL)O0Do|v^9vF?c4vVg^LbrWdcga_=xUPVOvi(-&@eYpuc=?=Vc?}oA}J#~|h|9M-!n|NqICzt2Mf(Laoo%Q9VU zXN$st3LO^?eRq8_806Bn02};k#mQ=3{Taem9o@5AxmTyPcoUMNnViR2q37eOp zj$=QJBhdp?k{nk<6>T?k5E1FkAyhnHp*a3<#AGxT?senZs*tb~rq37bpvh6l7V>S2;4%m)lyzZg|&>dU{CRgs&$A~1u?Nh-3lIRb^Z})3|D98e~$DQHvL)~ z7!E&w{5GE4Fl@Li(l(|Y(v?~50>;Jrb+QYR_aI}5k-NMJ3aTt0)Oquubu+2MHDHk; z<*Y+@~KcgEa_F{T&;htNq(xI*HP@82|X4gBo2SmSu$ zl$I~Y&ree5x;)!9PhocGxo|$#>XUuTL&6?&7&6Go7WTpMzSU7#_PUhxOwHYS)3&G$ z%QizQ()G7{NU1ECAr%?>LQ7n?-o*+;+s}EIU|Jdw?KZ;-{Q=tJgn&HY9@$i(eaP%0 zt+#2vlnEYs$;u>p?Yd8SDA)zEFx6CefpdwLy=rUOZH2H^lbVP^PotE{nM#5|AHy^Z72*jF4U};zJus(*CjZU%Agf9g>=cS9xZ>;H<&lFl-K{?^g7&=n3UhE zEWnZp_nNXMs7MrX&H)yzN5z$Q9Qn6dPdY@~*{opIK2>{=264U= z=y%}XNPsRImC{su4;1I~slPwHe&n^aR7tMe>9^Yxd`cQ{Y~#JzrSC*+q5;~-MC?#o)PTb z?M_$o{&l)f8=Q?a`Rv7C{nqe8pzQca)4WeBj?_*n$VS@RxLY4)it8ZDK}5&0&277G z0;%JBTZIb%2bUd|aW=pt>6R7uKf zz;EjBSexzxn*^&MHgru9MxD75Y(>#^)7DoSNOF4m;brNq|w{_cZ76+o`G}T56>V}0v_!*KwCV46vq@0sb6fsI4iB*~N)!f^cLav|L-Yw&NmR!*Skq0J zw}~m#vM+~LQS;pUd4IpjLGWq(45@bs%U-QV%6hH2kibK1p)ycLVA{gpvF#C3gjPgu zbqT51Z4+tb_0nZ1T|n@fWbZneCe|V;3=viet_x6qiR2;z=J(Z1#XHW#H-o)z*x4(e zu{tVTIOXtI%Mpob zs1>SGj%HhYGN^pSp;)V!6e77}>A{seL1`m2Bk`Lp45xcDS(kfSmdAH+vVf$&b%{|0 z+qFFr^0qZt5rHwwGA4m_FW|pp{9}isU3*;4422<$B37?3la<&vjbTI>%d0qM{sJR> zaum*ct-hSa6cn4dPy^aqyt5Ulb<-{Z)kJc>BGMJmwwlr7&+a*uNEs9qM*TcbrwBwQ=~F*mFjX=wVmGqT(Jk*=s}{R zd}(=y^JP!n$Tq&BZ0*3XYcK` z+>Sp>GpII5b)*%7gzy;cPBiOWwP8QUvHu+2BH>|0DWg`k5u-T%o<)y4e;N{L&oD-v zRTa`Lu+76uhP13N8k;XLk|zv$TxFVVnIwZTf%N>scHRG&ZEc{UI!LT%J@tthaP%b! zjLp+RNQ@P6<}aG*>6%^<9Xwx33s<(|7IPSQhEHx)ndXEYrP@W9haoEiym@-9xC3C% z&c`Vd&q6)xY?m?_#)0-YJ-?S=9=%`?&YnomFsXkzxDJ`NMnDDHcpp{uTd>inKK>&m z#B;FU-+Y^nQCk@IsX=Xc00qMPv@vSdB`S~dE5A=$ze#(XDRl_74bK=Ac0DocvIj2D zw-7rk=+5!ClS%TLuw91}Mbr3K)?Y_-qgDQ!f^f6;zw#4RMI&oM!?EQL#Iu^>-6|6u5RMG@e94TM{9wi!qUkB>=j1_?fOU?bH$(sAN5d-idrzG|497OEt*FD z{zmY>ia2SDE~gtdRUQSzrY_Rb0<({j#Pcu&U9BD0kxDQYj`ol3RrW(VMLIgR=T=?I$b-R>~jMm@~ z)^w8aQZ@XY%anSi4Ad&%tWlKS!1TwR;cS~14)Qx_$dXX=@6reO_4D;FeAc^Z0D1JQ z{yRa1^&l{5{!89wMiN@XnHOUGY4ho@fXes;%p(=)uy00%}fzI4tw)9`ccAcGz}SgUz_tdi20=;GQRTsaHOUoNt< ze%$Bh%ZSFW@$=Oj2p7h~?~V>XD2o;Kq~Xn$g~$oi5PGQY5bF=aIt)E}j%~2z$Tq<8 zd4&B#5&s*;Q$m-Q+=2SiM5lfNG7C8ue8gKg-D!jVfnz--{yLP_Rd~x)G?AR7ohpqu z<=lW!u;!sV4bk2Eo;fRedqJe(|AJzLza(wP&wbzKseHTK<_AFc9aQK>HGvqMIwG}-g2XlHb2m@V29^*c9{eGkUIyS4|C?FkqWtzh|+}2SgB;;6&v${-W~~RmaG|Yrn9GWiZMjs zmx~SYEksMAr|OdlUzt$9QS%+Ii+n?#EcuV0LO&&N<@>h8M(A4>Ba=WsHksOg>Uq#9Qt|3ow4~U9s1I9n#bxd(Qa2yG<#3QlwA{q9={^*wv4&Sm(R`eAyG2dAH%snVc9~FYzqdx{~=brNlu%Drv zu!s}>^YhdAU+$lu#{c|J{GEZVndP9xcOChCjCFIUJRw1; z^{e%e(6E7`-C%q4!tt9l;iT@-HEltaQo@;^C{CV-y*k})(`|n~=2%Q^9**shHluBW z3+YOoW=?%*+bjW%s~@LhhtJ9H0};1@7(#0Dk>90t?@O9EvrK0Okpi_@HoC0+DbBR3 zV9bqsd-Fk5#+zG~CyZ^INJo9FN5pv>q;u1z=t=VR+>_D|Uq4^p`*7)+qV<2BZZEd$ zx*=~c%tmDn?kWi`8imr73SGR(HIQT|LN1&3+`zk&=)*L>Io-!MH9TETqUTW=Z(Y?v zuUc~E9GUQFqY>tfs#8U*M4Mu)@we+R`URxhRMW3;uL`I0xt>aHN=!;Y{==o>FES<4 zc5>1FC?JLQ&$;fy)@xO%e;XP;wY?#6wRPVn%+w(aASS8;%#&fBK~IQV+~!TZroo6b ziaJDB%111To>B_6BdIE=6<1(Q2RL@se;#UxK|eIc4zLldIl&qm&TFtr>vyKK_HCkW z4IAHX5Uko=tmQW2SxUhWCmrDO(quyUjozI=q+0^IdH_c2LusnNawv1r-R1jrY2EGw z4a$)SC^xH0f&meTbIMqM8-MUMcyl_C(4bOPe8Mf?TvSIzCnV;2dx#Zr&zfNEWoR_7 zfx2^~<{xg85l%kvHWrEVr1z@Uo7TBx5|GnKmZh!4v{QcYA|LU&$d(-%%`}^(ge@f z71TR!%C|RCCRTxKM)j+ozi+KtN06wAW(0Z}C{gfZmC(4MWFEu_e@=1wfArJ1k09qZ z#{S$Lj_=d@{;mrMSxH9{1QL8BGBIodmZr(tt|=Imz`pj~6U}8tWIG&rkGw)roG(3G zL&ZsM*Tf6RSw^{tlr%GDNNK>oj_+J`SMxXtUQW=b#S#*gFpiZ?{J&PoQggc<1fAy5jOXzJx&tDdX&3+O3TcG84^YgS+o z%BGl;A{7xjdJ~sSi;4>Jq-ktd+8GQ!om#gh*xP8SuN$FDl4#XOt=ChOK3IXm58`}@ z;EuHuuAgB8wTi~Yc45@qfgh zkJKU3DrTfq=apgg23&<+Ay7(MbR^`EBaOwhfb`cVCAYg#FYWaAz83Q3D!0ot1{Ft($9REZ>?c>82)m?A6tAR9?v_p&DmrV*o_MiZBvFyL|&yW`&ct1 zTDIK~#mEk!h5<-jgo>lp&xF2YPeT$qGCI^KrgnO5?Mvs)5tbz0v932NL7T=c6lO!4 zykys(#!Miep2qRWz?-U16|HYBN3-tNWV`^{G;ffjIR%M7K#6(}U{d$V<~h%mQZgViBvai7kuS<93Smp)ibQzSxQTJJfy8%QP$glfYMC?oxC-pAqe;eUIy zDq24_Vy{O?wGR4eHAwfMWz;B_L)RTqQoaEaFF+j?t!j!$e~4&gOwBsvq4wo-wIiXo znpGtkfw4iDwg3}>LNg?Xf848LJkp$&%ab#+ChfBn48&CX9O*B`Enxt1oZ2*XJyInr zk8b1&dK@U{7ZCJDK^l4mn0_-iF3xBi-&>n$$Qo(l4PibH1Jr&xcVFxMmuf3@AwH+vHfGG;|6PHA5c?emu9To!+ep z5i7_&-Q*Z-myKj_IJ9y%-qR2@0?C}=gcmz>xGrmey{^^+uEJ>3U=54GQqB-Y*!2QNx|aKk270lojU)U8^; z`rBxbPfq*1SP%MvJyimZwND&D7xmN&KoJelQ~&15_2is>UzyAv~M z?zF1d{WGbrBdE~Rq9?5?sN0|=_Xv+nf93T48CH?T14Qs&#(JOnR=Ydpf%jGa&&}F18PeDaGKYlJ3PM95r!a{naW7;!w`&C%eXdQzp~~F1(uDL5(%%ee@KbgU zvhMg9!wM|qKAltScS;x=v7aNP(EfP~oO3lcwc{8hNDe~6yl<(h7ic3+wz^is`KZI~ zN(9d(s6X2#yZ;W7_gyue9BphH2z#Is?GI{%i>B(Eqs{tp-`9S)UMS4fbog$t$-WhF zDyWP%JQh@eH~DaHg&TyycSd$H*{FyOmT;MQqOo$Nr9nF)axa^979W?2==S7 z+NUxkQ)hx+-SV+xzkAo#>XxR2zw6!&Sm(Vf=Gd@|6}>r_VH}1?n&8g%!~jh|vcDry zVos=hXkq`)=%u5^!U2w4_{leK^Y=%h?|hD(*_S6Z7;1mR9D9X%8{$W#3&O&(vCu^% z%Cn`fl6~x|Dji`BjYM4)R^zdHRQ)tqogA(sna#SAQKA6DaPsf~axia;0!(4YHe(0^ zgGV^nu>T#rGz@GKAV{jki>3xE+O5$>fSlH@*5nNZkB}3kRIf)sMIWd$Yk*1$4Twe6 z%oVwFANvR)F+-{nq$-~NvcGCxUps&B&F#XPcV zO`l>+V5X^Yt={2_k+`VV_%&1x4y;-+kFiw)W>)PtsSM~yE09h*h4=sTOl`$#xKTB! z)yx-cq@^k&{SJd;MkH<@!0$&vcSXT)tFX#r0!ke-T#Pp zgZZIbAB?tK<9N-i%8r!@T4gm@&9*@FIjoH%J7uC8mE}xUiz79g0Ly=Yk z?Q1)Zx6ynW^NG0XY3zoXP&;pPsoAPIiymA51gV5m^?y`@^aC&4no3#n;8pg4gdkO} zW;oMl4ISC&i!i4!)MYT;+UR`5muKdEFIwt%A8JIV4QU;{IGg9ip zPm%sKy?yE5!@$L>KS?hylWSjI;tino4!=r!TvBSFL$!wlD?d_Q=prs#^{4U2S+05{ zQ#5E2tUfS;pHt_rNZsc4C#2^n@s!jI+2MD*HUMtgP#r7+1=M>) zI)=`Xy?4XQZ*Yh5Pg~6Mo{hu96_}gR_%KippeJ=&j$EK(KMSa-YHwvA*+xh_R%_#n zQN;OmWLs8wd0Y4ExpC)!W$q;v+ltl>^A z-UEl~w&%+=Wu#%P=qVGms%8EAc9U9DFL4p)S0JfV_m)SjZ)ZR&Xjrxc zg-1wLECMj3Uq?uNs?zyv6)68qdy8M_dfuam+wf8!0S(Rhvd*M;BZdL-s%|4NNAVy1 zuG=wb4KH<0NxyE>eD!T8XV%vRUaZ@a{9#>Aqvpz4b4)y6=frZtNQ4fO$Nl0;*3~ooPqq#z3>T8wp$&t=yBS zVrWHDXBG2kVG-6Np&)_*(cL4^uR471VGM# zV!F^c*SWVmT3I*n19sgJfLg7ZA`Qb&l}P6}fF9o`u>w8NS}!2o<*Vj4O=|7b^9%;~ z5>6r$nwGPF__@w_mWkIgn~LP>qQu43AZ^epglh1~_`_rTfhn!}2C9kXG z=reT3)?5G;&ny{*97CBne@wlALat`zXYHK&g)b+qThh=Yq{_vTKo!#B=OH4oTu+T* zP0AN^^eMkCK}8>i9UWeUSy^alCq5_!R1T7)8oT`Xt4#nvhpLD+kX>GS6sRj;CAumc zX}<{oI;yl5%}Q8!bW)twt*Hg%Y?p^ngND5dP=6Ar-#Ju&`MdG&$1ugwcOSdl`r$U& zS5nrwZJ^4OS-R&K;~Fq3s&TuxPvrIq+TNHf8H8F?rm_J>Q4NeIY- z>jV2%@Q7^)nwfDEq`g{ZNs35!Pt_^XTDihRVv9lTw?++V;5pLR9DwEDORjdI)~bOg zw32)Sa26*SG+<2)R*0N@H{CFBKEYbz^UK7-%(%(LrDY6X%W&6yFzhn)?HKpmBkVFC za?>VA!?12!iF8H>uf<9ihaF;7-E6yJC(N>&2O?D6D)vDGQt`K_2XFD^Ie=uyFF$K< zXMF(2|C1?C}EBj;|Rl3lT1OSABg@J!@#4IHZ6+_N7Mp#pY^%ddJ&i!)+#vnDY?&|H;G8R~cA7idK_%oWy? zdCi(S7WS$dtU}XB8ztBLfJGB-r&VNm zjpGCx2zZ?H@n_#HHq?!N9ARim=><;581%M=$^Sb(0>WzD#88LE7!9&dpM+SY7{W(v zh+ZT*5#$t&7URNYrAOV_OQolUBTS(q+4*?btZQsCXIO++Of@n1ab3xK=2#Gzr^6tq zSRJ;>j&d1e0ir;-adjUq&kg<1_ZX~4n?!2H|3-+k-dEol=F$TCqI$eDYW;sAL>m6S zbm_&{Iy}~3Y8Iqg%!Zj1-G96jhtY1_Otwe!YQ}28SCV6LWi4k{op!gMxoL} zeG2Y};HMB{24Lwg`u(Uzb@fn(^J4w~CEnW{O}{U(tdA9K3bBCjtFZyooTu1nhDYO~ z(8tRJ#>AQLUg!`7HHlNCLtVlP#5~X`BK%3Dwg)-I5)g!uWaw?QIaSlybI{y#tFh8j z4H6_sVl|>{GrxF_?)&#hj@N%AnRV}uu4=DS!7+je7pzBYGfJ6z#&d*dvs}i(cRv=N z3!AZH<@8WO1?w3~|0~Fa++cA$1&sK^pDaahjxq7`PPH|3cd$o&oi| z?yNv9NV48G6>!r;AxBL<^Zl|L3Vk3QbyZ0e|HCv8$EgEOsEMf>oIkspbSd@44yMP3 zEfp!q7~!`osG1jxI&M)%Nb8q>4{4yg@E7@oXsufudp{s`Fd}fE&nOjprWT}WP7U^3 zkY5}mubqdQBxwfS!p}WO@rPkQ$v!4w3AZ@9>jU)8tEs&h?yy!cZqWyz8!o+*C% z&6=wU#^Ypcsr7jnD1(Tbb1`ZH*Sq~rKLU5rIE&Yl_t-f*ZxwZ%AX+}WoLC9zR0NpZ4lB=*%RIWTNwj%69AO%iNr*3gyA7n`m6KS^aX}6Z&ZI z#7CtxBlIpxJVW~Y{0d6m^WO*2s26;7-IPU1s)|^H#Lav#EFDJd1zU5)t*PiNm0_L) zoHSxZ`}9(%6w_R^dmE^4wVGk%6>?`9n*A+8JbTeI!<&aG^GvrQoQqvvV2gEl)iY`z zC)Dc$vGrw~{BrN3pTCMT#B>;bA8>gJzT%p%xFL5`2Kk)@B5N==d!r1_)sxMAYDJ->jjMK0TPs(@g5&~ zEr*twGM7WpnR{JeV4{U!1T&`x-7ns~58k~VXwro`t_`#PkFSTyGs~g|d5-izrC2PA zi+&zV#|fhSSwe~Svrz=+L(P4&h_#JM*2XA#-JUZ&yrV%>@8H???0Kvt;W=FuPIijS zI^5I7vm~A#+MZA`26c+)TjC>GQ3~#p+7Qu|Fs1?WNE7Kk9qzl)_T#e;k8ST%UL`zx zHt7^aSYO|`c@9w3wf3GQw@I&NIrSW+|6dB-{FNxrKUdd(*{43&l$erZmuGLYaH%Ej ziqg`b7mLFL{R3BgPa_AQ)x_o5u>{_KH!Ds@IE%ZprXzC5%$fGTh?vNUZ32(cIKS)& zNV+vl){-Ug@LR3|XI<-N?x6PFs7NcGJt`uQ0>-!@$CZ9 zQpHv=Z=b+K4(i-BnA^S=NfTYUN_7oPl(^~X4{~5i%Y9?^E#>f!8j}OQ-{l3AYWABw z&BnWq=C&4$eVS!tt-1^S=ZL)^NoJ%(B&9*6nO!sCA7i1;)p83s++A(##S+b@EOqRXb{0RB~ z*&@h;+j7CI$6-`|vMjS|6boCD9$fMM@Bd(p<2dIjBytcNmvLeU-;I&!D%3tA*Lb)t zoim#>vySNm+Y^q-YACsIn8q#<5^^@Zh54_qfBoxU9_GJRm>WC(WBSKG{z2d8PlP#( zj4jDaF`V4XyN>S7BFaKbB=y)|?+BR{QB^xQ;qOi*4qzG-kn3QkGz)T^4AeSe zC2^RfczmFSn07$GYgw}HlDnv-C)YB}!tXP?Wu%V z2KmTfK$VM!1o{BYGcW}5)GfhjL@3Kv;4HjcgMayv8DMHxQXTk#VNrVk6tNNrwkm*L zZduZG1|?!hW6`ubXvgkMr@5bZbZ3@vp1!-ToAr_Anm`OGsbf;>H67+$tHOqa<3XJ1 ztkMQ%RqrVg7C6Y9wZvDoz|4yas38_n9-EW-s|We&dT^KeJhL5WWNumcj?=(FLJ%A< zryz;PBpiB$IrT2i`A+wVrqn)Y_SgTT=WLIbFL(++y+v`NETzUAXrEn4LC>aubHD#< z`H&RL<Zz=G9_=Dflsh>A#1R9^W zjHX+u0x0#2y8Yv{dL&*cV0dH<@w+x+8R{-i*= zG`Qd41)Zf1{q59gv;?(|nKtoLhF;+-EW28Demz{C572ou4coxv($Ig0_c#TvP1Pr7 zi~B1R8wM)udb(r7CV2R4DTs&poTSqnDDSOwn(Flx`!xXVrt5^=2TGQ_a6qp~yIj*!i)27#|RxdKV98w1cvt|}WRqwAE5T3iS~Oy-2%8CLQRhy5$5p>Zv@3_;vcMO^)9X^+WcoefFTx zC-plFH7W&_mEJINM53+_qRiGx5L{QAUQ-agkB{(VJoMN9vXtvnoPR=-Y z3Fq1p2u*9D2Guc;iYBaeaP)>6!wxY1>t74THP$m9oY_OYpnyv>wf&}Q-zKz#JH=Z% z^}=WN)=}XeEO~6%Rbw;{N?UP+lv(pT?9+@yd?yQrn$vifF&-pQY(uMoJjRD(J2na^$;-2}eH1`xwsl zp34M|at?frQWW|L!mMNga&bSn%^lzpPtfm3EQS4GX%GO-O?>b2?n{YQYkFBOne}|% zekS`iYim2Yt75K)IsATb8VXCRBhc+ykdoAVhdR#*zQ6&<+nd%>{y6W}@SE$>%k+$6 zh(lL`_nE(kcDm!?X?S|5LOKU2G1tr3{*?dY2OgK{DaT)>n#RhRLFGIn3m+ijvKKI( z1pCx+OZ^Y|YiO-lnWFNuzzx8y$twC19R>0~!a0x+pthXLR0nZ*uJhNG5BZV{fcIkI z5JC5|h;grvBs@Eb)uxvRlV&0TftE|90}IhUQ0ZdR*fG%GBcQ`TF%=Q7P(yR}e5d=} z)4omely_i~COm~=>613xeWJU~Hbr~Jbr0uzF{JF%dv=*P7|Btf7 zIlCMqE4Vok;)VD6Q;0b3E*(n?>5~ie9M99x<;|>RkaKRYu;w7k?)VnvT|`iHAbsx= zOgK<|r!h|Qo$Gh-&HEf*Pkx7GK&!-^aJBn&w?i+Dw<>d>T#Oif!?A>0!1=|(kj2%H z!K|@Xw&`&zzl>3%1`7Dd?)-WVNI_}s839ymoLjUaoHM$-BKWv^-A=W&$tkK&QiXxyC! zH!;SmPJclyxpTruG%3*SW4p75LXLkeirs6r}6VV zCas&WHxG=6oB!u`w>d}63qK5c>igmSBDdcjNPlszDBI>`f%p?jqSSBwLqta;-tir7 z{9kA}HShi1JhyIoe!jik{Qsou<@EOcal74~pPt6M|AW~ZIk7iLZkK;Ib2DldkuI7D z9LB+2@5?aI=dzm#truW2bwdvlNyBFLW~3q2#nZfzHAwx1{ZI4H>7V02@BjSg$N0~G ze&Vm;pZ<@?BaYA0r@b5o?a&-V{#rrrL#s`RzPM}3I3dE|t@cM!aog%F)7A9HSBVbx zC~fAo7hIAvs7FH8q68BPe8K;{;`_b~%OjLF-pN4C%;)4C-d@+G2T-%T8$hL7s4Ad4 zz?gdYxFsH}qSu1@mhkJYAa%buMZpCw=_ZWT!*h&34Hb$%?0i+fe16_W{dxPm4WHWo z8qlC z+?q+M>y~lNQ(GQ41;wrDd;pXeVroTVQ<|;C%g;#l>^)H_tU&tJ{QqnFlkLcHEo%@S z+hfF9=?Zt58lneCD*n<^eWg`{x+Bf={omrpJ^;xBb^t9hEh6NSZsxr^4!YCsO_>*C zhpn4FUjl3DvPPmUQ2xb#F?5Crw(ySX!>O!+UO*i9I?P>rPAnIHzqpt6DQaaA(4a69 zO2i$YyRN`Fw^=E))e7H$GZhZz{8sLpr;uz?D(GXFCP22|U!t%(WH4JE(DcwJ%Z9uK zs(5dAk^)QBcQqH`EqF82M_f+9PSpQsva!7}ftfY%`@SP5v%#p<7WMHlm5L45)@)5K zT}px~sq|V~{2QH{ozujAb%|?hnSq9GT|nzM^Fxu?@Y8*-K?$bP$YEuRZbv;PQ837A z$ieva4OJ&ONidD*L#KNc1V>3dz!@dAR)G?%?HuhaHO`V8?H=(`TXQ8Q69OH3`1gE3 zAMGma0ZknY6{M%BVw38bubT}zz4McZxjek0NC;MdJH+s551HgZ^A+VBRfpl9(?*((KeT$o{gx5CUHF$H(&@jqu}+35&N+5whYtekc+6fKTE&&R^e?~+Pg z<=F4G+|VA)UpIC3h6{C%K8Brnzc{9LN0Qe0ohxmo!@vK9>y8POZRf#f}n}Db;8P(p1o@fZW`me*Q|d-&DqafV5%FRJ-iY0w-weV>8EvxEbV zF%Nhl`Kq`1@ROPK7*74Wu=9Wt!>Y+LRb3yPD4{Zgs?@|#wdZ<7e#k7V4wRPtkhof| z9;jWCRlf_=V&!EU(B#mW{%+NJ$>n1we~+hl!O2SQq)ARZ#Wk2T)V-ymKii@8(#;M( z;4Vx<4zp;2IWboD$0epqFn1{-7nCggFj;W04)TYx;$U~)d61^-|0*S9VmtzCnCP92 zyHpmpJ&%(@nw4$=dfcrdrhLg`StrbP^&udcqnR^eS^hyd`b8aO;Vzf)QNE+py2?;@ zm-krMmUU-tJ!#Mfd#l6zrX=p`>n4Od-knY1+=`%j7iGGV8c~M$*Xw##ra1Z!IrN28 z!h+IscOHt!)MZft30sQ6Pl|-}C%z1RZem)~DYxhmZWtkVVOpIev0Gq9Ab++u`@dT9 zeO}!a(v)s8`6Z&x({rTso7?L!sHbA->4*39{rdrCcV4_XAX!}yhlOO-s&v`cO&fjD zVX!C4&D$Jq%RXH*moC02*KWoZNW248tKK?jkDh-Xaqc(7-zCzi^liFy@>`JFKgzk> zONo!z@?%^=y2~8LU^3F$xB*DKys;{lHzC2w~&v9(^WE^K*xpD>mycF`l)Ubhn~7*F@|i+=hFs?|u(w+M=NeCv3SJ z$GdFK@R+0_DvSo?ErJ};*(FgtBMP>E49(S0{w$rJl|7O@Kn>5*>qEHTD` zqA`xx*)AKD9r}fJ;K5W5W~4w}45>p7lsAr%ic?Z`Ifa?&Va;6cjDM>1T^L?mCXBrr zDsA->=5^*$tx#=}^^)SaJ5{_8Ne z>%ID{z0qoNd55lgZB0Gp8S~^(ZOJEojQ+5DKauXkI+TNHxqKJ=*p4ooNLxP zL@flJxK%M8RK3_vMT=V-m-@Df82xzkB}Wh zb3$A$$hWOL8X3c6P&Z6;Y{|ZO<*zz*JJKVI6T)#)K1{aZq{!mRfo~9>Auk#En|Re@ zoQFHfnrBA9sE(|173Rs9HkG^Yx*t~y-@t6S!@J51%7Dga>NPR1wg3$?wu9L7YO~-D z-7dq)vLoK5gxCh=_ZnvAWO`)bPb%_>MkqSUXdKe$9FhEi=jkSCsXM1`(B*FO=_v8~ zaX;uGzSQe+fb*r_r3Ri4ykQgio+9JueAX~x!Xo(m2=SQHm)~3;eBUULEGy5M87L_t zMGMko&nvl`)~-t^M0dSJ7rSAN_J(;(T1{O8r#QbR$~cw}L$ync@1uT}gX{01`@Pzq zvfz)14$onxJ|eWqYbsGLJp+m+W1FpcvFz#Tl>ZLsFlGXz z%kKOxVfk{B#QJ7Rd4}3SBIVuDhPd1z@Omfq1J#h%dcjSzu&`)|XDCoAl)p))a6+@t zX$|K0JYlgdcmZN`Q^tNhqu)TXO%!JqC*BGZ4-#|k40rC3AI(?&Da!8yFEm}EGCH7d z+FzEf-(&t~H?14d8~3`_vRYcgC%}We_gcP@ZtxEz5uT zjh~**fb1d~n##+`FO+d~<}pxJ#YsSy*T`eA)Q`CaKZd)v{!;iMSWPxwpV|g=3o(?% z9G#+8_BzN@E|%;sU*=lgrG;K@CPx1sY`&T5K)-H%*}AstI6ne4KAWHlq_J|tXF*Dn z%1qDjkV&UoD$_JvCW4=bzf>Yv7sc=wXA9CgP1Ds4`o)I7!m8X6h*G1=HPIVVKC6oAcfBOdFrlbm1&qF5f7kqQ8Nzc6;7Y$ z`5SO%aN{zT7Vr2(wubv^SW^Mox;=Ac8?lk~2>QXNJj1k{s+y=3t04dSYv@z!{5o5+ zIzub=5P1coXBZQA2m_ct-#@?95jK$s;#Y zdB-2RTODLYTt?GbxyV!f`oxrM|phQoWwodqp4ORt?dW>bQI2i!!Pq(z=E#YQIw| z{Tc6nao4Z@vb#q((W#BuhmvohfJubH>N?Erv5$e0x%8a!nhKir!7Z!#WNVgrw;X80 zjeqb&;VJdKCsB(nX=`I2bSbsD941cE9Nd(?Y(Y8<<9WB#<#W#{wxUmh!BI}NlaQvz zG#S;lAjO~LsvM_U^YPlsTQ!gcP~TKM{2@A4NWO()9~314OGNSwh|dKXM(;NbbEZWS zcq9AHb^!zGZ7vEl0IjvlZFfPd@OvSx?e`2CDB12Ban9Yb^Ou!nFZ=yljb&y}vx=i6 z;*;t|y`>!c|AswR+OUg7E%&A=A%S9L^+-+Bid0^+DXXc6W&ULgwVnEtt8FryI*!z( znenEx|M%5R+VVcv#fhpiQy9 zMp`l8X6iAY8CQS88J&}ka6aFVFFy0{;_`pnVpt(_SpCfU?S3HO^XrjjbD$Mdcl2!9-XltWigqqH5YJ=Fv&}b~1Jc_f_Uwn_IaW@J2FBzZBqQ@uL)lH9DT*IO>9UFWd^0y}2R1TxL zYRWwgZNG;TdfKl^AgFzJy3wj)WMldMfvet+Y^g)Bvq00g(dv#G)s|eH{P9#WtgRllp4F zu_vO3T?oO z&zuaM{w(@WSFN1qFX!ANDTvRT=;;=miVHYUo*5TY6=f1eKsUpgZp+mQ_{0VzF^}0Z zwguG{bvvDC$7>Leh6_$Xp5LCkLdwjTdLd7Qta73ytit@d^?g6g41ay`$3D(B@`{db z`S`BKCy5700J^&6yC$5=#k9NRTir5QckBK;jW&ZEAbOxoi(Ym7aYvGr$mS~;(GID_XS>ray6Gu665&=y2^vb_ z$U*9#^70QXgR@KD-%3^my2M`TK;q8Js|l=eQIK2CxsV2!n2Ngf?|nJ)bP#}+@26<& z2op$!pZx`AL7;(w-5Rt4lpUyaI|cSAiH=8{I?<&zY|=w>E>O+& zx%Nq|N6_>=hQ=i;)~G@c!=;L11K3-cKxUR)$Sp1nLo5R1)MpM-%5DxhRXkMAkx8c% zXv>iwCvpwDAX^>#t3f9fJ@G0*i5Ox%%%xZHbQghf9V2DJsClwK9K2s&NzEFn@aeQc zr<+~o3jlKFdISCAE0Jb4AywFjCXfqqC$p}e`>W0}XZgbvwqQM3D&9de%b$jn6vlL_ zFrn14Gbo;Z6~7K2_BzYoB&_J@M)Eo^bK13^tlX^%^6Tqn)?IB!3xB}{U5ijJD0xZ7 zff&MzTOH)2cob}TbhxU-H4*CUC=>r^47Z6KU|DguD1Mqw5sCX2*!{D*;@ z(e#~5U3h=}HI9G%^;dlHJtc6afKzoj$&e3`DNuHs^pZ~#c@Pck0=tf)7ZL+bH68TN znk8@d82tA9xJ*o>rx($rA+y5@klOV@@ulKHkL&K#!E)Qo1JbzM-j04%TTwnZ#_B@aTR;uvL>y;=Ie z6({NziKAcLDge`K?y5!iRc}B&+L(tVL#2N9kIn3sw2T4>$qA%|?K?E|Oq zRGFtc#TvGoh%sv@e?D)P`dH=>Q9U1ecbM_47vVIVV$9&hSi{CUi95v+s`c~xW2bqn z8d6@{B5L6<#D`K2)Dt*_XJghvuq~->>a557I+QM)Sm9h+tijG7o#Zy0J0H$ev-aSc zxLYBJ4$0DZwRg;Mm#hk7K#NI|)74e4g~D_(Jv3)eI@^6ZXDmb z@ws>IWjz?$+Ea%y57gMHom#N#-A{QBT<={T3$8;=Lf(USnUliI<7LHe-IX|LYE%v+ zx#bc`BG~fghaD{4{9$hZ8y};8m`T&;quircqoJ9YAEm;qb5LLu7cov;x5f>xllA5| zXkcOt6UK&IXX>rNj2A|OV}!{tMxQkgh%W6vl{LS)kB{j4$`3fd6#x9=qa*wwKmLi| zzQn)&@mK9Qm99yA#k-|>8UW{cLP2VE!Zo4nv$=GC=!9I)!V`VCjv&#t!fV1x3BsB3 zGVzMftcca~GhIqM!yO>&rm>GpPx{Q|Cc&jZ>|>$C$==E^(P1{@JoumsY|T*7&^?*; z>L8@5wS;Efa>HkfvKzjNa(X6N^t)O7)$yq(JVbgX(GLK+R)j(vqP+?cA9vc{c@-kX z7ZNEM4RabsOo2wF;O7WKO>~_J=4Wla2Q@rRsNOendiy!JQg?xqA*OywaB+Le#93Zj z2hydD$`a3@zcug7f%6bm`1f1utfCJf$if-o4?BSB*(dIj;9~>MrItFCSUsz+V$@?b zbyRL5u zsU{gxaHN{KWdIhMSed3XuNcb1D)Z3*<-Z}PGOYQ5+f-X6sVPKf(l{AkLdRZ09^GO{ zL&T-71N}yH8?+6I@$F7%msjV3-e+pvH1RQnJdI!a_mna`Xf&C^x{h-(;`%eij8_kG zx^XOx8KCZf6s8`>m%&>-K3(lAPMLpA-4fOah`h56 z%0&uf;s>sxY^d{vi_;GC_%LaTqatZJO27DNI$qpZ^E$|rW{HoF>F&9n&_L8p@ps(! zJq81pLljmCHzKRA!u-7bK~3Tr2_X;$IxM`#HekFc`%zhb4*gM<)sS{d@UpjT$_+s0 z+Z+BQ&9*n-JYRE)(=OUGMd7%V5Q`CU5AW)TK&_nf>Rs-VIRpAbK=XLRh-g=h?kyGf z-Dur-f9rnf=W*QtTi8>d^uL1bE=FSzCRp4HE~X$X@s|5r-JoB$J0(~=C9ks;kxx<~ zGMKjSeRQ2{V7b;p0Kg;(kUGrstZ*XVxttVE-OnCfhC&+R0%9xAANUS^hSbPafw|9U zN?q^{e6kg!wmz1M03G>iFlDe!3kno2Ci!QcDD2Q^yg^6c&00gsBR zDynOmy`d?0r0lH$?(GES^tn1gR8pEyMjN~)MaEbMeg!0I+ZMQe8S!F*nVKcTl#lgm zt2W31U|>*L%4-|LOG2g)Pf`hGeG8-wpyw^%iN1ca^lLm+Yl=Qw?K=eXvZMwNuRspO zjmf#z@ghK)s$rwv=HBvbOwaeMciM+*O$nSQn0P_ZE5+x>3Z&rJC(on>YH&VS5kt)& z2B@Uv8s?A~A|mv)?`)g6E|Ev-RlH3Nq)G}W%1G{5`a>J3Pj&SVAM(70g20!?WHDQL-iqXXl9#UIOj$4T_1DsA53c+|VH2@xAMY}r?lhdi#-jmE zpgE-iGTQSwUO(|9CHAjzqFFDx`W|{rsc21&hZ;PD{DNXojYtu53(}L79}}hhV+2+} zYC3A{Go?jEH(4~uB#67h`SV3VU5Qh?{(6=2>FLjt5HC;fPfx=^+#6Eww%nlnou{giYQ)*AerfXA{ao%#OEC!v zwhL&@qP=u&(HxX$ofxC*NK=r%B2z0rla`HMi`HBtFm$Oq9>gnfL=N@>sEm|2K>l;I z7m6|`^?7^3z2C@xqEkjsKUx$Ko1jcm+*iCrVkQF1s)bK|lc$nvER0i@tvYX-_T4n`+GJ3yx$9mg`VANVOpD*8pC9x~p}c^UE@}_V zqJF*b=2nMU^P&2k-PtfzNL0$!$2G?^46`OnlmKAjsjPyG)_i#xMz6g0YU%)7viN&v zsNub*{@J8MX=v1>a7lSeCtmm@W6=UMHT35vXnynil;P9;h7_m{l&F-^&#(I6Iub%( zaB`||WATcHTOH;B$24DDtOETL&kQ)L@+o2q zK3}B!K#%KpDHu`FlDM2iJV&g$?27bLnA!>|Fn`J#c%RbEwB+$8J#^icqptcIO;Uf@ zoC5EXQwKP0m-@j9oZtI6$<3P*Wlz1FoW9}Zj%1Hs=bp5vl}D1AFX9l)7ZK?vTX+rC z?!55Arb&sKH9u3IMhrVo>GZBQ6{Mz5Jeq;X-q3WCai(X3 zDPp1}t(dl!)(Lw+8-%>mnSu_jQ`Ggen3g*wB0D8@n4h=d&p5u%27KQeC4MC7th%Xd zRrK(ScY&rq!N6|1bI@2+N;#i55M>IQf937uRgtKu6$&5p}?FJM9@rE6OE+k!CTe9Chv@#?<=<#m|U{$hGK z_|VENm`%V{IeR3DiI!SytJ`z955I>$Z*L#!O?LN+1YoSu|j8oNQ0Ke}not3OZaAtfDJZz=c48b=G9Hf2Zu-GT4N zN1xFoJR`yC9x$1p`O>TP4XTKcp@Q@KW~Y2(xR3UAOd(!BBi#aRs5I!@;%fSU88mf( zGy5g33Fm9{M68WiUtR7Y_+8rdwTaeQ^5!QZpa;(Z$@uh0iLf22S7C@iuTBN$Chedp z2;AK5+lta`R~^Z6O5 zUBV|zhbwxi&XzFMykAu!i)w+1PWFMjqcxaf_grz9GxIBIB=H6a9cwKE(AhO{$cT3> zICp*0jex=hq7%b1k7yt7dap6{Rd^?UF{-hNf2jrPZUdz0^Pl?TuI?+OXP4@Xlp0~E z@ijjGjDsFHy_^?cCcftupk9U4{SsqjKMd;GQZ2;KjWC?g#Ta_g~#p;WJb1iy}Rv4al_Fat5Ll{TG6 z9bD=!xoFd&rwgga!2-q1h{$BqP~2uyqAB)to&+WZ77rz`fK(D^ZD((Fpr9$X;l&+u z6t~e7XQXZ!ZKA}PjgAizI}B&^PiLHHB}e;X73JshHibE2@*fyKXP?#Ew3~_0#g#vi z*?IImV_J}N%Yc*K6t0Q1w>r|#TaJ~7S$jA0*AC0^YJr+-~8O%9Zth_pX3??8VYfkf#xjQ9nW5kJopb)t7C1%G|3)6aLz@Z9M{ss>liA7Z_MB>CNI1<54)2NEj(&YV1iEX` zd6bnBkSL8J?Bm7o!1JUa+Km5U9p=woKgm$+&u|5&Scc|l(nUzL>^RShIh_0(ly_7Pn1 zf!Gy{P;7(pi(cmNvko)j9A1t&Yag^Hy-6bZb{%|tM$A?qHQv&N-vo4HztxZZpUaPD z?$-b>8|mW|j}o`2$zrlCKznFqxN#Z8p-5J3vnhV!U%4blbl7ajTNb z=?qm0zD>izfOrD)i)L;!9zQHidMlNZeL|4q1C^ScK_%&WhjtitpGsD|mk!B1?{0ON zUrV8>pSxg&kB^gNUR^&v`OeYJk5{RM^DNj9R)I9#r(F}!)8q8mCPgZb#@bB88gEnX znMloxw%DUu-Kr?(UeX~2%zD_%>0NHyASIdz5D7Ak4mh63)tuH?CYDZudJYt-N#LP! zY1z>Pv#mH@Yj>Wh3uxX}#4XBQ6Io=)BndZGF;*JNc280p^(9pr(5vbDP#$?6hLVn( zmQ6sT5j{UjI~KjnuaU=Z(Eq05F4g=bDLyt3d`di>p@bulQ;zedbSwAidDv<*yORAW z91O1$vvKcyjO{%OLlwV5UFKA(858-&9D(ZQP^7`0sazX<{J8KshWC6Zyg-T`F0|*i z04*!D!z#j~E~D!UPaROgS?VbBPBViNS7-SD#Q!Z*gy*I#*#z|n>D05s6t0hrIw@-G zAE$25sxcZN4k+HL2e`~F35j9jC!;&I>Q5qqd9oBNl_s5l`~Bu z<5U}*iTmqx@uNw>l~)mjxOzcN4yjNgz7KJqa?YXxDLtW$K+g~Yzj#0cWY(>a%3*r- z$e@W+wjHPF$KfQS_W%c!L1ZrwHKSk07D)MHPx+EW%v=FVC%=Ch)t{ME_$^^RBqlUX zNa-%^qGQs@xCW9hJwqBdb86LV0J_^xgpd13?9Rn9s z?TeMSOHR)Uk()GYK)=5amp-}qyYvN(_Y;cX(ai^~4huy%WvbLJrFL3?{&pWu0+yT& zy4L%;JPS@C6LIK4q?u+$m`PpfKxOvKD$ukKHyUXJbM`N@>P#~u=pY4GvJ%DOdw%2F zoPMkUT_XGK3z~lAADC{T8$U*%ApDD))Gfn8B~08-5Y}mcxp|53$%!2>t8^Q#f;y`& zoq9*&a}tANM)3>3NaW|zNF6B1RcQME50*MWbmKWm0mZdX?y6WP^6Lgo{SOXZIw&*> zTP}UzHg(kT*o1WYNHKq_#@APBeTe_qb%&N|5XPjGPG@_Bc@UAG4?QP%l}@LMLmg$@ zq4(_#zam(;y!0f9iXK9$Hx;R;0;eS=E!Tw8E_udKLQTp2QC(*G#3f2|O}^_bB(PQB zNV(e_U#1u~Wv2TxoYeU;^^8xQPL=ldN_lyf6F1QrF@jnhmvt|EJZdE}w>rp(@-^Gh zrfUe7=PDFkp=`1zX>U+vBwnSQs-{4SYFTb1sslaxq@{NoGJtw9bne7PKZ&8F&XGm) z*hmY|5I=re_6+c!67p4`$rlwp<^)66hmJk-oQYQYtoK$$xm3Z2k#EkkCj5HFG3Uh{ zRc(EUdu182FMy_nH&<{Ttv-|_F0(03Y{}4ShBw0F&%&SqDcsOzP>>>159ajgHoo4~ z8sp5Hh;tM=c=-HH(oFNsV&+o$6QrLLQc69o`U+?QtMB`#xA+al64#qlRhbJ|RSW@M zbV-mn5pD}O?*Ww`(FyQ(!yfpxw+H%dPfBBTdL(VB#~eyS!)2#wwn z74Lw+4C+|tFh%ytHR3ct<>Y8Y1ooWsq-KUJ*X4#bk|MN|L3tLuC02LrnsV$+kwH}p zP`*b4zeYp;UPW&}T{Y=caR_)x(c)gTjsxp5eQW%jFKJr+p-;ix0F{Y#t?A=ROIbm% z0`IxQgDSKwp{_iAb#bE}XZEf>Q^_$E5UevQX=5zQUxZFNkFj4_{pl!0Z)#~Xa$A@U zuT-uwBX;sg@2KMBbEMISK=uA1r2I2rHskU(oqO>iKHPPp*GwcqbFOR)elihmRhV-$ zI=@p%YKDG9_p2H)=6luS%hVMZ4aMpgF^O3hNRZF1O$7346xli8ct6pu{%k*SP zQxjZo4h;sp#$4bGBGnvPa8hz~aqjE=1#SAiWKn3M1WT2%W#M@pN2WSX6VChz^Ops0 z)=7UL0p&<_&4N5HzVlu)r}AbA{$ zaeHL(>&E=K#k*rL@$|(1-Xi^kgbuZpZ3xoqJ&&G7glMY82kzjRq9kIo-s>U8?}U-l zR#4O&0UI%llrbTX@5QWYOe-d>JqM5cCMcqGP!;MeqthvIDwpo0sx51mC-Kr<=FZD* z6KIM8bmoh*&ZAH_MWO!|=C3lomV$uP95QbT_sE7m_w5{X(ydhp1S^#I8qRc;LUu_2Mp5@ejhor+-?x2&R!&*szk7T(68pnf7E!v4UZ^)CEbt0{ss zI)=;g2xu@f$zKD|VYp9B{nO4YWBb#_JhKHlbrNBMO+pSQ@bo=L3YRJMy4Hc7ceWD` z*kjL;0ppYIeEfglnJ44cXMUbqK7rYAls3;%J*Fd`3NXHzUZKOl@C*be#m{?G-|9eP z^!Z1PKJL&z6Ea8yoE`eQYIb5Q!7lZPKY64SVa}7l7A&dJo-HWH^osZCtFm-}6a-q{ z?5KR->qp2411TJ0uDT>mEUifivKY~|;2dJe_NRE)#hP~fQs#aMJ<*6w6uBViUrxMd z(*j;J1O2`Y@x8>X$kP50*s|}kywy`ri~vR=lhZlB#+vD0lEj2UB?s{4)hlcVnojWXn_)%9iXCJwZ?af%MH8=0(}U!52y6w7vsw^Eku8 zIZTk#@$&jbue;pqR8>AHRAaIQp%V{4Imf(()I=9<4g2(gg>O-9!7K)(nr?)a+P$;~@A=GT%Jdu;~V*5zsv-H(fI zUeJ8lk^{9yXOX5>oV*t^j7cqe=+UJ4wjuFuDuPQ#CGkbJ{u6s$E+J;F)EI#ro#En`s zxEA#7A!K5jD$cLldwj-+-#%$?4-*H(4jI+;-q2k%R70u;MOTl;gCZAdUaE zEiezPNRK_R_x3U!esb_p|A@Df3?&_vc7+?FDd|+lN%_$Awp{>)6Xz&ighiWgEX)XeQ@W|F3#>y$NeQUj|QEi{63H|Uk;TXwLZ-} zh`ou1iI9l&gTRjNB#T2h)4US7`=JK(6O-W9xnk+JhyS#qy!+0lf(>q__@5&+$~S3| zeGoKuQ-%4(2E$Tk>pau)agYCKE%c@*ugr|fJ<$dl36?Z>BdnknGIl@li!`_?=Xkh1 z6Hxe0UX9am^}*+nVQ@wIrdWS1d!8KSKvkzRf6^Y%PiD~wbnvLn#dqDNh)Dt;+5bSt$!<;su?>4U9s`q90=+AOOib^-u zv-^}IwHOA^Z8}w;`}Zh6VAo=iOjnU^C(KMM0CwTSwL(xC8I!Ctr)z zftm^4q)WkF;8&?<%C_lw65Xx3@GB?Eb9_5z|56|tXzof-u~zzG`FU>^l=wO--JLG} zp}RM6rpn%r-P$>6I`>Iw075-b3|2=#At!~Fr$>PDC*O~EziuQ{jT6C;8lc8Z*@@zY z;?7Cp$X2^ZH;Z4a^zLhVdF_);Q_EvK{h)bRj#H6@@RC7J7%>S>mZ@w~xf|SP z3QD_l52N2HdZtN9Y`I8zFly5HMa`Hz!aMx92hLKWNk!YZiksB+pdMsv?>8njO;`s% z3{SM`gLV9>KPunJaE6sJCPKR*hECC&4uYp5JfCa;ah@P8#`C2=;wm9y&UBxbXD}wd zacQE5f+ju5CYUo&JC{VEI8Zm~C&@-?B5`$Z=oELkP3LT*w%n#^I21Et8secCqNpA&tzFGSc) z;OrdV9fTyMoc$`pW7JX%^+X3iykFck0Y|bek@{P0hyMJ0-#!NQasL?mh5Umz8SV5H z?bFnRd6UkAcSN8EhmY0n6hu%-|2pZet z4#igGosRKDxk<7^DUp=#XwlDXi5+Ug(-{2{P|aE>!rzPCIaIf3*;N#lYQ3Rh%Yc}< zXxj)(3qh%nELvT5+@q^Z-i}EdEb6jFCF_L{G=&ksjJ|j!$nDRq5K6oBfL0ft%H+|m z;!hv96g7$me+c-fmbbe5cltLhl}kFga!(4>aHW@->juyWik>mU$SVPpJFm#kPtsJnO8q=+QBO9cne z!?ET##WIyc4pm~wuWj7Zc*2B4;0u?mszVMRJmg;9c-~ld4~is`sY0F`RPAid`SA>^xHKZP*Zsi)u{NY zdS{}$VvcZzj<~=YuGasRPL5PK@#xwJc1C+^W!gcbdOM?>2vwPH_Yr3WD2Bk*W80SNe9ffksSKh3F+FR4}RG_kD z$2OHI9@3JfiDBHK#I0ovTl)C@97Ai78J1FIHzNH$MjB1{^zt_R8J>b2y^gCNA0woH zLY7EcU-+)LpoHC-!_5j)aQ-2e&?&a`yj458DQM!U8}zg*YG1OksRhzv&_zJMB?f#U z6id%DeLm^6Rbg&{riBml4_onpHjW)6PBA(Zz*A`gGiZ@JbBVX83X|asdw};0+SnfB zL7pq5h6tWLzzL58tVLIFjvDbk~=DK*Sor_@80oF=bT+@V-~)*@j9YdXQ!mD1np zI1f_1jA3U>uGB)DO45VD|&gkFOAg+dT%C?jA)6=hWCcT~h+<1XH5|Tys zB;NGFv6_x4NV9#QWv}4;`aUChOgkQIm=mHt zI|dzCX7}ag>P?@c=jew@sK^FPkuXP8FunyvvQpAg0{R?g^qAdPQ*t<($67s}dtFfx z3MHeI4&0{f^>p!}zOBM3VuP0rhn7<5N{Kq(Ul=zDT|J0*ES_8S=p~IX+ zi6E)Ee{F(2m&OI?>6DFIKokqlNC6ZCClbj@4JBGlFq?60;hV~%xakGjp}eIRY4%0p zf6Ywl4T>O2N34;4?;yRu%XjpTK4E6)eU$J*mTnVdq1C=Ga!d&_ONuFk$RHyQx3VkkC@~lA&sYm~fi_bn%Yg6$)kmJy zk-TLyh$*12BHf`SOf6?NXHWfg;x|f^llqxM$+277Hwu_aQ{_Q4;jE}I?-;Z`>Oeh4 zkmQcWjBZmf-O8YICMVci1`4dm>+b{o`n=hAc6vHMF{*#U&7WK2Ab06|NQtj9BxU24 zC}Slyl;Mb$5F$g$aFg=3Ej0}dRV}*sBxTGr@?T%RpJ>>g(s(}lqa7|fXZ8SYmeh)4D0IHK*gKLvwp9li8VhdZL*!M|mLBonBB)M8qzQjCyHT z4n8Nzr+^-QNfqWcmosNa<0k0s9?tsgHC1pG%`|I~WE^HUPX_LcN}468!`$Fx6y$?V z!c&*0ho&hf0j47K>a&SPqlyRAC5^r+%q>oQSvX+&?dWC00B%Y-!Jw|U$mRr|7F9){ zEU6~mg7oOA|M&<#RY}(|Xp?)7JhMxLHC{4S9m6T8l_gf4tay105}Aox*Xb^^xpwqN z{Nb5Q3PDjPwnx5tlu~`kLlH$Ihc7)#WKeml!{iqz>xG-S9}zFDQzT6URTvM3 z?*xL%j+Gmg<@W57?^K8RlO@oGqWh)j+t^IFHTh0RjX{am#mh9fVu$jHhcJ!e)o`9y zmvWH50$OJTnbWt?!vg`eV(ld*P&9OqX3s^S>l=UABlJGEXs&{x*BKK|J-knLko@K# zL43Q0GTu$z48-KV72=&fmrwlM?jGRZp?rUcl*oNe8`9|$4*3(WuKe_RqT}6Q?AmCy z@cdPh)2md2SrKqZwYvu`YEZ6sE=h{*;zU@74cdY;O*GOMeEX;GeWpwQ6XO!Aypor_ zrYwGi85c~Ea3&bTS)G$T6V7v(c#}5#gVS%~){Wnp>xw-Sb1E`<8WZB0my~tK6FNH4 z05w4Ux_NI(W{T5f^rvWNOZCLHaU8j3c$Xjvp%%4|^R@ zm4_*(Nx%hR(CXe>$KE9khK<; z7oca(FDK6Dt(?u8tW|3woAcTa-RsgiP(#^5RE1u5j1R;^ufU$?3WF{^ddZ-+3Fz24 zBp$Ym?v-Z3dZ>gF7%Zy!+>p^cd9Msf306^lamQ)k$$P@t51@PWR~@|E^b~y(Ok#QX zqpX1_0nMFSbdpg1tRqD?G-Yz{XwAnHdS8Ig<89M74-)M_RV>0r%R76a5@B+J?79?> zGy?v)RdZ!TS+{ws>TVwu?NKBF!CE|IOcl%MjeknDoGRkmcpv-|Wp;~EH~!}XL=iMU z2z6!&1^FU&=13iScu=%&0Xjq47?|8Fze$$1(88p7Cr>0hm=9hI>TcHZdw?iu&+k<1 zfZNmSpFgk9@s@vILOguYl@8dcw_tB5FVWXX8X}6Gs$@=zj!uf7uK|BUn!M#)?e5E* zeZtG&*BkUyp>r-ZID%U zM#GbYEDoD%Hl|3Bl8`9H%@v^O7Oj+SFSnFTozFx-9>|emOV5xxt>`^%0eX&+`1T&D zHc<-7Vyrpt0>@n`qgEaYsa#iq#+N^^T#1y!(8;v?L!htYs4jXabB8v@axB_QSo2FF zQJo%9LlZe1=GQ*VuT^@7rke52FZ(o*?<#!19vxk^zJ-;GIu}V?nslR{a$d;eY^o02 zD=LZ3oEx^9I~hi~rr@aOOQ*p-J4iq(WJfbZ?myvt=<8@58ee|cE)35%% zT6Tb_q@-CVuOQXQ^7)2O;u`E!l^gUZOP-RCK31 zmWbjErA~%hK2PTu%Ql$erb=$=`@3DI_L>OD9TAlF23-c48`G^LC}A(cbVsh~r8Vaq zTS%7$qRr+PRDslFCX)+D0|^VIVq-b%ayh}nYj`fP*;st(I%ZkF4)v9|)#1*URL?87 zr~qR z$CIXoNu5~aW@;wIe1jc-#RJrN)J5aZqEoEJU~@e+93`aA9OiJH$3GnBCMZ$)ayo?> zBYSe^;>muQaZSwz<1Q~$ShexrFrSbNv=gZ98$31}%F|1!&@auY4%f1r4VpR*B->Hvt`J!J|*M-1dJDFU3oGH4B}p;_<#mYZ(Sbd2N&f zMM;M!o|_EHuT71=aYaeuqnFR|hl5u4^V%rn&w7vk4a+8);_|yW^<N1boF*HGnUttFh1*vTnUs$nEQgqt#d?=nXvMg;8x&b^7Q zokBBqyptp!XD(={9p!IZadwC|UgAoZ3>FJB_0_q+nSLuMat%(@QZjev=Rb(#Zrme8Om{O+!x=MRwnlE0a zsh=3NGSzDh)#Cyv?}>rHB;u#Kg?F#hX%h?7`W|IdM!t-KJ+6LhEGLN|-&r7ATHOaX z@|ngxuX&zqe5h1LcI+)!Y*b-S9^ezRs%gTxXU{n^LGIrwt1DYuVl++Uc#Xz)QCKvn z`MuAKU@b^psy8o3DqR5__zG#$7A00y5#K0EM)3E^uihsgZVeZ}AwTuB?D463bN~;M zLkf34zg0bBv+yAo@gKvv{#@Uo7z>SW8U6|JK(X8oNVF%Mx}%hC(jckbr`UoaZ^n&L zch|j%4MHbMRbjzZY6>Tx#r0ZW_u&>6;tv+t@uljZdn7qW-P!0Hr9Ic1*V&AehWlvKn2P2{}w(oRb2(+4lp4$1OdRl`C`L;Rh&`)1$ z;yBfWUJ~&_;1t09+3_S9$Ek^}uxrD)UFZtV28*V2o{8p+K@&=kK{BLv5uHj;J+&7A z=;sO0Xi=1w`lmEBQruGK%vHPmMU4CtGmVtdLofdF(gUZNUwWTCrq^B`w3qqQPQ8IT#Aha4bbu#{Kf)D!P(gH7Xqjq!fi`OLq(nkJh^ zTBXe%R8es%DCuYJ{HA^SI;9?_leE4`ukO<+0VTnuOiWdLpK_je2R$F~>Qz^NeL;~J ziJ$1ehF%ZxpVdBX7`(>37|jPmIiYa#ICr{E-1*^FhxvWWH$1|-qwbAM-4IJxDf~Qb zF(|y8CZD=rlQnI+?zbp^wkUU)o~M&!5(V(-RO2x(Eh={;Ii)3U(e_1iNtf>X7IUsX z^T8K!=c4GiCQdg^gKYsBy^y@8APal4-Pzqz7_x@>y@9#g@>Fy_ukE?wQt~H^Uywfy z;H37JbwI>7${QZjci~ny==UvEZQ=)l0Tsk zzYKEf4unRe?}I_REk7Ol<^@kAcO=1AG15MST-~C6CbOOaYtlFisS>A=_X;eN9JiaD z4@ZnJssEx1vOrmcYd(fS^b5VJ8r3t&Wx5lL&64_W7SD~7;A^3BlUN(tejf8a+j;Aw zT+b+Y)VIJ$(sAh%<1LSZLP{gnk>X7Z=x3yB+s%u90=4P!w)EP>HwTR(wiv1`2>XN2 z3c*#NpEqyqAp3W!YUb+p+9$q@l+trV@PQUF?ThEeFPT{fl0gvWRAFwzpf$+oF+BKs zgOtL8y^)Q6at3^)xksL$n3xCmLf?!K`xGza7}iPd245(3SxT__Sde%z z&Sn!*6-Lu>Pi)=lc7XC{LK%IGGojVJBh(@$L0Ke`o9dx<<#EN&IP;T8-~H6z#Pe6% zaHYT?kN)>#bh7}^(}HFUr@IYSYEV-3mpup4_@!$*J~dtZlZ_t)bJO$5a6ZSgHIyf6 zh0`8%>lpE@1~`*iN@Uug@(WILy?9}91I`yBegNB54P(7Mp}*{f?OHF$WCDPi62aslR}h5@qX1v2llVq|_}?cZ?n@2>%a&x-_EK0`zclbUdn+w~Y|l)K_RajFQf01O1Tf z6-G2AZdH(#N&J17_0fk(AQPgFI_^Kl2xhyE+cVSCw*ejK6)RN~rq5pER-=|Y68fm< zU^I)_Q?TSN*^}7dDk!U0B9xIYFFVyq66=bJPq7Ou$zyTzEDJx;!SL^EbO(#*Cn*>M*1yWCZnb|B8)Eg~8udjQx>_k1!lK8c>?0UzK z5$}c|8dYx z)@0^85*w`J+*2H!Zl^>BIQ;^Hs_Rg;Np^6`x~RAXzz2srKoxgC3u=95r&ib+@LN7a zd!;h+hPUJa*aU$44j35ee5pKMiw)=S+ypmJlpZL7YhxG zRyMDF!`W(C{L8*6KUU;RLqiF3Pg5ukyeTkB-?aor0OIpgKM%e7(LmPaIsLu}j1ED} z@i_js=egSiq`_pjDc3isfdnVWA#uOrcW+2lzr9UEw{0?H5pHn*rC4fr&si}qr`YKc8MEVkQD`muEnqePm_ouTnFWmRxeG?+J5E-BU> z^z=V^pe8?X*r88uN#IffLPPUjb{2(n4wP3?gS=%H@(Rwc6ISBB|1eK@1XeyuQ%_+$ zPtZV%7eIhSj^S@VZz?1P$oK}dKn>BcOCR|mT`qI{^E7_EM7QrB{qfH~tMk-4f$8a? zG7vy^M<%5^UpC=9yGv=$_LqsC=cWW`a|TOQo5iAB)j^v%dtU>ZRM7s(Y$fEd$W0>u z@^K<+0LUTrK9D>+vnbafpqD?jzE)s9l5nV>CVKACs3(c}4byK_sGb{<(qMM z{n${Hw!Zh^1VApAJEvTI00mFQ*|sb^2xg-}YB)|5vCW=f% z!iKskhTmlk}_-!E~cW&~9YI<7!hp2rzn*Btq9=%M(0}PHd z-B0K$y#RGas8_esqlBYqdXHa8n2E*fs_@(Aa#0QE*KLindgL3r1*Yf{gFp_QA`B<@ zhMh~JKy{e#hMjVwq**1FstLz9n|KGQW~@TW)o8AAW>t}|B|?gDtKkIH?(hE1iDBpi zY6(N`Un@N1%eN?XP0}Um!H!x2E%2aQUJ1q8w#-gDvRMQ4AWO-~nTT>LuBuUjAfUV0Z>F4A^B zWN$#~h4*QBLzA3Lu6zh}%iJy%l+^RRq?O86p>m9Ih}X^Ke#RKpanEY}wCU7%E5gH

ZoyT4hP62x zqz@)CbaXUS6ggefhO?q6aXp=!N(lm9snc=wH7g{QI4zAy)rl_8s2!O5fuLIXJc4D(90<1sVBJ4H4pZjS1f0uxvNio1d62YDoDS;;Yx}pC;efU`v{1YL{MoC4Y$_v`iw5Zr-hxO z2K0RC{T67O=g68Jb8*@@ma0TY)?Rh#c7!Q6Khi?0tUVi2?E20zB*k+jQh2^3L z9}KqQC`c)3fl4o z^o&83=2^9iOEhS2vVwVi0}!`(@NvbT#-q&1gH;FU2gXOVi+}Q0)bdvp3Q|;aXxOI* zr~f{cP5OHb+(G{RAXNg^g_YtrlTLlJB6*mWc0N1KCOHO@MsJLO4Z)M!jRCmzMLXwl) zls%`OIGd@EQ^9#+(;<7%8%DpTN~0kLvKZhI-4FMkIT^A6-AXEu)PR28enH_h%rb`K zj^y3tv>ut%kv_OHGHz8_BApBOI+#=;X+gR}ud@DJP3$2Mm6%Y4_-24WJiqhZdhJaU z&e~xbuM1kw2`Hp=Rp>);ns~sR=#Y^nufQaiPltI#$?w>9)7aGDKBdu53@2%^^7ueC ztLK&PQaQYa5=Fdcx;{_lDL0h3_0vSXVwOgJz3QbYVVe0UiKGqR0&@^kmqy;Gi)e7z zhgnaRWS9!<=U{~LT>*5y@O6+mMcx3^Q>`8W$go41_sB__)PsbE;Hv4tCcE6Cj`xb$ z?v`V`3d}v9WP0-PhC*%k^7Ld*k;dZ=8YsvI6}`dW?)Cf)r8bxf&hzZ}!O}NX#%O*? zxVTH>Ev5OxWHuIe+KD;syvvmlH`#F%gUmh7S(h#1dUTWpgRU}WPTz5JYHHT0gZxQ}2v3O!F$K!;pmzxOM)B0$*YyV5G&if^ z;X81;;8bz#lM-q4_`rhQf^-<{%zcILgU+AtF+s5=}b z(Rj}z70FD?D>+(_s%;C-%V)nKN1kpeOAW#?bi(#ryyt=elC;Crl-E7o0Ssxsq#nWlw?AZICw0u_=WKa459c1Cvtdl;-BI06la~ zFVBOLSfmFVdT{aX?-4YiIv9z4G^U=f-e^vq_#sP=zEZgqM5NZXPJt z1ayY;>&Twhl~*ZpLKlRfSTqD~(IRN}lMr9~W$Jbsc6Qm48phQ#M=h#&7HvMvo*br- z`N403nZP{tFB_Qq3*Vp{N8Qe7+tDaFF=KjGWA-X(!cQQ-4?)Irg!=N(jrY4C^SQWD zhxDy*`#2jWY~o3BB*2JNghTSIZCa;y+Ogz#f67d z{c@5Dq!^{fNoqs-I{wnzeF-o9+AIFoCP6F;%vUHPVlN9n0Q*O4TXR{Uz> z9c4BNAT$82H)=gH^F2AE7^R1HT*v2d3Y&H|w?j186qV%33q9z~zB)dT)*gk>_|SXZ z>mmUaiUgFQ=BKJb8?xiuoL&>r7?s7M#|-yA7?vajPFLJd@u_bCst%M+hS`f=0hNvK zr;k2KN4;;V7*Gz}%}Y$Z@>U1>bJG*1*8=6p9)d>*Nsw(0eMHi0o-GN?&-2mD`TeYb zECg3~M+S$jMOzxmU{Oa(@KU?z7;1LxJ@-g#F4|X-Zno(2f-&D)E1b6FEYUay6bM}O z^Mm6qiC?>T=!AIy=+0+l(0Xg`lz_q@HF}9<=hqg^87Pp^WX<~8+M^n^@l5s#PM42= zCc_n^ylN0OGi7kcIgpK*oxLjDWLdL3OFOV1YA~_HF6@Yv1jWw8qrM(_AO_s@25$J zgSM^fQ-hOyT<9gL5PG>%UB~mJXg~n=lKuoc?86%({t)8hckxP{r*hCz zY++o>EoxYIArWCCqgw@mM{=28)`IUfsG4m+Z6+ zNI?lx23wQno$Im+UfU*UDfD5RY-S2uQUMlaDb+c87c|QSIL?x6G;wN%TLa*P_Kl<4 zjF&z#6``|=gBC7Hb318X<$`n%HWZezwT_E70h=-W%38Q2_POX!oHRXR$5W zANxg=w%Ta`QNG8_J4pSRLHXrMl;gzvTEmII=yDxG5_y|wPToe*&&q^Hk#=jj7hAPg z&}SD;@$JfEL!+mIRmB{wE1yA$Em_`jRi=E9qXVvXAHY;8i*3LP6(PD`6l&FNd*;66 z@2@AUsddUzGRQ&jJ}@_nGwTH+tTVTUEl|H#*XpvBwa<~O>rn$xpf2O<&e`gW$!Yb5 zjhdtEdCpAkl&CHTFO#T)piwh}#wi|WC9LUS1&AX^(*hO$2ghbzV1s$@?eYlx+}9nQ zSf@PZH@SKFD`2w{a?3RUqPpRpxsLY9S{`Wb^#}@di1#blCD0NC=iqYaBP&2B*VekLFMN>OwU-kS zWW(B{Uv|i)^tbWFE~S=(q)=Uago*(mfJ9LhU%0M#0o4be2h5w0?p{#GHf>4^(11DA z&pj?x$D+c_%w?xoi)1S6r$65Q7=Q~HiK zsyG2e>|Up7IC9g|-<-IVRI;#Lbg$B6#1<%G72$Og&{;%~z8OCB$1vpI*bm{{+vb95 z=$fT+x{Nozj$u_-9D3eHL0m#f#-#!?5ghtxH#_pQ%E|TA;Pz9K7ZIt027%3T0Om~j zZ)`Wy{(b`BXPQ3QhuPozzwb}u{qOjNQ^Dwn=T@lXcL8Dq_En^dLs&MywDgvFr{P{V%Li0tM zCOq~Z(212^9@6q?jKvw(^?}Lv6hMug7@v2LeisHMm&EpVeCk={|E&bPU-qA_CbjLJ znnpK0I^{cbcP}d+x5_M?51kN68UYs0YO%SEX>N2Oz&n#1|PYTG^nVpEc>e?Daj?;HPPRPeC__HP4|M!2CQa zR8g>hjx%1PPvg_e=-+a-&qIEGjdyiD**byPBqtle#($LOKzUZe`5Z)7?1?bXx$t2W zY5EKwJ@ZGG?=DSAuc-YV;QZt!{Wbp9 zS*)ushPID@$_9z=G7%=Z_f?#AvhU3pblY^XN75AZ`GG{eW`k<}GI=>`yH{ZT+`3;M z>KDmJe~j6T9$cIXMzwnewR<=I)^~#TNJsSPK`^IT7jJzUIjWnpE+zbkJyR7Gt?|7! zBnGb-P|1PHOv+S&i3!sI6`r1AjRBj#8 ziRfPtF?i@PSi0;|UPCuBJ)w1*{@kWa&d@x@(WpJ=lbSHvJw(SAU-QLRnKrBj^MNu{ zB@)RANZAKs3S}dPPV-kO5Z$u4)d2pGk7swMi`)^$iVQ)H<`vbUl?9;$??owa`Y_#0 zb)5b98gWhxvAd7{BSaDsNI}kocgflVgIY6<#MFVhQOCb7F!Hof+2T#o5JiLb43Cq| z`mQJbs0zwYByvn)#`-f~CtF4u4~0A-dkZ1)j-?%pD4$MT^>vhw4C^+L1WvLZ7=T9R zsVg;_3%HbbA~&xAZ_e9<<{7hAN=r?GQ%)n9$J#p`Cyv8FFJ_IZFl%M!p?}@P%XOO2 ziF;ETWW{~&8PWQLQ_qDlsZw|6IndiAiFQ1AusZlHvS?lqF^i$(C=SyClTW~UktwdK z=1lhN)9Mqm*<57q4}dlf>JuNEKv!3Moq{ZoE~zJ?5$U>Sr1|Lg!*)ty$E5YKavz9b z-}k|wM~0y^H!o(CdKx#WflSH!A*GlPV!V5aGMq&rYI>O<_|kiHb=!_#qvq!3Xvvaz zylaN3h?XPd;X=o|(Bei|!4#?iC+Q7=*(zA*3-aU668AxbB&kt_+0LtWhiPhSpgxG@>rjE!4RfZxd_?E$^U87FPmr#`5*-)xa056ay;UV#KQrFY=qA`ie(ILB zPcP4Eu!FTj_-&lDK}P@Hg+c!bL$HGhYs=={sWzXc4oC(q8<0VF@HrHY9>R|u#@eigo)5PX7v^nG4 z^-s1qEeT<3jPE*@%#F9!OtT+UNb-GEoH>oJPILKdpu3J~0d%4-r(RC1ebHwGsB_*~ z+RP+VhsxG?hfZK*jTaC*OuQ8*;4;F;HE%ABKCp2v>M_{nYINlhFaRL>#QSF1m|R zI6BiU;L_ejJ9W}mtHO=Dp>(MBDv;~VyPKWZu`l`#du>U&j$H@=G~5hs&^a;Lg0}pu zgTVWC@9tN1H)zRgrE-2WwJ*?uR+OOy`BadPpq)Q%0&dr6P4!|Rxh>?+b#G(XB8nWk zBi)S%G@>Qnz!EP?yvOQj>228Q;yJx?t6O1WoghIt%+H7tZnQ6hYkX!Mslok;a2+Jf zrl3{v3{VQey3Mt_ZkaHLn?a)LV_n?CmTg^_S8=>(oIq-|#kvQ3VY+jX-dJ0Aw`F>p zuY~$a{C86+`Cn`XY!0AxX)}l^=;Clccj4L(=N7WgrACs}nuOP?=lwA!HkywRGwH`w#-ZQTsAZ``u- zw(1GtQ6waL;%)#n010?31~%vm5nY5V1sT{l8V{>k7abQU5(Q45H|Y3tNc_%0dH>|r zuf_RxnDZVoKw2)IIo?<6h=dppy3~tLHj&evA!YSfcg9J3{VAUKKB0vO8X&`pTh7C_ zj^S1LsI|b&35Bv>aj$o=(YA~nf@Z=X@4$PmB3c(1+TbKT6c=o}g-J>YVcfaKb_)sX zcUa?I=v42c*8r^q_7Hn8<}YXskx~nN9qz~2bF)70+EzCO>}h-^Kam$=)X6 zYXMzhwq?V@^KD3p1T9Uc)=Wr2n+z)^qI{EAUU6QNNBmq%Nl>sbAWtv5eWX_Se6U)O z?)0kv+iPG6eiahS@gDH3aR5-wy!x7|GfhZ0Cu$lEP7*r9gZVGaiOU#9s<;gd2O1r* zS_L+NgaN#Lz_X^tKy0<$tp)9|57SQ;UiY@iRmOgu2VaE8 zD6@oji)TJKQu4hiW4A#4JcK%J&JhYP=O=5`M=&Lp%gaj4?QKb58jXX+5VPl1Pn;`E zTQ%79elNCXkK8@JHz~z*-{5R4EkHLT{{MJtYIU4UouE#or2uv?DO#b@5)XNN5&pJF z!veOq(cK#KejnZ$)c>Y!szd}SsQa1&<$W&J-;yS7^-$gJd#hb?mGAQ|95oEilXk+O z{kC_U&w!Ja1``s`UiIJ?@yo#zaGlI(Au%)5dF>20C$H4$T&b%AZE1P1kJ0EOlC=R2 ztHMOk25X)O!~nYzO;~?^HUps(aY(+U^4I9VUy)5J;w^=Yax|~BEZ*YVz(S8mcBX9~ z^c~K+&Ub&k@1<_siGK&pIR+!Fhd4uGh-kv8i9r2*x~axRANv4ji&h%74aVh%IA+l} z$T@3mH)(e_>{c&MJMngKw#)_5uub#;(R^=soQ>Z0#rJb*-RI}&^BCvzYxJVWyGSA* z{mfu-h;!ns&DW{ywlCB&A|U&KVLQb-p14JW6X`nrA2|N*0a9fm1Tlm&MylTPJWGMo znlv7A0Vlgnld2!F7?AkSr8dlE5HLE@60eiz*FK-GL!JUE=qr$)+xzLZzU`j|qF0!I zYmOX~QXaE-n*tG3$2U}Wgp{G=4dO-;w@Sk$x~ap!iF#sJ)6l!ky2~ zI;YgNaw7kzU-f9rFm3VEm0@a(SrDa$3+NMPS(E~(@OcQZ1t{H?6Q$z7FZqN&JV2&E zBFOQuOX|cEjI`2m2H1Sdr>?Q%3yHclZvN>{DzDtR`p}6*ZRt15y)o4(s~R0=7k8=9 zynGzg(l%y^rX*UN8}xG*>1W*B5uqDjF%k9@&rNQ2;uq8Y1$uIl$jgI>TQsOCO7hX$ z_wz}LR}^Y7HBThDdAVWGFZl-={u3w(d>{q5@|x>@2u@Pkp~_(No2|R%CrPFa+5&S( zje^5k3i0?PHPPsxxS~Wyl!cnkj{kSJ};R?#{TWVsokn{+2pltjuiQ?vu*gxzw3T+Vk_dbx?+`R6-D_$g#(z!K| zbdG#$Mz47mWL80jJKq_w(u+0HoPMIXLnfp<&Id<|Uw_@yYu~@Rak%#(r5&Od`qD?A z>~-jKzMQhBrx&T%n^Z;{101}#2?X*bj9bxnf?GFC3XWCCi0qG*J$9{IBHft>%Qav} z10tTTewHmSpyF7;fjZp=UkRy9+d!td#nI6=jX%+!zg}_u?~m1<=80{iSd4 zNT+(FNAo2Tdur(nDv&0Fc5Oi6-(^ABF@K%J#)rW|)$8hr-Hxt2vnYR};+CKWg_g4) z4>zmlHHPV=-{HgG0d|<+Mg|j8ZLjhV_ z5GKWk-yXrFi&Xv_nEl7cfaWFVAE0?j`Eew2;*Bp7i~a(X2lnW458bVfv+kL0^SO1; zG>f-N6>q5yawO^qC?b-Xp7vh=9dLQAGNWx1Z0Ng&ECDik5RvzLIIYg(^esR`;J9`_ zmt51O+T=SlEFz!7?$N-CO(0T0Yfl~JB9Eqvdx>?54_ly3UsVN)SkgM0sY@NOdwJK2eG9& zU}=_U)N}S((RFAxakrMG&v;gu+pH%)TC3OIQyZtNBi9)Uj`4Z&VIOGlepaMt!+C1b z%hixcx6>tAamj@7yx<-_=_cUytl~_fW75vd`NhW-ZK=)k-{2xgJ0AutA&|y=5~0@S z>~2+zyOZ?)tbcKt?_uI1_XcDif_e|<%P9fPQJ$LS!RknfK@Ys%-qPjd^YK|L{0uvo z8!oBbo=g&X0#i(G-?s5gndwYDVBB%-rmoIZ^ZR75{;pS=KlsHaApUXnD9-ug(*1um zbCSd+4M}##qc5#wM@nxleRpfv0k<52jJ@Y|hLg?O;y|YYW57^7sL~z3^f}VtyEJFj z1a$8+14hTeW0w@=%KCr{HE<6~@Q?(pr_gh&AE}1d=}{JE0iV=quu%nm{OxA#!>!@$ z-!P<3d7PPQ)Xraa*S&8Qs?2d$9I$W6JX7M3ZJJj+{aK+3e5t zsNYU;M4qJ{B+unLY)xpMduB|9zUO=^S&Eo^6V3tT8J2)0eF3)_tX**w?4Bc)D|fy) zNS!1ZFJU`wQd1^L(zIsU$a>u!sq zB(jHJH|K)0-}|=TShJ2Zr<*OjCS&2~7dw|Lgny?AV)S5-!a;U=pqvA@Shuoo$-frTwjn zGfxmu_JL&FT5NjuehN;C3zlI6(yX{CSY1SkTl_JfeNcY|(iyA(jZb}%BpHZ6DINQe zI^IKA3pVXz%{w(L0aN1ebWrh9+`m^c2FtTI>T@7cM>^y=p>ZTa+*H8n#*rEzN9{K6 zs5tYQX9}HEW7HZ7p4^~JT$q%xg1gm0Rwc$#3hJuY44j%q$PBVOPB5H_-zi`EfhhVL zU>><8ZanaQ@s(>LBXZ=-B5f`yemg3hLFv5P?8EO*NR)!58VR z5A>%FVGPy^8}GR=<3t*}cJSxcVbhb?Vf=%!A$v~WYJHcwU1@Ci6eFrP_wjEB=N_KE zyX%@UZAtBB53|myHxVe6ngqW{H5^4b*q+o4ZGyslWMObq00#npH|R$-M>^N^0AXA(}rx+I2x0Lq8*yXfGrV=l>AM|0IzA`OnUwy7hli z8RsvY8`J;wSO4Z*-z(+4w~CUnYnlU7&WfH92Vm*JGq2fs;(We&>^Ib!Cs`2` za1CM;t}X$nh)c5Pr)(m*AXKMNe{Rpu&rdHeFZbWiFVEBG{`}|pclq3PRGDc>RSWgsoBV{E6?_$cjOTEtF0iL#IM$B=*+t8wt_bLE0O#)Aa&k{kXTMO zA5VX;8xQJ4{**zDP8Gsv^zfO*r;3iWHO!}KSp1uR_M`cGbR>2(5E-p|9p}bK(0rCY zm2FveXa}NSFFH5Ts)1#DY;jYmM3LPB^d9DPR2CPi?azA0-ZF_Bz3Bjp@`pb|O+iXf zi6p)0K^Bhs8@4kNk3_f)Pn_cVgudi>y{e^GKD#Wkokzi`Ux$(Dq z>DBN4N%p_H3^B-q|M}?7E&uy7u||f_vYK>61d?I7ha~k5lT@Ik6P_Sg-4Y;8Q%n}j zV73EG1&fd@kC4K|d0xYaFfMy*vw-2Bas9d4ESg%HHT8|5gCM@6AOoR@A&O#S5YU)D_4Pj-l&2=H~8RE5Bq&c^VKd88PG;DrtNmL zcxvMF?K=l2WloD($4!bxah+Pn&T(!hnL;jtLHb~zSa}_3Ha*@cDFQBWkNnLF>EdRn z)Oy{BbiNViNEio955Lk=^mcjiCq4xj6@4zYz!}$QisGX+r*voqXZ-c9R@CMVg0lIA z#Aq=jX3(6YHEBb#2hwrBXkrCy+yup(GPI6Lz*n2HCi-g(i^1n8wt(lGRhg_}S+(q@ zt$oTlE<(%e^IvZq%`y^*;^U2~JnzW@P&WgOk0s_;TYxThOLuFpNmatc6+`xB(06@x zug+Jh>7aJvwJHqimNKa6Zj)YhD^5fkykv*Qr!<5mFYeIE*_4Pe0=n>v9m;#KgG{tf z=Q!!>4P(E<=twl`H+EPfDzQ<0ZW^yx6{!ySkteK3Ga~(!WF+XD z%QII~l8I#4qBF%)cL`{6Q0PRZ9Ag#e7hR}uO?&@jj2U0fQuvR=@L(p47eDmC4UG;g zo7NZisu7v;t%~@BMvtT!$M@lV{{g6`8zjj``Dka56)zjXi{*T|#IGW5O}4dncaL%L zJav7Kf5D#)6Twp0WO3^gL(`-K%t}ffj>Ucn>$DeK$~U%Pr4m`ZVD5jON6kX1RqkPT zFSNSX7;}pE0a$B0KAw??fpU_RE(K`YwqcGDmbG^qZMx|+I?wA=vt?=O!S#?B>0(f5 zH0wU_l5bzoK*qPBWlJ2wo8}DM`fX^^VUG7LoS1l#X%@A5#S>*dMXfD%#sBMqZ}tz= z0(yABEuhX~PgF~2=P2l_J*O*-7IoCN+XD6QLRdENL!1#JHJM6ic4@=9^jAm)^&0+r zq&u6mtwVxricV-Wn<~kG#>ZBcKp`erzX$dX5;lPxrNyH=4A&y;+_3FvkOW;^;QMCh zzewY|tj@Uy@`oM#iD!KcwNz}`$2k?9TQ%+H6BStM8T$59ne6n6!uPAx^T3lA2Vk3Y zZi$0MBA#?IK6QLfBfe5BXw_NDpaJTRQRh!8o2)~Z7GAI<(&#`|2=Pm|Lb|$5yEddR zV6-WC;9VGBOb9Pye5E7Wl>nM;k!OvI?y2i2o-ElN;&dXXf-YG!(7-EIN-TBus2-%S z_Q8$=4L?9g1<4T4)ME;)$t%QvcQe*y&DX<@+kIO1w5L89Yl%`_d)62kn9CB4DsL&Z zS_S$;j6nxGl}~R(Uuiv)UwQrKj#CNgTN*FoyFl}JE5aw~(+0FR4lNB8j3 zR-M;$vZD9u1Wo3bnLy5K<>bs)dcm$CH*!~ZLEWS&%@RChgFtS>ZCYdkJ!q9lT;Uw$ z&&H$$5vYj2F#dAQB6S^t84ig7d|`^R1yV;GaWn(fh_s(%R(9KY9?7jHxhqLsr-lj2 z`cS;8sFEslpVe@F-cp6i4HKFAxb;;ZqlH95prcV|7uY5ZoGuJ4o~e9I!tWLP-kIT{Lr=G-2E(%*4)S2UjEyKDHOw*%B1PL|I|OXzv`c| zOgaUkXdscKxyEkEz?s#O^NI0+y0H$2nQgLE{(zr3y1b8&*Jz}tJXDq9jR~b6wi8)c zhh{@!F=>Dt9WCXYoqI_7zU(Pc4>XiaXNKloJbdPkBUK!!&W>o^wzv2Mzw~oknOGTF zZqH1Gb7M|UD&H%G>B0F53Z3TOvdfJcE$i<=N_s}8xq(?97EYaym}u^z`E?(J;0D#) zpi^IQ9qUcHK>{iXfDOfcx!+CRFxEU3PQC0y^hO1n8x5QZEZ(IlsBf{U@g_f!jV5i* zOQQV+&3YvJB^gxK{pN@VQX2G2RFHmu{d~u8xqpI}{5)=Cp7bf`vai@2SQi&pKtCBW zXk2G6bVUW-CkE}Rb!SPnvmEL=wbK&dk#na>+#$NpiW!$XB5E4dsGIio;Xj7w5A*Rc z3=^65K-cYy(w1yGkLAgjl|NRfJuZ%Ex!eskm- zfKFFGO49I_FwD$WzMu@4n2f4Ow~4YDsUM`L8AyNi*(ahcB=lqq06dA7&PSdzuDHZw zMEncqD8Gp59df1oemZyQFEB0{)oZ*_LP>5?!`#1kn&Rr~7O4Nh#jw9uhjH=${P>s9 zNA90EW!}+fd`4W4gT)GdrFD(SJo+ux?HR&-QNl-vO_K8$sOsx^1|r0Z&CU~=A|5mx ztr|{NJe#~KSko@Q>bs}0J589DujwS~geyRy$p@DKo6QaJzeO;dmNF&#Fu%Tje!j?@ zpET@D<(z^32%||4Kzy<9_c1*G(!;YUaf*ynC!G2~yI2zC8uELXE8W%^r3?N zl?e5Cfc?_l7i8i3#s5Sq--NVsp2C)S>n!pUlSd_~%?AopCT>%9MM=3Z2slh;VlJl& zlUnutT8VlcuC};-azh0hLHg+L`z~%%{*sntQ9%-~b{%Mq#BPVQwjn=3LNAg`DpAc7 z?Bq*|{P>eE{|(S#TKHCzZi$sOnN%tW|liV`Sm>J2WnN;LpI+?~sry4cj4QzsC7 z&;flhp7rqnsF?$O8!{n715h*^IYZSEKzrSRygE$vJAYz?q!uYOAF*y(GUpY@(LO}RJq8=&M2!7Qg?TmFuQ zg&fcsB46unEjU$#XA;W&+~n0)bZ(1?4=6N00?+fRqS%DGxN^5DO3)gGaZp%&cbMaQ zXg~&6emwHlf!9ozNN1)n=Q5>w3sSHfs9YLLP&amcsGAc6;Fur>T~xd*mrkRgL~-ZK zEJ#&D8W(>Xh&WPvNxxT*QSl_bG)d{`o*$-&nH{F7zEg$yd7IzLq2S`-8{zU{A`c3a zCSX&KE;Q?i2{BHd#|%@J4|V%0&iT$~Ju$2=(nsTlU%PkyiNsHqQ9$pOwxr|97GAHD ze4W2lVSewpq(foOVYrx$%>4y|`p#`UQ=AhtZ~~_2JelM@)j-ynb%(7gX4|2cxJ9+a zs#Ao=pz`vA5FxW0)R6wL2D1SSH|nm^0SZs!d;jj;_`c3XwnBwwZ&JuqMTuAWbv!~^ z=ryEO)wFHev_+pFjhTUaO+yvi@e+h$(P(<_(H30=`gzkq2Y$#cKmN~q=f0&E!Vy)N zNkUXk@$Zv_*K1>D&$-ntcrNq`lQHq=3;NzVKPYhpr-EDR$bkl0~ z{JtgAZC$7#I{`z#o;Ya}q=5s4K4>Bh@g>qF2iZaUCn!StH6#80`rboID~y78Kk9g* z(D|CC_sGz4So1A*1yhVVNI}w!u|e?U(G155CR(g7sraAPO-dsic~~bkXU0cA4E>OQ z4guXZvufcXG(`7>&TqS8fY+dhTr^UaCV)GuvW%;6sK--@{eWUALx*Xo^K?dv6a))#)h*=*CM zf-Vj1n&|MwH;Sjo3uZ7O@NKkFM9_E<^9+2zHW`GYB@4~ z7(+b=ZjpDIfTsG0pX(#F!$e5YS#lrGnm8^8PV+Mwk)pE8L|%d^U(v({+=f&Y&T{?+ z(C>Js)cZAFxP0T2!bh~fNTet8+MA-3Ppoh>r+R0=4;%WzbN6Xi zu6?=jIZg;ay-q}{Rh&I^+fHxO%WqSz;7Uy`?@F@1D1+4f9EOHzVw1s)ul^S1_s#V8 zz8jvOpYCJVJrR+m86{2Rj9%^FoFodSj+j0Ex{0ma#s>flFkNEc=pj+2iVLblswc&yt4Oz>j|T?G{yQt>LAy6vl=Va!-e>4l=7OxE zj4(CYVeVxz=S&$>ww0TxFs6L(p_u1+H*ADDC!W(bO&Pr^)L|HRMzi{3sBN5pME|cf zIr);y_Xblqr9zRDUk5tV|9c|n@r(q{qnX#)4gG%`5uBm9=eR>5_fp!_GN+V?gy57` zfkvDD0)fzlE2?#HRALkp0ZHMMn!+iSXx~QnRt@G}W@q(L578eGbD5$Gzf_)_kYz^7OjZoP4y<-6T4c(Y57}UI;1dYXo6X@=XT*+Srwt60s%tA%p$NW z%CUw`2tObN)dKY?9+%XiMYR7QxS`;gYUD`^;L>TzNj)(CBkvfubg}S!z{T6db5`G= z>ru=Wi?+-*r)K@G47-uc`m*n{<-=&`9c88{JZWXw#Xb^j{%8Sp3#!v`|Ez z50hnXiBEht-K_!=-`}Rn4gS>oTlRXAbgeMGz|vZo7QC>x@4(iK4U za*YyaU|>Lh&P__6o!Q$xlTNB`6UUX>j$ z`Rf@dYSz}##qXjI8@DT0eF&U0(gmIU=PJ-t8JlMA4_<()=P40gA#{>RnRw4ijRj8c zCMAtFoR!bjaAaZ&MDsh&{SnOIriEX_{MvyTPv-cpM(>^OAvO4iQch%A>Ly-q(y}|- z@6BeQQ46vJDtb}HxSO>+se`FoNgetXW5UBf;IKKir>qXVZNxDbb22XvLYc2<&}@Nu zKI*oE@`5-fQ*S*0YEsu@3(&*vr{!~bdT1)(fW8LpkCMOi1*0j&^)x=jk$xYTlz)n= zG`&|FakjCtlst+kz2M3$>m9&dC85ril18W;G=v*z)X zd6J^Zq$zuE%yM!nD}UOO&8LSfBwU4|vRK$I|1>_Nk_vVM*7#C=@AbTf<#Z)BqaG*Y z@Os`q-w%`d(w-oh4)bJj6y`~y|7{j$}$^S{h@|xJzqI06bWW*eO4%Bxc za|1sN^kf$_`A=xqApa?Vflu*ytb!>tc~F?hE`k1Rcff7!U;`F8cr1M)Oa7wqr!`_ z%i)_}r(33_2f-3`KB|s&6S^{!Z&RtUe()RYxvDwT%Q)%CQyE7K(+93U5?m3Anv{BM zx}b)cQ*E{EY%2l)>@rq5&aDW6v>-i zXwDBTYU13mlr2fGehuc9xV-K?Hx(M!(%t(&Wp?vhfCloKJ{TuJ@fkNjTZgbURPy?` zL5<=$(g@VrB}eLLti1*3vhLWGJ`aja?*9Ae^okF*p|K>(&S3~gfo{2Q^j#5;m?@Fp zj&$FRcH^c2s*wZ^FQ^UZd{khsT`zv7-=GA!5-ms*jq#tMPvTSk8Zb2%tLdi;+dxdU z97WPz5Jx7G%K7;F&Bae->ovS&T6gr|n{lv5Oc7@=k=P582y1hJF{5M+CJ|+zX7?7O zwhWD3(Jh(AFfAxTYyv+}(v(Y_JJ?fx~MP+GJIBI_ zzI^1VgFKoNG>T|4VsU7ueAdjM?$!X4*>YWUcpAr-VfcOZC?%0WNt~ZCBq1`0W!=px zy+d8u6_B4_`{|u?2dasjkmThBbaZSVirzP0xTGKqia*wYKH};%HNm8Q9TM*jebyJ9 zkup{g55gTwHswgaf%Nlh`o+%|-S*U4()GO`Uu{2jR_fR1@#!_F*B5{P{WLe{PO7N3 zq}#Bim(L3KRJ^mj;nU#}nJ+6i5(*$SgMt;r&7%X5ei8BSXO=b>n>~O^XwNk9 zC>f&)pk?Wnh8!Az4&I1~x@ZoU%1-j@gdhr0eTbf(fl9yA ztdK&@4Yx9;dr zeouctl5n1w7OX+7(H#MG>Xx+HF9TiU-txOQD14-je3bAE#dXqfW`GDa{ygnd>e+C6 zP`_7DX_nb#VVZ7?VVu2_n#yvSZX{Ghnatgx_@9(L;v*_Fm}?|Um4!0-7}7XK)u^Ah zy*;`dt6}h)@VsoA6znBP-Q&rmwgu5YJDkN(7~tMKN>r%D%d zL!lTGCdzc#0yg#MGG5*Iy@`L3pr%EC?)$nwx3$`AG2JF;yiHpkl7zxFMYmpVFln&s z{OOBtQ0E$)UI)$1B>z+*m4XE7`4+JJm9h(O_BqfQ=@Y0nUdJgKb-V-N5h_TUal-g@ zj7x2?{6)$P{EE>}spCwF;%?7#&cCz|_|TaTNYN!l{QewinSqM`kpl-{4QS;~?I|eT zPn6f*IyZ8qW^Gcj#QF9Gv%HQdHhsq&n@LQM|^_8PK#_0*sMH8*=zPZ%jr!Dkn0xRy1KJY_3(I&(kQz%Yz~hD^|Ss! z6VM*JGtJ&M=?~OJGRWam6q9%Y8xDY~96+@IwL_RxlBZ8{s@(z5=6BSfF6vRt@v3#IMtKVk~cq4jxQd*T6R1u~mr(mG5`4(XwIKDT9^2e>q8*g@KOnF+AMEK47e%G)X? z1vNw!=;zJJesEN~?Z;QQ_$GZc=)K=T+E5i2{~ebdaY-PVB*jC$5~{$6cq`PyE6M>i zX-_?pGmDXlA`&=>*$lUbHCJ9Z+S@iD_R?ps3opcAcSpRdd{EHZYwSVPrk}nrNJaJQ^9UK2joeM}S(BtE81!sh|gztro0@mpuKzkxYS@06<0gwQf6nU=4Iwl*zYR_01tW=CksYWa*h5+hFMkbY zeej{*zWlBI5$H0Cr(dD<#PH>34H914TqdnUB?--2f5)J30{3zP{ILiX1W{lBH$CHr z65vT$c$KNhBQWh+MSNi0#TVG@*gD@D^+$oC8vhi>D%{L86-Z2xIqQpG*{nNAn+m4HZGi@v_@);6xNE3Bk#PJpK)r8} z6{+bhoBFyPb}geaM}CO+W26%8+wo%+&)7`=?3KxRsS3<|102wU9T>GeFO5-%PV|O9 z{aNtAE@!FpEjSBx#2M{WPqFFmbzkK96zM$uBp^{>&}b@d!Yn4W#9|hqKHo@nE`j_k zK@vPvcLDPAX3Xnr{O>ZhQ^x$4$)7jloN@o<@cs0jA{$GuR9V+G<&ai4CC~q#FBOd^ zlK6+vU9Y7mwrWu`CoKhiyJXka4bVubj+qEhF(9&1+^vd7EZF%~1;vFjw=K+{0%opD zjBZopDSmPJP^DtDBIjR0`$ZLQl`QWnKND*o+VgW}U|K#+6uXV=HlhccokAZ%vHsD8 zv@?4tP)eB^;?IEssrSPek0F9694ISo7=j+3bSGY|l~3})j7nVi=u#Qm+cNws2;k`f z=-Qxi&vcbo8A32yUsbpBgcuLp6(Wp_IRj;0^dMY)S&lS`Z;y>t>;8xY^ z1VD!|$QshKtZ$m&*?1|Z4uD*!_s|{fY59w!ieyg!<^Cv-Ud8z}X^&?C-#G{zPHuqH zy{f19&k_kdDicN(u1W;P+I!%}c+~>6!p$&jxf%9>ssvOmQxTd{RTwQbLbFR%^1c~s za%mDcKhI1Wk;0C@E^JhF#$LS0xm&c(xcFKj^{dOB%D7JQPL;RPrr*0*KlBIw{_r0k zA$&Z2_cjRi%u$J_oieIWyX=1ihrf>jd3{{pEd9Vt3BDOo-r(cKi zv(pwB!t3jbg-x8&>(BZh7UwGZD$0Q#R`DIc^VB*_9$+2nKqPT}d6XVLRD#8Th6rK< z(tyQVrMUdK$m$AJph*?%AZ8720*$u1=%cJz?oL%pYp1_w{m%dET^=ifRp* zuXldzCxbFlp_g#g(9EgqNQ0U~1y{@L3J^0qy#zJu?m}Je+#NL#xtk5hx0bNX7f?Ns zF=B|*`!(3bZ|MvL1TA29vQ?Q)%oCXy8nsHWyU*4H)^kQ{NEs+l+y?$|DV`3pJ_)-w zY}rH>yw5fu_fJ?84X>oxGo|np^v>|-AU><8tfn{CWICya!}XPFEUeXUkI3OdnoSgx zEt2wqMwPb#k6hpirsv?ni=g~mZ~RNN`vDboQUhW(W%O*&2Basp!K9z@I$lmv+cfv7 zH^}GC!Q5cuU1g9FDmJTICeP6sLgRAa;QstfEZj#M;QcvXgumW?578@7C0Ak_U13HZt(|-agklFI;j+42_r2^{)L?KBkhrh|B%{64aI!!t$2I`w zkLSSEiuA%(LB{oWjD>Z+1cp4&nSQ&1*+kd_avAIUfLpcGENY2BUAM%7n(ovVpta*P zYzh~x>hCrNEn0nX57n5zN&K@5E2pQ=B<@(}z)oOLQlKM&=^X(}yWtiUh zZW#OS^`CM0C%)bPgz+DT^46Lo;ndd_(|7`!z(~3So2~%awcyNBA(j4Ko3xf71Aki^X;UW2==4s^}a=@T}3s?uAvjaKTbPYv}p zrGo?wsqm9uy!|rCz_uXekJ9$j99MCqDH&|28WbS7W$X&+V$%*K_U;pZ4!yPjp zv{QGVr%3_2%&)VCT7*&pRINprwli0eW@a={VEV2jbzQzw`G^9P&L(EjbD*a)srLt`XVWkZQdBWeL>7pRD6Nr_ z!i8)?%5Kl~s!vhN<-Q8KQ5^=7J>05Lo_^l80-buQuGMfJ$0>oKzwG-{D-enn9p!#Z zSqL<|r(R-0O+kFChV&7>sk%@UxqUWq4r$3FKid@+O%s1rkae1jM^1jqr=p({;c5J0 z-62vf(j;9g(lU|?B&HrxaX1XG58P3n^p_&s4(gDW-TRPUbj52LpOUYn3weU``({Sj z{}&3X%Fny9$)x#-QU{e4;!Dt&Pzq(IDh9sI)LS8Sd`yM8)h)W3QfPJMt4Xm79MFRLbm(_dolSAjdtnQ|K?yyNWq=CD-{(Fo(#$Hmzc5)D--8}0 zG`4&B$YKzA+@fLQUe;`=tD8<^z)<1sob8P+S_2BGL^++^!5UPHj^N}@nUCJ?_*!!R zc%2&6(eXkp5PB_wH!)>~WW{bT8U#NFO2TYz+%P^IUIL92gdQvv0WG1{6!v8Fvq6MJ zja}cUU$;@+*EQgC_J=zCY-+U>a9YvB+!gLs1ZsVG2dLX7>e~VOhR+jyeR{qKM zvP7}dXh{hRM@=?F=~=w8oOv|9vMISYWHS{~l^?IXiLW*8XOax&h!0+PB$KEvI ze4-~*v>*FMa_o3NfxtI6;(n;XJol_EEV|)-sD~?4?>Y#S;uD~~;KwOJ z%@qJLsT3_xzlc<$k1t>^W@`Mxz{g|Yjnh`tZWGPxp&DHj0t>=aDqCoIYk+*~(QIS- z51wCL{qW#yeSD=9*m46}ZcK5w^yYAxCa{M);V8S_-W&|M%fwQ|HZ5;sM&z2h?d#j1 z&PTYJU^7jlwuacWqWc=LYFqSMxiWQO$fylq53w@V$4B^T3{`<38xOCQcZbLK&MloQ znTz&cSTzIfx69weoT?$-qKW<~>s&54nMFaLi0A6xul|;=rbYBJpn=V~p6I`Hk+mw= zJ*CZZ@zuMkA#Jf445~Aa=aO~2-%0pV`tBGih-!m+7%JPLejLPAH#Of=1`X^Y1WK~< z?d4j-gk0wh@-1022m+MU`h^^)n;^*G6AHqyX>r%9R z*_ocGzaVxB`k%mEm56j6uu5)fB6{2+{oWwm`q59tPy24_L6LgtSCG96@y^Gby)JEP zmpV1LN`h&!sc?wx?x`>p6s9eye?d_9rE@O6br&zFxg9)74280?60PfOi4-qX4d{VQ zi*5SI(3*#qpd?;zZqDMlCthb1RBIT|uczE=^T^Nw27%(U6kTeNQ5W4`p1t_s)g;lT zI@0f4Qk^pu!)_zsBO|s72`kL2YvRo9wqVnN>aq}41K1mROAEZd%KFz3p(d!VSLpr& zTT$`kN2qL{8lQWW>QGBme!`po^joOygj$xm6hhfa<`_RR|2Kpy{ z%rw0c@C)ON?JmcC)Cq1V;bSls4o(fW{Ol&epDbD2%4;HfI8CCJ`Be% z>Ld3)F{=ou-V$84_<{4PNY~M1bpz6`Tgc65xt(GX69@0#)VRDFu5I9_Oo0hFV3fv7 zrFmL{(-Q`n)i3KvX`}ug{nHrMtsjRcs>PL}_!-iIy6b&81Xd6!N|$;R8j)^3=Z|TV ziZnf@AXdO6nBZa#Ypm82w|fg#y!rFHU-~D5Pu!K4tU7DX#2YGDa8_{tg!4Uy#H-ZZ z>L~H(`@SD1vY*{j+STI8%%H2ls}!GoBUVU@3_4mAT96K7H}p><(Uh$Gz5Q!SHVuHL zxk`s-&t9)ilv|P~`9zUM#yQZq_HX{3KyH+8kJ$9u^alb>e<3j;RMDCm4LTS6g6@)b z*9zz77Uw4kb|za2{P`4cw~83w6DzSh44;o%k}>`1(Mn+)G#z;HL~uH_yh%xB@YI-a z_~T=V_j2~0HkbdUsRXs;PSBbFin8ePvjQK0x@m+eD3#`tDAY?Si9?|G@#!Az_mZgH zGdS^*uK;U7?!OMTT9c76(kwO0nW`>s(dhSiTiA4+kHY*^9cZmLX(zdRAt*&hBDhIqS9DJlU<25* zahd*n{#3yZ_rJ`lWdamx1A7DZ&6>LsM2Xfe45-sRG*Ih{<WJ>bLoKJG^hFSk01=lypgX zgUKPm*WcXyk~DXM@+WnmxTrMQZqRt8^w>Ygw<(@__vS+A<0}=KM!oq(mFv8Y^u(bv zTyUv~#QT=Jd=zGqb=Ly*2krDN`?c@;uxd!DD>qLH3b+P3%qSN=fZgm_eQQ8`1Dom| zqt0WA6w4?zG_I-+1K)*hi!_=~J--c>J?l=MetX%gqLvh!ay}iZ{-JmS9QRjbV*YJ` zYKjh@Vc_ZC>rmNfiX~xrZ6V()4xuU%AM;I6PjV9vEI*r}GF0O;P@&?rGZAsI*#x3g zL*rESU>&?x#hm)8)90+h@5{E&=wA`}kCOaMF+psQ7`~$CyX6YNkWF-2`y~XJw!(?M^8Xv)35K{K z$Kh=zQlRGP(O)OkM+-%^Dg4(?6{nwPR8=QPf3)TPuUx0w_o`8gkf?PHIRTLCWfy9Z z`wgbhrg|Ld&PHv~^$d;@0T};@-*FkGh%~-XdOlKqsr~@x*M#$Dg%ds?>ZaNZF`j;U z8;$j9@ES>$;Z8|vzs{_QRm;EHw*0541(lTEBgbaS%fIOK3LPcXt0t{rp10pNBZ(`u{y@ueU0_)`zMb_z`<#rIW0$ zjLx5hJef5|Sd~lp)v8T@Ao6*S7@uBLh*&3>)J8RQf`lshRzsh6Xw8cq|h_h5?A zwB8ISFl|%Dk}AKrQw``pH4m#~kp-NHYO6?bq^8((;N-ePz7Gbu^H4RZ6~v7y!Tg9( z53s552Y}P628mOpd<$A4eS15i&e=0;16lbr_78P34ErcP`&U9$K9wq4paM%cpQE_+ z{^t%Ak>uW@$HKKLgya>~M5?N9_0^9S?S{2)n!8kvU^W%0sf7|J-+PQw6nT{bs}_06 zs{yO`%AucPPpKk*E*+~(_vJvLL8^qs^pudijY?B##S>5I8A#;R2K7Yczdnf-`JG2~ zzsvhXP$iKQV8p{f|Ai~)l}H5!)u=tGVoo64=Os9C{Of7Tnfnvn{b*1U#guZ!I*m|b zz=ry~t5`8?zvBRG!m0qGtyMFvt7<5lkP?AN0-~9|cn0{pYGj&JZUB7ipEgj}`L>59 zXaaR8f=UBT)am8`_L#4vf;w9_`YrK20MmZ4 zbqiiq;)kH#?<(Gt4!~hlV?Tt!UvRLwZUwUJ(HS(h$XIZo*=q@nK$jxW`%Xjc1G^y2 zrm7ULdQZ)YM^IJBx;0Hu`$bxG~qYstOOw<=3^oS#KXti3%F? zXv4>c9alqY+u^jT@-NC3Xb9+uRJY?Oxet(tU(v2qzpcV_=+UJ}G1jK_`=L=vL!LG1 zD2JjLtkf!3iPGgm_v(C{@GtO+*!t?qC+43d1LMfMW@Dd16{B7uje6Cus7mYquw&E4 znY6g{G5Q0Ws^$^^Lb>8?)rnH2igRnyVf(`)Z3{%EOr3uir?%x~>NxQy8PzpmJ)zP> zHQ3wWA2O=SDWK@E@MuL9{)!YtP*sO{viM8faw!2kIK#(PHMc2Z7}aQqRuzE?XQmA< z>y*}U#>Iahhr1cQF+=K7%mCRdx;jrSIW=b6^$^q21O$D?{ z?sq&=eWp`fovW)vY7=t?m_A~lqJ`wUl=sq8EiF$cRe|PD3RAQ9zXSpCmRfZzv@}z1 zCA60ZO7mwF;|V08ond|-z@*kH=f>g1j4zxztK_NMH|Ys|LZYMqeL`ZWLehjSe}UDR z)UEhcF)6U#6&ea{8U4Rzz~dkMaE?_ThE?%}^6`=?B5JKR+cexDr*sbU^Va*6J9BK+ zed$f@Kl;bf-=y>oS^`wmg+Bu7b1kKAPh`_aZ4P+pPl48xY0-^~PAORQZL#)6*(apP zzXfVfgA~*C8S13;4NxiXZ%OViUYPL;;MHlZ*AX?bC)I#{?X{_Jm|=^wal9YXa}@vU zBAKFjG9~f+qC8o{`Ewh3WGVNZ{PbCS(2YPr_Z7{lZ1niJBc%%(f{?{@8jzkq++X~* zVNYsMS6#d&KucPjC~i~hX71!VQc!%&H~qw<)*mBnNT(#E2Ceie1}Cb>P!~weq)^&| z6j`*%Gg-zaPYzU5Dov3dg^Q0A=2G}J<^5?ckIyDhXE;A^zs#!~o@{ZDPKRh#{j}kj zhycMTC}~9zLg}L7hZ#_ZHszW{3*U62`yUptJK4#$v3dA>UP44G{;SDjo2pcvXhHf* zYhS8;KCNGYt1J|gc#ej58a*mLmsXloiKOsx(WE<{ozCxiWjYPfpph2r-W|#OFr-DW z2R>S9Z*`yzg~@)s_;r$BGO13DUh(C(wz#AG`YDzARtNf(n-jSeH|gmW(gvm+F-d)k z9_<`_lLl7ob75+vufdG5o=kJHb9w4sO`J7l_>hSVaP{LGC3!}oU?Rsgi}Gm$(u1{M z+CRe>uHO0+PagVbDa|AOj#0C}uTqD43)F!AbabALAJbPf{P6y?N780Gwa|umOL01< z%=J7Ue3Q7n&SBzSxQ%b65%GC`#*HeGOM7~jPx}Xde~N-R(Wvq1c=g*p6iI##HF%rb zCmOctZe!Q!d8*%0nKK)jq|xc~UGYG1BJ#LTH6>5#K&u?T!{&}^=t(D}N!h_@lF>Vc z3NJdo`fX6xK?x7~z@^|XCr*V#o!3$0oPxSjk>ll1Asv&#y#}b7$IPz{G`}t75^A16 z_YA71ohxjrS*4$HXYiydC~3(1XxY=()SrC$Ii*8|@HxIuUCHb^L1nM02rW>nWy#R5 zW0OWYPfI-X4(*;r6va8ygC|$YC326NO@EIvdQX4G)d20=(dth#{<}H^ydgn@+b2F$ zGVn}K2~^}NRLkP`(c6dk2#V)p?)Es_rVUMg2`EBn-z$xIzhX8iVx4K7;^`MujA+3+ zZAofFNcB7#QKoq19bJBpr1395nJ0yTQ9 zO}e{PTdHPR=G8}!1}d+`HB~r%5T8m3>2sJBlZb))H={k6{ zr4(f|VD-y8l~fa2unvC)t=>#GDB^U}#OW4Z-P`aS|Frhmt!VQr6^XeaDY{Y$TvJx7RN-`x1#_sbx{{N-@%+jwX+9jLfdC75$K_izDKbe+aNHBDHbwG|GQ z6vN=c@a~>oG_hs5ne^i_R#Nsrb5D#>!I#EG9ntc|p~_t1t0Kkd-1~HO4!e_Gk2?My zBc8#<4QqKPA}2>r5x>qf52rYPZuh&!Jx@2iM@w>AeR&Af>auHIAEqTOViZBWbK1%y zu)(CTzi!l@n|~YUlq7KvJ;c*psoK2Am5@V{hn@t zH;(v>IYsWc{SzmFMz?ezO2;CRRxCeV|57#?6{3^bN$ES*?R~ubb0!AnlPL zil^W68PxdiidpnJ&@_pXTz-a8dZKge(|Fg4=cRi2Pl+_Bc?a1`sJQZ-S~Q}dsWyaj zsQ*b&=Lya^Z#E^-+{f_O=(frJvdDYZFBQN&szI8@p}NAM(l&MY*KA6Mio2-u;U7rf zl@!y%BMFME=Irc>FAcLEEI|RKG!phXM;ZSv5>$Q$y3pt0tG#rkW~iwm>2$nW%YLa9 zR5NKgG(lZX5RoIl@e6CO+S8OtwZ!g)4iBTan-Y=EQ>dZ?&kw4n4Nz}HND|-vV2gu) z+%>bGMVCc~M*#D%FvOeGn{v9(sp3VQ29X<}{z7Sge)M+-kxviR&O?>xM-zFHHKA9| z9bru|YP`w52-rDPF%Y?v9ju*P>UnI!@39fM@=e_&sB6YLz}GY&L2e6A9f^GTu@aGL*+i!JQQ&s{a9`F!5uYDbe-o;tSw=9f}Nv3pYK_ zp;B#1k)Zn-O6rq$sT*Z~i|%byv3vM!rXaAhN8OPCCC1PpmQq+XEOK5Hqo#@JI@X#| zH!4};%&KC(sT`V69Z8Ne0$SWkZi%I# zaew}_JpUP~Hj`8R|9q)V7?H?x1PPJyWR)+{D#9k#U!x{q75ouYJ z(|nQoC!GWoDQYl}?o~p5k0D`vbPghbTKJE#9z|`5I0 zDIPe909(ZpG>AT37x#c!Azj?+j+m3snN&ymee3!=GX7ykm-zIC-0ZC`FaPqgTIxzg zrrMis1$SY1A*D=Zo9r6yW*iV+%NST=%=5YH2W5+Z6Hlnr0LNcY}c|#naT1k zyIA9yYq09)Ep*Fr-;+JrmaycrJ~GtjsdHXqSsdxiyw-sUrx%tpVeLZG~SU&K~Hjw|HIT-b3BW#-FZ;86sf_Q z5&^po{K>`rA2u#Pq9v_?-CX^w*)#v*f!V%kU`{~Wr(n$pVry1?tKy7E$$px{{GY1- z*tqbH=b-SgRe-3f18Pa3J~91#u?F8j=!) zY=Y|IDGU?qRHjeu(L*)%KHGS|s#0&FHSzj~>a6D#6)r;&-2{}jH0p?c6`T*Ont{19_mReK35<01>D653BRw<`x=^E0X z+jNYkA+%xJ3V8WurD}zu0+2xyP%$=HK!r+uJJR({HOvNzZGiFbH3+BZDV*z0;zHF4W#n)3afPsRyXnI6V~rjtVX-%@aO5t-68GfrhTA} zu&xSgJAwqDc=4n~;UZAq72LU@$gDxlk%lO0HvL)G)s|QQeKynwa;*fw`G}zJB2la^ zp8#9t8=q6U_@mJ_2?D5C0R= zO!CrA{P_(~@yj0?N5?B_h%#YA@L4%s^|~cmL6q}7_9O%Ci%UypErmK}dI|i(NLq)g z8whDowv3SgL=%0{RxPtKiDmDAytP=i-U7Zs+Hkfg8o&%)^vC0_EtK75SClPn8Ubj) zdYj`Y%K!DFOmWumfp(w|64HBcQ~^FKn6z|h)S_42BM=a`t2VWzDbsn|R013OKroXI zDw5vt7GOM1!7PrTOmAz;v69eB1E`nAvW09)S^23Jj2=*-R#2JGnw4E!pt?SFR_Qfu z;^T)u`K-FPn50G8QgEb&!%75dT|QH%K!0xYp!aSE*keegAo|Q0t)hUA{%U`SRjbrp z*M!wkgN4(in>0ExroKAX4F1@aYA6>FAf#gW+$WO0VR5cB?SM1_bPn}9I#;+;&HG8M zWyQ1B8VVH(xquy@-yG=56CC=?pO8N>61cl{jnxz`tK6d{McM*$v>5)=Wj#<9 zdJih7bgkAb`g6dZzI>c>>lu(%{$yl>1hoD#x6g#M$OL(vyuf{&pEJ&5Qkvh{sL{b% zm#s>h?wB-rfv#7IOzOBk4Pb4~RBo<3G3XbytkTAOb!!^BWlKDArjjD1i+eT30=m%q zAh3(C;u48U1J?7ge+s#`!xc?8E80Oo`r;u>R~{=-IxCinK`n+o!@AG))Wl1Sf-smg z6>1jzxr%C3rKHv#fkKsD~KZ12**7_(&YF$Jcu-6v8V9~l%LhTZV{w2Hi zYk>4|doe&0i59FG!1UWoYYL;P;t4P`SY5~2Lj17`TQ4N8HL3yV3D(8cs%}m%SF4x? z0qa;p^wC_PWLi%iK%a5|NmGlyyKt^7=c$N6V1T#M>0uHLohp!fLzB z0>r@Dft9#-g+w9xf3#YTzoKCE80yyybxK#EBV4E_Vts~v|GYoF|M@fiZC}@2HtIAJ zI7e3TR+S<`MR?6Os_bsYFQ;E*RqC2eLHUcSrK2yiGc|lHwakyTw<6C9?1Cj&(PGoZ z#|lgUg;@dCF5ar5Myn>Qthzb@G}HI*^v9LnH72F%67>s6ii$ovWKx)C9WPlFgsVvN zbajOa;KBtk3F=uA{fcgt7nohCa)-9{|0nTeM_?(FhptwCJ+Hrh&<40V$8G&=P1aAkDGfFo%8AxHyht zaPMns@>X93dQ`|;3(O>QqZXqJK@fpz6=+`k6UsNeZL&WPR&&(?+=TJpO7}s_VwZ+R zl!#_p65V?h=g-ZXU-yBE6yfM-T}@Y|%!GuZdn&e-P zul;zp$4C#T35!ZlNMt1zdSt!lyHqg&jfm5eB3ad*oA7`B%)@qZ>Fp!kAjJ76Y@oRn zTjo^@f^XE`r4}t<>$&QShks^yk7zYI!8q9`s{WSUQJUP8W|Pte=zBp0pkSf!6Mxnt zGI-*0b6q<4;8p0+_}#0ZN~Eh7N|<(}N1ufEGV@a^R|?;I68z;bRRPo8(y_0hWH9?_ zM70c1#CqLL)4URHP{gJNbv1rLDLw?$Ig_+g8<2k9Ua5RO&U!SKk?HLpd~^1YmLfl3 zfB|W~E{(cerYg8;UaiHb2=OT-NwGe@4KFAF4o}a6dU;#jQ1uMuG*{iZ7)8Z@eDUb7 z)*sun$*SdO)&A##O<9uKfj`y9YZWpBejjK@%>F_&A&pCI=%GJuv40?!V$fCzaX(F+ z^=$DCbE8y{lt`)X(jq%0hq>}gO?Gk|q#XUz?iF<+N0O?)=&1qYBWA%w5uGY7q1;(D zO)J;HBB+Xp`<^HZ%6(8D2WuDy_5Y2N<*3mfEPz6+U@z*lUDEpSEwFFnA3FMHY#>)l zD^vV05mlkK<~zP^i@|m0vxZ{bz&DT;dCo^&gW4eXV5D}FGk8X4V3sKS)QkN-fT96Vy1)u-i+V*;}f(k&M+Ch=4^u?H630BKg zSZT8=`%IXV^cM!~CDi$@-dfaQr;=ICN|ZttZ2!8`LhI(DJUnC5 zicvhjJd;_`ZvVu00OVu2Su-bqDsw<(lTC^$QdC&Gk z->;eK>(drgw?H%qsj~P_FolH!)BWfsNreW`x4-|kzbtiF%+Kwg)7zkRiUk}th_wPI zhLJj6wsYRHvm{eSRracJZ^7K@#xm3J`tbK`SeLJ|t4C}mTyd8CLSZdy4F%dVbVW(9 z<)z(S{&_~6W|}YO*i-ysK)52HSwSqy%NB@O-BQ1FxqkT=*v;obVmKZ8fcmvBnwNAL zyXrI_*sue(hsnn+-WM)_S*n^U!ZAK{4r=A@5NgtZ zF-=hG=l|NnI+N`}`U`~%&S5CErE*}-v&Lr&{layWb4w790;b^*5?Z48`{MCW?EsJ_ z1dF9Gy6~~%m9@O7S|!S@cBK2C)C=G)KdICVpah0(yXdd*#+8C(ow^o~b+5t%<4U5c zU-ej@3^8Beg@iPr!>!5{)d}ky32fmGAXM#_t#%8j{>wGC%K2y7dC}dk&g})@31sQs?AT- zKf~8 zf4)ep?&pV0d-L)ahU{Nas(5~;-c>*Td6(~F|Gv9h%K%%;od6r88xdvz>7MAzZa`CJ zSh`eUJ+~f`PJ-Gjyn zKRi|qBa9iUiZ_6jkOknY&ykA6x7=n_I!gVTwftXR0F+}#FCfN(@aFR)Rd@i9uK?%L z@ngCm&F6jkGIdu!kuF5M82VqQ?I3&Ifa&;)L=F+)=#M_ ze2{4k*J+GXP%Xe=98hV~MLS?YxlJ7*UEBfJu+qn1OOy(Rmm3%$t4FblKw(G*zEwGm zks!XEG`n~X6OD^u45u4)2kLq0Vxb2`w!+0bH9~O)B?c_H69qKeKl{`yH+@pC*%y|_OOgN2tb z{P1TO{`4;*yoLVH+n@NvzCEem*^WULTu-3UjPK4$lSCGTzgmVgLfv31*vZdf{6zOK z=-i_mJ=4p>D6{(^L7+jB-i2sl3AQ70k}fo%HP3JTo8d$sJv%48+-|>?us%=#5*=v} zRPcthf}2=J*l4P#qMX_$=V}9eUC0UUz1a-T)n(@wM+q$YxPgk|gX4Zu7cz}b#`<-F zH9B>k+Pv<=fCy<}QTI!jcd!Re_=z}|9yGLikh}IFO(N}@SEqizX7e5mrV5%^zk0W) z)IM6LUGJg#{2h^`Ks9(2tDwO__gSEpuqU0Ft$IBzWoeJ5L5Z3R| zKAFZgWA1xTGYS1FW#)?Fj#l&x)s4D1%uAgVUtTF4;YMhITxWhtSrfP-LTPXnnPaH* zymw7dhn)|6Q=h(b@v|S^x5#P44U2A%qB+r(RR~Gk?^T?YJY+Wa)k<^{cSm%md+ku- zDYq);DUEwIOa2km&l9LPR&RgIb9|T3hA_UEQK|kl5C7*2_v%)is(a9U(-3<0hL_9= zPGvM`P_s|fsuVU`cRHMYeKIlm_lB}25`wQ~7qu&LQKloexsG&`i^^60Ak#}5>>w_) z)TA3AhD#lhs#>o5v<8}+{jUREu7SgDpL%td8q*Z`)P!~Dnvnj)sESDSFJNV?Z?G#- z6di=3jYbtjRckzP!1Jx@iU3cfcFHDBsrxBG@!sM%8gq1 zf-3ofb)2` zxhXCFm|v$Szps$2AT8}W(CAy=V?Y4`sDmxK zLD~?zBE|+!VNd1W@?Zt-t`e7$H{=4=fd2Vo%%4AZ`4oNC7*fA<*0|{k*mqY|QIHh& zsLRtLb%8qo&NOqaOA270l{E-x1*^P2OZP!}{^(`LwVw#Uj#B(nEh4+tVZU?+*IHBQ z#0!16`huJl)wKHcA(iVVDi6}3*B=evp=_JP2GGmt_gSo_0jqe`q929H?Q%rWBlz3!Hmp=Y8ha6u28ss? zjJ16ILszW7O&PWppg!IvgPTS^cE=O0ws_#`2T7%&QMw`%STHK5R5a>4RhU1w5M%vi z*rTDvqt#bUqFM?Q^rD!jE4Y7!&QeuXq%l?zgZ_O-MqiKXiP{Hjs-}rg4ACJ3Fh!hF zP}GXVr;O&_8p`D~yf-`Q^bC7<^`r6PlpIp^=`A4TugRH!mpeaa7k;cMI zRRh$o&r{#er+?tVYRp9Lj;Ku;;hpztPGBqWf56DPcn1QPfe^&~lxa#Uut8IDkGtV* zsbdtNlld3D^8%oS7xRcX2oWNP?3h54C!(>8OJ zXEtpk)zf$mB2=%Dxq`@=6b$TQRs7U7A@wf)MU?tOZu#t!2tX!PoUenn+TTa*Az)WK^ABQIRTs z6{BKyp6B`foJ02n-i%Mb@2im16_K7)hxr~<`=VWC+%JM(MSRXX)%$dJYuT=p^pkQ3 zVOUC(3vX(1N3Bq=WMB3;NmK2|l01>OXzyi@k}Bv^DV8Qy%BC}>d$2q_fh}VOeRhF1 za8jny4`#YdeE3}mYpiy0{XFdIzayw1 zCKsl~tmoNDa;?@EY`wE9k&}R$CVHqyR&y;C75&gfqD~}EN^z^+1D}&f^;7bdTuM=z zA=iTT%-Rz7BSRfc8sZ-4dRH_msDeII7&j49S8gnj=uQgd{pj^Lu02$<-2oK}<$SJ? zlqQKf!{R9=I6I9dO2VX2d}`0*0EJ~L`pu!LbCPC-B2;}NWLxysD52I_-*m0kO!|3C zbCTnnklaxzyJk~Hper!j1$x3lqDYA{{&$(7q{*x2H$#}9#XHf9F%78zQg#XSB`t!(JrJBgT zJh_32+(E%lRrFchf>ThHvSQEGu=Ejyp1XYgXa8Cl8kT_W6JP!!LoIz4{%e4WRsUyl zt^Q$O50}5et;%IIq$Vt;=ETaX%fK8ZR8lhaf>k%4^dk6k{=5SZ&?43+vuYAnJ~V6M zPgTcQKNqYqDqXMru=@6qh6>84IYVW2$h-5Ks=%r#``0yLg@)dJ!nL1Bx#R)NV_lJgP@Vo!-itu8+T7av1Y3`V)>VC@{Q2Va1cNFzbD;Ox{Vq{9m@&yQ!jMDMqPtK|yhUepf=?9Ph#6u!U`djxd@O)6oOpCfO7*i@6 z;FA9{ibFyUf`tQvmr)eONQ3SlKl5@K# zFz7AWio`>uZs!)ok0}LkX#Zb|+4ZpA~O3RXffQT=^Z z7aua^)z5y5+WDl4>JzV%jyo2Me~49Q_4oQl{k(OH-`_79Km7^Th6n(SUm;MBl9oIG z(8WdNEzkcQV$40Yq(l}Lc$vx1e2QY3QX&YT7lh4J413S;(exe|)CE_OAGB zztV-X>N+k-acZe)!%)wE;*}t{j)Nqvt>)Y$~;7T{c)dGP@>`LHaXM-hk0HD zYRaoin?>qF8mhA#>_YHiluA_U3)asAthezWJ^m9v!$1E#{qy=y_$LfH{uw_nI{x%B zihovKwcER?iYGOd10{ND1dS^e73>Jrxv3s#gdn-MbF+SZX103gq?aZ1; z6H7>2*fr+T4~zj;!&cb9UB_4cx*wSJDO6(l)A9Cn>peTatB*ijed3ezC~Ha*rXdfY zV=U#$bVUj{{>p=itbVQzmi**cHKc$|9tHQ{Bj8Lls$k6$L_(LYstQg9^td%?-&3vA zkRn-$9DvdMvNC0@3Uw8gZ`D(z1oaNqLHKO~U;HwpQnNyN1JKj8 z-*C`0GP}m3B#49r4U@}JTFAY!i&_I%xt!uj4|enHH+6iY^p8A+2d`5x_6wA5R(Uj_ z_$FSre+AX`Zig@Z4D|EX{|4#9J2F(%=l`0}afyo4;pDyNla7)^2axeykA9>QDN9aJY~q@b ze*8&7zEk&?|4{ft9n(3Lag^V=@RJ@5ST$t4^gA{^r3&6D@Z(W+q5?AU&K4eiOAmd$ zu=Q1y$iL)@yI&`lKtIjHth$+wCVCIw|%ms;tZ8>3B!HMiU4(1f(;(e@|fY4W`* zLsRsBg!1r<97vr)O=Ql$(&;CY+7-3u#;US`O=Cyjm)Vs^Co)a^lhR8SE5o!!xw``z zl?dWXDx*&t|2B;ALs?Geih^*e`PXpnyrXRA-f5C${)yy(=(ho4(@)eR8Eq@ey034LS5Z z)og~ZjC4wcP}Jc3mASeOONe)k9{^6R0XhB3-Rd|$Z!X%y|q?Z52wr)D~|)k2Io-2q2f zeWvwn!HOr}yiTfqX$PEdfkyQKun&{73IziG3aqfwM1pU^dQXQ7pY`$I$J)&4UW$A# zb}feK>B@q1?sv|m9^m|;U^-=fUnXEXRe=4%-)fL{OP5**5;LfZjZxxV%?xzxvj;TZ zS^tHle-#Pj^Ap^bmt0>_Dt%db*5%BvTsv1w;4i5I7zGtoIM-emUQ3uS*u%w7SoahN zicf!N*YieY2SF;(W&5J_#}}6?c~?@-kLw}I)KNS2_#PAL?B^(^UwjMK_QQjwh26PR2>bHZuHzeoE%&McHW#CnYpE0_FZs~@;l3Wn&D ze^mvVyq4VjvFcBof4n}EMjZ;!$uWAJ1t`XTC6QB*%JXLu&pHJ9NrOPo6r0#e$jt*eTQqQ|}g;Y|T zl50b%S*~KxtFh0Wl$CBRQ)wc1mvFDRV21A0%`BId>~crtjPz$8X~e1%BsHq1_(jC) zG;Z37Hd*37df9KNI2o|pY7#P}g|9-rpPjhXO!^J=bOU~Z|B8nR*S>J>$u0b)dhZTs zhy#d)+Cqa!$9upPNXuN1O@#S%pouTr;RX=&g>Hl72dK%im%OTEQ)GZhg~AAn+In%S ziu{VSNir`{)SN;6jCa7FKW_A36s4wAH2%49lBFD<)h5K&)l}eGKb-^RBlK_$UMEJ? zQ?$1e(a`IJmK2zlrz%nc@F+PF%gEv&^Vx|gCbp&f}HIpW1#qXw751)Vr zrPn17J9YrV>C?AmL|9A?wvt}-?$`&EIal$FP;pGGJn zP*<{PY=}xKoVrWdLc|M@xPJn5;oq$NG8G zuh9d5MRmWZQ^QG7(+uln7hq44c!M?f|0Kr>lNEKWKa^pMPfNe%f#2H~m+#j-Spc>j ziYi8Gu(r2J;`t2lICTB5u@7V3!D-HF=n`DS1}pK*RknRE*}g-t%`99rgM)9IW-Mx! zO>OJpkXP$byKd!k7poS%;1H;#q;M4{!hskMpRE2CXWvS|5zi;VOO>8SeKg_%<<58}qDJWcX*B@Cnsl9ehmem+e z8S~J}&>QMmAMy-K`g-Y|-DiR2b}e4oB=#GjHkj2eeamgdp(epMB7+n*P@k>kxz-g_ z3Si>-YT5zIPHlu*ZwTCx`#FMYO-f+2KwZJg8FzC6wc#Du2tF~SzCb$3tyFYiI zCoxy%h_tpL)v{qOXF!YR%K0gKRl}J>mvt;Kmon@l8+AvQwm`bng{9`rK`lt9L)EfR z8x&MmLci?j>y9b?N@A{1sdjY)^M_#Oae(w#y~#Ahu!1cg8wp(rmn=3dgue5Q>TVT# zQjkU6K_6~z2Lj9P+!{3QUH+|aRVhozuXO!99qP}|ExS_({Z@NBpk5h17Te)_ zp|}Rp3d3rke z-nSdN;q}^)2w8ST(IPJ0#JXF@eAD6YgwhX3q3!h@yrte}Pz!I{(_yo>2IJm_uu``A z_g38%31L{(#;hXK@0{-gx@=EF+DAn~3gs2HZH?VRHMi=1#Tn~I*H1J-dqsHnKcZ-{ zSk;#CR>(z-Ey&LqE{Ehu4`~|`!eisF*2z9=2DYj;ejO+ zNYs&{RfT6y*S=*hpR0{t99w_8y2%HPdlT&n+opckru8noJt9uFQ;Tk~bf)5&s{w^K zpgeKc7HxQ|jj&I~dEV8;Y2r9e&J(AotAT7kj?mvR8|)rnFb_Qq^!qq*m~sZ0Rh~{wVum;=9zE8XUlKDj(2nskr#n`LX%zFwxM?%IeZ0Nh@7Mq??tv$^ z2}0Q043mpBb>yzeEp|C>8B#tro^Atp169M~=?YXF=u=uqx0tozJy=s5F54EsQJYc! zpZLF}WP3OM*B_d7LQ7QAln!#L+LSr<51bS79M2AO%*C|;vP+sCt{E48AqB9*&@9SP z&ui{+UCvv<6wNM*I^JaFx;lnQn|8dzvQKBFm=>-}xfv$104f zl>mQ;*nYH2p71H|18RNj7d=6-%gHvikC${DLKb~aG5(`g7irZpleO;!a$Rsw zzn=6g$kq27i|*?%%VOoB7Kr(APj{3n)J!tWiQho4k?z_uK2AZt4<>qnf9|9CW8L^3 zMxwSNZ;;gZ63$aG6IsRSOPtRO%5QWCe&zHU0`2dw_?x73@r@1p70=QAx9Xq%m@#z9 zjOtJSIvO?JpPpXte<02NH1{%OF_ZtMRxl~CsBrLi7Sx<9=-0+fiJmZ}c!^@s`N1*4 z%JZvImH9H|I0tMZRztotR1tosh5XzuTzWu?V>@no)J- zHSH+=L5r7zq*q}^bGX#>=Iz|lbSj3?5B~i91K7~ap$jLpfzV?5;3afsEyzx^`m zt^XtLVNfc#LAYQ{=k7|G;qA$YKW6v2UF|rz_FYX}$QbFxE1C#9_zie=ET-k0KlnG8 z9y|KrI{EWDVCLmdh|gW%L$3#M!5-o99*ZFNu=&Z*CiRiFoD4l?9$}5jD@h36={NH% zMAOL-h5xd{$yQ=rn51$HBX$kD2AeMG>73w;dmwU=8`m|id!RAxF7PvKkNno}v_-cO z7InGyDC*dDfmEb)WJSrz`!F@)X%-Z*OD2TMyVh>g5Bt4O7=ReTnm# zq5P)w)c2ALAd4=Z1dR+l)9p9(V>iKy2Pn{37HoR&*Pb;201q%27lP%Bxyh4Mj1>R( zyD_|t=FOxRPLTWR4LPwPnQwP7PlI{#TcfLBd(x^l#mf;EBZZVGS29E0KguOz;{!a?eukOCrh7a`y}EYcdPC6d6Lt2XS*pJ?Iu*0 zX$p~;DE`e#4nuVC;)?x_izY<8-|_}x(k?A4DT~Vu_gXx$MCWa}oP&|;d{+(N?4zva zxuy$l7}S$$K!`WE%*JTsH{(0u(Z=mu;9ltg%-zsqHss+azHon>orv-GJ?Aysc;W1E zI={s)jZP}q{8UbG>65G+`+Vp3j7o=3yKMy5^KiM|C8F;f&_xn`0p`raC{_AZCN`hk zjWH>skcfuw!|*~+#%9)RZ(mAZ21`4s*kn$)_(&B8BE@zbWk(8`9OTY=r%Ut;8Ul%B7*73Gn9RAX=Ha+g`F*>%r%&yG3MATk*JQOWwwOSD1H!N|cD*p`P#r zlUl~^fB*eA<%C^7+@rg&BQn2B-?qvxt)=Bx)Y6MlqnoxcOD8W|gd?O3=;uwnyu9SX eI|}e)>LY$VF2l=xso6QK|NjFqE)HdPaR>l(AaSh# literal 0 HcmV?d00001 diff --git a/tests/ex3.sam b/tests/ex3.sam new file mode 100644 index 0000000..bae2a22 --- /dev/null +++ b/tests/ex3.sam @@ -0,0 +1,13 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:1575 +@SQ SN:chr2 LN:1584 +@RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 CN:name:with:colon +@RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 CN:name:with:colon +@PG ID:P1 VN:1.0 +@PG ID:P2 VN:1.1 +@CO this is a comment +@CO this is another comment +read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U +read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R +read_28701_28881_323c 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< + diff --git a/tests/ex4.sam b/tests/ex4.sam new file mode 100644 index 0000000..b2282b8 --- /dev/null +++ b/tests/ex4.sam @@ -0,0 +1,9 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:100 +@SQ SN:chr2 LN:100 +@RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 +@RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 +@CO this is a comment +@CO this is another comment +read_28833_29006_6945 99 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 +read_28701_28881_323b 147 chr2 21 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 diff --git a/tests/ex5.sam b/tests/ex5.sam new file mode 100644 index 0000000..f1f8aad --- /dev/null +++ b/tests/ex5.sam @@ -0,0 +1,5 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:100 +@SQ SN:chr2 LN:100 +read_28833_29006_6945 0 * * * * * 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< +read_28701_28881_323b 0 * * * * * 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< diff --git a/tests/ex6.sam b/tests/ex6.sam new file mode 100644 index 0000000..7ae90f3 --- /dev/null +++ b/tests/ex6.sam @@ -0,0 +1,5 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:1575 +@SQ SN:chr2 LN:1584 +read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 +read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 diff --git a/tests/ex7.sam b/tests/ex7.sam new file mode 100644 index 0000000..12befae --- /dev/null +++ b/tests/ex7.sam @@ -0,0 +1,2 @@ +read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U +read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R diff --git a/tests/example.py b/tests/example.py new file mode 100644 index 0000000..a1ca7a0 --- /dev/null +++ b/tests/example.py @@ -0,0 +1,121 @@ +import sys +import pysam + +samfile = pysam.Samfile( "ex1.bam", "rb" ) + +print "###################" +# check different ways to iterate +print len(list(samfile.fetch())) +print len(list(samfile.fetch( "chr1", 10, 200 ))) +print len(list(samfile.fetch( region="chr1:10-200" ))) +print len(list(samfile.fetch( "chr1" ))) +print len(list(samfile.fetch( region="chr1"))) +print len(list(samfile.fetch( "chr2" ))) +print len(list(samfile.fetch( region="chr2"))) +print len(list(samfile.fetch())) +print len(list(samfile.fetch( "chr1" ))) +print len(list(samfile.fetch( region="chr1"))) +print len(list(samfile.fetch())) + +print len(list(samfile.pileup( "chr1", 10, 200 ))) +print len(list(samfile.pileup( region="chr1:10-200" ))) +print len(list(samfile.pileup( "chr1" ))) +print len(list(samfile.pileup( region="chr1"))) +print len(list(samfile.pileup( "chr2" ))) +print len(list(samfile.pileup( region="chr2"))) +print len(list(samfile.pileup())) +print len(list(samfile.pileup())) + +print "########### fetch with callback ################" +def my_fetch_callback( alignment ): print str(alignment) +samfile.fetch( region="chr1:10-200", callback=my_fetch_callback ) + +print "########## pileup with callback ################" +def my_pileup_callback( column ): print str(column) +samfile.pileup( region="chr1:10-200", callback=my_pileup_callback ) + +print "##########iterator row #################" +iter = pysam.IteratorRow( samfile, 0, 10, 200) +for x in iter: print str(x) + +print "##########iterator col #################" +iter = pysam.IteratorColumn( samfile, 0, 10, 200 ) +for x in iter: print str(x) + +print "#########row all##################" +iter = pysam.IteratorRowAll( samfile ) +for x in iter: print str(x) + + +print "###################" + +class Counter: + mCounts = 0 + def __call__(self, alignment): + self.mCounts += 1 + +c = Counter() +samfile.fetch( "chr1:10-200", c ) +print "counts=", c.mCounts + +sys.exit(0) +print samfile.getTarget( 0 ) +print samfile.getTarget( 1 ) + +for p in pysam.pileup( "-c", "ex1.bam" ): + print str(p) + +print pysam.pileup.getMessages() + +for p in pysam.pileup( "-c", "ex1.bam", raw=True ): + print str(p), + + + +print "###########################" + +samfile = pysam.Samfile( "ex2.sam.gz", "r" ) + +print "num targets=", samfile.getNumTargets() + +iter = pysam.IteratorRowAll( samfile ) +for x in iter: print str(x) + +samfile.close() + +print "###########################" +samfile = pysam.Samfile( "ex2.sam.gz", "r" ) +def my_fetch_callback( alignment ): + print str(alignment) + +try: + samfile.fetch( "chr1:10-20", my_fetch_callback ) +except AssertionError: + print "caught fetch exception" + +samfile.close() + +print "###########################" +samfile = pysam.Samfile( "ex2.sam.gz", "r" ) +def my_pileup_callback( pileups ): + print str(pileups) +try: + samfile.pileup( "chr1:10-20", my_pileup_callback ) +except NotImplementedError: + print "caught pileup exception" + +# playing arount with headers +samfile = pysam.Samfile( "ex3.sam", "r" ) +print samfile.targets +print samfile.lengths +print samfile.text +print samdile.header +header = samfile.header +samfile.close() + +header["HD"]["SO"] = "unsorted" +outfile = pysam.Samfile( "out.sam", "wh", + header = header ) + +outfile.close() + diff --git a/tests/pysam_test.py b/tests/pysam_test.py new file mode 100755 index 0000000..c2ae6fa --- /dev/null +++ b/tests/pysam_test.py @@ -0,0 +1,841 @@ +#!/usr/bin/env python +'''unit testing code for pysam. + +Execute in the :file:`tests` directory as it requires the Makefile +and data files located there. +''' + +import pysam +import unittest +import os +import itertools +import subprocess +import shutil + + +def checkBinaryEqual( filename1, filename2 ): + '''return true if the two files are binary equal.''' + if os.path.getsize( filename1 ) != os.path.getsize( filename2 ): + return False + + infile1 = open(filename1, "rb") + infile2 = open(filename2, "rb") + + def chariter( infile ): + while 1: + c = infile.read(1) + if c == "": break + yield c + + found = False + for c1,c2 in itertools.izip( chariter( infile1), chariter( infile2) ): + if c1 != c2: break + else: + found = True + + infile1.close() + infile2.close() + return found + +def runSamtools( cmd ): + '''run a samtools command''' + + try: + retcode = subprocess.call(cmd, shell=True) + if retcode < 0: + print >>sys.stderr, "Child was terminated by signal", -retcode + except OSError, e: + print >>sys.stderr, "Execution failed:", e + + +class BinaryTest(unittest.TestCase): + '''test samtools command line commands and compare + against pysam commands. + + Tests fail, if the output is not binary identical. + ''' + + first_time = True + + # a list of commands to test + mCommands = \ + { "faidx" : \ + ( + ("ex1.fa.fai", "samtools faidx ex1.fa"), + ("pysam_ex1.fa.fai", (pysam.faidx, "ex1.fa") ), + ), + "import" : + ( + ("ex1.bam", "samtools import ex1.fa.fai ex1.sam.gz ex1.bam" ), + ("pysam_ex1.bam", (pysam.samimport, "ex1.fa.fai ex1.sam.gz pysam_ex1.bam") ), + ), + "index": + ( + ("ex1.bam.bai", "samtools index ex1.bam" ), + ("pysam_ex1.bam.bai", (pysam.index, "pysam_ex1.bam" ) ), + ), + "pileup1" : + ( + ("ex1.pileup", "samtools pileup -cf ex1.fa ex1.bam > ex1.pileup" ), + ("pysam_ex1.pileup", (pysam.pileup, "-c -f ex1.fa ex1.bam" ) ) + ), + "pileup2" : + ( + ("ex1.glf", "samtools pileup -gf ex1.fa ex1.bam > ex1.glf" ), + ("pysam_ex1.glf", (pysam.pileup, "-g -f ex1.fa ex1.bam" ) ) + ), + "glfview" : + ( + ("ex1.glfview", "samtools glfview ex1.glf > ex1.glfview"), + ("pysam_ex1.glfview", (pysam.glfview, "ex1.glf" ) ), + ), + "view" : + ( + ("ex1.view", "samtools view ex1.bam > ex1.view"), + ("pysam_ex1.view", (pysam.view, "ex1.bam" ) ), + ), + } + + # some tests depend on others. The order specifies in which order + # the samtools commands are executed. + mOrder = ('faidx', 'import', 'index', 'pileup1', 'pileup2', 'glfview', 'view' ) + + def setUp( self ): + '''setup tests. + + For setup, all commands will be run before the first test is + executed. Individual tests will then just compare the output + files. + ''' + if BinaryTest.first_time: + # copy the source + shutil.copy( "ex1.fa", "pysam_ex1.fa" ) + + for label in self.mOrder: + command = self.mCommands[label] + samtools_target, samtools_command = command[0] + pysam_target, pysam_command = command[1] + runSamtools( samtools_command ) + pysam_method, pysam_options = pysam_command + output = pysam_method( *pysam_options.split(" "), raw=True) + if ">" in samtools_command: + outfile = open( pysam_target, "w" ) + for line in output: outfile.write( line ) + outfile.close() + + BinaryTest.first_time = False + + def checkCommand( self, command ): + if command: + samtools_target, pysam_target = self.mCommands[command][0][0], self.mCommands[command][1][0] + self.assertTrue( checkBinaryEqual( samtools_target, pysam_target ), + "%s failed: files %s and %s are not the same" % (command, samtools_target, pysam_target) ) + + def testImport( self ): + self.checkCommand( "import" ) + + def testIndex( self ): + self.checkCommand( "index" ) + + def testPileup1( self ): + self.checkCommand( "pileup1" ) + + def testPileup2( self ): + self.checkCommand( "pileup2" ) + + def testGLFView( self ): + self.checkCommand( "glfview" ) + + def testView( self ): + self.checkCommand( "view" ) + + def testEmptyIndex( self ): + self.assertRaises( pysam.SamtoolsError, pysam.index, "exdoesntexist.bam" ) + + def __del__(self): + + for label, command in self.mCommands.iteritems(): + samtools_target, samtools_command = command[0] + pysam_target, pysam_command = command[1] + if os.path.exists( samtools_target): os.remove( samtools_target ) + if os.path.exists( pysam_target): os.remove( pysam_target ) + if os.path.exists( "pysam_ex1.fa" ): os.remove( "pysam_ex1.fa" ) + +class IOTest(unittest.TestCase): + '''check if reading samfile and writing a samfile are consistent.''' + + def checkEcho( self, input_filename, reference_filename, + output_filename, + input_mode, output_mode, use_template = True): + '''iterate through *input_filename* writing to *output_filename* and + comparing the output to *reference_filename*. + + The files are opened according to the *input_mode* and *output_mode*. + + If *use_template* is set, the header is copied from infile using the + template mechanism, otherwise target names and lengths are passed explicitely. + ''' + + infile = pysam.Samfile( input_filename, input_mode ) + if use_template: + outfile = pysam.Samfile( output_filename, output_mode, template = infile ) + else: + outfile = pysam.Samfile( output_filename, output_mode, + referencenames = infile.references, + referencelengths = infile.lengths ) + + iter = infile.fetch() + for x in iter: outfile.write( x ) + infile.close() + outfile.close() + + self.assertTrue( checkBinaryEqual( reference_filename, output_filename), + "files %s and %s are not the same" % (reference_filename, output_filename) ) + + def testReadWriteBam( self ): + + input_filename = "ex1.bam" + output_filename = "pysam_ex1.bam" + reference_filename = "ex1.bam" + + self.checkEcho( input_filename, reference_filename, output_filename, + "rb", "wb" ) + + def testReadWriteBamWithTargetNames( self ): + + input_filename = "ex1.bam" + output_filename = "pysam_ex1.bam" + reference_filename = "ex1.bam" + + self.checkEcho( input_filename, reference_filename, output_filename, + "rb", "wb", use_template = False ) + + def testReadWriteSamWithHeader( self ): + + input_filename = "ex2.sam" + output_filename = "pysam_ex2.sam" + reference_filename = "ex2.sam" + + self.checkEcho( input_filename, reference_filename, output_filename, + "r", "wh" ) + + def testReadWriteSamWithoutHeader( self ): + + input_filename = "ex2.sam" + output_filename = "pysam_ex2.sam" + reference_filename = "ex1.sam" + + self.checkEcho( input_filename, reference_filename, output_filename, + "r", "w" ) + + def testFetchFromClosedFile( self ): + + samfile = pysam.Samfile( "ex1.bam", "rb" ) + samfile.close() + self.assertRaises( ValueError, samfile.fetch, 'chr1', 100, 120) + + def testPileupFromClosedFile( self ): + + samfile = pysam.Samfile( "ex1.bam", "rb" ) + samfile.close() + self.assertRaises( ValueError, samfile.pileup, 'chr1', 100, 120) + + def testBinaryReadFromSamfile( self ): + pass + # needs to re-activated, see issue 19 + #samfile = pysam.Samfile( "ex1.bam", "r" ) + #samfile.fetch().next() + + def testReadingFromFileWithoutIndex( self ): + '''read from bam file without index.''' + + assert not os.path.exists( "ex2.bam.bai" ) + samfile = pysam.Samfile( "ex2.bam", "rb" ) + self.assertRaises( ValueError, samfile.fetch ) + self.assertEqual( len(list( samfile.fetch(until_eof = True) )), 3270 ) + +class TestIteratorRow(unittest.TestCase): + + def setUp(self): + self.samfile=pysam.Samfile( "ex1.bam","rb" ) + + def checkRange( self, rnge ): + '''compare results from iterator with those from samtools.''' + ps = list(self.samfile.fetch(region=rnge)) + sa = list(pysam.view( "ex1.bam", rnge , raw = True) ) + self.assertEqual( len(ps), len(sa), "unequal number of results for range %s: %i != %i" % (rnge, len(ps), len(sa) )) + # check if the same reads are returned and in the same order + for line, pair in enumerate( zip( ps, sa ) ): + data = pair[1].split("\t") + self.assertEqual( pair[0].qname, data[0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0]) ) + + def testIteratePerContig(self): + '''check random access per contig''' + for contig in self.samfile.references: + self.checkRange( contig ) + + def testIterateRanges(self): + '''check random access per range''' + for contig, length in zip(self.samfile.references, self.samfile.lengths): + for start in range( 1, length, 90): + self.checkRange( "%s:%i-%i" % (contig, start, start + 90) ) # this includes empty ranges + + def tearDown(self): + self.samfile.close() + +class TestIteratorRowAll(unittest.TestCase): + + def setUp(self): + self.samfile=pysam.Samfile( "ex1.bam","rb" ) + + def testIterate(self): + '''compare results from iterator with those from samtools.''' + ps = list(self.samfile.fetch()) + sa = list(pysam.view( "ex1.bam", raw = True) ) + self.assertEqual( len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa) )) + # check if the same reads are returned + for line, pair in enumerate( zip( ps, sa ) ): + data = pair[1].split("\t") + self.assertEqual( pair[0].qname, data[0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0]) ) + + def tearDown(self): + self.samfile.close() + +class TestIteratorColumn(unittest.TestCase): + '''test iterator column against contents of ex3.bam.''' + + # note that samfile contains 1-based coordinates + # 1D means deletion with respect to reference sequence + # + mCoverages = { 'chr1' : [ 0 ] * 20 + [1] * 36 + [0] * (100 - 20 -35 ), + 'chr2' : [ 0 ] * 20 + [1] * 35 + [0] * (100 - 20 -35 ), + } + + def setUp(self): + self.samfile=pysam.Samfile( "ex4.bam","rb" ) + + def checkRange( self, rnge ): + '''compare results from iterator with those from samtools.''' + # check if the same reads are returned and in the same order + for column in self.samfile.pileup(region=rnge): + thiscov = len(column.pileups) + refcov = self.mCoverages[self.samfile.getrname(column.tid)][column.pos] + self.assertEqual( thiscov, refcov, "wrong coverage at pos %s:%i %i should be %i" % (self.samfile.getrname(column.tid), column.pos, thiscov, refcov)) + + def testIterateAll(self): + '''check random access per contig''' + self.checkRange( None ) + + def testIteratePerContig(self): + '''check random access per contig''' + for contig in self.samfile.references: + self.checkRange( contig ) + + def testIterateRanges(self): + '''check random access per range''' + for contig, length in zip(self.samfile.references, self.samfile.lengths): + for start in range( 1, length, 90): + self.checkRange( "%s:%i-%i" % (contig, start, start + 90) ) # this includes empty ranges + + def testInverse( self ): + '''test the inverse, is point-wise pileup accurate.''' + for contig, refseq in self.mCoverages.items(): + refcolumns = sum(refseq) + for pos, refcov in enumerate( refseq ): + columns = list(self.samfile.pileup( contig, pos, pos+1) ) + if refcov == 0: + # if no read, no coverage + self.assertEqual( len(columns), refcov, "wrong number of pileup columns returned for position %s:%i, %i should be %i" %(contig,pos,len(columns), refcov) ) + elif refcov == 1: + # one read, all columns of the read are returned + self.assertEqual( len(columns), refcolumns, "pileup incomplete - %i should be %i " % (len(columns), refcolumns)) + + def tearDown(self): + self.samfile.close() + +class TestAlignedReadFromBam(unittest.TestCase): + + def setUp(self): + self.samfile=pysam.Samfile( "ex3.bam","rb" ) + self.reads=list(self.samfile.fetch()) + + def testARqname(self): + self.assertEqual( self.reads[0].qname, "read_28833_29006_6945", "read name mismatch in read 1: %s != %s" % (self.reads[0].qname, "read_28833_29006_6945") ) + self.assertEqual( self.reads[1].qname, "read_28701_28881_323b", "read name mismatch in read 2: %s != %s" % (self.reads[1].qname, "read_28701_28881_323b") ) + + def testARflag(self): + self.assertEqual( self.reads[0].flag, 99, "flag mismatch in read 1: %s != %s" % (self.reads[0].flag, 99) ) + self.assertEqual( self.reads[1].flag, 147, "flag mismatch in read 2: %s != %s" % (self.reads[1].flag, 147) ) + + def testARrname(self): + self.assertEqual( self.reads[0].rname, 0, "chromosome/target id mismatch in read 1: %s != %s" % (self.reads[0].rname, 0) ) + self.assertEqual( self.reads[1].rname, 1, "chromosome/target id mismatch in read 2: %s != %s" % (self.reads[1].rname, 1) ) + + def testARpos(self): + self.assertEqual( self.reads[0].pos, 33-1, "mapping position mismatch in read 1: %s != %s" % (self.reads[0].pos, 33-1) ) + self.assertEqual( self.reads[1].pos, 88-1, "mapping position mismatch in read 2: %s != %s" % (self.reads[1].pos, 88-1) ) + + def testARmapq(self): + self.assertEqual( self.reads[0].mapq, 20, "mapping quality mismatch in read 1: %s != %s" % (self.reads[0].mapq, 20) ) + self.assertEqual( self.reads[1].mapq, 30, "mapping quality mismatch in read 2: %s != %s" % (self.reads[1].mapq, 30) ) + + def testARcigar(self): + self.assertEqual( self.reads[0].cigar, [(0, 10), (2, 1), (0, 25)], "read name length mismatch in read 1: %s != %s" % (self.reads[0].cigar, [(0, 10), (2, 1), (0, 25)]) ) + self.assertEqual( self.reads[1].cigar, [(0, 35)], "read name length mismatch in read 2: %s != %s" % (self.reads[1].cigar, [(0, 35)]) ) + + def testARmrnm(self): + self.assertEqual( self.reads[0].mrnm, 0, "mate reference sequence name mismatch in read 1: %s != %s" % (self.reads[0].mrnm, 0) ) + self.assertEqual( self.reads[1].mrnm, 1, "mate reference sequence name mismatch in read 2: %s != %s" % (self.reads[1].mrnm, 1) ) + + def testARmpos(self): + self.assertEqual( self.reads[0].mpos, 200-1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].mpos, 200-1) ) + self.assertEqual( self.reads[1].mpos, 500-1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].mpos, 500-1) ) + + def testARisize(self): + self.assertEqual( self.reads[0].isize, 167, "insert size mismatch in read 1: %s != %s" % (self.reads[0].isize, 167) ) + self.assertEqual( self.reads[1].isize, 412, "insert size mismatch in read 2: %s != %s" % (self.reads[1].isize, 412) ) + + def testARseq(self): + self.assertEqual( self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 1: %s != %s" % (self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG") ) + self.assertEqual( self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "sequence size mismatch in read 2: %s != %s" % (self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA") ) + + def testARqual(self): + self.assertEqual( self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", "quality string mismatch in read 1: %s != %s" % (self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") ) + self.assertEqual( self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "quality string mismatch in read 2: %s != %s" % (self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<") ) + + def testPresentOptionalFields(self): + self.assertEqual( self.reads[0].opt('NM'), 1, "optional field mismatch in read 1, NM: %s != %s" % (self.reads[0].opt('NM'), 1) ) + self.assertEqual( self.reads[0].opt('RG'), 'L1', "optional field mismatch in read 1, RG: %s != %s" % (self.reads[0].opt('RG'), 'L1') ) + self.assertEqual( self.reads[1].opt('RG'), 'L2', "optional field mismatch in read 2, RG: %s != %s" % (self.reads[1].opt('RG'), 'L2') ) + self.assertEqual( self.reads[1].opt('MF'), 18, "optional field mismatch in read 2, MF: %s != %s" % (self.reads[1].opt('MF'), 18) ) + + def testPairedBools(self): + self.assertEqual( self.reads[0].is_paired, True, "is paired mismatch in read 1: %s != %s" % (self.reads[0].is_paired, True) ) + self.assertEqual( self.reads[1].is_paired, True, "is paired mismatch in read 2: %s != %s" % (self.reads[1].is_paired, True) ) + self.assertEqual( self.reads[0].is_proper_pair, True, "is proper pair mismatch in read 1: %s != %s" % (self.reads[0].is_proper_pair, True) ) + self.assertEqual( self.reads[1].is_proper_pair, True, "is proper pair mismatch in read 2: %s != %s" % (self.reads[1].is_proper_pair, True) ) + + def testTags( self ): + self.assertEqual( self.reads[0].tags, + [('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U')] ) + self.assertEqual( self.reads[1].tags, + [('MF', 18), ('RG', 'L2'), + ('PG', 'P2'),('XT', 'R') ] ) + + def testOpt( self ): + self.assertEqual( self.reads[0].opt("XT"), "U" ) + self.assertEqual( self.reads[1].opt("XT"), "R" ) + + def testMissingOpt( self ): + self.assertRaises( KeyError, self.reads[0].opt, "XP" ) + + def testEmptyOpt( self ): + self.assertRaises( KeyError, self.reads[2].opt, "XT" ) + + def tearDown(self): + self.samfile.close() + +class TestAlignedReadFromSam(TestAlignedReadFromBam): + + def setUp(self): + self.samfile=pysam.Samfile( "ex3.sam","r" ) + self.reads=list(self.samfile.fetch()) + +# needs to be implemented +# class TestAlignedReadFromSamWithoutHeader(TestAlignedReadFromBam): +# +# def setUp(self): +# self.samfile=pysam.Samfile( "ex7.sam","r" ) +# self.reads=list(self.samfile.fetch()) + +class TestHeaderSam(unittest.TestCase): + + header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, + {'LN': 1584, 'SN': 'chr2'}], + 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN":"name:with:colon"}, + {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN":"name:with:colon"}], + 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}], + 'HD': {'VN': '1.0'}, + 'CO' : [ 'this is a comment', 'this is another comment'], + } + + def compareHeaders( self, a, b ): + '''compare two headers a and b.''' + for ak,av in a.iteritems(): + self.assertTrue( ak in b, "key '%s' not in '%s' " % (ak,b) ) + self.assertEqual( av, b[ak] ) + + def setUp(self): + self.samfile=pysam.Samfile( "ex3.sam","r" ) + + def testHeaders(self): + self.compareHeaders( self.header, self.samfile.header ) + self.compareHeaders( self.samfile.header, self.header ) + + def tearDown(self): + self.samfile.close() + +class TestHeaderBam(TestHeaderSam): + + def setUp(self): + self.samfile=pysam.Samfile( "ex3.bam","rb" ) + +class TestUnmappedReads(unittest.TestCase): + + def testSAM(self): + samfile=pysam.Samfile( "ex5.sam","r" ) + self.assertEqual( len(list(samfile.fetch( until_eof = True))), 2 ) + samfile.close() + + def testBAM(self): + samfile=pysam.Samfile( "ex5.bam","rb" ) + self.assertEqual( len(list(samfile.fetch( until_eof = True))), 2 ) + samfile.close() + +class TestPileupObjects(unittest.TestCase): + + def setUp(self): + self.samfile=pysam.Samfile( "ex1.bam","rb" ) + + def testPileupColumn(self): + for pcolumn1 in self.samfile.pileup( region="chr1:105" ): + if pcolumn1.pos == 104: + self.assertEqual( pcolumn1.tid, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.tid, 0) ) + self.assertEqual( pcolumn1.pos, 105-1, "position mismatch in position 1: %s != %s" % (pcolumn1.pos, 105-1) ) + self.assertEqual( pcolumn1.n, 2, "# reads mismatch in position 1: %s != %s" % (pcolumn1.n, 2) ) + for pcolumn2 in self.samfile.pileup( region="chr2:1480" ): + if pcolumn2.pos == 1479: + self.assertEqual( pcolumn2.tid, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.tid, 1) ) + self.assertEqual( pcolumn2.pos, 1480-1, "position mismatch in position 1: %s != %s" % (pcolumn2.pos, 1480-1) ) + self.assertEqual( pcolumn2.n, 12, "# reads mismatch in position 1: %s != %s" % (pcolumn2.n, 12) ) + + def testPileupRead(self): + for pcolumn1 in self.samfile.pileup( region="chr1:105" ): + if pcolumn1.pos == 104: + self.assertEqual( len(pcolumn1.pileups), 2, "# reads aligned to column mismatch in position 1: %s != %s" % (len(pcolumn1.pileups), 2) ) +# self.assertEqual( pcolumn1.pileups[0] # need to test additional properties here + + def tearDown(self): + self.samfile.close() + +class TestExceptions(unittest.TestCase): + + def setUp(self): + self.samfile=pysam.Samfile( "ex1.bam","rb" ) + + def testMissingFile(self): + + self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.bam", "rb" ) + self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.sam", "r" ) + self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.bam", "r" ) + self.assertRaises( IOError, pysam.Samfile, "exdoesntexist.sam", "rb" ) + + def testBadContig(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr88" ) + + def testMeaninglessCrap(self): + self.assertRaises( ValueError, self.samfile.fetch, "skljf" ) + + def testBackwardsOrderNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, 'chr1', 100, 10 ) + + def testBackwardsOrderOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:100-10") + + def testOutOfRangeNegativeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 ) + + def testOutOfRangeNegativeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-10" ) + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-0" ) + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5--10" ) + + def testOutOfRangNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999, 99999999999 ) + + def testOutOfRangeLargeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999999999999999999999999, 9999999999999999999999999999999999999999 ) + + def testOutOfRangeLargeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" ) + + def tearDown(self): + self.samfile.close() + +class TestFastaFile(unittest.TestCase): + + mSequences = { 'chr1' : + "CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCTGTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACCAAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCTCTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCAATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGCAGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAACAACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACACATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATACCATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTTTCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAATACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGAACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTGTGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAGTCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGCTTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTCTCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGGAGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATATTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTCTCCCTCGTCTTCTTA", + 'chr2' : + "TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAGCTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCTTATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTTCAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTTAGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAGGAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATTTTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTAAGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATAAAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACCTCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATTAATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCAAATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGTAAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATATAACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAATACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGATGATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTGCGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATAGCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAAAAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAATTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAAAAAATATTTACAGTAACT", + } + + def setUp(self): + self.file=pysam.Fastafile( "ex1.fa" ) + + def testFetch(self): + for id, seq in self.mSequences.items(): + self.assertEqual( seq, self.file.fetch( id ) ) + for x in range( 0, len(seq), 10): + self.assertEqual( seq[x:x+10], self.file.fetch( id, x, x+10) ) + + def testFetchErrors( self ): + self.assertRaises( ValueError, self.file.fetch ) + self.assertRaises( ValueError, self.file.fetch, "chr1", 0 ) + self.assertRaises( ValueError, self.file.fetch, "chr1", -1, 10 ) + self.assertRaises( ValueError, self.file.fetch, "chr1", 20, 10 ) + # the following segfaults: + # self.assertRaises( IndexError, self.file.fetch, "chr12", ) + pass + + def tearDown(self): + self.file.close() + + +class TestAlignedRead(unittest.TestCase): + '''tests to check if aligned read can be constructed + and manipulated. + ''' + + def checkFieldEqual( self, read1, read2, exclude = []): + '''check if two reads are equal by comparing each field.''' + + for x in ("qname", "seq", "flag", + "rname", "pos", "mapq", "cigar", + "mrnm", "mpos", "isize", "qual", + "is_paired", "is_proper_pair", + "is_unmapped", "mate_is_unmapped", + "is_reverse", "mate_is_reverse", + "is_read1", "is_read2", + "is_secondary", "is_qcfail", + "is_duplicate", "bin"): + if x in exclude: continue + self.assertEqual( getattr(read1, x), getattr(read2,x), "attribute mismatch for %s: %s != %s" % + (x, getattr(read1, x), getattr(read2,x))) + + def testEmpty( self ): + a = pysam.AlignedRead() + self.assertEqual( a.qname, None ) + self.assertEqual( a.seq, None ) + self.assertEqual( a.qual, None ) + self.assertEqual( a.flag, 0 ) + self.assertEqual( a.rname, 0 ) + self.assertEqual( a.mapq, 0 ) + self.assertEqual( a.cigar, None ) + self.assertEqual( a.tags, None ) + self.assertEqual( a.mrnm, 0 ) + self.assertEqual( a.mpos, 0 ) + self.assertEqual( a.isize, 0 ) + + def buildRead( self ): + '''build an example read.''' + + a = pysam.AlignedRead() + a.qname = "read_12345" + a.seq="ACGT" * 3 + a.flag = 0 + a.rname = 0 + a.pos = 33 + a.mapq = 20 + a.cigar = ( (0,10), (2,1), (0,25) ) + a.mrnm = 0 + a.mpos=200 + a.isize=167 + a.qual="1234" * 3 + + return a + + def testUpdate( self ): + '''check if updating fields affects other variable length data + ''' + a = self.buildRead() + b = self.buildRead() + + # check qname + b.qname = "read_123" + self.checkFieldEqual( a, b, "qname" ) + b.qname = "read_12345678" + self.checkFieldEqual( a, b, "qname" ) + b.qname = "read_12345" + self.checkFieldEqual( a, b) + + # check cigar + b.cigar = ( (0,10), ) + self.checkFieldEqual( a, b, "cigar" ) + b.cigar = ( (0,10), (2,1), (0,25), (2,1), (0,25) ) + self.checkFieldEqual( a, b, "cigar" ) + b.cigar = ( (0,10), (2,1), (0,25) ) + self.checkFieldEqual( a, b) + + # check seq + b.seq = "ACGT" + self.checkFieldEqual( a, b, ("seq", "qual") ) + b.seq = "ACGT" * 10 + self.checkFieldEqual( a, b, ("seq", "qual") ) + b.seq = "ACGT" * 3 + self.checkFieldEqual( a, b, ("qual",)) + + # reset qual + b = self.buildRead() + + # check flags: + for x in ( + "is_paired", "is_proper_pair", + "is_unmapped", "mate_is_unmapped", + "is_reverse", "mate_is_reverse", + "is_read1", "is_read2", + "is_secondary", "is_qcfail", + "is_duplicate"): + setattr( b, x, True ) + self.assertEqual( getattr(b, x), True ) + self.checkFieldEqual( a, b, ("flag", x,) ) + setattr( b, x, False ) + self.assertEqual( getattr(b, x), False ) + self.checkFieldEqual( a, b ) + + def testLargeRead( self ): + '''build an example read.''' + + a = pysam.AlignedRead() + a.qname = "read_12345" + a.seq="ACGT" * 200 + a.flag = 0 + a.rname = 0 + a.pos = 33 + a.mapq = 20 + a.cigar = ( (0,10), (2,1), (0,25) ) + a.mrnm = 0 + a.mpos=200 + a.isize=167 + a.qual="1234" * 200 + + return a + +class TestDeNovoConstruction(unittest.TestCase): + '''check BAM/SAM file construction using ex3.sam + + (note these are +1 coordinates): + + read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 + read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 + ''' + + header = { 'HD': {'VN': '1.0'}, + 'SQ': [{'LN': 1575, 'SN': 'chr1'}, + {'LN': 1584, 'SN': 'chr2'}], } + + bamfile = "ex6.bam" + samfile = "ex6.sam" + + def checkFieldEqual( self, read1, read2, exclude = []): + '''check if two reads are equal by comparing each field.''' + + for x in ("qname", "seq", "flag", + "rname", "pos", "mapq", "cigar", + "mrnm", "mpos", "isize", "qual", + "bin", + "is_paired", "is_proper_pair", + "is_unmapped", "mate_is_unmapped", + "is_reverse", "mate_is_reverse", + "is_read1", "is_read2", + "is_secondary", "is_qcfail", + "is_duplicate"): + if x in exclude: continue + self.assertEqual( getattr(read1, x), getattr(read2,x), "attribute mismatch for %s: %s != %s" % + (x, getattr(read1, x), getattr(read2,x))) + + def setUp( self ): + + + a = pysam.AlignedRead() + a.qname = "read_28833_29006_6945" + a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" + a.flag = 99 + a.rname = 0 + a.pos = 32 + a.mapq = 20 + a.cigar = ( (0,10), (2,1), (0,25) ) + a.mrnm = 0 + a.mpos=199 + a.isize=167 + a.qual="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<" + a.tags = ( ("NM", 1), + ("RG", "L1") ) + + b = pysam.AlignedRead() + b.qname = "read_28701_28881_323b" + b.seq="ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA" + b.flag = 147 + b.rname = 1 + b.pos = 87 + b.mapq = 30 + b.cigar = ( (0,35), ) + b.mrnm = 1 + b.mpos=499 + b.isize=412 + b.qual="<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<" + b.tags = ( ("MF", 18), + ("RG", "L2") ) + + self.reads = (a,b) + + def testSAMWholeFile( self ): + + tmpfilename = "tmp_%i.sam" % id(self) + + outfile = pysam.Samfile( tmpfilename, "wh", header = self.header ) + + for x in self.reads: outfile.write( x ) + outfile.close() + + self.assertTrue( checkBinaryEqual( tmpfilename, self.samfile ), + "mismatch when construction SAM file, see %s %s" % (tmpfilename, self.samfile)) + + os.unlink( tmpfilename ) + + def testBAMPerRead( self ): + '''check if individual reads are binary equal.''' + infile = pysam.Samfile( self.bamfile, "rb") + + others = list(infile) + for denovo, other in zip( others, self.reads): + self.checkFieldEqual( other, denovo ) + self.assertEqual( other, denovo) + + def testSAMPerRead( self ): + '''check if individual reads are binary equal.''' + infile = pysam.Samfile( self.samfile, "r") + + others = list(infile) + for denovo, other in zip( others, self.reads): + self.checkFieldEqual( other, denovo ) + self.assertEqual( other, denovo) + + def testBAMWholeFile( self ): + + tmpfilename = "tmp_%i.bam" % id(self) + + outfile = pysam.Samfile( tmpfilename, "wb", header = self.header ) + + for x in self.reads: outfile.write( x ) + outfile.close() + + self.assertTrue( checkBinaryEqual( tmpfilename, self.bamfile ), + "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile)) + + os.unlink( tmpfilename ) + + +# TODOS +# 1. finish testing all properties within pileup objects +# 2. check exceptions and bad input problems (missing files, optional fields that aren't present, etc...) + +if __name__ == "__main__": + # build data files + print "building data files" + subprocess.call( "make", shell=True) + print "starting tests" + unittest.main() diff --git a/tests/segfault_tests.py b/tests/segfault_tests.py new file mode 100755 index 0000000..ff32fec --- /dev/null +++ b/tests/segfault_tests.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +'''unit testing code for pysam.''' + +import pysam +import unittest +import os +import itertools +import subprocess +import shutil + +class TestExceptions(unittest.TestCase): + + def setUp(self): + self.samfile=pysam.Samfile( "ex1.bam","rb" ) + + def testOutOfRangeNegativeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 ) + + def testOutOfRangeNegativeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-10" ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-0" ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5--10" ) + + def testOutOfRangeLargeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 99999999999999999, 999999999999999999 ) + + def testOutOfRangeLargeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" ) + + def tearDown(self): + self.samfile.close() + +if __name__ == "__main__": + unittest.main() + -- 2.30.2