From 6a7c3f175b210cc16d09a5e8e4c1d47333dbe1c6 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 19 Nov 2010 11:43:43 -0800 Subject: [PATCH] Imported Upstream version 0.3 --- MANIFEST.in | 10 + PKG-INFO | 2 +- pysam/__init__.py | 12 +- pysam/csamtools.pxd | 84 ++-- pysam/csamtools.pyx | 646 ++++++++++++++++++-------- pysam/ctabix.pxd | 171 +++++++ pysam/ctabix.pyx | 881 ++++++++++++++++++++++++++++++++++++ pysam/pysam_util.c | 229 +--------- pysam/pysam_util.h | 69 +-- pysam/version.py | 7 + samtools/bam.c | 55 ++- samtools/bam.h | 107 +++-- samtools/bam_aux.c | 2 +- samtools/bam_import.c | 26 +- samtools/bam_index.c | 216 +++++++-- samtools/bam_maqcns.c | 33 +- samtools/bam_maqcns.h | 5 +- samtools/bam_md.c | 34 +- samtools/bam_pileup.c | 342 ++++++++++---- samtools/bam_plcmd.c | 250 ++++++++-- samtools/bam_reheader.c | 60 +++ samtools/bam_sort.c | 4 +- samtools/bam_tview.c | 19 +- samtools/bgzf.c | 76 ++-- samtools/bgzf.h | 25 +- samtools/faidx.c | 2 +- samtools/knetfile.c | 6 +- samtools/kstring.h | 34 ++ samtools/sam.c | 1 + samtools/sam_header.c | 38 +- samtools/sam_view.c | 43 +- setup.py | 78 +++- tabix/bam_endian.h | 42 ++ tabix/bgzf.c | 676 +++++++++++++++++++++++++++ tabix/bgzf.h | 156 +++++++ tabix/bgzip.c | 201 +++++++++ tabix/index.c | 954 +++++++++++++++++++++++++++++++++++++++ tabix/khash.h | 486 ++++++++++++++++++++ tabix/knetfile.c | 632 ++++++++++++++++++++++++++ tabix/knetfile.h | 75 +++ tabix/ksort.h | 271 +++++++++++ tabix/kstring.c | 165 +++++++ tabix/kstring.h | 68 +++ tabix/tabix.h | 137 ++++++ tests/ex3.sam | 2 +- tests/example.gtf.gz | Bin 0 -> 3778 bytes tests/example.gtf.gz.tbi | Bin 0 -> 260 bytes tests/pysam_test.py | 148 +++++- tests/tabix_test.py | 225 +++++++++ 49 files changed, 6938 insertions(+), 867 deletions(-) create mode 100644 pysam/ctabix.pxd create mode 100644 pysam/ctabix.pyx create mode 100644 pysam/version.py create mode 100644 samtools/bam_reheader.c create mode 100644 tabix/bam_endian.h create mode 100644 tabix/bgzf.c create mode 100644 tabix/bgzf.h create mode 100644 tabix/bgzip.c create mode 100644 tabix/index.c create mode 100644 tabix/khash.h create mode 100644 tabix/knetfile.c create mode 100644 tabix/knetfile.h create mode 100644 tabix/ksort.h create mode 100644 tabix/kstring.c create mode 100644 tabix/kstring.h create mode 100644 tabix/tabix.h create mode 100644 tests/example.gtf.gz create mode 100644 tests/example.gtf.gz.tbi create mode 100644 tests/tabix_test.py diff --git a/MANIFEST.in b/MANIFEST.in index 11fb9d1..4bbbc8e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,8 +9,12 @@ include INSTALL include KNOWN_BUGS include THANKS include pysam/csamtools.pxd +include pysam/ctabix.pxd include pysam/pysam_util.h include samtools/*.h +include tabix/*.h + +# pysam tests include tests/00README.txt include tests/Makefile include tests/ex1.fa @@ -24,3 +28,9 @@ include tests/example.py include tests/pysam_test.py include tests/segfault_tests.py +# tabix tests +include tests/tabix_test.py +include tests/example.gtf.gz +include tests/example.gtf.gz.tbi + + diff --git a/PKG-INFO b/PKG-INFO index 3e3b745..b95ed79 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: pysam -Version: 0.2 +Version: 0.3 Summary: pysam Home-page: http://code.google.com/p/pysam/ Author: Andreas Heger diff --git a/pysam/__init__.py b/pysam/__init__.py index 3062753..9f257c2 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -1,4 +1,7 @@ from csamtools import * +from ctabix import * +import csamtools +import ctabix import Pileup import sys import os @@ -50,7 +53,9 @@ class SamtoolsDispatcher(object): # Note that there is sometimes output on stderr that is not an error, # for example: [sam_header_read2] 2 sequences loaded. # Ignore messages like these - stderr = [ x for x in stderr if not x.startswith( "[sam_header_read2]" ) ] + stderr = [ x for x in stderr \ + if not x.startswith( "[sam_header_read2]" ) or \ + x.startswith("[bam_index_load]") ] if stderr: raise SamtoolsError( "\n".join( stderr ) ) # call parser for stdout: @@ -96,6 +101,9 @@ for key, options in SAMTOOLS_DISPATCH.iteritems(): globals()[key] = SamtoolsDispatcher(cmd, parser) # hack to export all the symbols from csamtools -__all__ = csamtools.__all__ + [ "SamtoolsError", "SamtoolsDispatcher" ] + list(SAMTOOLS_DISPATCH) +\ +__all__ = csamtools.__all__ + \ + ctabix.__all__ + \ + [ "SamtoolsError", "SamtoolsDispatcher" ] + list(SAMTOOLS_DISPATCH) +\ ["Pileup",] +from version import __version__, __samtools_version__ diff --git a/pysam/csamtools.pxd b/pysam/csamtools.pxd index 7dac38d..b614a84 100644 --- a/pysam/csamtools.pxd +++ b/pysam/csamtools.pxd @@ -27,7 +27,7 @@ cdef extern from "stdio.h": FILE * stdout int fclose(FILE *) int sscanf(char *str,char *fmt,...) - int printf(char *str,char *fmt,...) + int printf(char *fmt,...) int sprintf(char *str,char *fmt,...) int fprintf(FILE *ifile,char *fmt,...) char *fgets(char *str,int size,FILE *ifile) @@ -50,6 +50,9 @@ cdef extern from "string.h": size_t strlen(char *s) int memcmp( void * s1, void *s2, size_t len ) +cdef extern from "Python.h": + long _Py_HashPointer(void*) + cdef extern from "razf.h": pass @@ -120,15 +123,37 @@ cdef extern from "bam.h": ctypedef struct bam_plbuf_t: pass + ctypedef struct bam_iter_t: + pass + + bam1_t * bam_init1() + void bam_destroy1(bam1_t *) + bamFile razf_dopen(int data_fd, char *mode) - # removed - macros not found + int64_t bam_seek( bamFile fp, uint64_t voffset, int where) + int64_t bam_tell( bamFile fp ) - # int64_t bam_seek( bamFile fp, uint64_t voffset, int where) - # int64_t bam_tell( bamFile fp ) - # void bam_destroy1( bam1_t * b) # void bam_init_header_hash(bam_header_t *header) + ############################################### + # stand-ins for samtools macros + uint32_t * bam1_cigar( bam1_t * b) + char * bam1_qname( bam1_t * b) + uint8_t * bam1_seq( bam1_t * b) + uint8_t * bam1_qual( bam1_t * b) + uint8_t * bam1_aux( bam1_t * b) + + ############################################### + # bam iterator interface + bam_iter_t bam_iter_query( bam_index_t *idx, int tid, int beg, int end) + + int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) + + void bam_iter_destroy(bam_iter_t iter) + + ############################################### + bam1_t * bam_dup1( bam1_t *src ) bam1_t * bam_copy1(bam1_t *bdst, bam1_t *bsrc) @@ -138,6 +163,7 @@ cdef extern from "bam.h": int bam_parse_region(bam_header_t *header, char *str, int *ref_id, int *begin, int *end) + ############################################### bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) int bam_fetch(bamFile fp, bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) @@ -145,6 +171,22 @@ cdef extern from "bam.h": int bam_plbuf_push(bam1_t *b, bam_plbuf_t *buf) void bam_plbuf_destroy(bam_plbuf_t *buf) + ######################################## + # pileup iterator interface + ctypedef struct bam_plp_t: + pass + + ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) + + bam_plp_t bam_plp_init( bam_plp_auto_f func, void *data) + int bam_plp_push( bam_plp_t iter, bam1_t *b) + bam_pileup1_t *bam_plp_next( bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) + bam_pileup1_t *bam_plp_auto( bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) + void bam_plp_set_mask(bam_plp_t iter, int mask) + void bam_plp_reset(bam_plp_t iter) + void bam_plp_destroy(bam_plp_t iter) + + ################################################## int bam_read1(bamFile fp, bam1_t *b) @@ -209,15 +251,20 @@ cdef extern from "faidx.h": char *fai_fetch(faidx_t *fai, char *reg, int *len) -cdef extern from "pysam_util.h": + int faidx_fetch_nseq(faidx_t *fai) - int pysam_bam_plbuf_push(bam1_t *b, bam_plbuf_t *buf, int cont) + char *faidx_fetch_seq(faidx_t *fai, char *c_name, + int p_beg_i, int p_end_i, int *len) - int pysam_get_pos( bam_plbuf_t *buf) +cdef extern from "pysam_util.h": - int pysam_get_tid( bam_plbuf_t *buf) + int pysam_pileup_next(bam1_t *b, + bam_plbuf_t *buf, + bam_pileup1_t ** plp, + int * tid, + int * pos, + int * n_plp ) - bam_pileup1_t * pysam_get_pileup( bam_plbuf_t *buf) int pysam_dispatch(int argc, char *argv[] ) @@ -233,19 +280,4 @@ cdef extern from "pysam_util.h": # translate char to unsigned char unsigned char pysam_translate_sequence( char s ) - # stand-ins for samtools macros - uint32_t * pysam_bam1_cigar( bam1_t * b) - char * pysam_bam1_qname( bam1_t * b) - uint8_t * pysam_bam1_seq( bam1_t * b) - uint8_t * pysam_bam1_qual( bam1_t * b) - uint8_t * pysam_bam1_aux( bam1_t * b) - - # iterator implemenation - ctypedef struct bam_fetch_iterator_t: - pass - - bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, bam_index_t *idx, int tid, int beg, int end) - - bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter) - - void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter) + diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx index 0da8d9e..242e68a 100644 --- a/pysam/csamtools.pyx +++ b/pysam/csamtools.pyx @@ -1,8 +1,12 @@ # cython: embedsignature=True +# cython: profile=True # adds doc-strings for sphinx import tempfile, os, sys, types, itertools, struct, ctypes +from python_string cimport PyString_FromStringAndSize, PyString_AS_STRING +from python_exc cimport PyErr_SetString + # defines imported from samtools DEF SEEK_SET = 0 DEF SEEK_CUR = 1 @@ -36,6 +40,14 @@ DEF BAM_FDUP =1024 DEF BAM_CIGAR_SHIFT=4 DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1) +DEF BAM_CMATCH = 0 +DEF BAM_CINS = 1 +DEF BAM_CDEL = 2 +DEF BAM_CREF_SKIP = 3 +DEF BAM_CSOFT_CLIP = 4 +DEF BAM_CHARD_CLIP = 5 +DEF BAM_CPAD = 6 + ##################################################################### ##################################################################### ##################################################################### @@ -48,15 +60,17 @@ cdef makeAlignedRead( bam1_t * src): dest = AlignedRead() # destroy dummy delegate created in constructor # to prevent memory leak. - pysam_bam_destroy1(dest._delegate) + bam_destroy1(dest._delegate) dest._delegate = bam_dup1(src) return dest cdef class PileupProxy -cdef makePileupProxy( bam_plbuf_t * buf, int n ): +cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ): cdef PileupProxy dest dest = PileupProxy() - dest.buf = buf + dest.plp = plp + dest.tid = tid + dest.pos = pos dest.n = n return dest @@ -127,6 +141,7 @@ cdef int pileup_callback( uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, p.n = n pileups = [] + cdef int x for x from 0 <= x < n: pileups.append( makePileupRead( &(pl[x]) ) ) p.pileups = pileups @@ -185,6 +200,7 @@ VALID_HEADER_ORDER = { "HD" : ( "VN", "SO", "GO" ), "RG" : ( "ID", "SM", "LB", "DS" , "PU" , "PI" , "CN" , "DT", "PL" ), "PG" : ( "ID", "VN", "CL" ), } + ###################################################################### ###################################################################### ###################################################################### @@ -229,9 +245,12 @@ cdef class Samfile: cdef bam_index_t *index # true if file is a bam file cdef int isbam - + # true if file is not on the local filesystem + cdef int isremote # current read within iteration cdef bam1_t * b + # file opening mode + cdef char * mode def __cinit__(self, *args, **kwargs ): self.samfile = NULL @@ -251,12 +270,13 @@ cdef class Samfile: def _open( self, char * filename, - mode ='r', + mode = 'r', Samfile template = None, referencenames = None, referencelengths = None, - char * text = NULL, + text = None, header = None, + port = None, ): '''open a sam/bam file. @@ -277,6 +297,12 @@ cdef class Samfile: self.isbam = len(mode) > 1 and mode[1] == 'b' + self.isremote = strncmp(filename,"http:",5) == 0 or \ + strncmp(filename,"ftp:",4) == 0 + + cdef char * ctext + ctext = NULL + if mode[0] == 'w': # open file for writing @@ -306,11 +332,12 @@ cdef class Samfile: header_to_write.target_name[x] = calloc(len(name)+1, sizeof(char)) strncpy( header_to_write.target_name[x], name, len(name) ) - if text != NULL: + if text != None: # copy without \0 - header_to_write.l_text = strlen(text) - header_to_write.text = calloc( strlen(text), sizeof(char) ) - memcpy( header_to_write.text, text, strlen(text) ) + ctext = text + header_to_write.l_text = strlen(ctext) + header_to_write.text = calloc( strlen(ctext), sizeof(char) ) + memcpy( header_to_write.text, ctext, strlen(ctext) ) header_to_write.hash = NULL header_to_write.rg2lib = NULL @@ -327,7 +354,9 @@ cdef class Samfile: elif mode[0] == "r": # open file for reading - if strncmp( filename, "-", 1) != 0 and not os.path.exists( filename ): + if strncmp( filename, "-", 1) != 0 and \ + not self.isremote and \ + not os.path.exists( filename ): raise IOError( "file `%s` not found" % filename) store = StderrStore() @@ -337,15 +366,22 @@ cdef class Samfile: if self.samfile == NULL: raise IOError("could not open file `%s`" % filename ) + # check for index and open if present if mode[0] == "r" and self.isbam: - if not os.path.exists(filename + ".bai"): - self.index = NULL + + if not self.isremote: + if not os.path.exists(filename +".bai"): + self.index = NULL + else: + # returns NULL if there is no index or index could not be opened + self.index = bam_index_load(filename) + if self.index == NULL: + raise IOError("error while opening index `%s` " % filename ) else: - # returns NULL if there is no index or index could not be opened self.index = bam_index_load(filename) if self.index == NULL: raise IOError("error while opening index `%s` " % filename ) - + def getrname( self, tid ): '''(tid ) convert numerical :term:`tid` into :ref:`reference` name.''' @@ -394,6 +430,22 @@ cdef class Samfile: if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend ) return region, rtid, rstart, rend + + def seek( self, uint64_t offset, int where = 0): + '''move to current file to position *offset*''' + + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + if not self.isbam: + raise NotImplementedError("seek only available in bam files") + return bam_seek( self.samfile.x.bam, offset, where ) + + def tell( self ): + '''return current file position''' + if not self.isbam: + raise NotImplementedError("seek only available in bam files") + + return bam_tell( self.samfile.x.bam ) def fetch( self, reference = None, @@ -428,16 +480,24 @@ cdef class Samfile: if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) - + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) if self.isbam: + if not until_eof and not self._hasIndex() and not self.isremote: + raise ValueError( "fetch called on bamfile without index" ) + if callback: if not region: raise ValueError( "callback functionality requires a region/reference" ) if not self._hasIndex(): raise ValueError( "no index available for fetch" ) return bam_fetch(self.samfile.x.bam, - self.index, rtid, rstart, rend, callback, fetch_callback ) + self.index, + rtid, + rstart, + rend, + callback, + fetch_callback ) else: if region: return IteratorRow( self, rtid, rstart, rend ) @@ -453,7 +513,12 @@ cdef class Samfile: for rtid from 0 <= rtid < self.nreferences: i.append( IteratorRow( self, rtid, rstart, rend)) return itertools.chain( *i ) - else: + else: + # check if header is present - otherwise sam_read1 aborts + # this happens if a bamfile is opened with mode 'r' + if self.samfile.header.n_targets == 0: + raise ValueError( "fetch called for samfile without header") + if region != None: raise ValueError ("fetch for a region is not available for sam files" ) if callback: @@ -528,11 +593,11 @@ cdef class Samfile: self.samfile = NULL def __dealloc__( self ): - '''clean up.''' # remember: dealloc cannot call other methods - # Note that __del__ is not called. + # note: no doc string + # note: __del__ is not called. self.close() - pysam_bam_destroy1(self.b) + bam_destroy1(self.b) def write( self, AlignedRead read ): '''(AlignedRead read ) @@ -542,6 +607,13 @@ cdef class Samfile: ''' return samwrite( self.samfile, read._delegate ) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + property nreferences: '''number of :term:`reference` sequences in the file.''' def __get__(self): @@ -567,13 +639,7 @@ cdef class Samfile: property text: '''full contents of the :term:`sam file` header as a string.''' def __get__(self): - # create a temporary 0-terminated copy - cdef char * t - t = calloc( self.samfile.header.l_text + 1, sizeof(char) ) - memcpy( t, self.samfile.header.text, self.samfile.header.l_text ) - result = t - free(t) - return result + return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text) property header: '''header information within the :term:`sam file`. The records and fields are returned as @@ -730,6 +796,10 @@ cdef class Fastafile: '''return true if samfile has been opened.''' return self.fastafile != NULL + def __len__(self): + assert self.fastafile != NULL + return faidx_fetch_nseq(self.fastafile) + def _open( self, char * filename ): '''open an indexed fasta file. @@ -758,8 +828,14 @@ cdef class Fastafile: '''*(reference = None, start = None, end = None, region = None)* - fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. The region is specified by - :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. + fetch :meth:`AlignedRead` objects in a :term:`region` using 0-based indexing. + The region is specified by :term:`reference`, *start* and *end*. + + If *reference* is given and *start* is None, the sequence from the + first base is returned. Similarly, if *end* is None, the sequence + until the last base is returned. + + Alternatively, a samtools :term:`region` string can be supplied. ''' if not self._isOpen(): @@ -770,59 +846,85 @@ cdef class Fastafile: max_pos = 2 << 29 if not region: - if reference == None: raise ValueError( 'no sequence/region supplied.' ) - if start == None and end == None: - region = "%s" % str(reference) - elif start == None or end == None: - raise ValueError( 'only start or only end of region supplied' ) - else: - if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) ) - # valid ranges are from 0 to 2^29-1 - if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start ) - if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end ) - region = "%s:%i-%i" % (reference, start+1, end ) - - # samtools adds a '\0' at the end - seq = fai_fetch( self.fastafile, region, &len ) + if reference is None: raise ValueError( 'no sequence/region supplied.' ) + if start is None: start = 0 + if end is None: end = max_pos -1 + + if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) ) + if start == end: return "" + # valid ranges are from 0 to 2^29-1 + if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start ) + if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end ) + + seq = faidx_fetch_seq(self.fastafile, reference, + start, + end-1, &len) + else: + # samtools adds a '\0' at the end + seq = fai_fetch( self.fastafile, region, &len ) + # copy to python - result = seq - # clean up - free(seq) + if seq == NULL: + return "" + else: + result = seq + # clean up + free(seq) return result +########################################################################### +########################################################################### +########################################################################### ## turning callbacks elegantly into iterators is an unsolved problem, see the following threads: ## http://groups.google.com/group/comp.lang.python/browse_frm/thread/0ce55373f128aa4e/1d27a78ca6408134?hl=en&pli=1 ## http://www.velocityreviews.com/forums/t359277-turning-a-callback-function-into-a-generator.html ## Thus I chose to rewrite the functions requiring callbacks. The downside is that if the samtools C-API or code ## changes, the changes have to be manually entered. - cdef class IteratorRow: """iterates over mapped reads in a region. + + The samtools iterators assume that the file + position between iterations do not change. + As a consequence, no two iterators can work + on the same file. To permit this, each iterator + creates its own file handle by re-opening the + file. + + Note that the index will be shared between + samfile and the iterator. """ - cdef bam_fetch_iterator_t* bam_iter # iterator state object + cdef bam_iter_t iter # iterator state object cdef bam1_t * b - cdef error_msg - cdef int error_state + cdef int retval cdef Samfile samfile + cdef samfile_t * fp + def __cinit__(self, Samfile samfile, int tid, int beg, int end ): - self.bam_iter = NULL assert samfile._isOpen() assert samfile._hasIndex() # makes sure that samfile stays alive as long as the - # iterator is alive. + # iterator is alive self.samfile = samfile - # parse the region - self.error_state = 0 - self.error_msg = None + if samfile.isbam: mode = "rb" + else: mode = "r" + + # reopen the file + store = StderrStore() + self.fp = samopen( samfile.filename, mode, NULL ) + store.release() - cdef bamFile fp - fp = samfile.samfile.x.bam - self.bam_iter = bam_init_fetch_iterator(fp, samfile.index, tid, beg, end) + self.retval = 0 + + self.iter = bam_iter_query(self.samfile.index, + tid, + beg, + end) + self.b = bam_init1() def __iter__(self): return self @@ -832,29 +934,21 @@ cdef class IteratorRow: cdef int cnext(self): '''cversion of iterator. Used by IteratorColumn''' - self.b = bam_fetch_iterate(self.bam_iter) - if self.b == NULL: return 0 - return 1 - + self.retval = bam_iter_read( self.fp.x.bam, + self.iter, + self.b) + def __next__(self): """python version of next(). - - pyrex uses this non-standard name instead of next() """ - if self.error_state: - raise ValueError( self.error_msg) - - self.b = bam_fetch_iterate(self.bam_iter) - if self.b != NULL: - return makeAlignedRead( self.b ) - else: - raise StopIteration + self.cnext() + if self.retval < 0: raise StopIteration + return makeAlignedRead( self.b ) def __dealloc__(self): - '''remember: dealloc cannot call other methods!''' - if self.bam_iter: - bam_cleanup_fetch_iterator(self.bam_iter) - + bam_destroy1(self.b) + samclose( self.fp ) + cdef class IteratorRowAll: """iterates over all mapped reads """ @@ -866,7 +960,13 @@ cdef class IteratorRowAll: assert samfile._isOpen() - self.fp = samfile.samfile + if samfile.isbam: mode = "rb" + else: mode = "r" + + # reopen the file to avoid iterator conflict + store = StderrStore() + self.fp = samopen( samfile.filename, mode, NULL ) + store.release() # allocate memory for alignment self.b = calloc(1, sizeof(bam1_t)) @@ -895,9 +995,18 @@ cdef class IteratorRowAll: raise StopIteration def __dealloc__(self): - '''remember: dealloc cannot call other methods!''' - pysam_bam_destroy1(self.b) - + bam_destroy1(self.b) + samclose( self.fp ) + +ctypedef struct __iterdata: + bamFile fp + bam_iter_t iter + +cdef int __advance( void * data, bam1_t * b ): + cdef __iterdata * d + d = <__iterdata*>data + return bam_iter_read( d.fp, d.iter, b ) + cdef class IteratorColumn: '''iterates over columns. @@ -922,79 +1031,138 @@ cdef class IteratorColumn: Here, result will be a list of ``n`` lists of objects of type :class:`PileupRead`. ''' - cdef bam_plbuf_t *buf - # check if first iteration - cdef int notfirst # result of the last plbuf_push - cdef int n_pu - cdef int eof cdef IteratorRow iter - + cdef int tid + cdef int pos + cdef int n_plp + cdef bam_pileup1_t * plp + cdef bam_plp_t pileup_iter + cdef __iterdata iterdata def __cinit__(self, Samfile samfile, int tid, int start, int end ): self.iter = IteratorRow( samfile, tid, start, end ) - self.buf = bam_plbuf_init(NULL, NULL ) - self.n_pu = 0 - self.eof = 0 + self.iterdata.fp = samfile.samfile.x.bam + self.iterdata.iter = self.iter.iter + + self.pileup_iter = bam_plp_init( &__advance, &self.iterdata ) + self.n_plp = 0 + self.tid = 0 + self.pos = 0 + self.plp = NULL def __iter__(self): return self cdef int cnext(self): '''perform next iteration. - - return 1 if there is a buffer to emit. Return 0 for end of iteration. ''' + self.plp = bam_plp_auto( self.pileup_iter, + &self.tid, + &self.pos, + &self.n_plp ) - cdef int retval1, retval2 + def __next__(self): + """python version of next(). - # pysam bam_plbuf_push returns: - # 1: if buf is full and can be emitted - # 0: if b has been added - # -1: if there was an error + pyrex uses this non-standard name instead of next() + """ + self.cnext() + if self.n_plp < 0: + raise ValueError("error during iteration" ) + + if self.plp == NULL: + raise StopIteration - # check if previous plbuf was incomplete. If so, continue within - # the loop and yield if necessary - if self.n_pu > 0: - self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 1) - if self.n_pu > 0: return 1 + return makePileupProxy( self.plp, self.tid, self.pos, self.n_plp ) - if self.eof: return 0 + def __dealloc__(self): + bam_plp_destroy(self.pileup_iter) + +cdef inline int32_t query_start(bam1_t *src) except -1: + cdef uint32_t * cigar_p, op + cdef uint32_t k + cdef uint32_t start_offset = 0 + + if src.core.n_cigar: + cigar_p = bam1_cigar(src); + for k from 0 <= k < src.core.n_cigar: + op = cigar_p[k] & BAM_CIGAR_MASK + if op==BAM_CHARD_CLIP: + if start_offset!=0 and start_offset!=src.core.l_qseq: + PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string') + return -1 + elif op==BAM_CSOFT_CLIP: + start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT + else: + break + + return start_offset + + +cdef inline int32_t query_end(bam1_t *src) except -1: + cdef uint32_t * cigar_p, op + cdef uint32_t k + cdef uint32_t end_offset = src.core.l_qseq + + if src.core.n_cigar>1: + cigar_p = bam1_cigar(src); + for k from src.core.n_cigar > k >= 1: + op = cigar_p[k] & BAM_CIGAR_MASK + if op==BAM_CHARD_CLIP: + if end_offset!=0 and end_offset!=src.core.l_qseq: + PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string') + return -1 + elif op==BAM_CSOFT_CLIP: + end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT + else: + break - # get next alignments and submit until plbuf indicates that - # an new column has finished - while self.n_pu == 0: - retval1 = self.iter.cnext() - # wrap up if no more input - if retval1 == 0: - self.n_pu = pysam_bam_plbuf_push( NULL, self.buf, 0) - self.eof = 1 - return self.n_pu + if end_offset==0: + end_offset = src.core.l_qseq - # submit to plbuf - self.n_pu = pysam_bam_plbuf_push( self.iter.getCurrent(), self.buf, 0) - if self.n_pu < 0: raise ValueError( "error while iterating" ) + return end_offset - # plbuf has yielded - return 1 - def __next__(self): - """python version of next(). +cdef inline object get_seq_range(bam1_t *src, uint32_t start, uint32_t end): + cdef uint8_t * p + cdef uint32_t k + cdef char * s + cdef char * bam_nt16_rev_table = "=ACMGRSVTWYHKDBN" - pyrex uses this non-standard name instead of next() - """ - cdef int ret - ret = self.cnext() - cdef bam_pileup1_t * pl + if not src.core.l_qseq: + return None - if ret > 0 : - return makePileupProxy( self.buf, self.n_pu ) - else: - raise StopIteration + seq = PyString_FromStringAndSize(NULL, end-start) + s = PyString_AS_STRING(seq) + p = bam1_seq(src) - def __dealloc__(self): - bam_plbuf_destroy(self.buf); + for k from start <= k < end: + # equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c) + # note: do not use string literal as it will be a python string + s[k-start] = bam_nt16_rev_table[p[k/2] >> 4 * (1 - k%2) & 0xf] + + return seq + + +cdef inline object get_qual_range(bam1_t *src, uint32_t start, uint32_t end): + cdef uint8_t * p + cdef uint32_t k + cdef char * q + + p = bam1_qual(src) + if p[0] == 0xff: + return None + + qual = PyString_FromStringAndSize(NULL, end-start) + q = PyString_AS_STRING(qual) + + for k from start <= k < end: + ## equivalent to t[i] + 33 (see bam.c) + q[k-start] = p[k] + 33 + + return qual cdef class AlignedRead: ''' @@ -1030,8 +1198,7 @@ cdef class AlignedRead: self._delegate.data_len = 0 def __dealloc__(self): - '''clear up memory.''' - pysam_bam_destroy1(self._delegate) + bam_destroy1(self._delegate) def __str__(self): """todo""" @@ -1046,10 +1213,12 @@ cdef class AlignedRead: self.tags))) - def __cmp__(self, AlignedRead other): - '''return true, if contents in this are binary equal to ``other``.''' + def compare(self, AlignedRead other): + '''return -1,0,1, if contents in this are binary <,=,> to *other*''' + cdef int retval, x cdef bam1_t *t, *o + t = self._delegate o = other._delegate @@ -1062,16 +1231,20 @@ cdef class AlignedRead: # oo = (o.data) # for x from 0 <= x < max(t.data_len, o.data_len): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) - retval = memcmp( &t.core, - &o.core, - sizeof( bam1_core_t )) + # Fast-path test for object identity + if t==o: + return 0 + + retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t)) if retval: return retval - retval = cmp( t.data_len, o.data_len) + retval = cmp(t.data_len, o.data_len) if retval: return retval - return memcmp( t.data, - o.data, - sizeof( t.data_len )) + return memcmp(t.data, o.data, t.data_len) + + # Disabled so long as __cmp__ is a special method + def __hash__(self): + return _Py_HashPointer(self) property qname: """the query name (None if not present)""" @@ -1079,7 +1252,7 @@ cdef class AlignedRead: cdef bam1_t * src src = self._delegate if src.core.l_qname == 0: return None - return pysam_bam1_qname( src ) + return bam1_qname( src ) def __set__(self, qname ): if qname == None or len(qname) == 0: return @@ -1088,7 +1261,7 @@ cdef class AlignedRead: cdef char * p src = self._delegate - p = pysam_bam1_qname( src ) + p = bam1_qname( src ) # the qname is \0 terminated l = len(qname) + 1 @@ -1101,7 +1274,7 @@ cdef class AlignedRead: # re-acquire pointer to location in memory # as it might have moved - p = pysam_bam1_qname(src) + p = bam1_qname(src) strncpy( p, qname, l ) @@ -1112,11 +1285,13 @@ cdef class AlignedRead: cdef uint32_t * cigar_p cdef bam1_t * src cdef op, l, cigar + cdef int k + src = self._delegate if src.core.n_cigar == 0: return None cigar = [] - cigar_p = pysam_bam1_cigar(src); + cigar_p = bam1_cigar(src); for k from 0 <= k < src.core.n_cigar: op = cigar_p[k] & BAM_CIGAR_MASK l = cigar_p[k] >> BAM_CIGAR_SHIFT @@ -1135,7 +1310,7 @@ cdef class AlignedRead: src = self._delegate # get location of cigar string - p = pysam_bam1_cigar(src) + p = bam1_cigar(src) # create space for cigar data within src.data pysam_bam_update( src, @@ -1148,7 +1323,7 @@ cdef class AlignedRead: # re-acquire pointer to location in memory # as it might have moved - p = pysam_bam1_cigar(src) + p = bam1_cigar(src) # insert cigar operations for op, l in values: @@ -1159,24 +1334,16 @@ cdef class AlignedRead: src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, p)) property seq: - """the query sequence (None if not present)""" + """read sequence bases, including :term:`soft clipped` bases (None if not present)""" def __get__(self): cdef bam1_t * src - cdef uint8_t * p cdef char * s + src = self._delegate - bam_nt16_rev_table = "=ACMGRSVTWYHKDBN" - ## parse qseq (bam1_seq) + if src.core.l_qseq == 0: return None - s = < char *> calloc(src.core.l_qseq + 1 , sizeof(char)) - p = pysam_bam1_seq( src ) - for k from 0 <= k < src.core.l_qseq: - ## equivalent to bam_nt16_rev_table[bam1_seqi(s, i)] (see bam.c) - s[k] = "=ACMGRSVTWYHKDBN"[((p)[(k) / 2] >> 4 * (1 - (k) % 2) & 0xf)] - retval=s - free(s) - return retval + return get_seq_range(src, 0, src.core.l_qseq) def __set__(self,seq): # samtools manages sequence and quality length memory together @@ -1186,9 +1353,10 @@ cdef class AlignedRead: cdef bam1_t * src cdef uint8_t * p cdef char * s - src = self._delegate cdef int l, k, nbytes_new, nbytes_old + src = self._delegate + l = len(seq) # as the sequence is stored in half-bytes, the total length (sequence @@ -1196,7 +1364,7 @@ cdef class AlignedRead: nbytes_new = (l+1)/2 + l nbytes_old = (src.core.l_qseq+1)/2 + src.core.l_qseq # acquire pointer to location in memory - p = pysam_bam1_seq( src ) + p = bam1_seq( src ) src.core.l_qseq = l pysam_bam_update( src, @@ -1205,7 +1373,7 @@ cdef class AlignedRead: p) # re-acquire pointer to location in memory # as it might have moved - p = pysam_bam1_seq( src ) + p = bam1_seq( src ) for k from 0 <= k < nbytes_new: p[k] = 0 # convert to C string s = seq @@ -1213,38 +1381,32 @@ cdef class AlignedRead: p[k/2] |= pysam_translate_sequence(s[k]) << 4 * (1 - k % 2) # erase qualities - p = pysam_bam1_qual( src ) + p = bam1_qual( src ) p[0] = 0xff + property qual: - """the base quality (None if not present)""" + """read sequence base qualities, including :term:`soft clipped` bases (None if not present)""" def __get__(self): - cdef bam1_t * src - cdef uint8_t * p + + cdef bam1_t * src cdef char * q + src = self._delegate - if src.core.l_qseq == 0: return None - p = pysam_bam1_qual( src ) - if p[0] == 0xff: return None + if src.core.l_qseq == 0: return None - q = < char *>calloc(src.core.l_qseq + 1 , sizeof(char)) - for k from 0 <= k < src.core.l_qseq: - ## equivalent to t[i] + 33 (see bam.c) - q[k] = p[k] + 33 - # convert to python string - retval=q - # clean up - free(q) - return retval + return get_qual_range(src, 0, src.core.l_qseq) def __set__(self,qual): # note that space is already allocated via the sequences cdef bam1_t * src cdef uint8_t * p cdef char * q + cdef int k + src = self._delegate - p = pysam_bam1_qual( src ) + p = bam1_qual( src ) if qual == None or len(qual) == 0: # if absent - set to 0xff p[0] = 0xff @@ -1259,8 +1421,74 @@ cdef class AlignedRead: for k from 0 <= k < l: p[k] = q[k] - 33 + property query: + """aligned portion of the read and excludes any flanking bases that were :term:`soft clipped` (None if not present) + + SAM/BAM files may included extra flanking bases sequences that were + not part of the alignment. These bases may be the result of the + Smith-Waterman or other algorithms, which may not require alignments + that begin at the first residue or end at the last. In addition, + extra sequencing adapters, multiplex identifiers, and low-quality bases that + were not considered for alignment may have been retained.""" + + def __get__(self): + cdef bam1_t * src + cdef uint32_t start, end + cdef char * s + + src = self._delegate + + if src.core.l_qseq == 0: return None + + start = query_start(src) + end = query_end(src) + + return get_seq_range(src, start, end) + + property qqual: + """aligned query sequence quality values (None if not present)""" + def __get__(self): + cdef bam1_t * src + cdef uint32_t start, end + cdef char * q + + src = self._delegate + + if src.core.l_qseq == 0: return None + + start = query_start(src) + end = query_end(src) + + return get_qual_range(src, start, end) + + property qstart: + """start index of the aligned query portion of the sequence (0-based, inclusive)""" + def __get__(self): + return query_start(self._delegate) + + property qend: + """end index of the aligned query portion of the sequence (0-based, exclusive)""" + def __get__(self): + return query_end(self._delegate) + + property qlen: + """Length of the aligned query sequence""" + def __get__(self): + cdef bam1_t * src + src = self._delegate + return query_end(src)-query_start(src) + property tags: - """the tags in the AUX field.""" + """the tags in the AUX field. + This property permits convenience access to + the tags. Changes it the returned list will + not update the tags automatically. Instead, + the following is required for adding a + new tag:: + + read.tags = read.tags + [("RG",0)] + + """ def __get__(self): cdef char * ctag cdef bam1_t * src @@ -1270,7 +1498,7 @@ cdef class AlignedRead: src = self._delegate if src.l_aux == 0: return None - s = pysam_bam1_aux( src ) + s = bam1_aux( src ) result = [] ctag = calloc( 3, sizeof(char) ) cdef int x @@ -1290,27 +1518,27 @@ cdef class AlignedRead: # how do I do char literal comparison in cython? # the code below works (i.e, is C comparison) tpe = toupper(s[0]) - if tpe == 'S'[0]: + if tpe == 'S': value = bam_aux2i(s) s += 2 - elif tpe == 'I'[0]: + elif tpe == 'I': value = bam_aux2i(s) s += 4 - elif tpe == 'F'[0]: + elif tpe == 'F': value = bam_aux2f(s) s += 4 - elif tpe == 'D'[0]: + elif tpe == 'D': value = bam_aux2d(s) s += 8 - elif tpe == 'C'[0]: + elif tpe == 'C': value = bam_aux2i(s) s += 1 - elif tpe == 'A'[0]: + elif tpe == 'A': # there might a more efficient way # to convert a char into a string value = "%c" % bam_aux2A(s) s += 1 - elif tpe == 'Z'[0]: + elif tpe == 'Z': value = bam_aux2Z(s) # +1 for NULL terminated string s += len(value) + 1 @@ -1377,14 +1605,14 @@ cdef class AlignedRead: pysam_bam_update( src, src.l_aux, offset, - pysam_bam1_aux( src ) ) + bam1_aux( src ) ) src.l_aux = offset if offset == 0: return # get location of new data - s = pysam_bam1_aux( src ) + s = bam1_aux( src ) # check if there is direct path from buffer.raw to tmp cdef char * temp @@ -1416,7 +1644,7 @@ cdef class AlignedRead: cdef bam1_t * src src = self._delegate if src.core.n_cigar: - src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, pysam_bam1_cigar(src)) ) + src.core.bin = bam_reg2bin( src.core.pos, bam_calend( &src.core, bam1_cigar(src)) ) else: src.core.bin = bam_reg2bin( src.core.pos, src.core.pos + 1) self._delegate.core.pos = pos @@ -1427,6 +1655,27 @@ cdef class AlignedRead: property rlen: '''length of the read (read only). Returns 0 if not given.''' def __get__(self): return self._delegate.core.l_qseq + property aend: + '''aligned end position of the read (read only). Returns + None if not available.''' + def __get__(self): + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0: + return None + return bam_calend(&src.core, bam1_cigar(src)) + property alen: + '''aligned length of the read (read only). Returns None if + not available.''' + def __get__(self): + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or src.core.n_cigar == 0: + return None + return bam_calend(&src.core, + bam1_cigar(src)) - \ + self._delegate.core.pos + property mapq: """mapping quality""" def __get__(self): return self._delegate.core.qual @@ -1585,9 +1834,11 @@ cdef class PileupProxy: If the underlying engine iterator advances, the results of this column will change. ''' - cdef bam_plbuf_t * buf + cdef bam_pileup1_t * plp + cdef int tid + cdef int pos cdef int n_pu - + def __cinit__(self ): pass @@ -1598,7 +1849,7 @@ cdef class PileupProxy: property tid: '''the chromosome ID as is defined in the header''' - def __get__(self): return pysam_get_tid( self.buf ) + def __get__(self): return self.tid property n: '''number of reads mapping to this column.''' @@ -1606,18 +1857,17 @@ cdef class PileupProxy: def __set__(self, n): self.n_pu = n property pos: - def __get__(self): return pysam_get_pos( self.buf ) + def __get__(self): return self.pos property pileups: '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' def __get__(self): - cdef bam_pileup1_t * pl - pl = pysam_get_pileup( self.buf ) + cdef int x pileups = [] # warning: there could be problems if self.n and self.buf are # out of sync. for x from 0 <= x < self.n_pu: - pileups.append( makePileupRead( &pl[x]) ) + pileups.append( makePileupRead( &(self.plp[x])) ) return pileups cdef class PileupRead: diff --git a/pysam/ctabix.pxd b/pysam/ctabix.pxd new file mode 100644 index 0000000..ef735b6 --- /dev/null +++ b/pysam/ctabix.pxd @@ -0,0 +1,171 @@ + +cdef extern from "string.h": + ctypedef int size_t + void *memcpy(void *dst,void *src,size_t len) + void *memmove(void *dst,void *src,size_t len) + void *memset(void *b,int c,size_t len) + char *strtok_r(char *str, char *delim, char **saveptr) + char *strncpy(char *dest, char *src, size_t n) + void *memchr(void *s, int c, size_t n) + +cdef extern from "stdlib.h": + void free(void *) + void *malloc(size_t) + void *calloc(size_t,size_t) + void *realloc(void *,size_t) + void qsort(void *base, size_t nmemb, size_t size, + int (*compar)(void *,void *)) + int c_abs "abs" (int) + int atoi( char *nptr) + long atol( char *nptr) + double atof( char *nptr) + +cdef extern from "stdio.h": + ctypedef struct FILE: + pass + FILE *fopen(char *,char *) + FILE *freopen(char *path, char *mode, FILE *stream) + int fileno(FILE *stream) + int dup2(int oldfd, int newfd) + int fflush(FILE *stream) + + FILE * stderr + FILE * stdout + int fclose(FILE *) + int sscanf(char *str,char *fmt,...) + int printf(char *str,char *fmt,...) + int sprintf(char *str,char *fmt,...) + int fprintf(FILE *ifile,char *fmt,...) + char *fgets(char *str,int size,FILE *ifile) + +cdef extern from "ctype.h": + int toupper(int c) + int tolower(int c) + +cdef extern from "sys/types.h": + pass + +cdef extern from "sys/stat.h": + pass + +cdef extern from "fcntl.h": + int open(char *pathname, int flags) + +cdef extern from "unistd.h": + ctypedef int ssize_t + char *ttyname(int fd) + int isatty(int fd) + ssize_t read(int fd, void *buf, size_t count) + +cdef extern from "string.h": + int strcmp(char *s1, char *s2) + int strncmp(char *s1,char *s2,size_t len) + char *strcpy(char *dest,char *src) + char *strncpy(char *dest,char *src, size_t len) + char *strdup(char *) + char *strcat(char *,char *) + size_t strlen(char *s) + int memcmp( void * s1, void *s2, size_t len ) + +cdef extern from "stdint.h": + ctypedef int int64_t + ctypedef int int32_t + ctypedef int uint32_t + ctypedef int uint8_t + ctypedef int uint64_t + +cdef extern from "Python.h": + ctypedef struct FILE + FILE* PyFile_AsFile(object) + char *fgets(char *str, int size, FILE *ifile) + int feof(FILE *stream) + size_t strlen(char *s) + size_t getline(char **lineptr, size_t *n, FILE *stream) + char *strstr(char *, char *) + char *strchr(char *string, int c) + int fileno(FILE *stream) + +cdef extern from "bgzf.h": + + ctypedef struct BGZF: + pass + + int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) + + BGZF * bgzf_open(char * path, char * mode) + + int bgzf_write(BGZF * fp, void* data, int length) + + int bgzf_close(BGZF* fp) + +# tabix support +cdef extern from "tabix.h": + + ctypedef struct ti_index_t: + pass + + ctypedef struct tabix_t: + BGZF *fp + ti_index_t *idx + char *fn + char *fnidx + + ctypedef struct ti_iter_t: + pass + + ctypedef struct ti_conf_t: + int32_t preset + int32_t sc, bc, ec + int32_t meta_char, line_skip + + tabix_t *ti_open(char *fn, char *fnidx) + + int ti_lazy_index_load(tabix_t *t) + + void ti_close(tabix_t *t) + + ti_iter_t ti_query(tabix_t *t, char *name, int beg, int end) + ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end) + ti_iter_t ti_querys(tabix_t *t, char *reg) + char * ti_read(tabix_t *t, ti_iter_t iter, int *len) + + # Get the list of sequence names. Each "char*" pointer points to a + # internal member of the index, so DO NOT modify the returned + # pointer; otherwise the index will be corrupted. The returned + # pointer should be freed by a single free() call by the routine + # calling this function. The number of sequences is returned at *n + char **ti_seqname(ti_index_t *idx, int *n) + + + # Destroy the iterator + void ti_iter_destroy(ti_iter_t iter) + + # Build the index for file . File .tbi will be generated + # and overwrite the file of the same name. Return -1 on failure. */ + int ti_index_build(char *fn, ti_conf_t *conf) + + #/* Load the index from file .tbi. If is a URL and the index + # * file is not in the working directory, .tbi will be + # * downloaded. Return NULL on failure. */ + ti_index_t *ti_index_load( char *fn) + + ti_index_t *ti_index_load_local(char *fnidx) + + #/* Destroy the index */ + void ti_index_destroy(ti_index_t *idx) + + #/* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */ + int ti_parse_region( ti_index_t *idx, char *str, int *tid, int *begin, int *end) + + int ti_get_tid( ti_index_t *idx, char *name) + + # /* Get the iterator pointing to the first record at the current file + # * position. If the file is just openned, the iterator points to the + # * first record in the file. */ + ti_iter_t ti_iter_first() + + # /* Get the iterator pointing to the first record in region tid:beg-end */ + ti_iter_t ti_iter_query( ti_index_t *idx, int tid, int beg, int end) + + # /* Get the data line pointed by the iterator and iterate to the next record. */ + # char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len) diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx new file mode 100644 index 0000000..8715e5d --- /dev/null +++ b/pysam/ctabix.pyx @@ -0,0 +1,881 @@ +# cython: embedsignature=True +# adds doc-strings for sphinx + +import tempfile, os, sys, types, itertools, struct, ctypes + +cdef class Tabixfile: + '''*(filename, mode='r')* + + opens a :term:`tabix file` for reading. A missing + index (*filename* + ".tbi") will raise an exception. + ''' + + cdef char * filename + + # pointer to tabixfile + cdef tabix_t * tabixfile + + def __cinit__(self, *args, **kwargs ): + self.tabixfile = NULL + self._open( *args, **kwargs ) + + def _isOpen( self ): + '''return true if samfile has been opened.''' + return self.tabixfile != NULL + + def _open( self, + char * filename, + mode ='r', + ): + '''open a :term:`tabix file` for reading. + ''' + + assert mode in ( "r",), "invalid file opening mode `%s`" % mode + + # close a previously opened file + if self.tabixfile != NULL: self.close() + self.tabixfile = NULL + + self.filename = filename + filename_index = filename + ".tbi" + + if mode[0] == 'w': + # open file for writing + pass + + elif mode[0] == "r": + # open file for reading + if not os.path.exists( self.filename ): + raise IOError( "file `%s` not found" % self.filename) + + if not os.path.exists( filename_index ): + raise IOError( "index `%s` not found" % filename_index) + + # open file and load index + self.tabixfile = ti_open( self.filename, filename_index ) + + if self.tabixfile == NULL: + raise IOError("could not open file `%s`" % filename ) + + def _parseRegion( self, + reference = None, + start = None, + end = None, + region = None ): + '''parse region information. + + raise ValueError for for invalid regions. + + returns a tuple of region, tid, start and end. Region + is a valid samtools :term:`region` or None if the region + extends over the whole file. + + Note that regions are 1-based, while start,end are python coordinates. + ''' + ti_lazy_index_load( self.tabixfile ) + + cdef int rtid + cdef int rstart + cdef int rend + cdef int max_pos + max_pos = 2 << 29 + + rtid = rstart = rend = 0 + + # translate to a region + if reference: + if start != None and end != None: + region = "%s:%i-%i" % (reference, start+1, end) + elif start == None and end != None: + region = "%s:%i-%i" % (reference, 1, end) + elif end == None and start != None: + region = "%s:%i-%i" % (reference, start+1, max_pos-1) + else: + region = reference + + if region: + ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend) + if rtid < 0: raise ValueError( "invalid region `%s`" % region ) + if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) ) + if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart ) + if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend ) + + return region, rtid, rstart, rend + + def fetch( self, + reference = None, + start = None, + end = None, + region = None, + parser = None ): + ''' + + fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by + :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. + + Without *reference* or *region* all entries will be fetched. + + If only *reference* is set, all reads matching on *reference* will be fetched. + + If *parser* is None, the results are returned as an unparsed string. + Otherwise, *parser* is assumed to be a functor that will return parsed + data (see for example :meth:`asTuple` and :meth:`asGTF`). + ''' + ti_lazy_index_load( self.tabixfile ) + + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) + + if parser == None: + if region: + return TabixIterator( self, rtid, rstart, rend ) + else: + return TabixIterator( self, -1, 0, 0 ) + else: + if region: + return TabixIteratorParsed( self, rtid, rstart, rend, parser ) + else: + return TabixIteratorParsed( self, -1, 0, 0, parser ) + + property contigs: + '''chromosome names''' + def __get__(self): + cdef char ** sequences + cdef int nsequences + + ti_lazy_index_load( self.tabixfile ) + sequences = ti_seqname( self.tabixfile.idx, &nsequences ) + cdef int x + result = [] + for x from 0 <= x < nsequences: + result.append( sequences[x] ) + return result + +cdef class TabixIterator: + """iterates over rows in *tabixfile* in region + given by *tid*, *start* and *end*. + """ + + cdef ti_iter_t iterator + cdef tabix_t * tabixfile + + def __cinit__(self, Tabixfile tabixfile, + int tid, int start, int end ): + + assert tabixfile._isOpen() + + # makes sure that samfile stays alive as long as the + # iterator is alive. + self.tabixfile = tabixfile.tabixfile + + if tid < 0: + # seek to start of file to ensure iteration is over + # all entries. + bgzf_seek( self.tabixfile.fp, 0, 0) + self.iterator = ti_iter_first() + else: + self.iterator = ti_queryi(self.tabixfile, tid, start, end) + + if self.iterator == NULL: + raise ValueError("malformatted query or wrong sequence name.\n") + + def __iter__(self): + return self + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef char * s + cdef int len + s = ti_read(self.tabixfile, self.iterator, &len) + if s == NULL: raise StopIteration + return s + + def __dealloc__(self): + if self.iterator != NULL: + ti_iter_destroy(self.iterator) + +def toDot( v ): + '''convert value to '.' if None''' + if v == None: return "." + else: return str(v) + +def quote( v ): + '''return a quoted attribute.''' + if type(v) in types.StringTypes: + return '"%s"' % v + else: + return str(v) + +cdef class TupleProxy: + '''Proxy class for access to parsed row as a tuple. + + This class represents a table row for fast read-access. + ''' + + cdef: + char * data + char ** fields + int nfields + int index + + def __cinit__(self ): + + self.data = NULL + self.fields = NULL + self.index = 0 + + cdef take( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take ownership of the pointer. + ''' + self.data = buffer + self.update( buffer, nbytes ) + + cdef present( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Do not take ownership of the pointer. + ''' + self.update( buffer, nbytes ) + + cdef copy( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take a copy of buffer. + ''' + cdef int s + # +1 for '\0' + s = sizeof(char) * (nbytes + 1) + self.data = malloc( s ) + memcpy( self.data, buffer, s ) + self.update( self.data, nbytes ) + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data.''' + cdef char * pos + cdef char * old_pos + cdef int field + cdef int max_fields + field = 0 + + if buffer[nbytes] != 0: + raise ValueError( "incomplete line at %s" % buffer ) + + if self.fields != NULL: + free(self.fields) + + max_fields = nbytes / 4 + self.fields = calloc( max_fields, sizeof(char *) ) + + pos = buffer + self.fields[0] = pos + field += 1 + old_pos = pos + + while 1: + + pos = memchr( pos, '\t', nbytes ) + if pos == NULL: break + pos[0] = '\0' + pos += 1 + self.fields[field] = pos + field += 1 + if field >= max_fields: + raise ValueError("row too large - more than %i fields" % max_fields ) + nbytes -= pos - old_pos + if nbytes < 0: break + old_pos = pos + + self.nfields = field + + def __getitem__( self, key ): + + cdef int i + i = key + if i < 0: i += self.nfields + if i >= self.nfields or i < 0: + raise IndexError( "list index out of range" ) + return self.fields[i] + + def __len__(self): + return self.nfields + + def __dealloc__(self): + if self.data != NULL: + free(self.data) + + def __iter__(self): + self.index = 0 + return self + + def __next__(self): + """python version of next(). + """ + if self.index >= self.nfields: + raise StopIteration + self.index += 1 + return self.fields[self.index-1] + +cdef class GTFProxy: + '''Proxy class for access to GTF fields. + + This class represents a GTF entry for fast read-access. + Write-access has been added as well, though some care must + be taken. If any of the string fields (contig, source, ...) + are set, the new value is tied to the lifetime of the + argument that was supplied. + + The only exception is the attributes field when set from + a dictionary - this field will manage its own memory. + + ''' + + cdef: + char * contig + char * source + char * feature + uint32_t start + uint32_t end + char * score + char * strand + char * frame + char * attributes + int nbytes + char * data + cdef bint isModified + cdef bint hasOwnAttributes + + def __cinit__(self ): + self.data = NULL + self.isModified = False + self.hasOwnAttributes = False + + cdef take( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take ownership of the pointer. + ''' + self.data = buffer + self.update( buffer, nbytes ) + self.isModified = False + + cdef present( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Do not take ownership of the pointer. + ''' + self.update( buffer, nbytes ) + self.isModified = False + + cdef copy( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take a copy of buffer. + ''' + cdef int s + # +1 for '\0' + s = sizeof(char) * (nbytes + 1) + self.data = malloc( s ) + memcpy( self.data, buffer, s ) + self.update( self.data, nbytes ) + self.isModified = False + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + cdef int end + cdef char * cstart, * cend, * cscore + self.contig = buffer + self.nbytes = nbytes + cdef char * pos + + if buffer[nbytes] != 0: + raise ValueError( "incomplete line at %s" % buffer ) + + pos = strchr( buffer, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.source = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.feature = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + cstart = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + cend = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.score = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.strand = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.frame = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.attributes = pos + self.start = atoi( cstart ) - 1 + self.end = atoi( cend ) + + property contig: + '''contig of feature.''' + def __get__( self ): return self.contig + def __set__( self, value ): + self.isModified = True + self.contig = value + + property feature: + '''feature name.''' + def __get__( self ): return self.feature + def __set__( self, value ): + self.isModified = True + self.feature = value + + property source: + '''feature source.''' + def __get__( self ): return self.source + def __set__( self, value ): + self.isModified = True + self.source = value + + property start: + '''feature start (in 0-based open/closed coordinates).''' + def __get__( self ): return self.start + def __set__( self, value ): + self.isModified = True + self.start = value + + property end: + '''feature end (in 0-based open/closed coordinates).''' + def __get__( self ): return self.end + def __set__( self, value ): + self.isModified = True + self.end = value + + property score: + '''feature score.''' + def __get__( self ): + if self.score[0] == '.' and self.score[1] == '\0' : + return None + else: + return atof(self.score) + def __set__( self, value ): + self.isModified = True + self.score = value + + property strand: + '''feature strand.''' + def __get__( self ): return self.strand + def __set__( self, value ): + self.isModified = True + self.strand = value + + property frame: + '''feature frame.''' + def __get__( self ): return self.frame + def __set__( self, value ): + self.isModified = True + self.frame = value + + property attributes: + '''feature attributes (as a string).''' + def __get__( self ): return self.attributes + def __set__( self, value ): + self.isModified = True + self.attributes = value + + def asDict( self ): + """parse attributes - return as dict + """ + + # remove comments + attributes = self.attributes + + # separate into fields + fields = [ x.strip() for x in attributes.split(";")[:-1]] + + result = {} + + for f in fields: + + d = [ x.strip() for x in f.split(" ")] + + n,v = d[0], d[1] + if len(d) > 2: v = d[1:] + + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + else: + ## try to convert to a value + try: + v = float( v ) + v = int( v ) + except ValueError: + pass + except TypeError: + pass + + result[n] = v + + return result + + def fromDict( self, d ): + '''set attributes from a dictionary.''' + cdef char * p + cdef int l + + # clean up if this field is set twice + if self.hasOwnAttributes: + free(self.attributes) + + aa = [] + for k,v in d.items(): + if type(v) == types.StringType: + aa.append( '%s "%s"' % (k,v) ) + else: + aa.append( '%s %s' % (k,str(v)) ) + + a = "; ".join( aa ) + ";" + p = a + l = len(a) + self.attributes = calloc( l + 1, sizeof(char) ) + memcpy( self.attributes, p, l ) + + self.hasOwnAttributes = True + self.isModified = True + + def __str__(self): + cdef char * cpy + cdef int x + + if self.isModified: + return "\t".join( + (self.contig, + self.source, + self.feature, + str(self.start+1), + str(self.end), + toDot(self.score), + self.strand, + self.frame, + self.attributes ) ) + else: + cpy = calloc( sizeof(char), self.nbytes+1 ) + memcpy( cpy, self.data, self.nbytes+1) + for x from 0 <= x < self.nbytes: + if cpy[x] == '\0': cpy[x] = '\t' + result = cpy + free(cpy) + return result + + def invert( self, int lcontig ): + '''invert coordinates to negative strand coordinates + + This method will only act if the feature is on the + negative strand.''' + + if self.strand[0] == '-': + start = min(self.start, self.end) + end = max(self.start, self.end) + self.start, self.end = lcontig - end, lcontig - start + + def keys( self ): + '''return a list of attributes defined in this entry.''' + r = self.attributes + return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ] + + def __getitem__(self, item): + return self.__getattr__( item ) + + def __dealloc__(self): + if self.data != NULL: + free(self.data) + if self.hasOwnAttributes: + free(self.attributes) + + def __getattr__(self, item ): + """Generic lookup of attribute from GFF/GTF attributes + Only called if there *isn't* an attribute with this name + """ + cdef char * start + cdef char * query + cdef char * cpy + cdef char * end + cdef int l + query = item + + start = strstr( self.attributes, query) + if start == NULL: + raise AttributeError("'GTFProxy' has no attribute '%s'" % item ) + + start += strlen(query) + 1 + # skip gaps before + while start[0] == " ": start += 1 + if start[0] == '"': + start += 1 + end = start + while end[0] != '\0' and end[0] != '"': end += 1 + l = end - start + 1 + cpy = calloc( l, sizeof(char ) ) + memcpy( cpy, start, l ) + cpy[l-1] = '\0' + result = cpy + free(cpy) + return result + else: + return start + + def setAttribute( self, name, value ): + '''convenience method to set an attribute.''' + r = self.asDict() + r[name] = value + self.fromDict( r ) + +cdef class Parser: + pass + +cdef class asTuple(Parser): + '''converts a :term:`tabix row` into a python tuple.''' + def __call__(self, char * buffer, int len): + cdef TupleProxy r + r = TupleProxy() + # need to copy - there were some + # persistence issues with "present" + r.copy( buffer, len ) + return r + +cdef class asGTF(Parser): + '''converts a :term:`tabix row` into a GTF record.''' + def __call__(self, char * buffer, int len): + cdef GTFProxy r + r = GTFProxy() + r.copy( buffer, len ) + return r + +cdef class TabixIteratorParsed: + """iterates over mapped reads in a region. + """ + + cdef ti_iter_t iterator + cdef tabix_t * tabixfile + cdef Parser parser + + def __cinit__(self, + Tabixfile tabixfile, + int tid, + int start, + int end, + Parser parser ): + + assert tabixfile._isOpen() + self.parser = parser + + # makes sure that samfile stays alive as long as the + # iterator is alive. + self.tabixfile = tabixfile.tabixfile + + if tid < 0: + # seek to start of file to ensure iteration is over + # all entries. + bgzf_seek( self.tabixfile.fp, 0, 0) + self.iterator = ti_iter_first() + else: + self.iterator = ti_queryi(self.tabixfile, tid, start, end) + + if self.iterator == NULL: + raise ValueError("malformatted query or wrong sequence name.\n") + + def __iter__(self): + return self + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef char * s + cdef int len + s = ti_read(self.tabixfile, self.iterator, &len) + if s == NULL: raise StopIteration + return self.parser(s, len) + + def __dealloc__(self): + if self.iterator != NULL: + ti_iter_destroy(self.iterator) + +def tabix_compress( filename_in, + filename_out, + force = False ): + + ''' + compress *filename_in* writing the output to *filename_out*. + + Raise an IOError if *filename_out* already exists, unless *force* is set. + ''' + + if not force and os.path.exists(filename_out ): + raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out) + + cdef int WINDOW_SIZE + cdef int c, r + cdef void * buffer + cdef BGZF * fp + cdef int fd_src + + cdef int O_RDONLY + O_RDONLY = os.O_RDONLY + + WINDOW_SIZE = 64 * 1024 + + fp = bgzf_open( filename_out, "w") + if fp == NULL: + raise IOError( "could not open '%s' for writing" ) + + fd_src = open(filename_in, O_RDONLY) + if fd_src == 0: + raise IOError( "could not open '%s' for reading" ) + + buffer = malloc(WINDOW_SIZE) + + while c > 0: + c = read(fd_src, buffer, WINDOW_SIZE) + r = bgzf_write(fp, buffer, c) + if r < 0: + free( buffer ) + raise OSError("writing failed") + + free( buffer ) + r = bgzf_close(fp) + if r < 0: raise OSError("writing failed") + +def tabix_index( filename, + force = False, + seq_col = None, + start_col = None, + end_col = None, + preset = None, + meta_char = "#", + zerobased = False, + ): + ''' + index tab-separated *filename* using tabix. + + An existing index will not be overwritten unless + *force* is set. + + The index will be built from coordinates + in columns *seq_col*, *start_col* and *end_col*. + + The contents of *filename* have to be sorted by + contig and position - the method does not check + if the file is sorted. + + Column indices are 0-based. Coordinates in the file + are assumed to be 1-based. + + If *preset* is provided, the column coordinates + are taken from a preset. Valid values for preset + are "gff", "bed", "sam", "vcf", psltbl", "pileup". + + Lines beginning with *meta_char* and the first + *line_skip* lines will be skipped. + + If *filename* does not end in ".gz", it will be automatically + compressed. The original file will be removed and only the + compressed file will be retained. + + If *filename* ends in *gz*, the file is assumed to be already + compressed with bgzf. + + returns the filename of the compressed data + ''' + + if not os.path.exists(filename): raise IOError("No such file '%s'" % filename) + + if not filename.endswith(".gz"): + + tabix_compress( filename, filename + ".gz", force = force ) + os.unlink( filename ) + filename += ".gz" + + if not force and os.path.exists(filename + ".tbi" ): + raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" ) + + # columns (1-based) + # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning + # 0 is a missing column + preset2conf = { + 'gff' : ( 0, 1, 4, 5, ord('#'), 0 ), + 'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ), + 'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ), + 'sam' : ( 1, 3, 4, 0, ord('#'), 0 ), + 'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ), + 'pileup': (3, 1, 2, 0, ord('#'), 0 ), + } + + if preset: + try: + conf_data = preset2conf[preset] + except KeyError: + raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() ))) + else: + if end_col == None: end_col = -1 + preset = 0 + + # note that tabix internally works with 0-based coordinates and open/closed intervals. + # When using a preset, conversion is automatically taken care of. + # Otherwise, the coordinates are assumed to be 1-based closed intervals and + # -1 is subtracted from the start coordinate. To avoid doing this, set + # the TI_FLAG_UCSC=0x10000 flag: + if zerobased: preset = preset | 0x10000 + + conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0) + + cdef ti_conf_t conf + conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data + + ti_index_build( filename, &conf) + + return filename + +__all__ = ["tabix_index", + "tabix_compress", + "Tabixfile", + "asTuple", + "asGTF", + ] diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c index 5360626..91b6fa7 100644 --- a/pysam/pysam_util.c +++ b/pysam/pysam_util.c @@ -141,11 +141,8 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) } assert(x > pos); // otherwise a bug return ret; -} - - - +} // the following code has been taken from bam_plbuf_push // and modified such that instead of a function call // the function returns and will continue (if cont is true). @@ -155,98 +152,16 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) // 1: if buf is full and can be emitted // 0: if b has been added // -1: if there was an error -int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont) +int pysam_pileup_next(const bam1_t *b, + bam_plbuf_t *buf, + bam_pileup1_t ** plp, + int * tid, + int * pos, + int * n_plp ) { - if (!cont) - { - if (b) { // fill buffer - if (b->core.tid < 0) return 0; - if (b->core.flag & buf->flag_mask) return 0; - bam_copy1(&buf->tail->b, b); - buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); - if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) { - fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n"); - abort(); - } - buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg; - if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) { - buf->tail->next = mp_alloc(buf->mp); - buf->tail = buf->tail->next; - } - } else buf->is_eof = 1; - } - else - // continue end of loop - { - // update tid and pos - if (buf->head->next) { - if (buf->tid > buf->head->b.core.tid) { - fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); - return -1; - } - } - if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence - buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference - } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid - buf->pos = buf->head->beg; // jump to the next position - } else ++buf->pos; // scan contiguously - if (buf->is_eof && buf->head->next == 0) return 0; - } - - // enter yield loop - while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) - { - int n_pu = 0; - lbnode_t *p, *q; - buf->dummy->next = buf->head; - for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) { - if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list - q->next = p->next; mp_free(buf->mp, p); p = q; - } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup - if (n_pu == buf->max_pu) { // then double the capacity - buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256; - buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu); - } - buf->pu[n_pu].b = &p->b; - if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP - } - } - buf->head = buf->dummy->next; // dummy->next may be changed - - // exit if alignments need to be emitted - if (n_pu) { return n_pu; } - - // update tid and pos - if (buf->head->next) { - if (buf->tid > buf->head->b.core.tid) { - fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); - return -2; - } - } - if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence - buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference - } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid - buf->pos = buf->head->beg; // jump to the next position - } else ++buf->pos; // scan contiguously - if (buf->is_eof && buf->head->next == 0) break; - } - return 0; -} - -int pysam_get_pos( const bam_plbuf_t *buf) -{ - return buf->pos; -} - - -int pysam_get_tid( const bam_plbuf_t *buf) -{ - return buf->tid; -} - -bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf) -{ - return buf->pu; + *plp = bam_plp_next(buf->iter, tid, pos, n_plp); + if (plp == NULL) return 0; + return 1; } // pysam dispatch function to emulate the samtools @@ -309,15 +224,6 @@ int pysam_dispatch(int argc, char *argv[] ) return 0; } -// standin for bam_destroy1 in bam.h -// deletes all variable length data -void pysam_bam_destroy1( bam1_t * b ) -{ - if (b == NULL) return; - if (b->data != NULL) free(b->data); - free(b); -} - // taken from samtools/bam_import.c static inline uint8_t *alloc_data(bam1_t *b, size_t size) { @@ -379,121 +285,6 @@ unsigned char pysam_translate_sequence( const unsigned char s ) return bam_nt16_table[s]; } -// stand-ins for samtools macros in bam.h -char * pysam_bam1_qname( const bam1_t * b) -{ - return (char*)b->data; -} - -uint32_t * pysam_bam1_cigar( const bam1_t * b) -{ - return (uint32_t*)(b->data + b->core.l_qname); -} - -uint8_t * pysam_bam1_seq( const bam1_t * b) -{ - return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname); -} - -uint8_t * pysam_bam1_qual( const bam1_t * b) -{ - return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + (b->core.l_qseq + 1)/2); -} - -uint8_t * pysam_bam1_aux( const bam1_t * b) -{ - return (uint8_t*)(b->data + b->core.n_cigar*4 + b->core.l_qname + b->core.l_qseq + (b->core.l_qseq + 1)/2); -} - -// ####################################################### -// Iterator implementation -// ####################################################### - -// functions defined in bam_index.c -extern pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off); - -static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) -{ - uint32_t rbeg = b->core.pos; - uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; - return (rend > beg && rbeg < end); -} - -struct __bam_fetch_iterator_t -{ - bam1_t * b; - pair64_t * off; - int n_off; - uint64_t curr_off; - int curr_chunk; - bamFile fp; - int tid; - int beg; - int end; - int n_seeks; -}; - -bam_fetch_iterator_t* bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end) -{ - // iterator contains current alignment position - // and will contain actual alignment during iterations - bam_fetch_iterator_t* iter = (bam_fetch_iterator_t*)calloc(1, sizeof(bam_fetch_iterator_t)); - iter->b = (bam1_t*)calloc(1, sizeof(bam1_t)); - - // list of chunks containing our alignments - iter->off = get_chunk_coordinates(idx, tid, beg, end, &iter->n_off); - - // initialise other state variables in iterator - iter->fp = fp; - iter->curr_chunk = -1; - iter->curr_off = 0; - iter->n_seeks = 0; - iter->tid = tid; - iter->beg = beg; - iter->end = end; - return iter; -} - -bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter) -{ - if (!iter->off) { - return 0; - } - - int ret; - // iterate through all alignments in chunks - for (;;) { - if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->curr_chunk].v) { // then jump to the next chunk - if (iter->curr_chunk == iter->n_off - 1) break; // no more chunks - if (iter->curr_chunk >= 0) assert(iter->curr_off == iter->off[iter->curr_chunk].v); // otherwise bug - if (iter->curr_chunk < 0 || iter->off[iter->curr_chunk].v != iter->off[iter->curr_chunk+1].u) { // not adjacent chunks; then seek - bam_seek(iter->fp, iter->off[iter->curr_chunk+1].u, SEEK_SET); - iter->curr_off = bam_tell(iter->fp); - ++iter->n_seeks; - } - ++iter->curr_chunk; - } - if ((ret = bam_read1(iter->fp, iter->b)) > 0) { - iter->curr_off = bam_tell(iter->fp); - if (iter->b->core.tid != iter->tid || iter->b->core.pos >= iter->end) break; // no need to proceed - else if (is_overlap(iter->beg, iter->end, iter->b)) - // - //func(iter->b, data); - // - return iter->b; - } else - return 0; // end of file - } - return 0; -} - -void bam_cleanup_fetch_iterator(bam_fetch_iterator_t *iter) -{ - // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", iter->n_seeks); - bam_destroy1(iter->b); - free(iter->off); -} - diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h index ff5d569..bfbd6dd 100644 --- a/pysam/pysam_util.h +++ b/pysam/pysam_util.h @@ -1,75 +1,22 @@ #ifndef PYSAM_UTIL_H #define PYSAM_UTIL_H -////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////// -// code for iterator - -/*! @typedef - @Structure for holding current state (current alignment etc.) for iterating through - alignments overlapping a specified region. - @field b pointer to the current alignment - @field off pointer to an array of chunk loci (each with beg/end positions) - @field n_off The number of chunks - @field curr_off The current file positon - @field curr_chunk The item in a list of chunk - @discussion See also bam_fetch_iterate -*/ -struct __bam_fetch_iterator_t; -typedef struct __bam_fetch_iterator_t bam_fetch_iterator_t; - -/*! - @abstract Retrieve the alignments that are overlapped with the - specified region. - - @discussion Returns iterator object to retrieve successive alignments ordered by - start position. - @param fp BAM file handler - @param idx pointer to the alignment index - @param tid chromosome ID as is defined in the header - @param beg start coordinate, 0-based - @param end end coordinate, 0-based -*/ -bam_fetch_iterator_t * bam_init_fetch_iterator(bamFile fp, const bam_index_t *idx, int tid, int beg, int end); - - -/*! - @abstract Iterates through alignments overlapped the specified region. - @discussion Returns pointer to successive alignments ordered by start position. - Returns null pointer to signal the end of the iteration. - The alignment data is nested within the iterator to avoid unnecessary allocations. -*/ -bam1_t * bam_fetch_iterate(bam_fetch_iterator_t *iter); - -bam_fetch_iterator_t* bam_init_fetchall_iterator(bamFile fp, const bam_index_t *idx); -bam1_t * bam_fetchall_iterate(bam_fetch_iterator_t *iter); - ////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////// // various helper functions +// +// fill pileup buffer for next position. -int pysam_bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf, int cont); - -// accessor functions - necessary as bam_plbuf_t is hidden -// among the implementation -int pysam_get_pos( const bam_plbuf_t *buf); -int pysam_get_tid( const bam_plbuf_t *buf); -bam_pileup1_t * pysam_get_pileup( const bam_plbuf_t *buf); +int pysam_pileup_next(const bam1_t *b, + bam_plbuf_t *buf, + bam_pileup1_t ** plp, + int * tid, + int * pos, + int * n_plp); int pysam_dispatch(int argc, char *argv[] ); -// stand-in for macro - not wrappable in pyrex -void pysam_bam_destroy1( bam1_t * b ); - -// stand-in for other samtools macros -uint32_t * pysam_bam1_cigar( const bam1_t * b); -char * pysam_bam1_qname( const bam1_t * b); -uint8_t * pysam_bam1_seq( const bam1_t * b); -uint8_t * pysam_bam1_qual( const bam1_t * b); -uint8_t * pysam_bam1_aux( const bam1_t * b); - /*! @abstract Update the variable length data within a bam1_t entry diff --git a/pysam/version.py b/pysam/version.py new file mode 100644 index 0000000..5965c7c --- /dev/null +++ b/pysam/version.py @@ -0,0 +1,7 @@ +# pysam versioning information + +__version__ = "0.3" + +__samtools_version__ = "0.1.8" + +__tabix_version__ = "0.2.1" diff --git a/samtools/bam.c b/samtools/bam.c index ee7642b..94b0aa8 100644 --- a/samtools/bam.c +++ b/samtools/bam.c @@ -70,6 +70,7 @@ bam_header_t *bam_header_read(bamFile fp) { bam_header_t *header; char buf[4]; + int magic_len; int32_t i = 1, name_len; // check EOF i = bgzf_check_EOF(fp); @@ -80,9 +81,9 @@ bam_header_t *bam_header_read(bamFile fp) } else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n"); // read "BAM1" - if (bam_read(fp, buf, 4) != 4) return 0; - if (strncmp(buf, "BAM\001", 4)) { - fprintf(stderr, "[bam_header_read] wrong header\n"); + magic_len = bam_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { + fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); return 0; } header = bam_header_init(); @@ -140,6 +141,7 @@ int bam_header_write(bamFile fp, const bam_header_t *header) bam_write(fp, &x, 4); } else bam_write(fp, &header->target_len[i], 4); } + bgzf_flush(fp); return 0; } @@ -207,6 +209,7 @@ inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8 x[5] = c->mtid; x[6] = c->mpos; x[7] = c->isize; + bgzf_flush_try(fp, 4 + block_len); if (bam_is_be) { for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); y = block_len; @@ -232,8 +235,8 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kstring_t str; str.l = str.m = 0; str.s = 0; - ksprintf(&str, "%s\t", bam1_qname(b)); - if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag); + kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str); + if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); } else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); else { // BAM_OFSTR for (i = 0; i < 16; ++i) @@ -241,41 +244,43 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kputc(bam_flag2char_table[i], &str); kputc('\t', &str); } - if (c->tid < 0) kputs("*\t", &str); - else ksprintf(&str, "%s\t", header->target_name[c->tid]); - ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); + if (c->tid < 0) kputsn("*\t", 2, &str); + else { kputs(header->target_name[c->tid], &str); kputc('\t', &str); } + kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { - for (i = 0; i < c->n_cigar; ++i) - ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); + for (i = 0; i < c->n_cigar; ++i) { + kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); + kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + } } kputc('\t', &str); - if (c->mtid < 0) kputs("*\t", &str); - else if (c->mtid == c->tid) kputs("=\t", &str); - else ksprintf(&str, "%s\t", header->target_name[c->mtid]); - ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); + if (c->mtid < 0) kputsn("*\t", 2, &str); + else if (c->mtid == c->tid) kputsn("=\t", 2, &str); + else { kputs(header->target_name[c->mtid], &str); kputc('\t', &str); } + kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); kputc('\t', &str); if (t[0] == 0xff) kputc('*', &str); else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); - } else ksprintf(&str, "*\t*"); + } else kputsn("*\t*", 3, &str); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; - ksprintf(&str, "\t%c%c:", key[0], key[1]); - if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } - else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } - else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; } - else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } - else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } - else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } - else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } + kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); + if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } + else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } + else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } + else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } + else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } + else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } + else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } - else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } } return str.s; } @@ -288,7 +293,7 @@ char *bam_format1(const bam_header_t *header, const bam1_t *b) void bam_view1(const bam_header_t *header, const bam1_t *b) { char *s = bam_format1(header, b); - printf("%s\n", s); + puts(s); free(s); } diff --git a/samtools/bam.h b/samtools/bam.h index 291b303..8e26ea6 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -87,7 +87,7 @@ typedef struct { char **target_name; uint32_t *target_len; void *dict, *hash, *rg2lib; - int l_text; + size_t l_text, n_text; char *text; } bam_header_t; @@ -190,6 +190,8 @@ typedef struct { uint8_t *data; } bam1_t; +typedef struct __bam_iter_t *bam_iter_t; + #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) @@ -272,6 +274,10 @@ extern char bam_nt16_nt4_table[]; extern "C" { #endif + /********************* + * Low-level SAM I/O * + *********************/ + /*! @abstract TAM file handler */ typedef struct __tamFile_t *tamFile; @@ -323,6 +329,7 @@ extern "C" { be destroyed in the first place. */ int sam_header_parse(bam_header_t *h); + int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); /*! @abstract Parse @RG lines a update a header struct @@ -336,12 +343,22 @@ extern "C" { #define sam_write1(header, b) bam_view1(header, b) + + /******************************** + * APIs for string dictionaries * + ********************************/ + int bam_strmap_put(void *strmap, const char *rg, const char *lib); const char *bam_strmap_get(const void *strmap, const char *rg); void *bam_strmap_dup(const void*); void *bam_strmap_init(); void bam_strmap_destroy(void *strmap); + + /********************* + * Low-level BAM I/O * + *********************/ + /*! @abstract Initialize a header structure. @return the pointer to the header structure @@ -440,6 +457,11 @@ extern "C" { const char *bam_get_library(bam_header_t *header, const bam1_t *b); + + /*************** + * pileup APIs * + ***************/ + /*! @typedef @abstract Structure for one alignment covering the pileup position. @field b pointer to the alignment @@ -461,11 +483,25 @@ extern "C" { uint32_t is_del:1, is_head:1, is_tail:1; } bam_pileup1_t; - struct __bam_plbuf_t; - /*! @abstract pileup buffer */ - typedef struct __bam_plbuf_t bam_plbuf_t; + typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); - void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + struct __bam_plp_t; + typedef struct __bam_plp_t *bam_plp_t; + + bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); + int bam_plp_push(bam_plp_t iter, const bam1_t *b); + const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + void bam_plp_set_mask(bam_plp_t iter, int mask); + void bam_plp_reset(bam_plp_t iter); + void bam_plp_destroy(bam_plp_t iter); + + struct __bam_mplp_t; + typedef struct __bam_mplp_t *bam_mplp_t; + + bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); + void bam_mplp_destroy(bam_mplp_t iter); + int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); /*! @typedef @abstract Type of function to be called by bam_plbuf_push(). @@ -478,44 +514,16 @@ extern "C" { */ typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); - /*! - @abstract Reset a pileup buffer for another pileup process - @param buf the pileup buffer to be reset - */ - void bam_plbuf_reset(bam_plbuf_t *buf); + typedef struct { + bam_plp_t iter; + bam_pileup_f func; + void *data; + } bam_plbuf_t; - /*! - @abstract Initialize a buffer for pileup. - @param func fucntion to be called by bam_pileup_core() - @param data user provided data - @return pointer to the pileup buffer - */ + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + void bam_plbuf_reset(bam_plbuf_t *buf); bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); - - /*! - @abstract Destroy a pileup buffer. - @param buf pointer to the pileup buffer - */ void bam_plbuf_destroy(bam_plbuf_t *buf); - - /*! - @abstract Push an alignment to the pileup buffer. - @param b alignment to be pushed - @param buf pileup buffer - @see bam_plbuf_init() - @return always 0 currently - - @discussion If all the alignments covering a particular site have - been collected, this function will call the user defined function - as is provided to bam_plbuf_init(). The coordinate of the site and - all the alignments will be transferred to the user defined - function as function parameters. - - When all the alignments are pushed to the buffer, this function - needs to be called with b equal to NULL. This will flush the - buffer. A pileup buffer can only be reused when bam_plbuf_reset() - is called. - */ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); @@ -534,6 +542,11 @@ extern "C" { /*! @abstract bam_plbuf_push() equivalent with level calculated. */ int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); + + /********************* + * BAM indexing APIs * + *********************/ + struct __bam_index_t; typedef struct __bam_index_t bam_index_t; @@ -582,6 +595,10 @@ extern "C" { */ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); + bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end); + int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b); + void bam_iter_destroy(bam_iter_t iter); + /*! @abstract Parse a region in the format: "chr2:100,000-200,000". @discussion bam_header_t::hash will be initialized if empty. @@ -594,6 +611,11 @@ extern "C" { */ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + + /************************** + * APIs for optional tags * + **************************/ + /*! @abstract Retrieve data of a tag @param b pointer to an alignment struct @@ -617,6 +639,11 @@ extern "C" { void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() + + /***************** + * Miscellaneous * + *****************/ + /*! @abstract Calculate the rightmost coordinate of an alignment on the reference genome. diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c index 89e99f2..fbcd982 100644 --- a/samtools/bam_aux.c +++ b/samtools/bam_aux.c @@ -115,7 +115,7 @@ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *be *ref_id = kh_value(h, iter); if (i == k) { /* dump the whole sequence */ *begin = 0; *end = 1<<29; free(s); - return -1; + return 0; } for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; *begin = atoi(p); diff --git a/samtools/bam_import.c b/samtools/bam_import.c index 9d463d1..9d84328 100644 --- a/samtools/bam_import.c +++ b/samtools/bam_import.c @@ -116,7 +116,7 @@ static bam_header_t *hash2header(const kh_ref_t *hash) bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; - int c, dret, ret; + int c, dret, ret, error = 0; gzFile fp; kstream_t *ks; kstring_t *str; @@ -135,6 +135,10 @@ bam_header_t *sam_header_read2(const char *fn) ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); + if (ret == 0) { + fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s); + error = 1; + } kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); @@ -143,6 +147,7 @@ bam_header_t *sam_header_read2(const char *fn) gzclose(fp); free(str->s); free(str); fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); + if (error) return 0; header = hash2header(hash); kh_destroy(ref, hash); return header; @@ -163,9 +168,24 @@ static inline void parse_error(int64_t n_lines, const char * __restrict msg) } static inline void append_text(bam_header_t *header, kstring_t *str) { - int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null + size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null kroundup32(x); kroundup32(y); - if (x < y) header->text = (char*)realloc(header->text, y); + if (x < y) + { + header->n_text = y; + header->text = (char*)realloc(header->text, y); + if ( !header->text ) + { + fprintf(stderr,"realloc failed to alloc %ld bytes\n", y); + abort(); + } + } + // Sanity check + if ( header->l_text+str->l+1 >= header->n_text ) + { + fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,header->n_text,x,y); + abort(); + } strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. header->l_text += str->l + 1; header->text[header->l_text] = 0; diff --git a/samtools/bam_index.c b/samtools/bam_index.c index a627884..4152f20 100644 --- a/samtools/bam_index.c +++ b/samtools/bam_index.c @@ -42,6 +42,8 @@ // 1<<14 is the size of minimum bin. #define BAM_LIDX_SHIFT 14 +#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1 + typedef struct { uint64_t u, v; } pair64_t; @@ -63,6 +65,7 @@ KHASH_MAP_INIT_INT(i, bam_binlist_t) struct __bam_index_t { int32_t n; + uint64_t n_no_coor; // unmapped reads without coordinate khash_t(i) **index; bam_lidx_t *index2; }; @@ -98,8 +101,12 @@ static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); } - for (i = beg + 1; i <= end; ++i) - if (index2->offset[i] == 0) index2->offset[i] = offset; + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } index2->n = end + 1; } @@ -113,7 +120,7 @@ static void merge_chunks(bam_index_t *idx) index = idx->index[i]; for (k = kh_begin(index); k != kh_end(index); ++k) { bam_binlist_t *p; - if (!kh_exist(index, k)) continue; + if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue; p = &kh_value(index, k); m = 0; for (l = 1; l < p->n; ++l) { @@ -130,6 +137,17 @@ static void merge_chunks(bam_index_t *idx) #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) } +static void fill_missing(bam_index_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + bam_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + bam_index_t *bam_index_core(bamFile fp) { bam1_t *b; @@ -139,7 +157,7 @@ bam_index_t *bam_index_core(bamFile fp) uint32_t last_bin, save_bin; int32_t last_coor, last_tid, save_tid; bam1_core_t *c; - uint64_t save_off, last_off; + uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor; idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); b = (bam1_t*)calloc(1, sizeof(bam1_t)); @@ -154,7 +172,10 @@ bam_index_t *bam_index_core(bamFile fp) save_bin = save_tid = last_tid = last_bin = 0xffffffffu; save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; + n_mapped = n_unmapped = n_no_coor = off_end = 0; + off_beg = off_end = bam_tell(fp); while ((ret = bam_read1(fp, b)) >= 0) { + if (c->tid < 0) ++n_no_coor; if (last_tid != c->tid) { // change of chromosomes last_tid = c->tid; last_bin = 0xffffffffu; @@ -163,10 +184,17 @@ bam_index_t *bam_index_core(bamFile fp) bam1_qname(b), last_coor, c->pos, c->tid+1); exit(1); } - if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element + off_end = last_off; + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); + n_mapped = n_unmapped = 0; + off_beg = off_end; + } save_off = last_off; save_bin = last_bin = c->bin; save_tid = c->tid; @@ -177,13 +205,23 @@ bam_index_t *bam_index_core(bamFile fp) (unsigned long long)bam_tell(fp), (unsigned long long)last_off); exit(1); } + if (c->flag & BAM_FUNMAP) ++n_unmapped; + else ++n_mapped; last_off = bam_tell(fp); last_coor = b->core.pos; } - if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + if (save_tid >= 0) { + insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); + } merge_chunks(idx); + fill_missing(idx); + if (ret >= 0) + while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor; if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); free(b->data); free(b); + idx->n_no_coor = n_no_coor; return idx; } @@ -261,6 +299,11 @@ void bam_index_save(const bam_index_t *idx, FILE *fp) bam_swap_endian_8p(&index2->offset[x]); } else fwrite(index2->offset, 8, index2->n, fp); } + { // write the number of reads coor-less records. + uint64_t x = idx->n_no_coor; + if (bam_is_be) bam_swap_endian_8p(&x); + fwrite(&x, 8, 1, fp); + } fflush(fp); } @@ -322,6 +365,8 @@ static bam_index_t *bam_index_load_core(FILE *fp) if (bam_is_be) for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); } + if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0; + if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor); return idx; } @@ -339,13 +384,13 @@ bam_index_t *bam_index_load_local(const char *_fn) } else fn = strdup(_fn); fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); - fp = fopen(fnidx, "r"); + fp = fopen(fnidx, "rb"); if (fp == 0) { // try "{base}.bai" char *s = strstr(fn, "bam"); if (s == fn + strlen(fn) - 3) { strcpy(fnidx, fn); fnidx[strlen(fn)-1] = 'i'; - fp = fopen(fnidx, "r"); + fp = fopen(fnidx, "rb"); } } free(fnidx); free(fn); @@ -375,7 +420,7 @@ static void download_from_remote(const char *url) fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); return; } - if ((fp = fopen(fn, "w")) == 0) { + if ((fp = fopen(fn, "wb")) == 0) { fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); knet_close(fp_remote); return; @@ -425,7 +470,7 @@ int bam_index_build2(const char *fn, const char *_fnidx) fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); } else fnidx = strdup(_fnidx); - fpidx = fopen(fnidx, "w"); + fpidx = fopen(fnidx, "wb"); if (fpidx == 0) { fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); free(fnidx); @@ -446,7 +491,7 @@ int bam_index_build(const char *fn) int bam_index(int argc, char *argv[]) { if (argc < 2) { - fprintf(stderr, "Usage: samtools index []\n"); + fprintf(stderr, "Usage: samtools index [out.index]\n"); return 1; } if (argc >= 3) bam_index_build2(argv[1], argv[2]); @@ -454,11 +499,43 @@ int bam_index(int argc, char *argv[]) return 0; } -#define MAX_BIN 37450 // =(8^6-1)/7+1 +int bam_idxstats(int argc, char *argv[]) +{ + bam_index_t *idx; + bam_header_t *header; + bamFile fp; + int i; + if (argc < 2) { + fprintf(stderr, "Usage: samtools idxstats \n"); + return 1; + } + fp = bam_open(argv[1], "r"); + if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; } + header = bam_header_read(fp); + bam_close(fp); + idx = bam_index_load(argv[1]); + if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; } + for (i = 0; i < idx->n; ++i) { + khint_t k; + khash_t(i) *h = idx->index[i]; + printf("%s\t%d", header->target_name[i], header->target_len[i]); + k = kh_get(i, h, BAM_MAX_BIN); + if (k != kh_end(h)) + printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v); + else printf("\t0\t0"); + putchar('\n'); + } + printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor); + bam_header_destroy(header); + bam_index_destroy(idx); + return 0; +} -static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN]) { int i = 0, k; + if (beg >= end) return 0; + if (end >= 1u<<29) end = 1u<<29; --end; list[i++] = 0; for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; @@ -476,8 +553,15 @@ static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) return (rend > beg && rbeg < end); } +struct __bam_iter_t { + int from_first; // read from the first record; no random access + int tid, beg, end, n_off, i, finished; + uint64_t curr_off; + pair64_t *off; +}; + // bam_fetch helper function retrieves -pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off) +bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) { uint16_t *bins; int i, n_bins, n_off; @@ -485,17 +569,34 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e khint_t k; khash_t(i) *index; uint64_t min_off; - - bins = (uint16_t*)calloc(MAX_BIN, 2); + bam_iter_t iter = 0; + + if (beg < 0) beg = 0; + if (end < beg) return 0; + // initialize iter + iter = calloc(1, sizeof(struct __bam_iter_t)); + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; + // + bins = (uint16_t*)calloc(BAM_MAX_BIN, 2); n_bins = reg2bins(beg, end, bins); index = idx->index[tid]; - min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + if (idx->index2[tid].n > 0) { + min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4 + int n = beg>>BAM_LIDX_SHIFT; + if (n > idx->index2[tid].n) n = idx->index2[tid].n; + for (i = n - 1; i >= 0; --i) + if (idx->index2[tid].offset[i] != 0) break; + if (i >= 0) min_off = idx->index2[tid].offset[i]; + } + } else min_off = 0; // tabix 0.1.2 may produce such index files for (i = n_off = 0; i < n_bins; ++i) { if ((k = kh_get(i, index, bins[i])) != kh_end(index)) n_off += kh_value(index, k).n; } if (n_off == 0) { - free(bins); return 0; + free(bins); return iter; } off = (pair64_t*)calloc(n_off, 16); for (i = n_off = 0; i < n_bins; ++i) { @@ -534,41 +635,62 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e } bam_destroy1(b); } - *cnt_off = n_off; + iter->n_off = n_off; iter->off = off; + return iter; +} + +pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off) +{ // for pysam compatibility + bam_iter_t iter; + pair64_t *off; + iter = bam_iter_query(idx, tid, beg, end); + off = iter->off; *cnt_off = iter->n_off; + free(iter); return off; } -int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +void bam_iter_destroy(bam_iter_t iter) { - int n_off; - pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off); - if (off == 0) return 0; - { - // retrive alignments - uint64_t curr_off; - int i, ret, n_seeks; - n_seeks = 0; i = -1; curr_off = 0; - bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); - for (;;) { - if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk - if (i == n_off - 1) break; // no more chunks - if (i >= 0) assert(curr_off == off[i].v); // otherwise bug - if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek - bam_seek(fp, off[i+1].u, SEEK_SET); - curr_off = bam_tell(fp); - ++n_seeks; - } - ++i; + if (iter) { free(iter->off); free(iter); } +} + +int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) +{ + if (iter->finished) return -1; + if (iter->from_first) { + int ret = bam_read1(fp, b); + if (ret < 0) iter->finished = 1; + return ret; + } + if (iter->off == 0) return -1; + for (;;) { + int ret; + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk + if (iter->i == iter->n_off - 1) break; // no more chunks + if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug + if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek + bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET); + iter->curr_off = bam_tell(fp); } - if ((ret = bam_read1(fp, b)) > 0) { - curr_off = bam_tell(fp); - if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed - else if (is_overlap(beg, end, b)) func(b, data); - } else break; // end of file + ++iter->i; } -// fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); - bam_destroy1(b); + if ((ret = bam_read1(fp, b)) > 0) { + iter->curr_off = bam_tell(fp); + if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed + else if (is_overlap(iter->beg, iter->end, b)) return ret; + } else break; // end of file } - free(off); + iter->finished = 1; + return -1; +} + +int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +{ + bam_iter_t iter; + bam1_t *b; + b = bam_init1(); + iter = bam_iter_query(idx, tid, beg, end); + while (bam_iter_read(fp, iter, b) >= 0) func(b, data); + bam_destroy1(b); return 0; } diff --git a/samtools/bam_maqcns.c b/samtools/bam_maqcns.c index 71c2185..cad63d7 100644 --- a/samtools/bam_maqcns.c +++ b/samtools/bam_maqcns.c @@ -310,6 +310,7 @@ bam_maqindel_opt_t *bam_maqindel_opt_init() bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); mi->q_indel = 40; mi->r_indel = 0.00015; + mi->r_snp = 0.001; // mi->mm_penalty = 3; mi->indel_err = 4; @@ -406,7 +407,8 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } { // the core part char *ref2, *rs, *inscns = 0; - int k, l, *score, *pscore, max_ins = types[n_types-1]; + int qr_snp, k, l, *score, *pscore, max_ins = types[n_types-1]; + qr_snp = (int)(-4.343 * log(mi->r_snp) + .499); if (max_ins > 0) { // get the consensus of inserted sequences int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); // count occurrences @@ -446,12 +448,18 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c for (i = 0; i < n_types; ++i) { ka_param_t ap = ka_param_blast; ap.band_width = 2 * types[n_types - 1] + 2; + ap.gap_end = 0; // write ref2 for (k = 0, j = left; j <= pos; ++j) ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; if (types[i] <= 0) j += -types[i]; else for (l = 0; l < types[i]; ++l) ref2[k++] = bam_nt16_nt4_table[(int)inscns[i*max_ins + l]]; + if (types[0] < 0) { // mask deleted sequences + int jj, tmp = types[i] >= 0? -types[0] : -types[0] + types[i]; + for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j) + ref2[k++] = 4; + } for (; j < right && ref[j]; ++j) ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; if (j < right) right = j; @@ -482,22 +490,27 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c if (op == BAM_CMATCH) { int k; for (k = 0; k < len; ++k) - if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k]; + if (ref2[x+k] != rs[y+k] && ref2[x+k] < 4) + ps += bam1_qual(p->b)[y+k] < qr_snp? bam1_qual(p->b)[y+k] : qr_snp; x += len; y += len; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - if (op == BAM_CINS) ps += mi->q_indel * len; + if (op == BAM_CINS && l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; y += len; } else if (op == BAM_CDEL) { - ps += mi->q_indel * len; + if (l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; x += len; } } pscore[i*n+j] = ps; - /*if (pos == 2618517) { // for debugging only - fprintf(stderr, "pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend); - for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); fprintf(stderr, "\n"); - for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l]], stderr); fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); fputc('\n', stderr); + /*if (1) { // for debugging only + fprintf(stderr, "id=%d, pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, %d, ", + j, pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend, mi->q_indel); + for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); + fprintf(stderr, "\n"); + for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l+tbeg-left]], stderr); + fputc('\n', stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); + fputc('\n', stderr); }*/ free(acigar); } @@ -560,7 +573,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c ret->gl[0] = ret->gl[1] = 0; for (j = 0; j < n; ++j) { int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; - //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2); + //fprintf(stderr, "id=%d, %d, %d, %d, %d, %d\n", j, pl[j].b->core.pos+1, types[max1_i], types[max2_i], s1, s2); if (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err; else ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err; } diff --git a/samtools/bam_maqcns.h b/samtools/bam_maqcns.h index fa5489d..6cc5355 100644 --- a/samtools/bam_maqcns.h +++ b/samtools/bam_maqcns.h @@ -16,8 +16,9 @@ typedef struct { } bam_maqcns_t; typedef struct { - int q_indel; - float r_indel; + int q_indel; // indel sequencing error, phred scaled + float r_indel; // indel prior + float r_snp; // snp prior // hidden parameters, unchangeable from command line int mm_penalty, indel_err, ambi_thres; } bam_maqindel_opt_t; diff --git a/samtools/bam_md.c b/samtools/bam_md.c index 3ca7309..17b0a4a 100644 --- a/samtools/bam_md.c +++ b/samtools/bam_md.c @@ -6,7 +6,7 @@ #include "sam.h" #include "kstring.h" -void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); @@ -53,6 +53,26 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal) } } ksprintf(str, "%d", u); + // apply max_nm + if (max_nm > 0 && nm >= max_nm) { + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match + seq[z/2] |= (z&1)? 0x0f : 0xf0; + bam1_qual(b)[z] = 0; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } // update NM old_nm = bam_aux_get(b, "NM"); if (c->flag & BAM_FUNMAP) return; @@ -83,9 +103,14 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal) free(str->s); free(str); } +void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +{ + bam_fillmd1_core(b, ref, is_equal, 0); +} + int bam_fillmd(int argc, char *argv[]) { - int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed; + int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; @@ -94,12 +119,13 @@ int bam_fillmd(int argc, char *argv[]) is_bam_out = is_sam_in = is_uncompressed = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "eubS")) >= 0) { + while ((c = getopt(argc, argv, "eubSn:")) >= 0) { switch (c) { case 'e': is_equal = 1; break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': is_sam_in = 1; break; + case 'n': max_nm = atoi(optarg); break; default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; } } @@ -136,7 +162,7 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", fp->header->target_name[tid]); } - if (ref) bam_fillmd1(b, ref, is_equal); + if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); } samwrite(fpout, b); } diff --git a/samtools/bam_pileup.c b/samtools/bam_pileup.c index f68f400..3c41a16 100644 --- a/samtools/bam_pileup.c +++ b/samtools/bam_pileup.c @@ -73,18 +73,28 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) p->qpos = y + (pos - x); if (x == pos && is_restart) p->is_head = 1; if (x + l - 1 == pos) { // come to the end of a match - if (k < c->n_cigar - 1) { // there are additional operation(s) + int has_next_match = 0; + unsigned i; + for (i = k + 1; i < c->n_cigar; ++i) { + uint32_t cigar = bam1_cigar(b)[i]; + int opi = cigar&BAM_CIGAR_MASK; + if (opi == BAM_CMATCH) { + has_next_match = 1; + break; + } else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break; + } + if (!has_next_match) p->is_tail = 1; + if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s) uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins - if (op_next == BAM_CDEL || op_next == BAM_CINS) { - if (k + 2 < c->n_cigar) op_next = bam1_cigar(b)[k+2]&BAM_CIGAR_MASK; - else p->is_tail = 1; + else if (op_next == BAM_CPAD && k + 2 < c->n_cigar) { // no working for adjacent padding + cigar = bam1_cigar(b)[k+2]; op_next = cigar&BAM_CIGAR_MASK; + if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del + else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins } - if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP) - p->is_tail = 1; // tail - } else p->is_tail = 1; // this is the last operation; set tail + } } } x += l; y += l; @@ -96,7 +106,8 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) x += l; } else if (op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); + if (is_restart) is_restart ^= (op == BAM_CMATCH); + else is_restart ^= (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); if (x > pos) { if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all break; @@ -108,119 +119,167 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) /* --- END: Auxiliary functions */ -struct __bam_plbuf_t { +/******************* + * pileup iterator * + *******************/ + +struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail, *dummy; - bam_pileup_f func; - void *func_data; int32_t tid, pos, max_tid, max_pos; - int max_pu, is_eof; - bam_pileup1_t *pu; - int flag_mask; + int is_eof, flag_mask, max_plp, error; + bam_pileup1_t *plp; + // for the "auto" interface only + bam1_t *b; + bam_plp_auto_f func; + void *data; }; -void bam_plbuf_reset(bam_plbuf_t *buf) +bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) { - lbnode_t *p, *q; - buf->max_tid = buf->max_pos = -1; - buf->tid = buf->pos = 0; - buf->is_eof = 0; - for (p = buf->head; p->next;) { - q = p->next; - mp_free(buf->mp, p); - p = q; + bam_plp_t iter; + iter = calloc(1, sizeof(struct __bam_plp_t)); + iter->mp = mp_init(); + iter->head = iter->tail = mp_alloc(iter->mp); + iter->dummy = mp_alloc(iter->mp); + iter->max_tid = iter->max_pos = -1; + iter->flag_mask = BAM_DEF_MASK; + if (func) { + iter->func = func; + iter->data = data; + iter->b = bam_init1(); } - buf->head = buf->tail; + return iter; } -void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) -{ - if (mask < 0) buf->flag_mask = BAM_DEF_MASK; - else buf->flag_mask = BAM_FUNMAP | mask; -} - -bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +void bam_plp_destroy(bam_plp_t iter) { - bam_plbuf_t *buf; - buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t)); - buf->func = func; buf->func_data = data; - buf->mp = mp_init(); - buf->head = buf->tail = mp_alloc(buf->mp); - buf->dummy = mp_alloc(buf->mp); - buf->max_tid = buf->max_pos = -1; - buf->flag_mask = BAM_DEF_MASK; - return buf; + mp_free(iter->mp, iter->dummy); + mp_free(iter->mp, iter->head); + if (iter->mp->cnt != 0) + fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt); + mp_destroy(iter->mp); + if (iter->b) bam_destroy1(iter->b); + free(iter->plp); + free(iter); } -void bam_plbuf_destroy(bam_plbuf_t *buf) +const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) { - mp_free(buf->mp, buf->dummy); - mp_free(buf->mp, buf->head); - if (buf->mp->cnt != 0) - fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt); - mp_destroy(buf->mp); - free(buf->pu); - free(buf); + if (iter->error) { *_n_plp = -1; return 0; } + *_n_plp = 0; + if (iter->is_eof && iter->head->next == 0) return 0; + while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { + int n_plp = 0; + lbnode_t *p, *q; + // write iter->plp at iter->pos + iter->dummy->next = iter->head; + for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove + q->next = p->next; mp_free(iter->mp, p); p = q; + } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup + if (n_plp == iter->max_plp) { // then double the capacity + iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; + iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); + } + iter->plp[n_plp].b = &p->b; + if (resolve_cigar(iter->plp + n_plp, iter->pos)) ++n_plp; // skip the read if we are looking at ref-skip + } + } + iter->head = iter->dummy->next; // dummy->next may be changed + *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; + // update iter->tid and iter->pos + if (iter->head->next) { + if (iter->tid > iter->head->b.core.tid) { + fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__); + iter->error = 1; + *_n_plp = -1; + return 0; + } + } + if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence + iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference + } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid + iter->pos = iter->head->beg; // jump to the next position + } else ++iter->pos; // scan contiguously + // return + if (n_plp) return iter->plp; + if (iter->is_eof && iter->head->next == 0) break; + } + return 0; } -int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +int bam_plp_push(bam_plp_t iter, const bam1_t *b) { - if (b) { // fill buffer + if (iter->error) return -1; + if (b) { if (b->core.tid < 0) return 0; - if (b->core.flag & buf->flag_mask) return 0; - bam_copy1(&buf->tail->b, b); - buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); - if (b->core.tid < buf->max_tid) { + if (b->core.flag & iter->flag_mask) return 0; + bam_copy1(&iter->tail->b, b); + iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); + if (b->core.tid < iter->max_tid) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); + iter->error = 1; return -1; } - if ((b->core.tid == buf->max_tid) && (buf->tail->beg < buf->max_pos)) { + if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); + iter->error = 1; return -1; } - buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg; - if (buf->tail->end > buf->pos || buf->tail->b.core.tid > buf->tid) { - buf->tail->next = mp_alloc(buf->mp); - buf->tail = buf->tail->next; - } - } else buf->is_eof = 1; - while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) { - int n_pu = 0; - lbnode_t *p, *q; - buf->dummy->next = buf->head; - for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) { - if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list - q->next = p->next; mp_free(buf->mp, p); p = q; - } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup - if (n_pu == buf->max_pu) { // then double the capacity - buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256; - buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu); - } - buf->pu[n_pu].b = &p->b; - if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP - } + iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; + if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { + iter->tail->next = mp_alloc(iter->mp); + iter->tail = iter->tail->next; } - buf->head = buf->dummy->next; // dummy->next may be changed - if (n_pu) { // then call user defined function - buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data); - } - // update tid and pos - if (buf->head->next) { - if (buf->tid > buf->head->b.core.tid) { - fprintf(stderr, "[bam_plbuf_push] unsorted input. Pileup aborts.\n"); - return 1; + } else iter->is_eof = 1; + return 0; +} + +const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + const bam_pileup1_t *plp; + if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + else { + *_n_plp = 0; + if (iter->is_eof) return 0; + while (iter->func(iter->data, iter->b) >= 0) { + if (bam_plp_push(iter, iter->b) < 0) { + *_n_plp = -1; + return 0; } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; } - if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence - buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference - } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid - buf->pos = buf->head->beg; // jump to the next position - } else ++buf->pos; // scan contiguously - if (buf->is_eof && buf->head->next == 0) break; + bam_plp_push(iter, 0); + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + return 0; } - return 0; } +void bam_plp_reset(bam_plp_t iter) +{ + lbnode_t *p, *q; + iter->max_tid = iter->max_pos = -1; + iter->tid = iter->pos = 0; + iter->is_eof = 0; + for (p = iter->head; p->next;) { + q = p->next; + mp_free(iter->mp, p); + p = q; + } + iter->head = iter->tail; +} + +void bam_plp_set_mask(bam_plp_t iter, int mask) +{ + iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask); +} + +/***************** + * callback APIs * + *****************/ + int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) { bam_plbuf_t *buf; @@ -236,3 +295,102 @@ int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) bam_destroy1(b); return 0; } + +void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) +{ + bam_plp_set_mask(buf->iter, mask); +} + +void bam_plbuf_reset(bam_plbuf_t *buf) +{ + bam_plp_reset(buf->iter); +} + +bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +{ + bam_plbuf_t *buf; + buf = calloc(1, sizeof(bam_plbuf_t)); + buf->iter = bam_plp_init(0, 0); + buf->func = func; + buf->data = data; + return buf; +} + +void bam_plbuf_destroy(bam_plbuf_t *buf) +{ + bam_plp_destroy(buf->iter); + free(buf); +} + +int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +{ + int ret, n_plp, tid, pos; + const bam_pileup1_t *plp; + ret = bam_plp_push(buf->iter, b); + if (ret < 0) return ret; + while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) + buf->func(tid, pos, n_plp, plp, buf->data); + return 0; +} + +/*********** + * mpileup * + ***********/ + +struct __bam_mplp_t { + int n; + uint64_t min, *pos; + bam_plp_t *iter; + int *n_plp; + const bam_pileup1_t **plp; +}; + +bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) +{ + int i; + bam_mplp_t iter; + iter = calloc(1, sizeof(struct __bam_mplp_t)); + iter->pos = calloc(n, 8); + iter->n_plp = calloc(n, sizeof(int)); + iter->plp = calloc(n, sizeof(void*)); + iter->iter = calloc(n, sizeof(void*)); + iter->n = n; + iter->min = (uint64_t)-1; + for (i = 0; i < n; ++i) { + iter->iter[i] = bam_plp_init(func, data[i]); + iter->pos[i] = iter->min; + } + return iter; +} + +void bam_mplp_destroy(bam_mplp_t iter) +{ + int i; + for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); + free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); + free(iter); +} + +int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +{ + int i, ret = 0; + uint64_t new_min = (uint64_t)-1; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { + int tid, pos; + iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); + iter->pos[i] = (uint64_t)tid<<32 | pos; + } + if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; + } + iter->min = new_min; + if (new_min == (uint64_t)-1) return 0; + *_tid = new_min>>32; *_pos = (uint32_t)new_min; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { + n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; + ++ret; + } else n_plp[i] = 0, plp[i] = 0; + } + return ret; +} diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index ba787a9..6804795 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -18,6 +18,10 @@ KHASH_MAP_INIT_INT64(64, indel_list_t) #define BAM_PLF_GLF 0x08 #define BAM_PLF_VAR_ONLY 0x10 #define BAM_PLF_2ND 0x20 +#define BAM_PLF_RANBASE 0x40 +#define BAM_PLF_1STBASE 0x80 +#define BAM_PLF_ALLBASE 0x100 +#define BAM_PLF_READPOS 0x200 typedef struct { bam_header_t *h; @@ -28,6 +32,7 @@ typedef struct { uint32_t format; int tid, len, last_pos; int mask; + int max_depth; // for indel calling, ignore reads with the depth too high. 0 for unlimited char *ref; glfFile fp_glf; // for glf output only } pu_data_t; @@ -121,10 +126,11 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, g3->offset = pos - d->last_pos; d->last_pos = pos; glf3_write1(d->fp_glf, g3); - if (pos < d->len) { + if (pos < d->len) { + int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; if (proposed_indels) - r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); - else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); } if (r) { // then write indel line int het = 3 * n, min; @@ -152,11 +158,37 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, return 0; } +static void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) +{ + if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); + if (!p->is_del) { + int j, rb, c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + rb = (ref && pos < ref_len)? ref[pos] : 'N'; + if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + putchar(c); + if (p->indel > 0) { + printf("+%d", p->indel); + for (j = 1; j <= p->indel; ++j) { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } else if (p->indel < 0) { + printf("%d", p->indel); + for (j = 1; j <= -p->indel; ++j) { + c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } + } else putchar('*'); + if (p->is_tail) putchar('$'); +} + static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) { pu_data_t *d = (pu_data_t*)data; bam_maqindel_ret_t *r = 0; - int i, j, rb, rms_mapq = -1, *proposed_indels = 0; + int i, rb, rms_mapq = -1, *proposed_indels = 0; uint64_t rms_aux; uint32_t cns = 0; @@ -171,7 +203,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p // update d->ref if necessary if (d->fai && (int)tid != d->tid) { free(d->ref); - d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->ref = faidx_fetch_seq(d->fai, d->h->target_name[tid], 0, 0x7fffffff, &d->len); d->tid = tid; } rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; @@ -182,12 +214,31 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p if (i == n) return 0; } // call the consensus and indel - if (d->format & BAM_PLF_CNS) // call consensus - cns = bam_maqcns_call(n, pu, d->c); - if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels - if (proposed_indels) // the first element gives the size of the array - r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); - else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); + if (d->format & BAM_PLF_CNS) { // call consensus + if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE)) { // use a random base or the 1st base as the consensus call + const bam_pileup1_t *p = (d->format & BAM_PLF_1STBASE)? pu : pu + (int)(drand48() * n); + int q = bam1_qual(p->b)[p->qpos]; + int mapQ = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + uint32_t b = bam1_seqi(bam1_seq(p->b), p->qpos); + cns = b<<28 | 0xf<<24 | mapQ<<16 | q<<8; + } else if (d->format & BAM_PLF_ALLBASE) { // collapse all bases + uint64_t rmsQ = 0; + uint32_t b = 0; + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int q = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + b |= bam1_seqi(bam1_seq(p->b), p->qpos); + rmsQ += q * q; + } + rmsQ = (uint64_t)(sqrt((double)rmsQ / n) + .499); + cns = b<<28 | 0xf<<24 | rmsQ<<16 | 60<<8; + } else cns = bam_maqcns_call(n, pu, d->c); + } + if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels + int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; + if (proposed_indels) // the first element gives the size of the array + r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); } // when only variant sites are asked for, test if the site is a variant if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { @@ -218,27 +269,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p const bam_pileup1_t *p = pu + i; int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; rms_aux += tmp * tmp; - if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); - if (!p->is_del) { - int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; - if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; - else c = bam1_strand(p->b)? tolower(c) : toupper(c); - putchar(c); - if (p->indel > 0) { - printf("+%d", p->indel); - for (j = 1; j <= p->indel; ++j) { - c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } - } else if (p->indel < 0) { - printf("%d", p->indel); - for (j = 1; j <= -p->indel; ++j) { - c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N'; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } - } - } else putchar('*'); - if (p->is_tail) putchar('$'); + pileup_seq(p, pos, d->len, d->ref); } // finalize rms_mapq rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); @@ -275,6 +306,15 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p putchar(c); } } + // print read position + if (d->format & BAM_PLF_READPOS) { + putchar('\t'); + for (i = 0; i < n; ++i) { + int x = pu[i].qpos; + int l = pu[i].b->core.l_qseq; + printf("%d,", x < l/2? x+1 : -((l-1)-x+1)); + } + } putchar('\n'); // print the indel line if r has been calculated. This only happens if: // a) -c or -i are flagged, AND b) the reference sequence is available @@ -298,29 +338,40 @@ int bam_pileup(int argc, char *argv[]) int c, is_SAM = 0; char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); + d->max_depth = 0; d->tid = -1; d->mask = BAM_DEF_MASK; d->c = bam_maqcns_init(); + d->c->is_soap = 1; // change the default model d->ido = bam_maqindel_opt_init(); - while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:vM:S2a")) >= 0) { + while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PA")) >= 0) { switch (c) { case 'a': d->c->is_soap = 1; break; + case 'A': d->c->is_soap = 0; break; case 's': d->format |= BAM_PLF_SIMPLE; break; case 't': fn_list = strdup(optarg); break; case 'l': fn_pos = strdup(optarg); break; case 'f': fn_fa = strdup(optarg); break; case 'T': d->c->theta = atof(optarg); break; case 'N': d->c->n_hap = atoi(optarg); break; - case 'r': d->c->het_rate = atof(optarg); break; + case 'r': d->c->het_rate = atof(optarg); d->ido->r_snp = d->c->het_rate; break; case 'M': d->c->cap_mapQ = atoi(optarg); break; + case 'd': d->max_depth = atoi(optarg); break; case 'c': d->format |= BAM_PLF_CNS; break; case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; case 'v': d->format |= BAM_PLF_VAR_ONLY; break; case 'm': d->mask = strtol(optarg, 0, 0); break; case 'g': d->format |= BAM_PLF_GLF; break; case '2': d->format |= BAM_PLF_2ND; break; + case 'P': d->format |= BAM_PLF_READPOS; break; case 'I': d->ido->q_indel = atoi(optarg); break; case 'G': d->ido->r_indel = atof(optarg); break; case 'S': is_SAM = 1; break; + case 'R': + if (strcmp(optarg, "random") == 0) d->format |= BAM_PLF_RANBASE; + else if (strcmp(optarg, "first") == 0) d->format |= BAM_PLF_1STBASE; + else if (strcmp(optarg, "all") == 0) d->format |= BAM_PLF_ALLBASE; + else fprintf(stderr, "[bam_pileup] unrecognized -R\n"); + break; default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; } } @@ -330,15 +381,16 @@ int bam_pileup(int argc, char *argv[]) fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); fprintf(stderr, " -S the input is in SAM\n"); - fprintf(stderr, " -a use the SOAPsnp model for SNP calling\n"); + fprintf(stderr, " -A use the MAQ model for SNP calling\n"); fprintf(stderr, " -2 output the 2nd best call and quality\n"); fprintf(stderr, " -i only show lines/consensus with indels\n"); fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask); fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); + fprintf(stderr, " -d INT limit maximum depth for indels [unlimited]\n"); fprintf(stderr, " -t FILE list of reference sequences (force -S)\n"); fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); - fprintf(stderr, " -c output the maq consensus sequence\n"); + fprintf(stderr, " -c output the SOAPsnp consensus sequence\n"); fprintf(stderr, " -v print variants only (for -c)\n"); fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n"); fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta); @@ -350,6 +402,7 @@ int bam_pileup(int argc, char *argv[]) free(fn_list); free(fn_fa); free(d); return 1; } + if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE|BAM_PLF_ALLBASE)) d->format |= BAM_PLF_CNS; if (fn_fa) d->fai = fai_load(fn_fa); if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling if (d->format & BAM_PLF_GLF) { // for glf output @@ -390,3 +443,128 @@ int bam_pileup(int argc, char *argv[]) free(d->ido); free(d->ref); free(d); return 0; } + +/*********** + * mpileup * + ***********/ + +typedef struct { + char *reg; + faidx_t *fai; +} mplp_conf_t; + +typedef struct { + bamFile fp; + bam_iter_t iter; +} mplp_aux_t; + +static int mplp_func(void *data, bam1_t *b) +{ + mplp_aux_t *ma = (mplp_aux_t*)data; + if (ma->iter) return bam_iter_read(ma->fp, ma->iter, b); + return bam_read1(ma->fp, b); +} + +static int mpileup(mplp_conf_t *conf, int n, char **fn) +{ + mplp_aux_t **data; + int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid; + const bam_pileup1_t **plp; + bam_mplp_t iter; + bam_header_t *h = 0; + char *ref; + // allocate + data = calloc(n, sizeof(void*)); + plp = calloc(n, sizeof(void*)); + n_plp = calloc(n, sizeof(int*)); + // read the header and initialize data + for (i = 0; i < n; ++i) { + bam_header_t *h_tmp; + data[i] = calloc(1, sizeof(mplp_aux_t)); + data[i]->fp = bam_open(fn[i], "r"); + h_tmp = bam_header_read(data[i]->fp); + if (conf->reg) { + int beg, end; + bam_index_t *idx; + idx = bam_index_load(fn[i]); + if (idx == 0) { + fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); + exit(1); + } + if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { + fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); + exit(1); + } + if (i == 0) beg0 = beg, end0 = end; + data[i]->iter = bam_iter_query(idx, tid, beg, end); + bam_index_destroy(idx); + } + if (i == 0) h = h_tmp; + else { + // FIXME: to check consistency + bam_header_destroy(h_tmp); + } + } + // mpileup + ref_tid = -1; ref = 0; + iter = bam_mplp_init(n, mplp_func, (void**)data); + while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { + if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested + if (tid != ref_tid) { + free(ref); + if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); + ref_tid = tid; + } + printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + int j; + printf("\t%d\t", n_plp[i]); + if (n_plp[i] == 0) printf("*\t*"); + else { + for (j = 0; j < n_plp[i]; ++j) + pileup_seq(plp[i] + j, pos, ref_len, ref); + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = bam1_qual(p->b)[p->qpos] + 33; + if (c > 126) c = 126; + putchar(c); + } + } + } + putchar('\n'); + } + bam_mplp_destroy(iter); + bam_header_destroy(h); + for (i = 0; i < n; ++i) { + bam_close(data[i]->fp); + if (data[i]->iter) bam_iter_destroy(data[i]->iter); + free(data[i]); + } + free(data); free(plp); free(ref); free(n_plp); + return 0; +} + +int bam_mpileup(int argc, char *argv[]) +{ + int c; + mplp_conf_t mplp; + memset(&mplp, 0, sizeof(mplp_conf_t)); + while ((c = getopt(argc, argv, "f:r:")) >= 0) { + switch (c) { + case 'f': + mplp.fai = fai_load(optarg); + if (mplp.fai == 0) return 1; + break; + case 'r': mplp.reg = strdup(optarg); + } + } + if (argc == 1) { + fprintf(stderr, "Usage: samtools mpileup [-r reg] [-f in.fa] in1.bam [in2.bam [...]]\n"); + return 1; + } + mpileup(&mplp, argc - optind, argv + optind); + free(mplp.reg); + if (mplp.fai) fai_destroy(mplp.fai); + return 0; +} diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c new file mode 100644 index 0000000..bae97c7 --- /dev/null +++ b/samtools/bam_reheader.c @@ -0,0 +1,60 @@ +#include +#include +#include "bgzf.h" +#include "bam.h" + +#define BUF_SIZE 0x10000 + +int bam_reheader(BGZF *in, const bam_header_t *h, int fd) +{ + BGZF *fp; + bam_header_t *old; + int len; + uint8_t *buf; + if (in->open_mode != 'r') return -1; + buf = malloc(BUF_SIZE); + old = bam_header_read(in); + fp = bgzf_fdopen(fd, "w"); + bam_header_write(fp, h); + if (in->block_offset < in->block_length) { + bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); + bgzf_flush(fp); + } +#ifdef _USE_KNETFILE + while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) +#else + while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) +#endif + fwrite(buf, 1, len, fp->x.fpw); + free(buf); + fp->block_offset = in->block_offset = 0; + bgzf_close(fp); + return 0; +} + +int main_reheader(int argc, char *argv[]) +{ + bam_header_t *h; + BGZF *in; + if (argc != 3) { + fprintf(stderr, "Usage: samtools reheader \n"); + return 1; + } + { // read the header + tamFile fph = sam_open(argv[1]); + if (fph == 0) { + fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); + return 1; + } + h = sam_header_read(fph); + sam_close(fph); + } + in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); + return 1; + } + bam_reheader(in, h, fileno(stdout)); + bgzf_close(in); + return 0; +} diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 9884f3d..12b1b54 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -294,7 +294,7 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size mem += ret; ++k; if (mem >= max_mem) { - sort_blocks(n++, k, buf, prefix, header, is_stdout); + sort_blocks(n++, k, buf, prefix, header, 0); mem = 0; k = 0; } } @@ -304,7 +304,7 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size else { // then merge char **fns, *fnout; fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); - sort_blocks(n++, k, buf, prefix, header, is_stdout); + sort_blocks(n++, k, buf, prefix, header, 0); fnout = (char*)calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); diff --git a/samtools/bam_tview.c b/samtools/bam_tview.c index 4c121e7..7b326fc 100644 --- a/samtools/bam_tview.c +++ b/samtools/bam_tview.c @@ -280,7 +280,7 @@ int tv_draw_aln(tview_t *tv, int tid, int pos) static void tv_win_goto(tview_t *tv, int *tid, int *pos) { - char str[256]; + char str[256], *p; int i, l = 0; wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); mvwprintw(tv->wgoto, 1, 2, "Goto: "); @@ -291,10 +291,18 @@ static void tv_win_goto(tview_t *tv, int *tid, int *pos) --l; } else if (c == KEY_ENTER || c == '\012' || c == '\015') { int _tid = -1, _beg, _end; - bam_parse_region(tv->header, str, &_tid, &_beg, &_end); - if (_tid >= 0) { - *tid = _tid; *pos = _beg; - return; + if (str[0] == '=') { + _beg = strtol(str+1, &p, 10); + if (_beg > 0) { + *pos = _beg; + return; + } + } else { + bam_parse_region(tv->header, str, &_tid, &_beg, &_end); + if (_tid >= 0) { + *tid = _tid; *pos = _beg; + return; + } } } else if (isgraph(c)) { if (l < TV_MAX_GOTO) str[l++] = c; @@ -351,6 +359,7 @@ void tv_loop(tview_t *tv) case '?': tv_win_help(tv); break; case '\033': case 'q': goto end_loop; + case '/': case 'g': tv_win_goto(tv, &tid, &pos); break; case 'm': tv->color_for = TV_COLOR_MAPQ; break; case 'b': tv->color_for = TV_COLOR_BASEQ; break; diff --git a/samtools/bgzf.c b/samtools/bgzf.c index 59f902f..a6923da 100644 --- a/samtools/bgzf.c +++ b/samtools/bgzf.c @@ -203,9 +203,7 @@ bgzf_open(const char* __restrict path, const char* __restrict mode) if (fd == -1) return 0; fp = open_write(fd, strstr(mode, "u")? 1 : 0); } - if (fp != NULL) { - fp->owned_file = 1; - } + if (fp != NULL) fp->owned_file = 1; return fp; } @@ -429,20 +427,19 @@ static void cache_block(BGZF *fp, int size) memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); } -static int -read_block(BGZF* fp) +bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; - int size = 0; + int count, size = 0; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); if (load_block_from_cache(fp, block_address)) return 0; - int count = knet_read(fp->x.fpr, header, sizeof(header)); + count = knet_read(fp->x.fpr, header, sizeof(header)); #else int64_t block_address = ftello(fp->file); if (load_block_from_cache(fp, block_address)) return 0; - int count = fread(header, 1, sizeof(header), fp->file); + count = fread(header, 1, sizeof(header), fp->file); #endif if (count == 0) { fp->block_length = 0; @@ -472,9 +469,7 @@ read_block(BGZF* fp) } size += count; count = inflate_block(fp, block_length); - if (count < 0) { - return -1; - } + if (count < 0) return -1; if (fp->block_length != 0) { // Do not reset offset if this read follows a seek. fp->block_offset = 0; @@ -501,7 +496,7 @@ bgzf_read(BGZF* fp, void* data, int length) while (bytes_read < length) { int available = fp->block_length - fp->block_offset; if (available <= 0) { - if (read_block(fp) != 0) { + if (bgzf_read_block(fp) != 0) { return -1; } available = fp->block_length - fp->block_offset; @@ -528,19 +523,16 @@ bgzf_read(BGZF* fp, void* data, int length) return bytes_read; } -static -int -flush_block(BGZF* fp) +int bgzf_flush(BGZF* fp) { while (fp->block_offset > 0) { - int block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) { - return -1; - } + int count, block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; #ifdef _USE_KNETFILE - int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); #else - int count = fwrite(fp->compressed_block, 1, block_length, fp->file); + count = fwrite(fp->compressed_block, 1, block_length, fp->file); #endif if (count != block_length) { report_error(fp, "write failed"); @@ -551,17 +543,22 @@ flush_block(BGZF* fp) return 0; } -int -bgzf_write(BGZF* fp, const void* data, int length) +int bgzf_flush_try(BGZF *fp, int size) +{ + if (fp->block_offset + size > fp->uncompressed_block_size) + return bgzf_flush(fp); + return -1; +} + +int bgzf_write(BGZF* fp, const void* data, int length) { if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; } - if (fp->uncompressed_block == NULL) { + if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); - } const bgzf_byte_t* input = data; int block_length = fp->uncompressed_block_size; @@ -574,7 +571,7 @@ bgzf_write(BGZF* fp, const void* data, int length) input += copy_length; bytes_written += copy_length; if (fp->block_offset == block_length) { - if (flush_block(fp) != 0) { + if (bgzf_flush(fp) != 0) { break; } } @@ -582,13 +579,10 @@ bgzf_write(BGZF* fp, const void* data, int length) return bytes_written; } -int -bgzf_close(BGZF* fp) +int bgzf_close(BGZF* fp) { if (fp->open_mode == 'w') { - if (flush_block(fp) != 0) { - return -1; - } + if (bgzf_flush(fp) != 0) return -1; { // add an empty block int count, block_length = deflate_block(fp, 0); #ifdef _USE_KNETFILE @@ -613,9 +607,7 @@ bgzf_close(BGZF* fp) else ret = knet_close(fp->x.fpr); if (ret != 0) return -1; #else - if (fclose(fp->file) != 0) { - return -1; - } + if (fclose(fp->file) != 0) return -1; #endif } free(fp->uncompressed_block); @@ -625,12 +617,6 @@ bgzf_close(BGZF* fp) return 0; } -int64_t -bgzf_tell(BGZF* fp) -{ - return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); -} - void bgzf_set_cache_size(BGZF *fp, int cache_size) { if (fp) fp->cache_size = cache_size; @@ -655,9 +641,11 @@ int bgzf_check_EOF(BGZF *fp) return (memcmp(magic, buf, 28) == 0)? 1 : 0; } -int64_t -bgzf_seek(BGZF* fp, int64_t pos, int where) +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { + int block_offset; + int64_t block_address; + if (fp->open_mode != 'r') { report_error(fp, "file not open for read"); return -1; @@ -666,8 +654,8 @@ bgzf_seek(BGZF* fp, int64_t pos, int where) report_error(fp, "unimplemented seek option"); return -1; } - int block_offset = pos & 0xFFFF; - int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; + block_offset = pos & 0xFFFF; + block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; #ifdef _USE_KNETFILE if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { #else diff --git a/samtools/bgzf.h b/samtools/bgzf.h index 91b3317..099ae9a 100644 --- a/samtools/bgzf.h +++ b/samtools/bgzf.h @@ -106,7 +106,7 @@ int bgzf_write(BGZF* fp, const void* data, int length); * Return value is non-negative on success. * Returns -1 on error. */ -int64_t bgzf_tell(BGZF* fp); +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) /* * Set the file to read from the location specified by pos, which must @@ -126,9 +126,32 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); void bgzf_set_cache_size(BGZF *fp, int cache_size); int bgzf_check_EOF(BGZF *fp); +int bgzf_read_block(BGZF* fp); +int bgzf_flush(BGZF* fp); +int bgzf_flush_try(BGZF *fp, int size); #ifdef __cplusplus } #endif +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + #endif diff --git a/samtools/faidx.c b/samtools/faidx.c index 811bdf8..dbd8b3e 100644 --- a/samtools/faidx.c +++ b/samtools/faidx.c @@ -197,7 +197,7 @@ int fai_build(const char *fn) sprintf(str, "%s.fai", fn); rz = razf_open(fn, "r"); if (rz == 0) { - fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",str); + fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); free(str); return -1; } diff --git a/samtools/knetfile.c b/samtools/knetfile.c index 994babb..e1be4d6 100644 --- a/samtools/knetfile.c +++ b/samtools/knetfile.c @@ -38,9 +38,7 @@ #include #include -#ifdef _WIN32 -#include -#else +#ifndef _WIN32 #include #include #include @@ -566,7 +564,7 @@ off_t knet_seek(knetFile *fp, int64_t off, int whence) else if (whence==SEEK_SET) fp->offset = off; fp->is_ready = 0; - return fp->offset; + return 0; } errno = EINVAL; fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); diff --git a/samtools/kstring.h b/samtools/kstring.h index f4e5a99..925117a 100644 --- a/samtools/kstring.h +++ b/samtools/kstring.h @@ -58,6 +58,40 @@ static inline int kputc(int c, kstring_t *s) return c; } +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + static inline int *ksplit(kstring_t *s, int delimiter, int *n) { int max = 0, *offsets = 0; diff --git a/samtools/sam.c b/samtools/sam.c index ad4325b..ecdee02 100644 --- a/samtools/sam.c +++ b/samtools/sam.c @@ -55,6 +55,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) if (aux) { // check if aux is present bam_header_t *textheader = fp->header; fp->header = sam_header_read2((const char*)aux); + if (fp->header == 0) goto open_err_ret; append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } diff --git a/samtools/sam_header.c b/samtools/sam_header.c index a119c02..05d75de 100644 --- a/samtools/sam_header.c +++ b/samtools/sam_header.c @@ -10,6 +10,7 @@ KHASH_MAP_INIT_STR(str, const char *) struct _HeaderList { + struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. struct _HeaderList *next; void *data; }; @@ -58,6 +59,34 @@ static void debug(const char *format, ...) va_end(ap); } +#if 0 +// Replaced by list_append_to_end +static list_t *list_prepend(list_t *root, void *data) +{ + list_t *l = malloc(sizeof(list_t)); + l->next = root; + l->data = data; + return l; +} +#endif + +// Relies on the root->last being correct. Do not use with the other list_* +// routines unless they are fixed to modify root->last as well. +static list_t *list_append_to_end(list_t *root, void *data) +{ + list_t *l = malloc(sizeof(list_t)); + l->last = l; + l->next = NULL; + l->data = data; + + if ( !root ) + return l; + + root->last->next = l; + root->last = l; + return root; +} + static list_t *list_append(list_t *root, void *data) { list_t *l = root; @@ -322,7 +351,7 @@ static HeaderLine *sam_header_line_parse(const char *headerLine) while (*to && *to!='\t') to++; if ( to-from != 2 ) { - debug("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine); + debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); return 0; } @@ -345,7 +374,11 @@ static HeaderLine *sam_header_line_parse(const char *headerLine) while (*to && *to!='\t') to++; if ( !required_tags[itype] && !optional_tags[itype] ) + { + // CO is a special case, it can contain anything, including tabs + if ( *to ) { to++; continue; } tag = new_tag(" ",from,to-1); + } else tag = new_tag(from,from+3,to-1); @@ -539,7 +572,8 @@ void *sam_header_parse2(const char *headerText) { hline = sam_header_line_parse(buf); if ( hline && sam_header_line_validate(hline) ) - hlines = list_append(hlines, hline); + // With too many (~250,000) reference sequences the header parsing was too slow with list_append. + hlines = list_append_to_end(hlines, hline); else { if (hline) sam_header_line_free(hline); diff --git a/samtools/sam_view.c b/samtools/sam_view.c index 06dd01a..3b10e2e 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -6,7 +6,12 @@ #include "sam_header.h" #include "sam.h" #include "faidx.h" +#include "khash.h" +KHASH_SET_INIT_STR(rg) +typedef khash_t(rg) *rghash_t; + +rghash_t g_rghash = 0; static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; static char *g_library, *g_rg; static int g_sol2sanger_tbl[128]; @@ -32,9 +37,15 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) { if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; - if (g_rg) { + if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); - if (s && strcmp(g_rg, (char*)(s + 1)) == 0) return 0; + if (s) { + if (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1; + if (g_rghash) { + khint_t k = kh_get(rg, g_rghash, (char*)(s + 1)); + return (k != kh_end(g_rghash))? 0 : 1; + } + } } if (g_library) { const char *p = bam_get_library((bam_header_t*)h, b); @@ -58,11 +69,11 @@ int main_samview(int argc, char *argv[]) int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0; int of_type = BAM_OFDEC, is_long_help = 0; samfile_t *in = 0, *out = 0; - char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:C")) >= 0) { + while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { switch (c) { case 'C': slx2sngr = 1; break; case 'S': is_bamin = 0; break; @@ -77,6 +88,7 @@ int main_samview(int argc, char *argv[]) case 'u': is_uncompressed = 1; break; case 'l': g_library = strdup(optarg); break; case 'r': g_rg = strdup(optarg); break; + case 'R': fn_rg = strdup(optarg); break; case 'x': of_type = BAM_OFHEX; break; case 'X': of_type = BAM_OFSTR; break; case '?': is_long_help = 1; break; @@ -94,7 +106,19 @@ int main_samview(int argc, char *argv[]) if (is_bamin) strcat(in_mode, "b"); if (is_header) strcat(out_mode, "h"); if (is_uncompressed) strcat(out_mode, "u"); - if (argc == optind) return usage(is_long_help); + if (argc == optind) return usage(is_long_help); // potential memory leak... + + // read the list of read groups + if (fn_rg) { + FILE *fp_rg; + char buf[1024]; + int ret; + g_rghash = kh_init(rg); + fp_rg = fopen(fn_rg, "r"); + while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me... + kh_put(rg, g_rghash, strdup(buf), &ret); // we'd better check duplicates... + fclose(fp_rg); + } // generate the fn_list if necessary if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref); @@ -147,7 +171,13 @@ int main_samview(int argc, char *argv[]) view_end: // close files, free and return - free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); + free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); + if (g_rghash) { + khint_t k; + for (k = 0; k < kh_end(g_rghash); ++k) + if (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k)); + kh_destroy(rg, g_rghash); + } samclose(in); samclose(out); return ret; @@ -167,6 +197,7 @@ static int usage(int is_long_help) fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -R FILE list of read groups to be outputted [null]\n"); fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); fprintf(stderr, " -q INT minimum mapping quality [0]\n"); diff --git a/setup.py b/setup.py index 098cb7f..925f016 100644 --- a/setup.py +++ b/setup.py @@ -6,34 +6,54 @@ pysam ''' -import os, sys, glob, shutil +import os, sys, glob, shutil, hashlib name = "pysam" -version = "0.2" + +# collect pysam version +sys.path.insert( 0, "pysam") +import version + +version = version.__version__ samtools_exclude = ( "bamtk.c", "razip.c", "bgzip.c" ) samtools_dest = os.path.abspath( "samtools" ) +tabix_exclude = ( "main.c", ) +tabix_dest = os.path.abspath( "tabix" ) # copy samtools source if len(sys.argv) >= 2 and sys.argv[1] == "import": if len(sys.argv) < 3: raise ValueError("missing PATH to samtools source directory") - samtools_src = os.path.abspath( sys.argv[2] ) - if not os.path.exists( samtools_src ): raise IOError( "samtools src dir `%s` does not exist." % samtools_src ) - - cfiles = glob.glob( os.path.join( samtools_src, "*.c" ) ) - hfiles = glob.glob( os.path.join( samtools_src, "*.h" ) ) - ncopied = 0 - for p in cfiles + hfiles: - f = os.path.basename(p) - if f in samtools_exclude: continue - if os.path.exists( os.path.join( samtools_dest, f )): continue - shutil.copy( p, samtools_dest ) - ncopied += 1 - print "installed latest source code from %s: %i files copied" % (samtools_src, ncopied) + if len(sys.argv) < 4: raise ValueError("missing PATH to tabix source directory") + + for destdir, srcdir, exclude in zip( + (samtools_dest, tabix_dest), + sys.argv[2:4], + (samtools_exclude, tabix_exclude)): + + srcdir = os.path.abspath( srcdir ) + if not os.path.exists( srcdir ): raise IOError( "samtools src dir `%s` does not exist." % srcdir ) + + cfiles = glob.glob( os.path.join( srcdir, "*.c" ) ) + hfiles = glob.glob( os.path.join( srcdir, "*.h" ) ) + ncopied = 0 + for new_file in cfiles + hfiles: + f = os.path.basename(new_file) + if f in exclude: continue + old_file = os.path.join( destdir, f ) + if os.path.exists( old_file ): + md5_old = hashlib.md5("".join(open(old_file,"r").readlines())).digest() + md5_new = hashlib.md5("".join(open(new_file,"r").readlines())).digest() + if md5_old == md5_new: continue + raise ValueError( "incompatible files for %s and %s" % (old_file, new_file )) + + shutil.copy( new_file, destdir ) + ncopied += 1 + print "installed latest source code from %s: %i files copied" % (srcdir, ncopied) sys.exit(0) from distutils.core import setup, Extension -from Pyrex.Distutils import build_ext +from Cython.Distutils import build_ext classifiers = """ Development Status :: 2 - Alpha @@ -48,14 +68,27 @@ Topic :: Scientific/Engineering Topic :: Scientific/Engineering :: Bioinformatics """ -pysam = Extension( - "pysam/csamtools", # name of extension +samtools = Extension( + "csamtools", # name of extension [ "pysam/csamtools.pyx" ] +\ [ "pysam/%s" % x for x in ( "pysam_util.c", )] +\ glob.glob( os.path.join( "samtools", "*.c" ) ), library_dirs=[], - include_dirs=[ "samtools", ], + include_dirs=[ "samtools", "pysam" ], + libraries=[ "z", ], + language="c", + define_macros = [('FILE_OFFSET_BITS','64'), + ('_USE_KNETFILE','')], + ) + +tabix = Extension( + "ctabix", # name of extension + [ "pysam/ctabix.pyx" ] +\ + [ "pysam/%s" % x for x in ()] +\ + glob.glob( os.path.join( "tabix", "*.c" ) ), + library_dirs=[], + include_dirs=[ "tabix", "pysam" ], libraries=[ "z", ], language="c", ) @@ -71,8 +104,11 @@ metadata = { 'platforms': "ALL", 'url': "http://code.google.com/p/pysam/", 'py_modules': [ - "pysam/__init__", "pysam/Pileup", "pysam/namedtuple" ], - 'ext_modules': [pysam,], + "pysam/__init__", + "pysam/Pileup", + "pysam/namedtuple", + "pysam/version" ], + 'ext_modules': [samtools, tabix], 'cmdclass' : {'build_ext': build_ext} } if __name__=='__main__': diff --git a/tabix/bam_endian.h b/tabix/bam_endian.h new file mode 100644 index 0000000..0fc74a8 --- /dev/null +++ b/tabix/bam_endian.h @@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif diff --git a/tabix/bgzf.c b/tabix/bgzf.c new file mode 100644 index 0000000..7a936a8 --- /dev/null +++ b/tabix/bgzf.c @@ -0,0 +1,676 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif + +typedef int8_t bgzf_byte_t; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +static inline +int +bgzf_min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); + return fp; +} + +static +BGZF* +open_read(int fd) +{ +#ifdef _USE_KNETFILE + knetFile *file = knet_dopen(fd, "r"); +#else + FILE* file = fdopen(fd, "r"); +#endif + BGZF* fp; + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = fd; + fp->open_mode = 'r'; +#ifdef _USE_KNETFILE + fp->x.fpr = file; +#else + fp->file = file; +#endif + return fp; +} + +static +BGZF* +open_write(int fd, bool is_uncompressed) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp; + if (file == 0) return 0; + fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; +#ifdef _USE_KNETFILE + fp->x.fpw = file; +#else + fp->file = file; +#endif + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ +#ifdef _USE_KNETFILE + knetFile *file = knet_open(path, mode); + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = -1; + fp->open_mode = 'r'; + fp->x.fpr = file; +#else + int fd, oflag = O_RDONLY; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag); + if (fd == -1) return 0; + fp = open_read(fd); +#endif + } else if (mode[0] == 'w' || mode[0] == 'W') { + int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag, 0666); + if (fd == -1) return 0; + fp = open_write(fd, strstr(mode, "u")? 1 : 0); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (fd == -1) return 0; + if (mode[0] == 'r' || mode[0] == 'R') { + return open_read(fd); + } else if (mode[0] == 'w' || mode[0] == 'W') { + return open_write(fd, strstr(mode, "u")? 1 : 0); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + bgzf_byte_t* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const bgzf_byte_t* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (bgzf_byte_t) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + +int +bgzf_read_block(BGZF* fp) +{ + bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + int size = 0; +#ifdef _USE_KNETFILE + int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; + int count = knet_read(fp->x.fpr, header, sizeof(header)); +#else + int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; + int count = fread(header, 1, sizeof(header), fp->file); +#endif + if (count == 0) { + fp->block_length = 0; + return 0; + } + size = count; + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; +#ifdef _USE_KNETFILE + count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); +#else + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); +#endif + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + size += count; + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + bgzf_byte_t* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = bgzf_min(length-bytes_read, available); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } +#ifdef _USE_KNETFILE + int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const bgzf_byte_t* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } + { // add an empty block + int count, block_length = deflate_block(fp, 0); +#ifdef _USE_KNETFILE + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + } +#ifdef _USE_KNETFILE + if (fflush(fp->x.fpw) != 0) { +#else + if (fflush(fp->file) != 0) { +#endif + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { +#ifdef _USE_KNETFILE + int ret; + if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); + else ret = knet_close(fp->x.fpr); + if (ret != 0) return -1; +#else + if (fclose(fp->file) != 0) { + return -1; + } +#endif + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + uint8_t buf[28]; + off_t offset; +#ifdef _USE_KNETFILE + offset = knet_tell(fp->x.fpr); + if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; + knet_read(fp->x.fpr, buf, 28); + knet_seek(fp->x.fpr, offset, SEEK_SET); +#else + offset = ftello(fp->file); + if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; + fread(buf, 1, 28, fp->file); + fseeko(fp->file, offset, SEEK_SET); +#endif + return (memcmp(magic, buf, 28) == 0)? 1 : 0; +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { +#else + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { +#endif + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} diff --git a/tabix/bgzf.h b/tabix/bgzf.h new file mode 100644 index 0000000..f544a67 --- /dev/null +++ b/tabix/bgzf.h @@ -0,0 +1,156 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include +#include +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +int bgzf_check_EOF(BGZF *fp); + +int bgzf_read_block(BGZF* fp); + +#ifdef __cplusplus +} +#endif + +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + +#endif diff --git a/tabix/bgzip.c b/tabix/bgzip.c new file mode 100644 index 0000000..d144632 --- /dev/null +++ b/tabix/bgzip.c @@ -0,0 +1,201 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static int bgzip_main_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); + fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); + fprintf(stderr, " -d decompress\n"); + fprintf(stderr, " -f overwrite files without asking\n"); + fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); + fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); + fprintf(stderr, " -h give this help\n"); + fprintf(stderr, "\n"); + return 1; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { + fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + fprintf(stderr, "[bgzip] not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { + fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +static void fail(BGZF* fp) +{ + fprintf(stderr, "Error: %s\n", fp->error); + exit(1); +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + BGZF *fp; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ + switch(c){ + case 'h': return bgzip_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if (end >= 0 && end < start) { + fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); + return 1; + } + if (compress == 1) { + struct stat sbuf; + int f_src = fileno(stdin); + int f_dst = fileno(stdout); + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if (pstdout) + f_dst = fileno(stdout); + else + { + char *name = malloc(strlen(argv[optind]) + 5); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(); + + fp = bgzf_fdopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) fail(fp); + // f_dst will be closed here + if (bgzf_close(fp) < 0) fail(fp); + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + struct stat sbuf; + int f_dst; + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + char *name; + int len = strlen(argv[optind]); + if ( strcmp(argv[optind]+len-3,".gz") ) + { + fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(); + else + { + f_dst = fileno(stdout); + fp = bgzf_fdopen(fileno(stdin), "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) fail(fp); + start += c; + write(f_dst, buffer, c); + if (end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(fp) < 0) fail(fp); + if (!pstdout) unlink(argv[optind]); + return 0; + } +} diff --git a/tabix/index.c b/tabix/index.c new file mode 100644 index 0000000..e5b227c --- /dev/null +++ b/tabix/index.c @@ -0,0 +1,954 @@ +#include +#include +#include +#include "khash.h" +#include "ksort.h" +#include "kstring.h" +#include "bam_endian.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif +#include "tabix.h" + +#define TAD_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. +#define TAD_LIDX_SHIFT 14 + +typedef struct { + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) +KSORT_INIT(off, pair64_t, pair64_lt) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} ti_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} ti_lidx_t; + +KHASH_MAP_INIT_INT(i, ti_binlist_t) +KHASH_MAP_INIT_STR(s, int) + +struct __ti_index_t { + ti_conf_t conf; + int32_t n, max; + khash_t(s) *tname; + khash_t(i) **index; + ti_lidx_t *index2; +}; + +struct __ti_iter_t { + int from_first; // read from the first record; no random access + int tid, beg, end, n_off, i, finished; + uint64_t curr_off; + kstring_t str; + const ti_index_t *idx; + pair64_t *off; +}; + +typedef struct { + int tid, beg, end, bin; +} ti_intv_t; + +ti_conf_t ti_conf_gff = { 0, 1, 4, 5, '#', 0 }; +ti_conf_t ti_conf_bed = { TI_FLAG_UCSC, 1, 2, 3, '#', 0 }; +ti_conf_t ti_conf_psltbl = { TI_FLAG_UCSC, 15, 17, 18, '#', 0 }; +ti_conf_t ti_conf_sam = { TI_PRESET_SAM, 3, 4, 0, '@', 0 }; +ti_conf_t ti_conf_vcf = { TI_PRESET_VCF, 1, 2, 0, '#', 0 }; + +/*************** + * read a line * + ***************/ + +/* +int ti_readline(BGZF *fp, kstring_t *str) +{ + int c, l = 0; + str->l = 0; + while ((c = bgzf_getc(fp)) >= 0 && c != '\n') { + ++l; + if (c != '\r') kputc(c, str); + } + if (c < 0 && l == 0) return -1; // end of file + return str->l; +} +*/ + +/* Below is a faster implementation largely equivalent to the one + * commented out above. */ +int ti_readline(BGZF *fp, kstring_t *str) +{ + int l, state = 0; + unsigned char *buf = (unsigned char*)fp->uncompressed_block; + str->l = 0; + do { + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) { state = -2; break; } + if (fp->block_length == 0) { state = -1; break; } + } + for (l = fp->block_offset; l < fp->block_length && buf[l] != '\n'; ++l); + if (l < fp->block_length) state = 1; + l -= fp->block_offset; + if (str->l + l + 1 >= str->m) { + str->m = str->l + l + 2; + kroundup32(str->m); + str->s = (char*)realloc(str->s, str->m); + } + memcpy(str->s + str->l, buf + fp->block_offset, l); + str->l += l; + fp->block_offset += l + 1; + if (fp->block_offset >= fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + } while (state == 0); + if (str->l == 0 && state < 0) return state; + str->s[str->l] = 0; + return str->l; +} + +/************************************* + * get the interval from a data line * + *************************************/ + +static inline int ti_reg2bin(uint32_t beg, uint32_t end) +{ + --end; + if (beg>>14 == end>>14) return 4681 + (beg>>14); + if (beg>>17 == end>>17) return 585 + (beg>>17); + if (beg>>20 == end>>20) return 73 + (beg>>20); + if (beg>>23 == end>>23) return 9 + (beg>>23); + if (beg>>26 == end>>26) return 1 + (beg>>26); + return 0; +} + +static int get_tid(ti_index_t *idx, const char *ss) +{ + khint_t k; + int tid; + k = kh_get(s, idx->tname, ss); + if (k == kh_end(idx->tname)) { // a new target sequence + int ret, size; + // update idx->n, ->max, ->index and ->index2 + if (idx->n == idx->max) { + idx->max = idx->max? idx->max<<1 : 8; + idx->index = realloc(idx->index, idx->max * sizeof(void*)); + idx->index2 = realloc(idx->index2, idx->max * sizeof(ti_lidx_t)); + } + memset(&idx->index2[idx->n], 0, sizeof(ti_lidx_t)); + idx->index[idx->n++] = kh_init(i); + // update ->tname + tid = size = kh_size(idx->tname); + k = kh_put(s, idx->tname, strdup(ss), &ret); + kh_value(idx->tname, k) = size; + assert(idx->n == kh_size(idx->tname)); + } else tid = kh_value(idx->tname, k); + return tid; +} + +static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) +{ + int i, b = 0, id = 1; + char *s; + intv->tid = intv->beg = intv->end = intv->bin = -1; + for (i = 0; i <= str->l; ++i) { + if (str->s[i] == '\t' || str->s[i] == 0) { + if (id == idx->conf.sc) { + str->s[i] = 0; + intv->tid = get_tid(idx, str->s + b); + if (i != str->l) str->s[i] = '\t'; + } else if (id == idx->conf.bc) { + // here ->beg is 0-based. + intv->beg = intv->end = strtol(str->s + b, &s, 0); + if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg; + else ++intv->end; + if (intv->beg < 0) intv->beg = 0; + if (intv->end < 1) intv->end = 1; + } else { + if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) { + if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0); + } else if ((idx->conf.preset&0xffff) == TI_PRESET_SAM) { + if (id == 6) { // CIGAR + int l = 0, op; + char *t; + for (s = str->s + b; s < str->s + i;) { + long x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M' || op == 'D' || op == 'N') l += x; + s = t + 1; + } + if (l == 0) l = 1; + intv->end = intv->beg + l; + } + } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) { + // FIXME: the following is NOT tested and is likely to be buggy + if (id == 5) { // ALT + char *t; + int max = 1; + for (s = str->s + b; s < str->s + i;) { + if (s[i] == 'D') { + long x = strtol(s + 1, &t, 10); + if (x > max) max = x; + s = t + 1; + } else ++s; + } + intv->end = intv->beg + max; + } + } + } + b = i + 1; + ++id; + } + } + if (intv->tid < 0 || intv->beg < 0 || intv->end < 0) return -1; + intv->bin = ti_reg2bin(intv->beg, intv->end); + return 0; +} + +/************ + * indexing * + ************/ + +// requirement: len <= LEN_MASK +static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + ti_binlist_t *l; + int ret; + k = kh_put(i, h, bin, &ret); + l = &kh_value(h, k); + if (ret) { // not present + l->m = 1; l->n = 0; + l->list = (pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; l->list[l->n++].v = end; +} + +static inline void insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_t offset) +{ + int i, beg, end; + beg = _beg >> TAD_LIDX_SHIFT; + end = (_end - 1) >> TAD_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } + if (index2->n < end + 1) index2->n = end + 1; +} + +static void merge_chunks(ti_index_t *idx) +{ + khash_t(i) *index; + int i, l, m; + khint_t k; + for (i = 0; i < idx->n; ++i) { + index = idx->index[i]; + for (k = kh_begin(index); k != kh_end(index); ++k) { + ti_binlist_t *p; + if (!kh_exist(index, k)) continue; + p = &kh_value(index, k); + m = 0; + for (l = 1; l < p->n; ++l) { + if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; + else p->list[++m] = p->list[l]; + } // ~for(l) + p->n = m + 1; + } // ~for(k) + } // ~for(i) +} + +static void fill_missing(ti_index_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + ti_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + +ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) +{ + int ret; + ti_index_t *idx; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; + uint64_t save_off, last_off, lineno = 0; + kstring_t *str; + + str = calloc(1, sizeof(kstring_t)); + + idx = (ti_index_t*)calloc(1, sizeof(ti_index_t)); + idx->conf = *conf; + idx->n = idx->max = 0; + idx->tname = kh_init(s); + idx->index = 0; + idx->index2 = 0; + + save_bin = save_tid = last_tid = last_bin = 0xffffffffu; + save_off = last_off = bgzf_tell(fp); last_coor = 0xffffffffu; + while ((ret = ti_readline(fp, str)) >= 0) { + ti_intv_t intv; + ++lineno; + if (lineno <= idx->conf.line_skip || str->s[0] == idx->conf.meta_char) { + last_off = bgzf_tell(fp); + continue; + } + get_intv(idx, str, &intv); + if (last_tid != intv.tid) { // change of chromosomes + last_tid = intv.tid; + last_bin = 0xffffffffu; + } else if (last_coor > intv.beg) { + fprintf(stderr, "[ti_index_core] the file out of order at line %llu\n", (unsigned long long)lineno); + exit(1); + } + insert_offset2(&idx->index2[intv.tid], intv.beg, intv.end, last_off); + if (intv.bin != last_bin) { // then possibly write the binning index + if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + save_off = last_off; + save_bin = last_bin = intv.bin; + save_tid = intv.tid; + if (save_tid < 0) break; + } + if (bgzf_tell(fp) <= last_off) { + fprintf(stderr, "[ti_index_core] bug in BGZF: %llx < %llx\n", + (unsigned long long)bgzf_tell(fp), (unsigned long long)last_off); + exit(1); + } + last_off = bgzf_tell(fp); + last_coor = intv.beg; + } + if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp)); + merge_chunks(idx); + fill_missing(idx); + + free(str->s); free(str); + return idx; +} + +void ti_index_destroy(ti_index_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + // destroy the name hash table + for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) { + if (kh_exist(idx->tname, k)) + free((char*)kh_key(idx->tname, k)); + } + kh_destroy(s, idx->tname); + // destroy the binning index + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + ti_lidx_t *index2 = idx->index2 + i; + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) + free(kh_value(index, k).list); + } + kh_destroy(i, index); + free(index2->offset); + } + free(idx->index); + // destroy the linear index + free(idx->index2); + free(idx); +} + +/****************** + * index file I/O * + ******************/ + +void ti_index_save(const ti_index_t *idx, BGZF *fp) +{ + int32_t i, size, ti_is_be; + khint_t k; + ti_is_be = bam_is_big_endian(); + bgzf_write(fp, "TBI\1", 4); + if (ti_is_be) { + uint32_t x = idx->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &idx->n, 4); + assert(sizeof(ti_conf_t) == 24); + if (ti_is_be) { // write ti_conf_t; + uint32_t x[6]; + memcpy(x, &idx->conf, 24); + for (i = 0; i < 6; ++i) bgzf_write(fp, bam_swap_endian_4p(&x[i]), 4); + } else bgzf_write(fp, &idx->conf, sizeof(ti_conf_t)); + { // write target names + char **name; + int32_t l = 0; + name = calloc(kh_size(idx->tname), sizeof(void*)); + for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) + if (kh_exist(idx->tname, k)) + name[kh_value(idx->tname, k)] = (char*)kh_key(idx->tname, k); + for (i = 0; i < kh_size(idx->tname); ++i) + l += strlen(name[i]) + 1; + if (ti_is_be) bgzf_write(fp, bam_swap_endian_4p(&l), 4); + else bgzf_write(fp, &l, 4); + for (i = 0; i < kh_size(idx->tname); ++i) + bgzf_write(fp, name[i], strlen(name[i]) + 1); + free(name); + } + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + ti_lidx_t *index2 = idx->index2 + i; + // write binning index + size = kh_size(index); + if (ti_is_be) { // big endian + uint32_t x = size; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &size, 4); + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) { + ti_binlist_t *p = &kh_value(index, k); + if (ti_is_be) { // big endian + uint32_t x; + x = kh_key(index, k); bgzf_write(fp, bam_swap_endian_4p(&x), 4); + x = p->n; bgzf_write(fp, bam_swap_endian_4p(&x), 4); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + bgzf_write(fp, p->list, 16 * p->n); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } else { + bgzf_write(fp, &kh_key(index, k), 4); + bgzf_write(fp, &p->n, 4); + bgzf_write(fp, p->list, 16 * p->n); + } + } + } + // write linear index (index2) + if (ti_is_be) { + int x = index2->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &index2->n, 4); + if (ti_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + bgzf_write(fp, index2->offset, 8 * index2->n); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else bgzf_write(fp, index2->offset, 8 * index2->n); + } +} + +static ti_index_t *ti_index_load_core(BGZF *fp) +{ + int i, ti_is_be; + char magic[4]; + ti_index_t *idx; + ti_is_be = bam_is_big_endian(); + if (fp == 0) { + fprintf(stderr, "[ti_index_load_core] fail to load index.\n"); + return 0; + } + bgzf_read(fp, magic, 4); + if (strncmp(magic, "TBI\1", 4)) { + fprintf(stderr, "[ti_index_load] wrong magic number.\n"); + return 0; + } + idx = (ti_index_t*)calloc(1, sizeof(ti_index_t)); + bgzf_read(fp, &idx->n, 4); + if (ti_is_be) bam_swap_endian_4p(&idx->n); + idx->tname = kh_init(s); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + idx->index2 = (ti_lidx_t*)calloc(idx->n, sizeof(ti_lidx_t)); + // read idx->conf + bgzf_read(fp, &idx->conf, sizeof(ti_conf_t)); + if (ti_is_be) { + bam_swap_endian_4p(&idx->conf.preset); + bam_swap_endian_4p(&idx->conf.sc); + bam_swap_endian_4p(&idx->conf.bc); + bam_swap_endian_4p(&idx->conf.ec); + bam_swap_endian_4p(&idx->conf.meta_char); + bam_swap_endian_4p(&idx->conf.line_skip); + } + { // read target names + int j, ret; + kstring_t *str; + int32_t l; + uint8_t *buf; + bgzf_read(fp, &l, 4); + if (ti_is_be) bam_swap_endian_4p(&l); + buf = calloc(l, 1); + bgzf_read(fp, buf, l); + str = calloc(1, sizeof(kstring_t)); + for (i = j = 0; i < l; ++i) { + if (buf[i] == 0) { + khint_t k = kh_put(s, idx->tname, strdup(str->s), &ret); + kh_value(idx->tname, k) = j++; + str->l = 0; + } else kputc(buf[i], str); + } + free(str->s); free(str); free(buf); + } + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index; + ti_lidx_t *index2 = idx->index2 + i; + uint32_t key, size; + khint_t k; + int j, ret; + ti_binlist_t *p; + index = idx->index[i] = kh_init(i); + // load binning index + bgzf_read(fp, &size, 4); + if (ti_is_be) bam_swap_endian_4p(&size); + for (j = 0; j < (int)size; ++j) { + bgzf_read(fp, &key, 4); + if (ti_is_be) bam_swap_endian_4p(&key); + k = kh_put(i, index, key, &ret); + p = &kh_value(index, k); + bgzf_read(fp, &p->n, 4); + if (ti_is_be) bam_swap_endian_4p(&p->n); + p->m = p->n; + p->list = (pair64_t*)malloc(p->m * 16); + bgzf_read(fp, p->list, 16 * p->n); + if (ti_is_be) { + int x; + for (x = 0; x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } + } + // load linear index + bgzf_read(fp, &index2->n, 4); + if (ti_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + bgzf_read(fp, index2->offset, index2->n * 8); + if (ti_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +ti_index_t *ti_index_load_local(const char *fnidx) +{ + BGZF *fp; + fp = bgzf_open(fnidx, "r"); + if (fp) { + ti_index_t *idx = ti_index_load_core(fp); + bgzf_close(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +static char *get_local_version(const char *fn) +{ + struct stat sbuf; + char *fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".tbi"); + if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { + char *p, *url; + int l = strlen(fnidx); + for (p = fnidx + l - 1; p >= fnidx; --p) + if (*p == '/') break; + url = fnidx; fnidx = strdup(p + 1); + if (stat(fnidx, &sbuf) == 0) { + free(url); + return fnidx; + } + fprintf(stderr, "[%s] downloading the index file...\n", __func__); + download_from_remote(url); + free(url); + } + if (stat(fnidx, &sbuf) == 0) return fnidx; + free(fnidx); return 0; +} + +const char **ti_seqname(const ti_index_t *idx, int *n) +{ + const char **names; + khint_t k; + *n = idx->n; + names = calloc(idx->n, sizeof(void*)); + for (k = kh_begin(idx->tname); k < kh_end(idx->tname); ++k) + if (kh_exist(idx->tname, k)) + names[kh_val(idx->tname, k)] = kh_key(idx->tname, k); + return names; +} + +ti_index_t *ti_index_load(const char *fn) +{ + ti_index_t *idx; + char *fname = get_local_version(fn); + if (fname == 0) return 0; + idx = ti_index_load_local(fname); + free(fname); + if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n"); + return idx; +} + +int ti_index_build2(const char *fn, const ti_conf_t *conf, const char *_fnidx) +{ + char *fnidx; + BGZF *fp, *fpidx; + ti_index_t *idx; + if ((fp = bgzf_open(fn, "r")) == 0) { + fprintf(stderr, "[ti_index_build2] fail to open the BAM file.\n"); + return -1; + } + idx = ti_index_core(fp, conf); + bgzf_close(fp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".tbi"); + } else fnidx = strdup(_fnidx); + fpidx = bgzf_open(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[ti_index_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + ti_index_save(idx, fpidx); + ti_index_destroy(idx); + bgzf_close(fpidx); + free(fnidx); + return 0; +} + +int ti_index_build(const char *fn, const ti_conf_t *conf) +{ + return ti_index_build2(fn, conf, 0); +} + +/******************************************** + * parse a region in the format chr:beg-end * + ********************************************/ + +int ti_get_tid(const ti_index_t *idx, const char *name) +{ + khiter_t iter; + const khash_t(s) *h = idx->tname; + iter = kh_get(s, h, name); /* get the tid */ + if (iter == kh_end(h)) return -1; + return kh_value(h, iter); +} + +int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + if ((*tid = ti_get_tid(idx, s)) < 0) { + free(s); + return -1; + } + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return 0; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) return -1; + return 0; +} + +/******************************* + * retrieve a specified region * + *******************************/ + +#define MAX_BIN 37450 // =(8^6-1)/7+1 + +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +{ + int i = 0, k; + if (beg >= end) return 0; + if (end >= 1u<<29) end = 1u<<29; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; +} + +ti_iter_t ti_iter_first() +{ + ti_iter_t iter; + iter = calloc(1, sizeof(struct __ti_iter_t)); + iter->from_first = 1; + return iter; +} + +ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end) +{ + uint16_t *bins; + int i, n_bins, n_off; + pair64_t *off; + khint_t k; + khash_t(i) *index; + uint64_t min_off; + ti_iter_t iter = 0; + + if (beg < 0) beg = 0; + if (end < beg) return 0; + // initialize the iterator + iter = calloc(1, sizeof(struct __ti_iter_t)); + iter->idx = idx; iter->tid = tid; iter->beg = beg; iter->end = end; iter->i = -1; + // random access + bins = (uint16_t*)calloc(MAX_BIN, 2); + n_bins = reg2bins(beg, end, bins); + index = idx->index[tid]; + if (idx->index2[tid].n > 0) { + min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT]; + if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4 + int n = beg>>TAD_LIDX_SHIFT; + if (n > idx->index2[tid].n) n = idx->index2[tid].n; + for (i = n - 1; i >= 0; --i) + if (idx->index2[tid].offset[i] != 0) break; + if (i >= 0) min_off = idx->index2[tid].offset[i]; + } + } else min_off = 0; // tabix 0.1.2 may produce such index files + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) + n_off += kh_value(index, k).n; + } + if (n_off == 0) { + free(bins); return iter; + } + off = (pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { + int j; + ti_binlist_t *p = &kh_value(index, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + free(bins); + { + int l; + ks_introsort(off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + { // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; + else off[++l] = off[i]; + } + n_off = l + 1; + } + } + iter->n_off = n_off; iter->off = off; + return iter; +} + +const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len) +{ + if (iter->finished) return 0; + if (iter->from_first) { + int ret; + if ((ret = ti_readline(fp, &iter->str)) < 0) { + iter->finished = 1; + return 0; + } else { + if (len) *len = iter->str.l; + return iter->str.s; + } + } + if (iter->n_off == 0) return 0; + while (1) { + int ret; + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk + if (iter->i == iter->n_off - 1) break; // no more chunks + if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug + if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek + bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET); + iter->curr_off = bgzf_tell(fp); + } + ++iter->i; + } + if ((ret = ti_readline(fp, &iter->str)) >= 0) { + ti_intv_t intv; + iter->curr_off = bgzf_tell(fp); + if (iter->str.s[0] == iter->idx->conf.meta_char) continue; + get_intv((ti_index_t*)iter->idx, &iter->str, &intv); + if (intv.tid != iter->tid || intv.beg >= iter->end) break; // no need to proceed + else if (intv.end > iter->beg && iter->end > intv.beg) { + if (len) *len = iter->str.l; + return iter->str.s; + } + } else break; // end of file + } + iter->finished = 1; + return 0; +} + +void ti_iter_destroy(ti_iter_t iter) +{ + if (iter) { + free(iter->str.s); free(iter->off); + free(iter); + } +} + +int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func) +{ + ti_iter_t iter; + const char *s; + int len; + iter = ti_iter_query(idx, tid, beg, end); + while ((s = ti_iter_read(fp, iter, &len)) != 0) + func(len, s, data); + ti_iter_destroy(iter); + return 0; +} + +/******************* + * High-level APIs * + *******************/ + +tabix_t *ti_open(const char *fn, const char *fnidx) +{ + tabix_t *t; + BGZF *fp; + if ((fp = bgzf_open(fn, "r")) == 0) return 0; + t = calloc(1, sizeof(tabix_t)); + t->fn = strdup(fn); + if (fnidx) t->fnidx = strdup(fnidx); + t->fp = fp; + return t; +} + +void ti_close(tabix_t *t) +{ + if (t) { + bgzf_close(t->fp); + if (t->idx) ti_index_destroy(t->idx); + free(t->fn); free(t->fnidx); + free(t); + } +} + +int ti_lazy_index_load(tabix_t *t) +{ + if (t->idx == 0) { // load index + if (t->fnidx) t->idx = ti_index_load_local(t->fnidx); + else t->idx = ti_index_load(t->fn); + if (t->idx == 0) return -1; // fail to load index + } + return 0; +} + +ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end) +{ + if (tid < 0) return ti_iter_first(); + if (ti_lazy_index_load(t) != 0) return 0; + return ti_iter_query(t->idx, tid, beg, end); +} + +ti_iter_t ti_querys(tabix_t *t, const char *reg) +{ + int tid, beg, end; + if (reg == 0) return ti_iter_first(); + if (ti_lazy_index_load(t) != 0) return 0; + if (ti_parse_region(t->idx, reg, &tid, &beg, &end) < 0) return 0; + return ti_iter_query(t->idx, tid, beg, end); +} + +ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end) +{ + int tid; + if (name == 0) return ti_iter_first(); + // then need to load the index + if (ti_lazy_index_load(t) != 0) return 0; + if ((tid = ti_get_tid(t->idx, name)) < 0) return 0; + return ti_iter_query(t->idx, tid, beg, end); +} + +const char *ti_read(tabix_t *t, ti_iter_t iter, int *len) +{ + return ti_iter_read(t->fp, iter, len); +} diff --git a/tabix/khash.h b/tabix/khash.h new file mode 100644 index 0000000..1d583ef --- /dev/null +++ b/tabix/khash.h @@ -0,0 +1,486 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.2" + +#include +#include +#include + +typedef uint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + uint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + uint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [uint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (uint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [uint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/tabix/knetfile.c b/tabix/knetfile.c new file mode 100644 index 0000000..7c96a3e --- /dev/null +++ b/tabix/knetfile.c @@ -0,0 +1,632 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Probably I will not do socket programming in the next few years and + therefore I decide to heavily annotate this file, for Linux and + Windows as well. -lh3 */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#include +#include +#endif + +#include "knetfile.h" + +/* In winsock.h, the type of a socket is SOCKET, which is: "typedef + * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed + * integer -1. In knetfile.c, I use "int" for socket type + * throughout. This should be improved to avoid confusion. + * + * In Linux/Mac, recv() and read() do almost the same thing. You can see + * in the header file that netread() is simply an alias of read(). In + * Windows, however, they are different and using recv() is mandatory. + */ + +/* This function tests if the file handler is ready for reading (or + * writing if is_read==0). */ +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); +#ifndef _WIN32 + if (ret == -1) perror("select"); +#else + if (ret == 0) + fprintf(stderr, "select time-out\n"); + else if (ret == SOCKET_ERROR) + fprintf(stderr, "select: %d\n", WSAGetLastError()); +#endif + return ret; +} + +#ifndef _WIN32 +/* This function does not work with Windows due to the lack of + * getaddrinfo() in winsock. It is addapted from an example in "Beej's + * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + /* In Unix/Mac, getaddrinfo() is the most convenient way to get + * server information. */ + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + /* The following two setsockopt() are used by ftplib + * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they + * necessary. */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +} +#else +/* MinGW's printf has problem with "%lld" */ +char *int64tostr(char *buf, int64_t x) +{ + int cnt; + int i = 0; + do { + buf[i++] = '0' + x % 10; + x /= 10; + } while (x); + buf[i] = 0; + for (cnt = i, i = 0; i < cnt/2; ++i) { + int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; + } + return buf; +} + +int64_t strtoint64(const char *buf) +{ + int64_t x; + for (x = 0; *buf != '\0'; ++buf) + x = x * 10 + ((int64_t) *buf - 48); + return x; +} +/* In windows, the first thing is to establish the TCP connection. */ +int knet_win32_init() +{ + WSADATA wsaData; + return WSAStartup(MAKEWORD(2, 2), &wsaData); +} +void knet_win32_destroy() +{ + WSACleanup(); +} +/* A slightly modfied version of the following function also works on + * Mac (and presummably Linux). However, this function is not stable on + * my Mac. It sometimes works fine but sometimes does not. Therefore for + * non-Windows OS, I do not use this one. */ +static SOCKET socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) \ + do { \ + fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ + return -1; \ + } while (0) + + int on = 1; + SOCKET fd; + struct linger lng = { 0, 0 }; + struct sockaddr_in server; + struct hostent *hp = 0; + // open socket + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + // get host info + if (isalpha(host[0])) hp = gethostbyname(host); + else { + struct in_addr addr; + addr.s_addr = inet_addr(host); + hp = gethostbyaddr((char*)&addr, 4, AF_INET); + } + if (hp == 0) __err_connect("gethost"); + // connect + server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); + server.sin_family= AF_INET; + server.sin_port = htons(atoi(port)); + if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); + // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) + return fd; +} +#endif + +static off_t my_netread(int fd, void *buf, off_t len) +{ + off_t rest = len, curr, l = 0; + /* recv() and read() may not read the required length of data with + * one call. They have to be called repeatedly. */ + while (rest) { + if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading + curr = netread(fd, buf + l, rest); + /* According to the glibc manual, section 13.2, a zero returned + * value indicates end-of-file (EOF), which should mean that + * read() will not return zero if EOF has not been met but data + * are not immediately available. */ + if (curr == 0) break; + l += curr; rest -= curr; + } + return l; +} + +/************************* + * FTP specific routines * + *************************/ + +static int kftp_get_response(knetFile *ftp) +{ +#ifndef _WIN32 + unsigned char c; +#else + char c; +#endif + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + + +static int kftp_pasv_connect(knetFile *ftp) +{ + char host[80], port[10]; + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + ftp->fd = socket_connect(host, port); + if (ftp->fd == -1) return -1; + return 0; +} + +int kftp_connect(knetFile *ftp) +{ + ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); + if (ftp->ctrl_fd == -1) return -1; + kftp_get_response(ftp); + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd != -1) { + netclose(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + netclose(ftp->fd); + ftp->fd = -1; + return kftp_connect(ftp); +} + +// initialize ->type, ->host, ->retr and ->size +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + /* the Linux/Mac version of socket_connect() also recognizes a port + * like "ftp", but the Windows version does not. */ + fp->port = strdup("21"); + fp->host = calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->size_cmd = calloc(strlen(p) + 8, 1); + sprintf(fp->size_cmd, "SIZE %s\r\n", p); + fp->seek_offset = 0; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + long long file_size; + if (fp->fd != -1) { + netclose(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + kftp_send_cmd(fp, fp->size_cmd, 1); +#ifndef _WIN32 + if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) + { + fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); + return -1; + } +#else + const char *p = fp->response; + while (*p != ' ') ++p; + while (*p < '0' || *p > '9') ++p; + file_size = strtoint64(p); +#endif + fp->file_size = file_size; + if (fp->offset>=0) { + char tmp[32]; +#ifndef _WIN32 + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); +#else + strcpy(tmp, "REST "); + int64tostr(tmp + 5, fp->offset); + strcat(tmp, "\r\n"); +#endif + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + + +/************************** + * HTTP specific routines * + **************************/ + +knetFile *khttp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p, *proxy, *q; + int l; + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + fp = calloc(1, sizeof(knetFile)); + fp->http_host = calloc(l + 1, 1); + strncpy(fp->http_host, fn + 7, l); + fp->http_host[l] = 0; + for (q = fp->http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set ->host, ->port and ->path + if (proxy == 0) { + fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. + fp->port = strdup(*q? q : "80"); + fp->path = strdup(*p? p : "/"); + } else { + fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = fp->host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + fp->port = strdup(*q? q : "80"); + fp->path = strdup(fn); + } + fp->type = KNF_TYPE_HTTP; + fp->ctrl_fd = fp->fd = -1; + fp->seek_offset = 0; + return fp; +} + +int khttp_connect_file(knetFile *fp) +{ + int ret, l = 0; + char *buf, *p; + if (fp->fd != -1) netclose(fp->fd); + fp->fd = socket_connect(fp->host, fp->port); + buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); + l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); + l += sprintf(buf + l, "\r\n"); + netwrite(fp->fd, buf, l); + l = 0; + while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + netclose(fp->fd); + fp->fd = -1; + return -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file + off_t rest = fp->offset; + while (rest) { + off_t l = rest < 0x10000? rest : 0x10000; + rest -= my_netread(fp->fd, buf, l); + } + } else if (ret != 206 && ret != 200) { + free(buf); + fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + free(buf); + fp->is_ready = 1; + return 0; +} + +/******************** + * Generic routines * + ********************/ + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + } else if (strstr(fn, "http://") == fn) { + fp = khttp_parse_url(fn, mode); + if (fp == 0) return 0; + khttp_connect_file(fp); + } else { // local file +#ifdef _WIN32 + /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may + * be undefined on some systems, although it is defined on my + * Mac and the Linux I have tested on. */ + int fd = open(fn, O_RDONLY | O_BINARY); +#else + int fd = open(fn, O_RDONLY); +#endif + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + fp->ctrl_fd = -1; + } + if (fp && fp->fd == -1) { + knet_close(fp); + return 0; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +off_t knet_read(knetFile *fp, void *buf, off_t len) +{ + off_t l = 0; + if (fp->fd == -1) return 0; + if (fp->type == KNF_TYPE_FTP) { + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + } + } else if (fp->type == KNF_TYPE_HTTP) { + if (fp->is_ready == 0) + khttp_connect_file(fp); + } + if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX + off_t rest = len, curr; + while (rest) { + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; + l += curr; rest -= curr; + } + } else l = my_netread(fp->fd, buf, len); + fp->offset += l; + return l; +} + +off_t knet_seek(knetFile *fp, int64_t off, int whence) +{ + if (whence == SEEK_SET && off == fp->offset) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + /* Be aware that lseek() returns the offset after seeking, + * while fseek() returns zero on success. */ + off_t offset = lseek(fp->fd, off, whence); + if (offset == -1) { + // Be silent, it is OK for knet_seek to fail when the file is streamed + // fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; + } + fp->offset = offset; + return 0; + } + else if (fp->type == KNF_TYPE_FTP) + { + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + else if ( whence==SEEK_END) + fp->offset = fp->file_size+off; + fp->is_ready = 0; + return 0; + } + else if (fp->type == KNF_TYPE_HTTP) + { + if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? + fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); + errno = ESPIPE; + return -1; + } + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + fp->is_ready = 0; + return fp->offset; + } + errno = EINVAL; + fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific + if (fp->fd != -1) { + /* On Linux/Mac, netclose() is an alias of close(), but on + * Windows, it is an alias of closesocket(). */ + if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); + else netclose(fp->fd); + } + free(fp->host); free(fp->port); + free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific + free(fp->path); free(fp->http_host); // HTTP specific + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char *buf; + knetFile *fp; + int type = 4, l; +#ifdef _WIN32 + knet_win32_init(); +#endif + buf = calloc(0x100000, 1); + if (type == 0) { + fp = knet_open("knetfile.c", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 1) { // NCBI FTP, large file + fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); + knet_seek(fp, 2500000000ll, SEEK_SET); + l = knet_read(fp, buf, 255); + } else if (type == 2) { + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 3) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 4) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); + knet_read(fp, buf, 10000); + knet_seek(fp, 20000, SEEK_SET); + knet_seek(fp, 10000, SEEK_SET); + l = knet_read(fp, buf+10000, 10000000) + 10000; + } + if (type != 4 && type != 1) { + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + } else write(fileno(stdout), buf, l); + knet_close(fp); + free(buf); + return 0; +} +#endif diff --git a/tabix/knetfile.h b/tabix/knetfile.h new file mode 100644 index 0000000..0a0e66f --- /dev/null +++ b/tabix/knetfile.h @@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include +#include + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, int64_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tabix/ksort.h b/tabix/ksort.h new file mode 100644 index 0000000..16a03fd --- /dev/null +++ b/tabix/ksort.h @@ -0,0 +1,271 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/tabix/kstring.c b/tabix/kstring.c new file mode 100644 index 0000000..e0203fa --- /dev/null +++ b/tabix/kstring.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +int *ksBM_prep(const uint8_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = calloc(m + 256, 1); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +{ + int i, j, *prep, *bmGs, *bmBc; + int *matches = 0, mm = 0, nm = 0; + prep = _prep? _prep : ksBM_prep(pat, m); + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i < 0) { + if (nm == mm) { + mm = mm? mm<<1 : 1; + matches = realloc(matches, mm * sizeof(int)); + } + matches[nm++] = j; + j += bmGs[0]; + } else { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } + } + *n_matches = nm; + if (_prep == 0) free(prep); + return matches; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + + { + static char *str = "abcdefgcdg"; + static char *pat = "cd"; + int n, *matches; + matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); + printf("%d: \n", n); + for (i = 0; i < n; ++i) + printf("- %d\n", matches[i]); + free(matches); + } + return 0; +} +#endif diff --git a/tabix/kstring.h b/tabix/kstring.h new file mode 100644 index 0000000..f4e5a99 --- /dev/null +++ b/tabix/kstring.h @@ -0,0 +1,68 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +// calculate the auxiliary array, allocated by calloc() +int *ksBM_prep(const uint8_t *pat, int m); + +/* Search pat in str and returned the list of matches. The size of the + * list is returned as n_matches. _prep is the array returned by + * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/tabix/tabix.h b/tabix/tabix.h new file mode 100644 index 0000000..4390c09 --- /dev/null +++ b/tabix/tabix.h @@ -0,0 +1,137 @@ +/* The MIT License + + Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef __TABIDX_H +#define __TABIDX_H + +#include +#include "kstring.h" +#include "bgzf.h" + +#define TI_PRESET_GENERIC 0 +#define TI_PRESET_SAM 1 +#define TI_PRESET_VCF 2 + +#define TI_FLAG_UCSC 0x10000 + +typedef int (*ti_fetch_f)(int l, const char *s, void *data); + +struct __ti_index_t; +typedef struct __ti_index_t ti_index_t; + +struct __ti_iter_t; +typedef struct __ti_iter_t *ti_iter_t; + +typedef struct { + BGZF *fp; + ti_index_t *idx; + char *fn, *fnidx; +} tabix_t; + +typedef struct { + int32_t preset; + int32_t sc, bc, ec; // seq col., beg col. and end col. + int32_t meta_char, line_skip; +} ti_conf_t; + +extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset + +#ifdef __cplusplus +extern "C" { +#endif + + /******************* + * High-level APIs * + *******************/ + + tabix_t *ti_open(const char *fn, const char *fnidx); + int ti_lazy_index_load(tabix_t *t); + void ti_close(tabix_t *t); + ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end); + ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end); + ti_iter_t ti_querys(tabix_t *t, const char *reg); + const char *ti_read(tabix_t *t, ti_iter_t iter, int *len); + + /* Destroy the iterator */ + void ti_iter_destroy(ti_iter_t iter); + + /* Get the list of sequence names. Each "char*" pointer points to a + * internal member of the index, so DO NOT modify the returned + * pointer; otherwise the index will be corrupted. The returned + * pointer should be freed by a single free() call by the routine + * calling this function. The number of sequences is returned at *n. */ + const char **ti_seqname(const ti_index_t *idx, int *n); + + /****************** + * Low-level APIs * + ******************/ + + /* Build the index for file . File .tbi will be generated + * and overwrite the file of the same name. Return -1 on failure. */ + int ti_index_build(const char *fn, const ti_conf_t *conf); + + /* Load the index from file .tbi. If is a URL and the index + * file is not in the working directory, .tbi will be + * downloaded. Return NULL on failure. */ + ti_index_t *ti_index_load(const char *fn); + + ti_index_t *ti_index_load_local(const char *fnidx); + + /* Destroy the index */ + void ti_index_destroy(ti_index_t *idx); + + /* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */ + int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end); + + int ti_get_tid(const ti_index_t *idx, const char *name); + + /* Get the iterator pointing to the first record at the current file + * position. If the file is just openned, the iterator points to the + * first record in the file. */ + ti_iter_t ti_iter_first(void); + + /* Get the iterator pointing to the first record in region tid:beg-end */ + ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end); + + /* Get the data line pointed by the iterator and iterate to the next record. */ + const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len); + + /******************* + * Deprecated APIs * + *******************/ + + /* The callback version for random access */ + int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func); + + /* Read one line. */ + int ti_readline(BGZF *fp, kstring_t *str); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tests/ex3.sam b/tests/ex3.sam index bae2a22..495d4fe 100644 --- a/tests/ex3.sam +++ b/tests/ex3.sam @@ -10,4 +10,4 @@ read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R read_28701_28881_323c 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< - +test_clipped1 99 chr2 997 20 4S6M1D20M5S = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U diff --git a/tests/example.gtf.gz b/tests/example.gtf.gz new file mode 100644 index 0000000000000000000000000000000000000000..693db0c6372a12d4299eb20a49cfb54ebba94882 GIT binary patch literal 3778 zcmV;z4n6T7iwFb&00000{{{d;LjnMM3+f&>0tNYFDc2fB#RU&N}fbM1uCNn!_<#oo)7Yny#4wJi??T=-hTY$ z!{YPR^je?EG8yjiVpAMd|EZ5mt^ z4Y-huYTKar*{A<~UmhKQe|&ga-mk82AMWl~UpC)cKixe2SRWn#=j{B?v&|P*H{a|J zTwVa2Fn$5-4K#n3r^8x{cDJ=}?RI`biTeeAE&pErb;N!?`g-&C&Fbdb{a22T&#$iD zfBsm$Z~Mnc+w@I2hUoW#zR)`*oMyWtgq{eUE4iRTBJ_!&*Q3=6y5<-? z#7C-f_*`KGM-iwRTX_ehtPAr;}(0)$R&kH#MhtJ?(NTyas~WBdEELyxK%% zfeQ>R!VwhKet6AQU-iYB3KSm(s@34df{Ko4cSW^+dicJHN~s|3K{3Kgh_DJ~n0h9p zo}5ax;7sxi+od&dHQA6HCFy*&VLWMg#w47~1QO2OWxxXXU?Ygc`-pehQX$MX^Xj&x zhw7gjS-lO}wsA5=PCicB4qk2uASW-U!XGy{VIJX6P!ZJrK+SdJHKIB#buK52F?wSR z=M03j!G%y&0FA+gckDJn;n?+g&61M_g3cgDPzb8e8H~XRok5&4P&T=LMva+)f9y6u zp)>HMnvyHBkdkT?RM2TQIiWA`4m^cXv)fEmvKPM@&IUPi(-vEU5Uoa`C zxm&35K+V)dHKL0VIAL@V64Jm=M7<(Z#^Dv>j&1V!;b02;qqMG~o8xz$`2Vmnitx-r}@|bj9 zyNfnB#vDIzjU{$*-T_giiV6?X-V3R@MjYM44@I>fRC85TBaSu9;Gx75 z#<5MDP9*rys?BR7*YNP8 z>9c*uLCqFySPxVwvi7oWL|xYGP-CA=N+(_At}jkZDUcJ}QkS%K`?r;%iu7RT}xG@hy!3a8eU) z=YDQ92C2rN!4#u`5qy@=&Swb|*g%O$6J=U5MVmUJO>L4z129fHu&oYtCP1Mf7z=GN zB_$*wsMc(x^^H@U8ldi3tqXL^l#!xP#mkg};k$FGX-uCdIXjm}vaoKuZk2O&-R#-| z(k-&X08&h_RY15Z3|TUeR)ZH0#0#UAUFw(EY2_aJAmnGf2;lz zf;=Eb!~}Va7LjdF@o(YPGf}r1On^EUZxUMHY%U6NAA&3{8`ti#F8dVBOB$52ZZ$xf zQ3*PC8Q0SIU34lk#|jp`wh2God^tM))kaYEi?_FTYxic{KN%6l28Qq_z#~6_&Y7Qz zw=oKk_q4oAdz;6*)!>Elj<+5>U~^HK`%-3ozS5aqUOc5Y7Deo%V0S5Vh1--zU?J{F zRsNiCIo`%SE8Bw9C!<~jbro(j<@*WhlmVd^8f?Z9SRm1am{7VRC1+%ZcUEDv@t_~A zFxn)PpIgc@l5!w5ELj5(Wde7DAnf6=i@3SMOc@Uy*6PRHYVfKsCn4^#2F2}m#z3fo z-xwa;CSin7GM(Jn`5$jTc*yZQa+|LGuag=FqU%Fu>abclHlaV*ZL=l$pzyg)bFxlcUnOE?YI9?Sijj?Ias{ ztHD&rM*y7!*=q_=qI;BRWfhpEjn;R1>nm&?;$B?IP-u6r4({%CtBJXrVJKMNO z=G_jHTXMhs6$w@Ml&h#S;1yvVRri6C+h_QyJPRJ-PYu6ve-(Dab@-4)5B64riLno_ zlWTPzkGtcB7OH^D@+GvC-6vHh%8L?S1+#=pNV>Sy3Sf8f`Oc+4e2pEwTX4b4coAIp zj`?SNLrvWQ;w$bocHB%|cD_Rpky|sIRGd+aX!a1%H05dlOnsQ6dx>-}ZE$EG`@@>r zU91iqpzzY|q>+NT7X_gu6Jv-BaWhPr-NW^=Dnomwmhz`JQf8K@Rac~~Y;&$sy^=E{ zH0Y1KoLdcE73UGO?sPw~#%PHbMn_oS$^7!3!eYU*}T%hIqzm6E)!MJt$}Ui;Am1(+ zVusk?e_E{xWmyTN+q-od%C2}zX^{6syenl(>gKYTWGWC@AnEZ)qqUygT1!C$T90Wo z-JBxhN^<316c1DgaH4cDF_F8d*Q?93)9Ya+FuIF!M~8#7tox5=&c@Pe@G7u)k<=YK z>ZNBGiGXjQRMPM<`lKwPYGa9D@wxM!%*{`yyz$R6dT>eT4}Y=NZZM$Qvu5XQ?mXeg6+CNcltn0096WiwFb&00000{{{d;LjnLP z1MS+;ZrU&u2H@-LDNt^wR`#)dY^zs@b*r?NN-OOyQ3O$o)CB}twY~l9kkAkaBu#>b zWYjh)J$1s@(;wSP{Cm~&zYp(!4*jcMybPk%WI0>J<0$wQ_Wb@Wf@DNrQn&y@P04Hc z7QXi2s`nDS1yL}54kJ@{KN{UU-hCtM2M7@Urs93jtZtOnNs8LP`ZRf2Qtzh!)F1Xe zJDf)aG3PV~%9Qdhz0c;;{$MhHsnB}+s`nQpzZ?Hum!BXNyfsq-^DO}B=B=8+|)Qab5V zNOM4C4=A`%zSu9#G(&Yr8c_&YUwU}|bbVVQEt^4SLzhF#I?h~?O3sAI{rS6gMkYai z#JNy#R?fAI%hR`^pzG9Qoth?sklbdkP((QAPMrlbch*!RoS}2&IayU_YMsi{Bh^;O zh|}&uJMC(zky&Q~%B4q#>Cuj}>*r(rthi(@H%{QJsm*8e)y?iY%nbw%#2$p8&bdQp zh7{durG00}oF7L&JWz^qk5Q!M;Y@9U(*wv1B;xL-W*p9<@nk-oMb8yBh`#pkC?$gu z+wHOHYzw91UXX3lIC$>)gTa>v^WOqDMfoqTKZ$;Mq~uH=Ujrcm%7n^y|#%yTyt7&U-RQ-P!T;GH@( z-KKxXz=58xX0@ zXqTPcD7QI5<$Y%BEM+BUzT=#117*${Fu7Z!wsWE2tek5Zmn{Pu3%X7{)~V}6FhPLa sIk)Wm50iKxG}SKv03VA81ONa4009360763o02=@U000000000001%faGynhq literal 0 HcmV?d00001 diff --git a/tests/example.gtf.gz.tbi b/tests/example.gtf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..eb3f24c6ba9cd8cf5c4551ced2edfd08518ce9e7 GIT binary patch literal 260 zcmb2|=3rp}f&Xj_PR>jW&l$K2Kc%E3H88j-q%kIiDTK9k6*3kUxlL5;Fi3j9_Q__Z zf|k9PW5^o|wuQE>3Xcpv?2>S*c08K+XGW#KEom*j$NOX07S=9!WU%j<8OK|0(e{tw z@7gca1r$o`|02-xt-Z_Miszxc1@}T`pqhVIw+mPa05xs?*K)xTsK!2OOPjQqky-%% z;X_P!^$oUv%XeS;|Bb^|=`=HqFNqIdt1xa2&yz__6kvj}sjjX2{KmopQ;*n&H}%16RVB7#QTy{3*@A4E8pN004_)Vut_# literal 0 HcmV?d00001 diff --git a/tests/pysam_test.py b/tests/pysam_test.py index c2ae6fa..5be5ca4 100755 --- a/tests/pysam_test.py +++ b/tests/pysam_test.py @@ -7,7 +7,7 @@ and data files located there. import pysam import unittest -import os +import os, re import itertools import subprocess import shutil @@ -47,7 +47,13 @@ def runSamtools( cmd ): except OSError, e: print >>sys.stderr, "Execution failed:", e - +def getSamtoolsVersion(): + '''return samtools version''' + + pipe = subprocess.Popen("samtools", shell=True, stderr=subprocess.PIPE).stderr + lines = "".join(pipe.readlines()) + return re.search( "Version:\s+(\S+)", lines).groups()[0] + class BinaryTest(unittest.TestCase): '''test samtools command line commands and compare against pysam commands. @@ -125,12 +131,19 @@ class BinaryTest(unittest.TestCase): BinaryTest.first_time = False + samtools_version = getSamtoolsVersion() + if samtools_version != pysam.__samtools_version__: + raise ValueError("versions of pysam/samtools and samtools differ: %s != %s" % \ + (pysam.__samtools_version__, + samtools_version )) + def checkCommand( self, command ): + if command: samtools_target, pysam_target = self.mCommands[command][0][0], self.mCommands[command][1][0] self.assertTrue( checkBinaryEqual( samtools_target, pysam_target ), "%s failed: files %s and %s are not the same" % (command, samtools_target, pysam_target) ) - + def testImport( self ): self.checkCommand( "import" ) @@ -153,7 +166,7 @@ class BinaryTest(unittest.TestCase): self.assertRaises( pysam.SamtoolsError, pysam.index, "exdoesntexist.bam" ) def __del__(self): - + return for label, command in self.mCommands.iteritems(): samtools_target, samtools_command = command[0] pysam_target, pysam_command = command[1] @@ -254,6 +267,12 @@ class IOTest(unittest.TestCase): self.assertRaises( ValueError, samfile.fetch ) self.assertEqual( len(list( samfile.fetch(until_eof = True) )), 3270 ) + def testReadingFromFileWithWrongMode( self ): + + assert not os.path.exists( "ex2.bam.bai" ) + samfile = pysam.Samfile( "ex2.bam", "r" ) + self.assertRaises( ValueError, samfile.fetch ) + class TestIteratorRow(unittest.TestCase): def setUp(self): @@ -283,6 +302,7 @@ class TestIteratorRow(unittest.TestCase): def tearDown(self): self.samfile.close() + class TestIteratorRowAll(unittest.TestCase): def setUp(self): @@ -348,7 +368,8 @@ class TestIteratorColumn(unittest.TestCase): self.assertEqual( len(columns), refcov, "wrong number of pileup columns returned for position %s:%i, %i should be %i" %(contig,pos,len(columns), refcov) ) elif refcov == 1: # one read, all columns of the read are returned - self.assertEqual( len(columns), refcolumns, "pileup incomplete - %i should be %i " % (len(columns), refcolumns)) + self.assertEqual( len(columns), refcolumns, "pileup incomplete at position %i: got %i, expected %i " %\ + (pos, len(columns), refcolumns)) def tearDown(self): self.samfile.close() @@ -398,10 +419,22 @@ class TestAlignedReadFromBam(unittest.TestCase): def testARseq(self): self.assertEqual( self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 1: %s != %s" % (self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG") ) self.assertEqual( self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "sequence size mismatch in read 2: %s != %s" % (self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA") ) + self.assertEqual( self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 4: %s != %s" % (self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG") ) def testARqual(self): self.assertEqual( self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", "quality string mismatch in read 1: %s != %s" % (self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") ) self.assertEqual( self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "quality string mismatch in read 2: %s != %s" % (self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<") ) + self.assertEqual( self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", "quality string mismatch in read 3: %s != %s" % (self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") ) + + def testARquery(self): + self.assertEqual( self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "query mismatch in read 1: %s != %s" % (self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG") ) + self.assertEqual( self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "query size mismatch in read 2: %s != %s" % (self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA") ) + self.assertEqual( self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT", "query mismatch in read 4: %s != %s" % (self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT") ) + + def testARqqual(self): + self.assertEqual( self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", "qquality string mismatch in read 1: %s != %s" % (self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") ) + self.assertEqual( self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "qquality string mismatch in read 2: %s != %s" % (self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<") ) + self.assertEqual( self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22", "qquality string mismatch in read 3: %s != %s" % (self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22") ) def testPresentOptionalFields(self): self.assertEqual( self.reads[0].opt('NM'), 1, "optional field mismatch in read 1, NM: %s != %s" % (self.reads[0].opt('NM'), 1) ) @@ -518,7 +551,14 @@ class TestPileupObjects(unittest.TestCase): def tearDown(self): self.samfile.close() - + +class TestContextManager(unittest.TestCase): + + def testManager( self ): + with pysam.Samfile('ex1.bam', 'rb') as samfile: + samfile.fetch() + self.assertEqual( samfile._isOpen(), False ) + class TestExceptions(unittest.TestCase): def setUp(self): @@ -581,20 +621,26 @@ class TestFastaFile(unittest.TestCase): self.assertEqual( seq, self.file.fetch( id ) ) for x in range( 0, len(seq), 10): self.assertEqual( seq[x:x+10], self.file.fetch( id, x, x+10) ) + # test x:end + self.assertEqual( seq[x:], self.file.fetch( id, x) ) + # test 0:x + self.assertEqual( seq[:x], self.file.fetch( id, None, x) ) + + + # unknown sequence returns "" + self.assertEqual( "", self.file.fetch("chr12") ) def testFetchErrors( self ): self.assertRaises( ValueError, self.file.fetch ) - self.assertRaises( ValueError, self.file.fetch, "chr1", 0 ) self.assertRaises( ValueError, self.file.fetch, "chr1", -1, 10 ) self.assertRaises( ValueError, self.file.fetch, "chr1", 20, 10 ) - # the following segfaults: - # self.assertRaises( IndexError, self.file.fetch, "chr12", ) - pass + def testLength( self ): + self.assertEqual( len(self.file), 2 ) + def tearDown(self): self.file.close() - class TestAlignedRead(unittest.TestCase): '''tests to check if aligned read can be constructed and manipulated. @@ -803,7 +849,7 @@ class TestDeNovoConstruction(unittest.TestCase): others = list(infile) for denovo, other in zip( others, self.reads): self.checkFieldEqual( other, denovo ) - self.assertEqual( other, denovo) + self.assertEqual( other.compare( denovo ), 0 ) def testSAMPerRead( self ): '''check if individual reads are binary equal.''' @@ -812,7 +858,7 @@ class TestDeNovoConstruction(unittest.TestCase): others = list(infile) for denovo, other in zip( others, self.reads): self.checkFieldEqual( other, denovo ) - self.assertEqual( other, denovo) + self.assertEqual( other.compare( denovo), 0 ) def testBAMWholeFile( self ): @@ -828,6 +874,82 @@ class TestDeNovoConstruction(unittest.TestCase): os.unlink( tmpfilename ) +class TestDoubleFetch(unittest.TestCase): + '''check if two iterators on the same bamfile are independent.''' + + def testDoubleFetch( self ): + + samfile1 = pysam.Samfile('ex1.bam', 'rb') + + for a,b in zip(samfile1.fetch(), samfile1.fetch()): + self.assertEqual( a.compare( b ), 0 ) + + def testDoubleFetchWithRegion( self ): + + samfile1 = pysam.Samfile('ex1.bam', 'rb') + chr, start, stop = 'chr1', 200, 3000000 + self.assertTrue(len(list(samfile1.fetch ( chr, start, stop))) > 0) #just making sure the test has something to catch + + for a,b in zip(samfile1.fetch( chr, start, stop), samfile1.fetch( chr, start, stop)): + self.assertEqual( a.compare( b ), 0 ) + + def testDoubleFetchUntilEOF( self ): + + samfile1 = pysam.Samfile('ex1.bam', 'rb') + + for a,b in zip(samfile1.fetch( until_eof = True), + samfile1.fetch( until_eof = True )): + self.assertEqual( a.compare( b), 0 ) + +class TestRemoteFileFTP(unittest.TestCase): + '''test remote access. + + ''' + + # Need to find an ftp server without password on standard + # port. + + url = "ftp://ftp.sanger.ac.uk/pub/rd/humanSequences/CV.bam" + region = "1:1-1000" + + def testFTPView( self ): + result = pysam.view( self.url, self.region ) + self.assertEqual( len(result), 36 ) + + def testFTPFetch( self ): + samfile = pysam.Samfile(self.url, "rb") + result = list(samfile.fetch( region = self.region )) + self.assertEqual( len(result), 36 ) + +class TestRemoteFileHTTP( unittest.TestCase): + + url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/ex1.bam" + region = "chr1:1-1000" + local = "ex1.bam" + + def testView( self ): + self.assertRaises( pysam.SamtoolsError, pysam.view, self.url, self.region ) + + def testFetch( self ): + samfile = pysam.Samfile(self.url, "rb") + result = list(samfile.fetch( region = self.region )) + samfile_local = pysam.Samfile(self.local, "rb") + ref = list(samfile_local.fetch( region = self.region )) + + self.assertEqual( len(ref), len(result) ) + for x, y in zip(result, ref): + self.assertEqual( x.compare( y ), 0 ) + + def testFetchAll( self ): + samfile = pysam.Samfile(self.url, "rb") + result = list(samfile.fetch()) + samfile_local = pysam.Samfile(self.local, "rb") + ref = list(samfile_local.fetch() ) + + self.assertEqual( len(ref), len(result) ) + for x, y in zip(result, ref): + self.assertEqual( x.compare( y ), 0 ) + # TODOS # 1. finish testing all properties within pileup objects diff --git a/tests/tabix_test.py b/tests/tabix_test.py new file mode 100644 index 0000000..8eb8a60 --- /dev/null +++ b/tests/tabix_test.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +'''unit testing code for pysam. + +Execute in the :file:`tests` directory as it requires the Makefile +and data files located there. +''' + +import sys, os, shutil, gzip +import pysam +import unittest +import itertools +import subprocess + +def checkBinaryEqual( filename1, filename2 ): + '''return true if the two files are binary equal.''' + if os.path.getsize( filename1 ) != os.path.getsize( filename2 ): + return False + + infile1 = open(filename1, "rb") + infile2 = open(filename2, "rb") + + def chariter( infile ): + while 1: + c = infile.read(1) + if c == "": break + yield c + + found = False + for c1,c2 in itertools.izip( chariter( infile1), chariter( infile2) ): + if c1 != c2: break + else: + found = True + + infile1.close() + infile2.close() + return found + +class TestIndexing(unittest.TestCase): + filename = "example.gtf.gz" + filename_idx = "example.gtf.gz.tbi" + + def setUp( self ): + + self.tmpfilename = "tmp_%i.gtf.gz" % id(self) + shutil.copyfile( self.filename, self.tmpfilename ) + + def testIndexPreset( self ): + '''test indexing via preset.''' + + pysam.tabix_index( self.tmpfilename, preset = "gff" ) + checkBinaryEqual( self.tmpfilename + ".tbi", self.filename_idx ) + + def tearDown( self ): + os.unlink( self.tmpfilename ) + os.unlink( self.tmpfilename + ".tbi" ) + +class TestCompression(unittest.TestCase): + filename = "example.gtf.gz" + filename_idx = "example.gtf.gz.tbi" + + def setUp( self ): + + self.tmpfilename = "tmp_%i.gtf" % id(self) + infile = gzip.open( self.filename, "r") + outfile = open( self.tmpfilename, "w" ) + outfile.write( "".join(infile.readlines()) ) + outfile.close() + infile.close() + + def testIndexPreset( self ): + '''test indexing via preset.''' + + pysam.tabix_index( self.tmpfilename, preset = "gff" ) + checkBinaryEqual( self.tmpfilename + ".gz", self.filename ) + checkBinaryEqual( self.tmpfilename + ".gz.tbi", self.filename_idx ) + + def tearDown( self ): + os.unlink( self.tmpfilename + ".gz" ) + os.unlink( self.tmpfilename + ".gz.tbi" ) + +class TestIteration( unittest.TestCase ): + + filename = "example.gtf.gz" + + def setUp( self ): + + self.tabix = pysam.Tabixfile( self.filename ) + lines = gzip.open(self.filename).readlines() + # creates index of contig, start, end, adds content without newline. + self.compare = [ + (x[0][0], int(x[0][3]), int(x[0][4]), x[1]) + for x in [ (y.split("\t"), y[:-1]) for y in lines ] ] + + def getSubset( self, contig = None, start = None, end = None): + + if contig == None: + # all lines + subset = [ x[3] for x in self.compare ] + else: + if start != None and end == None: + # until end of contig + subset = [ x[3] for x in self.compare if x[0] == contig and x[2] > start ] + elif start == None and end != None: + # from start of contig + subset = [ x[3] for x in self.compare if x[0] == contig and x[1] <= end ] + elif start == None and end == None: + subset = [ x[3] for x in self.compare if x[0] == contig ] + else: + # all within interval + subset = [ x[3] for x in self.compare if x[0] == contig and \ + min( x[2], end) - max(x[1], start) > 0 ] + + return subset + + def checkPairwise( self, result, ref ): + + result.sort() + ref.sort() + + a = set(result) + b = set(ref) + + self.assertEqual( len(result), len(ref), + "unexpected number of results: %i, expected %i, differences are %s: %s" \ + % (len(result), len(ref), + a.difference(b), + b.difference(a) )) + + for x, d in enumerate( zip( result, ref )): + + self.assertEqual( d[0], d[1], + "unexpected results in pair %i: '%s', expected '%s'" % \ + (x, + d[0], + d[1]) ) + + + def testAll( self ): + result = list(self.tabix.fetch()) + ref = self.getSubset( ) + self.checkPairwise( result, ref ) + + def testPerContig( self ): + for contig in ("chr1", "chr2", "chr1", "chr2" ): + result = list(self.tabix.fetch( contig )) + ref = self.getSubset( contig ) + self.checkPairwise( result, ref ) + + def testPerContigToEnd( self ): + + end = None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + for start in range( 0, 200000, 1000): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + def testPerContigFromStart( self ): + + start = None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + for end in range( 0, 200000, 1000): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + def testPerContig( self ): + + start, end = None, None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + def testPerInterval( self ): + + start, end = None, None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + for start in range( 0, 200000, 2000): + for end in range( start, start + 2000, 500): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + + def testInvalidIntervals( self ): + + self.assertRaises( ValueError, self.tabix.fetch, "chr1", 0, -10) + self.assertRaises( ValueError, self.tabix.fetch, "chr1", -10, 200) + self.assertRaises( ValueError, self.tabix.fetch, "chr1", 200, 0) + self.assertRaises( ValueError, self.tabix.fetch, "chr1", -10, -20) + self.assertRaises( ValueError, self.tabix.fetch, "chrUn" ) + + def testGetContigs( self ): + self.assertEqual( sorted(self.tabix.contigs), ["chr1", "chr2"] ) + # check that contigs is read-only + self.assertRaises( AttributeError, setattr, self.tabix, "contigs", ["chr1", "chr2"] ) + +class TestParser( unittest.TestCase ): + + filename = "example.gtf.gz" + + def setUp( self ): + + self.tabix = pysam.Tabixfile( self.filename ) + self.compare = [ x[:-1].split("\t") for x in gzip.open( self.filename, "r") ] + + def testGTF( self ): + + for x, r in enumerate(self.tabix.fetch( parser = pysam.asGTF() )): + self.assertEqual( "\t".join( self.compare[x]), str(r) ) + + def testTuple( self ): + + for x, r in enumerate(self.tabix.fetch( parser = pysam.asTuple() )): + self.assertEqual( self.compare[x], list(r) ) + + self.assertEqual( len(self.compare[x]), len(r) ) + for c in range(0,len(r)): + self.assertEqual( self.compare[x][c], r[c] ) + +if __name__ == "__main__": + unittest.main() + + -- 2.30.2