From 2112b1dba17e822d8028151605b4cdc01f0a4c3d Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 12 Apr 2011 14:57:33 +0900 Subject: [PATCH] Imported Upstream version 0.2.4 --- ChangeLog | 103 +++++++++++ Makefile | 3 +- NEWS | 23 +++ bedidx.c | 152 ++++++++++++++++ bgzf.c | 122 +++++++------ bgzf.h | 14 +- example.gtf.gz.tbi | Bin 260 -> 259 bytes index.c | 80 ++++++--- kseq.h | 227 ++++++++++++++++++++++++ main.c | 206 ++++++++++++++++++---- python/setup.py | 55 ++++++ python/tabixmodule.c | 408 +++++++++++++++++++++++++++++++++++++++++++ python/test.py | 91 ++++++++++ tabix.1 | 10 +- tabix.h | 8 + 15 files changed, 1379 insertions(+), 123 deletions(-) create mode 100644 bedidx.c create mode 100644 kseq.h create mode 100644 python/setup.py create mode 100644 python/tabixmodule.c create mode 100755 python/test.py diff --git a/ChangeLog b/ChangeLog index 3594496..fd335b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,106 @@ +------------------------------------------------------------------------ +r942 | lh3lh3 | 2011-03-31 16:39:50 -0400 (Thu, 31 Mar 2011) | 2 lines +Changed paths: + M /trunk/tabix/main.c + +update version number + +------------------------------------------------------------------------ +r940 | lh3lh3 | 2011-03-31 16:38:03 -0400 (Thu, 31 Mar 2011) | 2 lines +Changed paths: + M /trunk/tabix/bedidx.c + M /trunk/tabix/main.c + +fixed two bugs due to recent changes + +------------------------------------------------------------------------ +r939 | lh3lh3 | 2011-03-31 16:12:21 -0400 (Thu, 31 Mar 2011) | 2 lines +Changed paths: + M /trunk/tabix/bgzf.c + M /trunk/tabix/bgzf.h + M /trunk/tabix/main.c + +update to the latest bgzf.* + +------------------------------------------------------------------------ +r938 | lh3lh3 | 2011-03-31 16:02:21 -0400 (Thu, 31 Mar 2011) | 2 lines +Changed paths: + M /trunk/tabix/index.c + M /trunk/tabix/main.c + M /trunk/tabix/tabix.h + +BED support + +------------------------------------------------------------------------ +r937 | lh3lh3 | 2011-03-31 15:03:49 -0400 (Thu, 31 Mar 2011) | 2 lines +Changed paths: + M /trunk/tabix/Makefile + A /trunk/tabix/bedidx.c + M /trunk/tabix/example.gtf.gz.tbi + M /trunk/tabix/index.c + A /trunk/tabix/kseq.h + M /trunk/tabix/tabix.h + +restructure get_intv() for BED support + +------------------------------------------------------------------------ +r919 | petulda | 2011-02-24 10:14:14 -0500 (Thu, 24 Feb 2011) | 1 line +Changed paths: + M /trunk/tabix/bgzf.c + M /trunk/tabix/bgzf.h + M /trunk/tabix/index.c + M /trunk/tabix/main.c + +New -r (reheader) option for efficient header replacement. +------------------------------------------------------------------------ +r915 | lh3lh3 | 2011-02-22 09:50:57 -0500 (Tue, 22 Feb 2011) | 2 lines +Changed paths: + A /trunk/tabix/python + A /trunk/tabix/python/setup.py (from /trunk/tabix/setup.py:914) + A /trunk/tabix/python/tabixmodule.c (from /trunk/tabix/tabixmodule.c:914) + A /trunk/tabix/python/test.py (from /trunk/tabix/test.py:914) + D /trunk/tabix/setup.py + D /trunk/tabix/tabixmodule.c + D /trunk/tabix/test.py + +move to a new python/ directory + +------------------------------------------------------------------------ +r914 | lh3lh3 | 2011-02-22 09:49:35 -0500 (Tue, 22 Feb 2011) | 2 lines +Changed paths: + A /trunk/tabix/setup.py + A /trunk/tabix/tabixmodule.c + A /trunk/tabix/test.py + +CPython C-API by Hyeshik Chang + +------------------------------------------------------------------------ +r904 | petulda | 2011-01-28 08:06:27 -0500 (Fri, 28 Jan 2011) | 1 line +Changed paths: + M /trunk/tabix/index.c + +Check the number of fields on each line and exit nicely without segfault +------------------------------------------------------------------------ +r901 | petulda | 2011-01-21 06:45:37 -0500 (Fri, 21 Jan 2011) | 1 line +Changed paths: + M /trunk/tabix/main.c + +Fix: Complain only when VCF is newer, not newer or same mtime +------------------------------------------------------------------------ +r900 | petulda | 2011-01-21 04:23:04 -0500 (Fri, 21 Jan 2011) | 1 line +Changed paths: + M /trunk/tabix/main.c + +Prevent the common user mistake and check the timestamps of the vcf and index file +------------------------------------------------------------------------ +r876 | lh3lh3 | 2010-12-08 12:38:45 -0500 (Wed, 08 Dec 2010) | 2 lines +Changed paths: + M /trunk/tabix/ChangeLog + M /trunk/tabix/NEWS + M /trunk/tabix/main.c + +Release tabix-0.2.3 + ------------------------------------------------------------------------ r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines Changed paths: diff --git a/Makefile b/Makefile index 9013642..3ba7d5d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC= gcc CFLAGS= -g -Wall -O2 -fPIC #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -LOBJS= bgzf.o kstring.o knetfile.o index.o +LOBJS= bgzf.o kstring.o knetfile.o index.o bedidx.o AOBJS= main.o PROG= tabix bgzip INCLUDES= @@ -52,6 +52,7 @@ bgzf.o:bgzf.h knetfile.h index.o:bgzf.h tabix.h khash.h ksort.h kstring.h main.o:tabix.h kstring.h bgzf.h bgzip.o:bgzf.h +bedidx.o:kseq.h khash.h tabix.pdf:tabix.tex pdflatex tabix.tex diff --git a/NEWS b/NEWS index ff28ded..d230541 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,26 @@ +Release 0.2.4 (10 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Give an error if the index file is older than the data file. + + * Avoid a segfault given flawed input. + + * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to + the dynamic library and are reported to be faster. Pysam also comes with a + tabix binding. + + * Added option "-r" for efficient header replacement. + + * Added BED support. + + * Synchronized the BGZF library between tabix and samtools. + +(0.2.4: 10 April 2011, r949) + + + Beta Release 0.2.3 (8 December, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bedidx.c b/bedidx.c new file mode 100644 index 0000000..9297831 --- /dev/null +++ b/bedidx.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 8192) + +typedef struct { + int n, m; + uint64_t *a; + int *idx; +} bed_reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, bed_reglist_t) + +#define LIDX_SHIFT 13 + +typedef kh_reg_t reghash_t; + +int *bed_index_core(int n, uint64_t *a, int *n_idx) +{ + int i, j, m, *idx; + m = *n_idx = 0; idx = 0; + for (i = 0; i < n; ++i) { + int beg, end; + beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; + if (m < end + 1) { + int oldm = m; + m = end + 1; + kroundup32(m); + idx = realloc(idx, m * sizeof(int)); + for (j = oldm; j < m; ++j) idx[j] = -1; + } + if (beg == end) { + if (idx[beg] < 0) idx[beg] = i; + } else { + for (j = beg; j <= end; ++j) + if (idx[j] < 0) idx[j] = i; + } + *n_idx = end + 1; + } + return idx; +} + +void bed_index(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + bed_reglist_t *p = &kh_val(h, k); + if (p->idx) free(p->idx); + p->idx = bed_index_core(p->n, p->a, &p->m); + } + } +} + +int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +{ + int i, min_off; + if (p->n == 0) return 0; + min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; + if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here + int n = beg>>LIDX_SHIFT; + if (n > p->n) n = p->n; + for (i = n - 1; i >= 0; --i) + if (p->idx[i] >= 0) break; + min_off = i >= 0? p->idx[i] : 0; + } + for (i = min_off; i < p->n; ++i) { + if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed + if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) + return 1; // find the overlap; return + } + return 0; +} + +int bed_overlap(const void *_h, const char *chr, int beg, int end) +{ + const reghash_t *h = (const reghash_t*)_h; + khint_t k; + if (!h) return 0; + k = kh_get(reg, h, chr); + if (k == kh_end(h)) return 0; + return bed_overlap_core(&kh_val(h, k), beg, end); +} + +void *bed_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; + str = calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name + int beg = -1, end = -1; + bed_reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { // absent from the hash table + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { // if the lines has other characters + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); // begin + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) + end = atoi(str->s); // end + } + } + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg >= 0 && end > beg) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + bed_index(h); + return h; +} + +void bed_destroy(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free(kh_val(h, k).idx); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} diff --git a/bgzf.c b/bgzf.c index 94e6194..216cd04 100644 --- a/bgzf.c +++ b/bgzf.c @@ -111,7 +111,7 @@ report_error(BGZF* fp, const char* message) { fp->error = message; } -int is_bgzipped(const char *fn) +int bgzf_check_bgzf(const char *fn) { BGZF *fp; uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; @@ -119,7 +119,7 @@ int is_bgzipped(const char *fn) if ((fp = bgzf_open(fn, "r")) == 0) { - fprintf(stderr, "[is_bgzipped] failed to open the file: %s\n",fn); + fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); return -1; } @@ -174,7 +174,7 @@ open_read(int fd) static BGZF* -open_write(int fd, bool is_uncompressed) +open_write(int fd, int compress_level) // compress_level==-1 for the default level { FILE* file = fdopen(fd, "w"); BGZF* fp; @@ -182,7 +182,9 @@ open_write(int fd, bool is_uncompressed) fp = malloc(sizeof(BGZF)); fp->file_descriptor = fd; fp->open_mode = 'w'; - fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; + fp->owned_file = 0; + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; #ifdef _USE_KNETFILE fp->x.fpw = file; #else @@ -203,7 +205,7 @@ BGZF* bgzf_open(const char* __restrict path, const char* __restrict mode) { BGZF* fp = NULL; - if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ + if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */ #ifdef _USE_KNETFILE knetFile *file = knet_open(path, mode); if (file == 0) return 0; @@ -220,18 +222,23 @@ bgzf_open(const char* __restrict path, const char* __restrict mode) if (fd == -1) return 0; fp = open_read(fd); #endif - } else if (mode[0] == 'w' || mode[0] == 'W') { - int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; #ifdef _WIN32 oflag |= O_BINARY; #endif fd = open(path, oflag, 0666); if (fd == -1) return 0; - fp = open_write(fd, strstr(mode, "u")? 1 : 0); - } - if (fp != NULL) { - fp->owned_file = 1; + { // set compress_level + int i; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + } + fp = open_write(fd, compress_level); } + if (fp != NULL) fp->owned_file = 1; return fp; } @@ -242,7 +249,12 @@ bgzf_fdopen(int fd, const char * __restrict mode) if (mode[0] == 'r' || mode[0] == 'R') { return open_read(fd); } else if (mode[0] == 'w' || mode[0] == 'W') { - return open_write(fd, strstr(mode, "u")? 1 : 0); + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return open_write(fd, compress_level); } else { return NULL; } @@ -282,7 +294,6 @@ deflate_block(BGZF* fp, int block_length) int input_length = block_length; int compressed_length = 0; while (1) { - int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; @@ -291,7 +302,7 @@ deflate_block(BGZF* fp, int block_length) zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); if (status != Z_OK) { report_error(fp, "deflate init failed"); @@ -358,6 +369,7 @@ inflate_block(BGZF* fp, int block_length) // Inflate the block in fp->compressed_block into fp->uncompressed_block z_stream zs; + int status; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = fp->compressed_block + 18; @@ -365,7 +377,7 @@ inflate_block(BGZF* fp, int block_length) zs.next_out = fp->uncompressed_block; zs.avail_out = fp->uncompressed_block_size; - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + status = inflateInit2(&zs, GZIP_WINDOW_BITS); if (status != Z_OK) { report_error(fp, "inflate init failed"); return -1; @@ -459,15 +471,15 @@ int bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; - int size = 0; + int count, size = 0, block_length, remaining; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); if (load_block_from_cache(fp, block_address)) return 0; - int count = knet_read(fp->x.fpr, header, sizeof(header)); + count = knet_read(fp->x.fpr, header, sizeof(header)); #else int64_t block_address = ftello(fp->file); if (load_block_from_cache(fp, block_address)) return 0; - int count = fread(header, 1, sizeof(header), fp->file); + count = fread(header, 1, sizeof(header), fp->file); #endif if (count == 0) { fp->block_length = 0; @@ -482,10 +494,10 @@ bgzf_read_block(BGZF* fp) report_error(fp, "invalid block header"); return -1; } - int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + block_length = unpackInt16((uint8_t*)&header[16]) + 1; bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - int remaining = block_length - BLOCK_HEADER_LENGTH; + remaining = block_length - BLOCK_HEADER_LENGTH; #ifdef _USE_KNETFILE count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); #else @@ -497,9 +509,7 @@ bgzf_read_block(BGZF* fp) } size += count; count = inflate_block(fp, block_length); - if (count < 0) { - return -1; - } + if (count < 0) return -1; if (fp->block_length != 0) { // Do not reset offset if this read follows a seek. fp->block_offset = 0; @@ -524,7 +534,8 @@ bgzf_read(BGZF* fp, void* data, int length) int bytes_read = 0; bgzf_byte_t* output = data; while (bytes_read < length) { - int available = fp->block_length - fp->block_offset; + int copy_length, available = fp->block_length - fp->block_offset; + bgzf_byte_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) { return -1; @@ -534,8 +545,8 @@ bgzf_read(BGZF* fp, void* data, int length) break; } } - int copy_length = bgzf_min(length-bytes_read, available); - bgzf_byte_t* buffer = fp->uncompressed_block; + copy_length = bgzf_min(length-bytes_read, available); + buffer = fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; @@ -553,19 +564,16 @@ bgzf_read(BGZF* fp, void* data, int length) return bytes_read; } -static -int -flush_block(BGZF* fp) +int bgzf_flush(BGZF* fp) { while (fp->block_offset > 0) { - int block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) { - return -1; - } + int count, block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; #ifdef _USE_KNETFILE - int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); #else - int count = fwrite(fp->compressed_block, 1, block_length, fp->file); + count = fwrite(fp->compressed_block, 1, block_length, fp->file); #endif if (count != block_length) { report_error(fp, "write failed"); @@ -576,21 +584,28 @@ flush_block(BGZF* fp) return 0; } -int -bgzf_write(BGZF* fp, const void* data, int length) +int bgzf_flush_try(BGZF *fp, int size) { + if (fp->block_offset + size > fp->uncompressed_block_size) + return bgzf_flush(fp); + return -1; +} + +int bgzf_write(BGZF* fp, const void* data, int length) +{ + const bgzf_byte_t *input = data; + int block_length, bytes_written; if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; } - if (fp->uncompressed_block == NULL) { + if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); - } - const bgzf_byte_t* input = data; - int block_length = fp->uncompressed_block_size; - int bytes_written = 0; + input = data; + block_length = fp->uncompressed_block_size; + bytes_written = 0; while (bytes_written < length) { int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); bgzf_byte_t* buffer = fp->uncompressed_block; @@ -599,7 +614,7 @@ bgzf_write(BGZF* fp, const void* data, int length) input += copy_length; bytes_written += copy_length; if (fp->block_offset == block_length) { - if (flush_block(fp) != 0) { + if (bgzf_flush(fp) != 0) { break; } } @@ -607,13 +622,10 @@ bgzf_write(BGZF* fp, const void* data, int length) return bytes_written; } -int -bgzf_close(BGZF* fp) +int bgzf_close(BGZF* fp) { if (fp->open_mode == 'w') { - if (flush_block(fp) != 0) { - return -1; - } + if (bgzf_flush(fp) != 0) return -1; { // add an empty block int count, block_length = deflate_block(fp, 0); #ifdef _USE_KNETFILE @@ -638,9 +650,7 @@ bgzf_close(BGZF* fp) else ret = knet_close(fp->x.fpr); if (ret != 0) return -1; #else - if (fclose(fp->file) != 0) { - return -1; - } + if (fclose(fp->file) != 0) return -1; #endif } free(fp->uncompressed_block); @@ -674,9 +684,11 @@ int bgzf_check_EOF(BGZF *fp) return (memcmp(magic, buf, 28) == 0)? 1 : 0; } -int64_t -bgzf_seek(BGZF* fp, int64_t pos, int where) +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { + int block_offset; + int64_t block_address; + if (fp->open_mode != 'r') { report_error(fp, "file not open for read"); return -1; @@ -685,8 +697,8 @@ bgzf_seek(BGZF* fp, int64_t pos, int where) report_error(fp, "unimplemented seek option"); return -1; } - int block_offset = pos & 0xFFFF; - int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; + block_offset = pos & 0xFFFF; + block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; #ifdef _USE_KNETFILE if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { #else diff --git a/bgzf.h b/bgzf.h index 70f497e..7295f37 100644 --- a/bgzf.h +++ b/bgzf.h @@ -26,7 +26,6 @@ #include #include -#include #include #ifdef _USE_KNETFILE #include "knetfile.h" @@ -37,7 +36,7 @@ typedef struct { int file_descriptor; char open_mode; // 'r' or 'w' - bool owned_file, is_uncompressed; + int16_t owned_file, compress_level; #ifdef _USE_KNETFILE union { knetFile *fpr; @@ -62,13 +61,6 @@ typedef struct { extern "C" { #endif -/* - * Checks the magic string of the file. Returns 1 - * for bgzipped files, -1 on errors and 0 for files - * without the bgzip magic string. - */ -int is_bgzipped(const char *path); - /* * Open an existing file descriptor for reading or writing. * Mode must be either "r" or "w". @@ -133,8 +125,10 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); void bgzf_set_cache_size(BGZF *fp, int cache_size); int bgzf_check_EOF(BGZF *fp); - int bgzf_read_block(BGZF* fp); +int bgzf_flush(BGZF* fp); +int bgzf_flush_try(BGZF *fp, int size); +int bgzf_check_bgzf(const char *fn); #ifdef __cplusplus } diff --git a/example.gtf.gz.tbi b/example.gtf.gz.tbi index eb3f24c6ba9cd8cf5c4551ced2edfd08518ce9e7..84a3c841cb94ec6287a948bc8ef238afcd653736 100644 GIT binary patch delta 105 zcmV-v0G9uR0)qk(ABzYC000000RIL6LPG)o=8+MGOfUco3v|E0+}gh#mHx8>)oz&C zpwNefuagACo(Kg99RhVnv=)TVz_5%DM8d)s<}O!d2n`Eg7$2sO2g%(qJutf{rD1je L01K{~J(1)fkbx&9 delta 135 zcmZo>YGD$T@8)1(0D=E(3{K8W49_PDw&kh?@E<tid = intv->beg = intv->end = intv->bin = -1; - for (i = 0; i <= str->l; ++i) { - if (str->s[i] == '\t' || str->s[i] == 0) { - if (id == idx->conf.sc) { - str->s[i] = 0; - intv->tid = get_tid(idx, str->s + b); - if (i != str->l) str->s[i] = '\t'; - } else if (id == idx->conf.bc) { + intv->ss = intv->se = 0; intv->beg = intv->end = -1; + for (i = 0; i <= len; ++i) { + if (line[i] == '\t' || line[i] == 0) { + ++ncols; + if (id == conf->sc) { + intv->ss = line + b; intv->se = line + i; + } else if (id == conf->bc) { // here ->beg is 0-based. - intv->beg = intv->end = strtol(str->s + b, &s, 0); - if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg; + intv->beg = intv->end = strtol(line + b, &s, 0); + if (!(conf->preset&TI_FLAG_UCSC)) --intv->beg; else ++intv->end; if (intv->beg < 0) intv->beg = 0; if (intv->end < 1) intv->end = 1; } else { - if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) { - if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0); - } else if ((idx->conf.preset&0xffff) == TI_PRESET_SAM) { + if ((conf->preset&0xffff) == TI_PRESET_GENERIC) { + if (id == conf->ec) intv->end = strtol(line + b, &s, 0); + } else if ((conf->preset&0xffff) == TI_PRESET_SAM) { if (id == 6) { // CIGAR int l = 0, op; char *t; - for (s = str->s + b; s < str->s + i;) { + for (s = line + b; s < line + i;) { long x = strtol(s, &t, 10); op = toupper(*t); if (op == 'M' || op == 'D' || op == 'N') l += x; @@ -190,21 +189,21 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) if (l == 0) l = 1; intv->end = intv->beg + l; } - } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) { + } else if ((conf->preset&0xffff) == TI_PRESET_VCF) { // FIXME: the following is NOT tested and is likely to be buggy if (id == 4) { if (b < i) intv->end = intv->beg + (i - b); } else if (id == 8) { // look for "END=" - int c = str->s[i]; - str->s[i] = 0; - s = strstr(str->s + b, "END="); - if (s == str->s + b) s += 4; + int c = line[i]; + line[i] = 0; + s = strstr(line + b, "END="); + if (s == line + b) s += 4; else if (s) { - s = strstr(str->s + b, ";END="); + s = strstr(line + b, ";END="); if (s) s += 5; } if (s) intv->end = strtol(s, &s, 0); - str->s[i] = c; + line[i] = c; } } } @@ -212,11 +211,33 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) ++id; } } - if (intv->tid < 0 || intv->beg < 0 || intv->end < 0) return -1; - intv->bin = ti_reg2bin(intv->beg, intv->end); +/* + if (ncols < conf->sc || ncols < conf->bc || ncols < conf->ec) { + if (ncols == 1) fprintf(stderr,"[get_intv] Is the file tab-delimited? The line has %d field only: %s\n", ncols, line); + else fprintf(stderr,"[get_intv] The line has %d field(s) only: %s\n", ncols, line); + exit(1); + } +*/ + if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1; return 0; } +static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) +{ + ti_interval_t x; + intv->tid = intv->beg = intv->end = intv->bin = -1; + if (ti_get_intv(&idx->conf, str->l, str->s, &x) == 0) { + int c = *x.se; + *x.se = '\0'; intv->tid = get_tid(idx, x.ss); *x.se = c; + intv->beg = x.beg; intv->end = x.end; + intv->bin = ti_reg2bin(intv->beg, intv->end); + return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1; + } else { + fprintf(stderr, "[%s] the following line cannot be parsed and skipped: %s\n", __func__, str->s); + return -1; + } +} + /************ * indexing * ************/ @@ -322,10 +343,15 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) continue; } get_intv(idx, str, &intv); + if ( intv.beg<0 || intv.end<0 ) + { + fprintf(stderr,"[ti_index_core] the indexes overlap or are out of bounds\n"); + exit(1); + } if (last_tid != intv.tid) { // change of chromosomes if (last_tid>intv.tid ) { - fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted?\n",(unsigned long long)lineno); + fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted? [pos %d]\n",(unsigned long long)lineno,intv.beg+1); exit(1); } last_tid = intv.tid; @@ -902,6 +928,8 @@ int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *d return 0; } +const ti_conf_t *ti_get_conf(ti_index_t *idx) { return idx? &idx->conf : 0; } + /******************* * High-level APIs * *******************/ diff --git a/kseq.h b/kseq.h new file mode 100644 index 0000000..82face0 --- /dev/null +++ b/kseq.h @@ -0,0 +1,227 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" + */ + +/* Last Modified: 12APR2009 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_MAX 1 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->l == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff --git a/main.c b/main.c index 8ae6b2e..4664251 100644 --- a/main.c +++ b/main.c @@ -3,17 +3,109 @@ #include #include #include +#include #include "bgzf.h" #include "tabix.h" -#define PACKAGE_VERSION "0.2.3 (r876)" +#define PACKAGE_VERSION "0.2.4 (r949)" + +#define error(...) { fprintf(stderr,__VA_ARGS__); return -1; } + +int reheader_file(const char *header, const char *file, int meta) +{ + BGZF *fp = bgzf_open(file,"r"); + if (bgzf_read_block(fp) != 0 || !fp->block_length) + return -1; + + char *buffer = fp->uncompressed_block; + int skip_until = 0; + + if ( buffer[0]==meta ) + { + skip_until = 1; + + // Skip the header + while (1) + { + if ( buffer[skip_until]=='\n' ) + { + skip_until++; + if ( skip_until>=fp->block_length ) + { + if (bgzf_read_block(fp) != 0 || !fp->block_length) + error("no body?\n"); + skip_until = 0; + } + // The header has finished + if ( buffer[skip_until]!=meta ) break; + } + skip_until++; + if ( skip_until>=fp->block_length ) + { + if (bgzf_read_block(fp) != 0 || !fp->block_length) + error("no body?\n"); + skip_until = 0; + } + } + } + + FILE *fh = fopen(header,"r"); + if ( !fh ) + error("%s: %s", header,strerror(errno)); + int page_size = getpagesize(); + char *buf = valloc(page_size); + BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w"); + ssize_t nread; + while ( (nread=fread(buf,1,page_size-1,fh))>0 ) + { + if ( nreaderror); + } + fclose(fh); + + if ( fp->block_length - skip_until > 0 ) + { + if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) + error("Error: %s\n",fp->error); + } + if (bgzf_flush(bgzf_out) < 0) + error("Error: %s\n",bgzf_out->error); + + while (1) + { +#ifdef _USE_KNETFILE + nread = knet_read(fp->x.fpr, buf, page_size); +#else + nread = fread(buf, 1, page_size, fp->file); +#endif + if ( nread<=0 ) + break; + +#ifdef _USE_KNETFILE + int count = fwrite(buf, 1, nread, bgzf_out->x.fpw); +#else + int count = fwrite(buf, 1, nread, bgzf_out->file); +#endif + if (count != nread) + error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); + } + + if (bgzf_close(bgzf_out) < 0) + error("Error: %s\n",bgzf_out->error); + + return 0; +} + int main(int argc, char *argv[]) { - int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0; + int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0; ti_conf_t conf = ti_conf_gff; - while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhf")) >= 0) { + const char *reheader = NULL; + while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) { switch (c) { + case 'B': bed_reg = 1; break; case '0': conf.preset |= TI_FLAG_UCSC; break; case 'S': skip = atoi(optarg); break; case 'c': meta = optarg[0]; break; @@ -34,6 +126,7 @@ int main(int argc, char *argv[]) case 'l': list_chrms = 1; break; case 'h': print_header = 1; break; case 'f': force = 1; break; + case 'r': reheader = optarg; break; } } if (skip >= 0) conf.line_skip = skip; @@ -46,11 +139,13 @@ int main(int argc, char *argv[]) fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n"); fprintf(stderr, " -s INT sequence name column [1]\n"); fprintf(stderr, " -b INT start column [4]\n"); - fprintf(stderr, " -e INT end column [5]\n"); + fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); + fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n"); + fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n"); fprintf(stderr, " -0 zero-based coordinate\n"); - fprintf(stderr, " -h print the VCF header\n"); + fprintf(stderr, " -h print the header lines\n"); fprintf(stderr, " -l list chromosome names\n"); fprintf(stderr, " -f force to overwrite the index\n"); fprintf(stderr, "\n"); @@ -71,27 +166,49 @@ int main(int argc, char *argv[]) ti_index_destroy(idx); return 0; } + if (reheader) + return reheader_file(reheader,argv[optind],conf.meta_char); + + struct stat stat_tbi,stat_vcf; + char *fnidx = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(fnidx, argv[optind]), ".tbi"); + if (optind + 1 == argc) { if (force == 0) { - struct stat buf; - char *fnidx = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(fnidx, argv[optind]), ".tbi"); - if (stat(fnidx, &buf) == 0) { - fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n"); - free(fnidx); - return 1; + if (stat(fnidx, &stat_tbi) == 0) + { + // Before complaining, check if the VCF file isn't newer. This is a common source of errors, + // people tend not to notice that tabix failed + stat(argv[optind], &stat_vcf); + if ( stat_vcf.st_mtime <= stat_tbi.st_mtime ) + { + fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n"); + free(fnidx); + return 1; + } } - free(fnidx); } - if ( is_bgzipped(argv[optind])!=1 ) + if ( bgzf_check_bgzf(argv[optind])!=1 ) { fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]); + free(fnidx); return 1; } return ti_index_build(argv[optind], &conf); } { // retrieve tabix_t *t; + // Common source of errors: new VCF is used with an old index + stat(fnidx, &stat_tbi); + stat(argv[optind], &stat_vcf); + if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime ) + { + fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n"); + free(fnidx); + return 1; + } + free(fnidx); + if ((t = ti_open(argv[optind], 0)) == 0) { fprintf(stderr, "[main] fail to open the data file.\n"); return 1; @@ -106,36 +223,65 @@ int main(int argc, char *argv[]) } ti_iter_destroy(iter); } else { // retrieve from specified regions - int i; - if ( ti_lazy_index_load(t) ) - { + int i, len; + ti_iter_t iter; + const char *s; + const ti_conf_t *idxconf; + + if (ti_lazy_index_load(t) < 0 && bed_reg == 0) { fprintf(stderr,"[tabix] failed to load the index file.\n"); return 1; } + idxconf = ti_get_conf(t->idx); - ti_iter_t iter; - const char *s; - int len; if ( print_header ) { // If requested, print the header lines here iter = ti_query(t, 0, 0, 0); while ((s = ti_read(t, iter, &len)) != 0) { - if ( *s != '#' ) break; + if ((int)(*s) != idxconf->meta_char) break; fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); } - for (i = optind + 1; i < argc; ++i) { - int tid, beg, end; - if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { - iter = ti_queryi(t, tid, beg, end); - while ((s = ti_read(t, iter, &len)) != 0) { - fputs(s, stdout); fputc('\n', stdout); + if (bed_reg) { + extern int bed_overlap(const void *_h, const char *chr, int beg, int end); + extern void *bed_read(const char *fn); + extern void bed_destroy(void *_h); + + const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available + void *bed = bed_read(argv[optind+1]); // load the BED file + ti_interval_t intv; + + if (bed == 0) { + fprintf(stderr, "[main] fail to read the BED file.\n"); + return 1; + } + iter = ti_query(t, 0, 0, 0); + while ((s = ti_read(t, iter, &len)) != 0) { + int c; + ti_get_intv(conf_, len, (char*)s, &intv); + c = *intv.se; *intv.se = '\0'; + if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) { + *intv.se = c; + puts(s); } - ti_iter_destroy(iter); - } - // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); + *intv.se = c; + } + ti_iter_destroy(iter); + bed_destroy(bed); + } else { + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { + iter = ti_queryi(t, tid, beg, end); + while ((s = ti_read(t, iter, &len)) != 0) { + fputs(s, stdout); fputc('\n', stdout); + } + ti_iter_destroy(iter); + } + // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); + } } } ti_close(t); diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..2771eda --- /dev/null +++ b/python/setup.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# +# The MIT License +# +# Copyright (c) 2011 Seoul National University. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Contact: Hyeshik Chang + +from distutils.core import setup, Extension + +# Change this to True when you need the knetfile support. +USE_KNETFILE = False + +TABIX_SOURCE_FILES = [ + '../bgzf.c', '../bgzip.c', '../index.c', '../knetfile.c', '../kstring.c' +] + +define_options = [('_FILE_OFFSET_BITS', 64)] +if USE_KNETFILE: + define_options.append(('_USE_KNETFILE', 1)) + +ext_modules = [Extension("tabix", ["tabixmodule.c"] + TABIX_SOURCE_FILES, + include_dirs=['..'], + libraries=['z'], + define_macros=define_options)] + +setup (name = 'tabix', + version = '1.0', + description = 'Python interface to tabix, a generic indexer ' + 'for TAB-delimited genome position files', + author = 'Hyeshik Chang', + author_email = 'hyeshik@snu.ac.kr', + license = 'MIT', + ext_modules = ext_modules +) diff --git a/python/tabixmodule.c b/python/tabixmodule.c new file mode 100644 index 0000000..d04d097 --- /dev/null +++ b/python/tabixmodule.c @@ -0,0 +1,408 @@ +/*- + * The MIT License + * + * Copyright (c) 2011 Seoul National University. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Contact: Hyeshik Chang + */ + +#define PY_SSIZE_T_CLEAN +#include "Python.h" +#include "tabix.h" + +static PyObject *TabixError; + +typedef struct { + PyObject_HEAD + tabix_t *tb; + char *fn; +} TabixObject; + +typedef struct { + PyObject_HEAD + TabixObject *tbobj; + ti_iter_t iter; +} TabixIteratorObject; + +static PyTypeObject Tabix_Type, TabixIterator_Type; + +/* --- TabixIterator --------------------------------------------------- */ + +static PyObject * +tabixiter_create(TabixObject *parentidx, ti_iter_t iter) +{ + TabixIteratorObject *self; + + self = PyObject_New(TabixIteratorObject, &TabixIterator_Type); + if (self == NULL) + return NULL; + + Py_INCREF(parentidx); + self->tbobj = parentidx; + self->iter = iter; + + return (PyObject *)self; +} + +static void +tabixiter_dealloc(TabixIteratorObject *self) +{ + ti_iter_destroy(self->iter); + Py_DECREF(self->tbobj); + PyObject_Del(self); +} + +static PyObject * +tabixiter_iter(PyObject *self) +{ + Py_INCREF(self); + return self; +} + +#if PY_MAJOR_VERSION < 3 +# define PYOBJECT_FROM_STRING_AND_SIZE PyString_FromStringAndSize +#else +# define PYOBJECT_FROM_STRING_AND_SIZE PyUnicode_FromStringAndSize +#endif + +static PyObject * +tabixiter_iternext(TabixIteratorObject *self) +{ + const char *chunk; + int len, i; + + chunk = ti_read(self->tbobj->tb, self->iter, &len); + if (chunk != NULL) { + PyObject *ret, *column; + Py_ssize_t colidx; + const char *ptr, *begin; + + ret = PyList_New(0); + if (ret == NULL) + return NULL; + + colidx = 0; + ptr = begin = chunk; + for (i = len; i > 0; i--, ptr++) + if (*ptr == '\t') { + column = PYOBJECT_FROM_STRING_AND_SIZE(begin, + (Py_ssize_t)(ptr - begin)); + if (column == NULL || PyList_Append(ret, column) == -1) { + Py_DECREF(ret); + return NULL; + } + + Py_DECREF(column); + begin = ptr + 1; + colidx++; + } + + column = PYOBJECT_FROM_STRING_AND_SIZE(begin, (Py_ssize_t)(ptr - begin)); + if (column == NULL || PyList_Append(ret, column) == -1) { + Py_DECREF(ret); + return NULL; + } + Py_DECREF(column); + + return ret; + } + else + return NULL; +} + +static PyMethodDef tabixiter_methods[] = { + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject TabixIterator_Type = { + PyVarObject_HEAD_INIT(NULL, 0) + "tabix.TabixIterator", /*tp_name*/ + sizeof(TabixIteratorObject), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + /* methods */ + (destructor)tabixiter_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + 0, /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + tabixiter_iter, /*tp_iter*/ + (iternextfunc)tabixiter_iternext, /*tp_iternext*/ + tabixiter_methods, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + 0, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ +}; + + +/* --- Tabix ----------------------------------------------------------- */ + +static PyObject * +tabix_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + TabixObject *self; + const char *fn, *fnidx=NULL; + static char *kwnames[]={"fn", "fnidx", NULL}; + tabix_t *tb; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|z:Tabix", + kwnames, &fn, &fnidx)) + return NULL; + + tb = ti_open(fn, fnidx); + if (tb == NULL) { + PyErr_SetString(TabixError, "Can't open the index file."); + return NULL; + } + + self = (TabixObject *)type->tp_alloc(type, 0); + if (self == NULL) + return NULL; + + self->tb = tb; + self->fn = strdup(fn); + + return (PyObject *)self; +} + +static void +tabix_dealloc(TabixObject *self) +{ + free(self->fn); + ti_close(self->tb); + PyObject_Del(self); +} + +static PyObject * +tabix_query(TabixObject *self, PyObject *args) +{ + char *name; + int begin, end; + ti_iter_t result; + + if (!PyArg_ParseTuple(args, "sii:query", &name, &begin, &end)) + return NULL; + + result = ti_query(self->tb, name, begin, end); + if (result == NULL) { + PyErr_SetString(TabixError, "query failed"); + return NULL; + } + + return tabixiter_create(self, result); +} + +static PyObject * +tabix_queryi(TabixObject *self, PyObject *args) +{ + int tid, begin, end; + ti_iter_t result; + + if (!PyArg_ParseTuple(args, "iii:queryi", &tid, &begin, &end)) + return NULL; + + result = ti_queryi(self->tb, tid, begin, end); + if (result == NULL) { + PyErr_SetString(TabixError, "query failed"); + return NULL; + } + + return tabixiter_create(self, result); +} + +static PyObject * +tabix_querys(TabixObject *self, PyObject *args) +{ + const char *reg; + ti_iter_t result; + + if (!PyArg_ParseTuple(args, "s:querys", ®)) + return NULL; + + result = ti_querys(self->tb, reg); + if (result == NULL) { + PyErr_SetString(TabixError, "query failed"); + return NULL; + } + + return tabixiter_create(self, result); +} + +static PyObject * +tabix_repr(TabixObject *self) +{ +#if PY_MAJOR_VERSION < 3 + return PyString_FromFormat("", self->fn); +#else + return PyUnicode_FromFormat("", self->fn); +#endif +} + +static PyMethodDef tabix_methods[] = { + {"query", (PyCFunction)tabix_query, METH_VARARGS, + PyDoc_STR("T.query(name, begin, end) -> iterator")}, + {"queryi", (PyCFunction)tabix_queryi, METH_VARARGS, + PyDoc_STR("T.queryi(tid, begin, id) -> iterator")}, + {"querys", (PyCFunction)tabix_querys, METH_VARARGS, + PyDoc_STR("T.querys(region) -> iterator")}, + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject Tabix_Type = { + /* The ob_type field must be initialized in the module init function + * to be portable to Windows without using C++. */ + PyVarObject_HEAD_INIT(NULL, 0) + "tabix.Tabix", /*tp_name*/ + sizeof(TabixObject), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + /* methods */ + (destructor)tabix_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + (reprfunc)tabix_repr, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + 0, /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + tabix_methods, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + (newfunc)tabix_new, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ +}; +/* --------------------------------------------------------------------- */ + +static PyMethodDef tabix_functions[] = { + {NULL, NULL} /* sentinel */ +}; + +PyDoc_STRVAR(module_doc, +"Python interface to tabix, Heng Li's generic indexer for TAB-delimited " +"genome position filesThis is a template module just for instruction."); + +#if PY_MAJOR_VERSION >= 3 +static struct PyModuleDef tabixmodule = { + PyModuleDef_HEAD_INIT, + "tabix", + module_doc, + -1, + tabix_functions, + NULL, + NULL, + NULL, + NULL +}; +#endif + +#if PY_MAJOR_VERSION < 3 +PyMODINIT_FUNC inittabix(void) +#else +PyMODINIT_FUNC PyInit_tabix(void) +#endif +{ + PyObject *m; + + if (PyType_Ready(&Tabix_Type) < 0) + goto fail; + if (PyType_Ready(&TabixIterator_Type) < 0) + goto fail; + +#if PY_MAJOR_VERSION < 3 + m = Py_InitModule3("tabix", tabix_functions, module_doc); +#else + m = PyModule_Create(&tabixmodule); +#endif + if (m == NULL) + goto fail; + + if (TabixError == NULL) { + TabixError = PyErr_NewException("tabix.error", NULL, NULL); + if (TabixError == NULL) + goto fail; + } + Py_INCREF(TabixError); + PyModule_AddObject(m, "error", TabixError); + + PyModule_AddObject(m, "Tabix", (PyObject *)&Tabix_Type); + PyModule_AddObject(m, "TabixIterator", (PyObject *)&TabixIterator_Type); + +#if PY_MAJOR_VERSION >= 3 + return m; +#endif + + fail: +#if PY_MAJOR_VERSION < 3 + return; +#else + return NULL; +#endif +} diff --git a/python/test.py b/python/test.py new file mode 100755 index 0000000..804a565 --- /dev/null +++ b/python/test.py @@ -0,0 +1,91 @@ +# +# The MIT License +# +# Copyright (c) 2011 Seoul National University. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Contact: Hyeshik Chang + +import unittest +import random +import gzip +import tabix + +EXAMPLEFILE = '../example.gtf.gz' + +def load_example_regions(path): + alldata = [] + for line in gzip.GzipFile(EXAMPLEFILE): + fields = line.decode('ascii')[:-1].split('\t') + seqid = fields[0] + begin = int(fields[3]) + end = int(fields[4]) + alldata.append((seqid, begin, end, fields[:7])) + + return alldata + +def does_overlap(A, B, C, D): + return (A <= D <= B) or (C <= B <= D) + +def sample_test_dataset(regions, ntests): + seqids = [seqid for seqid, _, _, _ in regions] + lowerbound = max(0, min(begin for _, begin, _, _ in regions) - 1000) + upperbound = max(end for _, _, end, _ in regions) + 1000 + + tests = [] + for i in range(ntests): + seqid = random.choice(seqids) + low = random.randrange(lowerbound, upperbound) + high = random.randrange(low, upperbound) + + # for 1-based both-end inclusive intervals + matches = [info for seq, begin, end, info in regions + if seqid == seq and does_overlap(begin, end, low, high)] + + tests.append((seqid, low, high, matches)) + + return tests + +def tbresult2excerpt(tbmatches): + return [fields[:7] for fields in tbmatches] + +class TabixTest(unittest.TestCase): + regions = load_example_regions(EXAMPLEFILE) + testset = sample_test_dataset(regions, 500) + + def setUp(self): + self.tb = tabix.Tabix(EXAMPLEFILE) + + def testQuery(self): + for seqid, low, high, matches in self.testset: + tbresult = tbresult2excerpt(self.tb.query(seqid, low, high)) + self.assertEqual(tbresult, matches) + + def testQueryS(self): + for seqid, low, high, matches in self.testset: + tbresult = tbresult2excerpt(self.tb.querys('%s:%d-%d' % + (seqid, low, high))) + self.assertEqual(tbresult, matches) + + +if __name__ == '__main__': + unittest.main() diff --git a/tabix.1 b/tabix.1 index 3b82c76..1bd9533 100644 --- a/tabix.1 +++ b/tabix.1 @@ -7,7 +7,7 @@ tabix - Generic indexer for TAB-delimited genome position files .SH SYNOPSIS .PP .B bgzip -.RB [ \-cdh ] +.RB [ \-cdhB ] .RB [ \-b .IR virtualOffset ] .RB [ \-s @@ -82,6 +82,14 @@ Skip lines started with character CHAR. [#] Specify that the position in the data file is 0-based (e.g. UCSC files) rather than 1-based. .TP +.B -h +Print the header/meta lines. +.TP +.B -B +The second argument is a BED file. When this option is in use, the input +file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless, +with this option, the format of the input must be specificed correctly on the command line. +.TP .B -f Force to overwrite the index file if it is present. .TP diff --git a/tabix.h b/tabix.h index 4390c09..7b4497a 100644 --- a/tabix.h +++ b/tabix.h @@ -58,6 +58,11 @@ typedef struct { int32_t meta_char, line_skip; } ti_conf_t; +typedef struct { + int beg, end; + char *ss, *se; +} ti_interval_t; + extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset #ifdef __cplusplus @@ -120,6 +125,9 @@ extern "C" { /* Get the data line pointed by the iterator and iterate to the next record. */ const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len); + const ti_conf_t *ti_get_conf(ti_index_t *idx); + int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv); + /******************* * Deprecated APIs * *******************/ -- 2.30.2