Imported Upstream version 0.2.4
authorCharles Plessy <plessy@debian.org>
Tue, 12 Apr 2011 05:57:33 +0000 (14:57 +0900)
committerCharles Plessy <plessy@debian.org>
Tue, 12 Apr 2011 05:57:33 +0000 (14:57 +0900)
15 files changed:
ChangeLog
Makefile
NEWS
bedidx.c [new file with mode: 0644]
bgzf.c
bgzf.h
example.gtf.gz.tbi
index.c
kseq.h [new file with mode: 0644]
main.c
python/setup.py [new file with mode: 0644]
python/tabixmodule.c [new file with mode: 0644]
python/test.py [new file with mode: 0755]
tabix.1
tabix.h

index 3594496ca8de7d1d15535d07a4d329f2504847fe..fd335b82c292066e8787aa53017db4123daf7798 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,106 @@
+------------------------------------------------------------------------
+r942 | lh3lh3 | 2011-03-31 16:39:50 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+   M /trunk/tabix/main.c
+
+update version number
+
+------------------------------------------------------------------------
+r940 | lh3lh3 | 2011-03-31 16:38:03 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+   M /trunk/tabix/bedidx.c
+   M /trunk/tabix/main.c
+
+fixed two bugs due to recent changes
+
+------------------------------------------------------------------------
+r939 | lh3lh3 | 2011-03-31 16:12:21 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+   M /trunk/tabix/bgzf.c
+   M /trunk/tabix/bgzf.h
+   M /trunk/tabix/main.c
+
+update to the latest bgzf.*
+
+------------------------------------------------------------------------
+r938 | lh3lh3 | 2011-03-31 16:02:21 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+   M /trunk/tabix/index.c
+   M /trunk/tabix/main.c
+   M /trunk/tabix/tabix.h
+
+BED support
+
+------------------------------------------------------------------------
+r937 | lh3lh3 | 2011-03-31 15:03:49 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+   M /trunk/tabix/Makefile
+   A /trunk/tabix/bedidx.c
+   M /trunk/tabix/example.gtf.gz.tbi
+   M /trunk/tabix/index.c
+   A /trunk/tabix/kseq.h
+   M /trunk/tabix/tabix.h
+
+restructure get_intv() for BED support
+
+------------------------------------------------------------------------
+r919 | petulda | 2011-02-24 10:14:14 -0500 (Thu, 24 Feb 2011) | 1 line
+Changed paths:
+   M /trunk/tabix/bgzf.c
+   M /trunk/tabix/bgzf.h
+   M /trunk/tabix/index.c
+   M /trunk/tabix/main.c
+
+New -r (reheader) option for efficient header replacement.
+------------------------------------------------------------------------
+r915 | lh3lh3 | 2011-02-22 09:50:57 -0500 (Tue, 22 Feb 2011) | 2 lines
+Changed paths:
+   A /trunk/tabix/python
+   A /trunk/tabix/python/setup.py (from /trunk/tabix/setup.py:914)
+   A /trunk/tabix/python/tabixmodule.c (from /trunk/tabix/tabixmodule.c:914)
+   A /trunk/tabix/python/test.py (from /trunk/tabix/test.py:914)
+   D /trunk/tabix/setup.py
+   D /trunk/tabix/tabixmodule.c
+   D /trunk/tabix/test.py
+
+move to a new python/ directory
+
+------------------------------------------------------------------------
+r914 | lh3lh3 | 2011-02-22 09:49:35 -0500 (Tue, 22 Feb 2011) | 2 lines
+Changed paths:
+   A /trunk/tabix/setup.py
+   A /trunk/tabix/tabixmodule.c
+   A /trunk/tabix/test.py
+
+CPython C-API by Hyeshik Chang
+
+------------------------------------------------------------------------
+r904 | petulda | 2011-01-28 08:06:27 -0500 (Fri, 28 Jan 2011) | 1 line
+Changed paths:
+   M /trunk/tabix/index.c
+
+Check the number of fields on each line and exit nicely without segfault
+------------------------------------------------------------------------
+r901 | petulda | 2011-01-21 06:45:37 -0500 (Fri, 21 Jan 2011) | 1 line
+Changed paths:
+   M /trunk/tabix/main.c
+
+Fix: Complain only when VCF is newer, not newer or same mtime
+------------------------------------------------------------------------
+r900 | petulda | 2011-01-21 04:23:04 -0500 (Fri, 21 Jan 2011) | 1 line
+Changed paths:
+   M /trunk/tabix/main.c
+
+Prevent the common user mistake and check the timestamps of the vcf and index file
+------------------------------------------------------------------------
+r876 | lh3lh3 | 2010-12-08 12:38:45 -0500 (Wed, 08 Dec 2010) | 2 lines
+Changed paths:
+   M /trunk/tabix/ChangeLog
+   M /trunk/tabix/NEWS
+   M /trunk/tabix/main.c
+
+Release tabix-0.2.3
+
 ------------------------------------------------------------------------
 r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines
 Changed paths:
index 9013642bacc8a1ec2c93ff03a9e519f5ef202abd..3ba7d5d1d65ee3a8e3f6d05d1ae5df0bebc0386b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 CC=                    gcc
 CFLAGS=                -g -Wall -O2 -fPIC #-m64 #-arch ppc
 DFLAGS=                -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
-LOBJS=         bgzf.o kstring.o knetfile.o index.o
+LOBJS=         bgzf.o kstring.o knetfile.o index.o bedidx.o
 AOBJS=         main.o
 PROG=          tabix bgzip
 INCLUDES=
@@ -52,6 +52,7 @@ bgzf.o:bgzf.h knetfile.h
 index.o:bgzf.h tabix.h khash.h ksort.h kstring.h
 main.o:tabix.h kstring.h bgzf.h
 bgzip.o:bgzf.h
+bedidx.o:kseq.h khash.h
 
 tabix.pdf:tabix.tex
                pdflatex tabix.tex
diff --git a/NEWS b/NEWS
index ff28dedd5237ac852ccdc41fec0ab2521275a5e0..d23054184048429abbf18fe8dbe3b45446979dc8 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,26 @@
+Release 0.2.4 (10 April, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Give an error if the index file is older than the data file.
+
+ * Avoid a segfault given flawed input.
+
+ * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to
+   the dynamic library and are reported to be faster. Pysam also comes with a
+   tabix binding.
+
+ * Added option "-r" for efficient header replacement.
+
+ * Added BED support.
+
+ * Synchronized the BGZF library between tabix and samtools.
+
+(0.2.4: 10 April 2011, r949)
+
+
+
 Beta Release 0.2.3 (8 December, 2010)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/bedidx.c b/bedidx.c
new file mode 100644 (file)
index 0000000..9297831
--- /dev/null
+++ b/bedidx.c
@@ -0,0 +1,152 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+       int n, m;
+       uint64_t *a;
+       int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+       int i, j, m, *idx;
+       m = *n_idx = 0; idx = 0;
+       for (i = 0; i < n; ++i) {
+               int beg, end;
+               beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+               if (m < end + 1) {
+                       int oldm = m;
+                       m = end + 1;
+                       kroundup32(m);
+                       idx = realloc(idx, m * sizeof(int));
+                       for (j = oldm; j < m; ++j) idx[j] = -1;
+               }
+               if (beg == end) {
+                       if (idx[beg] < 0) idx[beg] = i;
+               } else {
+                       for (j = beg; j <= end; ++j)
+                               if (idx[j] < 0) idx[j] = i;
+               }
+               *n_idx = end + 1;
+       }
+       return idx;
+}
+
+void bed_index(void *_h)
+{
+       reghash_t *h = (reghash_t*)_h;
+       khint_t k;
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       bed_reglist_t *p = &kh_val(h, k);
+                       if (p->idx) free(p->idx);
+                       p->idx = bed_index_core(p->n, p->a, &p->m);
+               }
+       }
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+       int i, min_off;
+       if (p->n == 0) return 0;
+       min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+       if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+               int n = beg>>LIDX_SHIFT;
+               if (n > p->n) n = p->n;
+               for (i = n - 1; i >= 0; --i)
+                       if (p->idx[i] >= 0) break;
+               min_off = i >= 0? p->idx[i] : 0;
+       }
+       for (i = min_off; i < p->n; ++i) {
+               if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+               if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+                       return 1; // find the overlap; return
+       }
+       return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+       const reghash_t *h = (const reghash_t*)_h;
+       khint_t k;
+       if (!h) return 0;
+       k = kh_get(reg, h, chr);
+       if (k == kh_end(h)) return 0;
+       return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+       reghash_t *h = kh_init(reg);
+       gzFile fp;
+       kstream_t *ks;
+       int dret;
+       kstring_t *str;
+       // read the list
+       fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+       if (fp == 0) return 0;
+       str = calloc(1, sizeof(kstring_t));
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+               int beg = -1, end = -1;
+               bed_reglist_t *p;
+               khint_t k = kh_get(reg, h, str->s);
+               if (k == kh_end(h)) { // absent from the hash table
+                       int ret;
+                       char *s = strdup(str->s);
+                       k = kh_put(reg, h, s, &ret);
+                       memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+               }
+               p = &kh_val(h, k);
+               if (dret != '\n') { // if the lines has other characters
+                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+                               beg = atoi(str->s); // begin
+                               if (dret != '\n') {
+                                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
+                                               end = atoi(str->s); // end
+                               }
+                       }
+               }
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+               if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+               if (beg >= 0 && end > beg) {
+                       if (p->n == p->m) {
+                               p->m = p->m? p->m<<1 : 4;
+                               p->a = realloc(p->a, p->m * 8);
+                       }
+                       p->a[p->n++] = (uint64_t)beg<<32 | end;
+               }
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       bed_index(h);
+       return h;
+}
+
+void bed_destroy(void *_h)
+{
+       reghash_t *h = (reghash_t*)_h;
+       khint_t k;
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       free(kh_val(h, k).a);
+                       free(kh_val(h, k).idx);
+                       free((char*)kh_key(h, k));
+               }
+       }
+       kh_destroy(reg, h);
+}
diff --git a/bgzf.c b/bgzf.c
index 94e6194700f6d9f633939804a7bade4bf58c9df2..216cd04527f1ad187420cf1fab781e1d72e485b8 100644 (file)
--- a/bgzf.c
+++ b/bgzf.c
@@ -111,7 +111,7 @@ report_error(BGZF* fp, const char* message) {
     fp->error = message;
 }
 
-int is_bgzipped(const char *fn)
+int bgzf_check_bgzf(const char *fn)
 {
     BGZF *fp;
     uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
@@ -119,7 +119,7 @@ int is_bgzipped(const char *fn)
 
     if ((fp = bgzf_open(fn, "r")) == 0) 
     {
-        fprintf(stderr, "[is_bgzipped] failed to open the file: %s\n",fn);
+        fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn);
         return -1;
     }
 
@@ -174,7 +174,7 @@ open_read(int fd)
 
 static
 BGZF*
-open_write(int fd, bool is_uncompressed)
+open_write(int fd, int compress_level) // compress_level==-1 for the default level
 {
     FILE* file = fdopen(fd, "w");
     BGZF* fp;
@@ -182,7 +182,9 @@ open_write(int fd, bool is_uncompressed)
        fp = malloc(sizeof(BGZF));
     fp->file_descriptor = fd;
     fp->open_mode = 'w';
-    fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
+    fp->owned_file = 0;
+       fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+       if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
 #ifdef _USE_KNETFILE
     fp->x.fpw = file;
 #else
@@ -203,7 +205,7 @@ BGZF*
 bgzf_open(const char* __restrict path, const char* __restrict mode)
 {
     BGZF* fp = NULL;
-    if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
+    if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
 #ifdef _USE_KNETFILE
                knetFile *file = knet_open(path, mode);
                if (file == 0) return 0;
@@ -220,18 +222,23 @@ bgzf_open(const char* __restrict path, const char* __restrict mode)
                if (fd == -1) return 0;
         fp = open_read(fd);
 #endif
-    } else if (mode[0] == 'w' || mode[0] == 'W') {
-               int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
+    } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+               int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
 #ifdef _WIN32
                oflag |= O_BINARY;
 #endif
                fd = open(path, oflag, 0666);
                if (fd == -1) return 0;
-        fp = open_write(fd, strstr(mode, "u")? 1 : 0);
-    }
-    if (fp != NULL) {
-        fp->owned_file = 1;
+               { // set compress_level
+                       int i;
+                       for (i = 0; mode[i]; ++i)
+                               if (mode[i] >= '0' && mode[i] <= '9') break;
+                       if (mode[i]) compress_level = (int)mode[i] - '0';
+                       if (strchr(mode, 'u')) compress_level = 0;
+               }
+        fp = open_write(fd, compress_level);
     }
+    if (fp != NULL) fp->owned_file = 1;
     return fp;
 }
 
@@ -242,7 +249,12 @@ bgzf_fdopen(int fd, const char * __restrict mode)
     if (mode[0] == 'r' || mode[0] == 'R') {
         return open_read(fd);
     } else if (mode[0] == 'w' || mode[0] == 'W') {
-        return open_write(fd, strstr(mode, "u")? 1 : 0);
+               int i, compress_level = -1;
+               for (i = 0; mode[i]; ++i)
+                       if (mode[i] >= '0' && mode[i] <= '9') break;
+               if (mode[i]) compress_level = (int)mode[i] - '0';
+               if (strchr(mode, 'u')) compress_level = 0;
+        return open_write(fd, compress_level);
     } else {
         return NULL;
     }
@@ -282,7 +294,6 @@ deflate_block(BGZF* fp, int block_length)
     int input_length = block_length;
     int compressed_length = 0;
     while (1) {
-               int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
         z_stream zs;
         zs.zalloc = NULL;
         zs.zfree = NULL;
@@ -291,7 +302,7 @@ deflate_block(BGZF* fp, int block_length)
         zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
         zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
 
-        int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
+        int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
                                   GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
         if (status != Z_OK) {
             report_error(fp, "deflate init failed");
@@ -358,6 +369,7 @@ inflate_block(BGZF* fp, int block_length)
     // Inflate the block in fp->compressed_block into fp->uncompressed_block
 
     z_stream zs;
+       int status;
     zs.zalloc = NULL;
     zs.zfree = NULL;
     zs.next_in = fp->compressed_block + 18;
@@ -365,7 +377,7 @@ inflate_block(BGZF* fp, int block_length)
     zs.next_out = fp->uncompressed_block;
     zs.avail_out = fp->uncompressed_block_size;
 
-    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+    status = inflateInit2(&zs, GZIP_WINDOW_BITS);
     if (status != Z_OK) {
         report_error(fp, "inflate init failed");
         return -1;
@@ -459,15 +471,15 @@ int
 bgzf_read_block(BGZF* fp)
 {
     bgzf_byte_t header[BLOCK_HEADER_LENGTH];
-       int size = 0;
+       int count, size = 0, block_length, remaining;
 #ifdef _USE_KNETFILE
     int64_t block_address = knet_tell(fp->x.fpr);
        if (load_block_from_cache(fp, block_address)) return 0;
-    int count = knet_read(fp->x.fpr, header, sizeof(header));
+    count = knet_read(fp->x.fpr, header, sizeof(header));
 #else
     int64_t block_address = ftello(fp->file);
        if (load_block_from_cache(fp, block_address)) return 0;
-    int count = fread(header, 1, sizeof(header), fp->file);
+    count = fread(header, 1, sizeof(header), fp->file);
 #endif
     if (count == 0) {
         fp->block_length = 0;
@@ -482,10 +494,10 @@ bgzf_read_block(BGZF* fp)
         report_error(fp, "invalid block header");
         return -1;
     }
-    int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+    block_length = unpackInt16((uint8_t*)&header[16]) + 1;
     bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
     memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
-    int remaining = block_length - BLOCK_HEADER_LENGTH;
+    remaining = block_length - BLOCK_HEADER_LENGTH;
 #ifdef _USE_KNETFILE
     count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
 #else
@@ -497,9 +509,7 @@ bgzf_read_block(BGZF* fp)
     }
        size += count;
     count = inflate_block(fp, block_length);
-    if (count < 0) {
-        return -1;
-    }
+    if (count < 0) return -1;
     if (fp->block_length != 0) {
         // Do not reset offset if this read follows a seek.
         fp->block_offset = 0;
@@ -524,7 +534,8 @@ bgzf_read(BGZF* fp, void* data, int length)
     int bytes_read = 0;
     bgzf_byte_t* output = data;
     while (bytes_read < length) {
-        int available = fp->block_length - fp->block_offset;
+        int copy_length, available = fp->block_length - fp->block_offset;
+               bgzf_byte_t *buffer;
         if (available <= 0) {
             if (bgzf_read_block(fp) != 0) {
                 return -1;
@@ -534,8 +545,8 @@ bgzf_read(BGZF* fp, void* data, int length)
                 break;
             }
         }
-        int copy_length = bgzf_min(length-bytes_read, available);
-        bgzf_byte_t* buffer = fp->uncompressed_block;
+        copy_length = bgzf_min(length-bytes_read, available);
+        buffer = fp->uncompressed_block;
         memcpy(output, buffer + fp->block_offset, copy_length);
         fp->block_offset += copy_length;
         output += copy_length;
@@ -553,19 +564,16 @@ bgzf_read(BGZF* fp, void* data, int length)
     return bytes_read;
 }
 
-static
-int
-flush_block(BGZF* fp)
+int bgzf_flush(BGZF* fp)
 {
     while (fp->block_offset > 0) {
-        int block_length = deflate_block(fp, fp->block_offset);
-        if (block_length < 0) {
-            return -1;
-        }
+        int count, block_length;
+               block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) return -1;
 #ifdef _USE_KNETFILE
-        int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+        count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
 #else
-        int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+        count = fwrite(fp->compressed_block, 1, block_length, fp->file);
 #endif
         if (count != block_length) {
             report_error(fp, "write failed");
@@ -576,21 +584,28 @@ flush_block(BGZF* fp)
     return 0;
 }
 
-int
-bgzf_write(BGZF* fp, const void* data, int length)
+int bgzf_flush_try(BGZF *fp, int size)
 {
+       if (fp->block_offset + size > fp->uncompressed_block_size)
+               return bgzf_flush(fp);
+       return -1;
+}
+
+int bgzf_write(BGZF* fp, const void* data, int length)
+{
+       const bgzf_byte_t *input = data;
+       int block_length, bytes_written;
     if (fp->open_mode != 'w') {
         report_error(fp, "file not open for writing");
         return -1;
     }
 
-    if (fp->uncompressed_block == NULL) {
+    if (fp->uncompressed_block == NULL)
         fp->uncompressed_block = malloc(fp->uncompressed_block_size);
-    }
 
-    const bgzf_byte_t* input = data;
-    int block_length = fp->uncompressed_block_size;
-    int bytes_written = 0;
+    input = data;
+    block_length = fp->uncompressed_block_size;
+    bytes_written = 0;
     while (bytes_written < length) {
         int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
         bgzf_byte_t* buffer = fp->uncompressed_block;
@@ -599,7 +614,7 @@ bgzf_write(BGZF* fp, const void* data, int length)
         input += copy_length;
         bytes_written += copy_length;
         if (fp->block_offset == block_length) {
-            if (flush_block(fp) != 0) {
+            if (bgzf_flush(fp) != 0) {
                 break;
             }
         }
@@ -607,13 +622,10 @@ bgzf_write(BGZF* fp, const void* data, int length)
     return bytes_written;
 }
 
-int
-bgzf_close(BGZF* fp)
+int bgzf_close(BGZF* fp)
 {
     if (fp->open_mode == 'w') {
-        if (flush_block(fp) != 0) {
-            return -1;
-        }
+        if (bgzf_flush(fp) != 0) return -1;
                { // add an empty block
                        int count, block_length = deflate_block(fp, 0);
 #ifdef _USE_KNETFILE
@@ -638,9 +650,7 @@ bgzf_close(BGZF* fp)
                else ret = knet_close(fp->x.fpr);
         if (ret != 0) return -1;
 #else
-        if (fclose(fp->file) != 0) {
-            return -1;
-        }
+        if (fclose(fp->file) != 0) return -1;
 #endif
     }
     free(fp->uncompressed_block);
@@ -674,9 +684,11 @@ int bgzf_check_EOF(BGZF *fp)
        return (memcmp(magic, buf, 28) == 0)? 1 : 0;
 }
 
-int64_t
-bgzf_seek(BGZF* fp, int64_t pos, int where)
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
 {
+       int block_offset;
+       int64_t block_address;
+
     if (fp->open_mode != 'r') {
         report_error(fp, "file not open for read");
         return -1;
@@ -685,8 +697,8 @@ bgzf_seek(BGZF* fp, int64_t pos, int where)
         report_error(fp, "unimplemented seek option");
         return -1;
     }
-    int block_offset = pos & 0xFFFF;
-    int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+    block_offset = pos & 0xFFFF;
+    block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
 #ifdef _USE_KNETFILE
     if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
 #else
diff --git a/bgzf.h b/bgzf.h
index 70f497e819442adc843f66c4023012fade6005e8..7295f37425df320199aef87bb7bb80e482cc38bd 100644 (file)
--- a/bgzf.h
+++ b/bgzf.h
@@ -26,7 +26,6 @@
 
 #include <stdint.h>
 #include <stdio.h>
-#include <stdbool.h>
 #include <zlib.h>
 #ifdef _USE_KNETFILE
 #include "knetfile.h"
@@ -37,7 +36,7 @@
 typedef struct {
     int file_descriptor;
     char open_mode;  // 'r' or 'w'
-    bool owned_file, is_uncompressed;
+    int16_t owned_file, compress_level;
 #ifdef _USE_KNETFILE
        union {
                knetFile *fpr;
@@ -62,13 +61,6 @@ typedef struct {
 extern "C" {
 #endif
 
-/* 
- * Checks the magic string of the file. Returns 1
- *  for bgzipped files, -1 on errors and 0 for files
- *  without the bgzip magic string.
- */
-int is_bgzipped(const char *path);
-
 /*
  * Open an existing file descriptor for reading or writing.
  * Mode must be either "r" or "w".
@@ -133,8 +125,10 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
 void bgzf_set_cache_size(BGZF *fp, int cache_size);
 
 int bgzf_check_EOF(BGZF *fp);
-
 int bgzf_read_block(BGZF* fp);
+int bgzf_flush(BGZF* fp);
+int bgzf_flush_try(BGZF *fp, int size);
+int bgzf_check_bgzf(const char *fn);
 
 #ifdef __cplusplus
 }
index eb3f24c6ba9cd8cf5c4551ced2edfd08518ce9e7..84a3c841cb94ec6287a948bc8ef238afcd653736 100644 (file)
Binary files a/example.gtf.gz.tbi and b/example.gtf.gz.tbi differ
diff --git a/index.c b/index.c
index 4ed32c2c2f4c83f89c7882f028a084276d340c75..8f254e39e76a737b5d88a2eb108c35a32cb76254 100644 (file)
--- a/index.c
+++ b/index.c
@@ -156,32 +156,31 @@ static int get_tid(ti_index_t *idx, const char *ss)
        return tid;
 }
 
-static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
+int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv)
 {
-       int i, b = 0, id = 1;
+       int i, b = 0, id = 1, ncols = 0;
        char *s;
-       intv->tid = intv->beg = intv->end = intv->bin = -1;
-       for (i = 0; i <= str->l; ++i) {
-               if (str->s[i] == '\t' || str->s[i] == 0) {
-                       if (id == idx->conf.sc) {
-                               str->s[i] = 0;
-                               intv->tid = get_tid(idx, str->s + b);
-                               if (i != str->l) str->s[i] = '\t';
-                       } else if (id == idx->conf.bc) {
+       intv->ss = intv->se = 0; intv->beg = intv->end = -1;
+       for (i = 0; i <= len; ++i) {
+               if (line[i] == '\t' || line[i] == 0) {
+            ++ncols;
+                       if (id == conf->sc) {
+                               intv->ss = line + b; intv->se = line + i;
+                       } else if (id == conf->bc) {
                                // here ->beg is 0-based.
-                               intv->beg = intv->end = strtol(str->s + b, &s, 0);
-                               if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg;
+                               intv->beg = intv->end = strtol(line + b, &s, 0);
+                               if (!(conf->preset&TI_FLAG_UCSC)) --intv->beg;
                                else ++intv->end;
                                if (intv->beg < 0) intv->beg = 0;
                                if (intv->end < 1) intv->end = 1;
                        } else {
-                               if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) {
-                                       if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0);
-                               } else if ((idx->conf.preset&0xffff) == TI_PRESET_SAM) {
+                               if ((conf->preset&0xffff) == TI_PRESET_GENERIC) {
+                                       if (id == conf->ec) intv->end = strtol(line + b, &s, 0);
+                               } else if ((conf->preset&0xffff) == TI_PRESET_SAM) {
                                        if (id == 6) { // CIGAR
                                                int l = 0, op;
                                                char *t;
-                                               for (s = str->s + b; s < str->s + i;) {
+                                               for (s = line + b; s < line + i;) {
                                                        long x = strtol(s, &t, 10);
                                                        op = toupper(*t);
                                                        if (op == 'M' || op == 'D' || op == 'N') l += x;
@@ -190,21 +189,21 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
                                                if (l == 0) l = 1;
                                                intv->end = intv->beg + l;
                                        }
-                               } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) {
+                               } else if ((conf->preset&0xffff) == TI_PRESET_VCF) {
                                        // FIXME: the following is NOT tested and is likely to be buggy
                                        if (id == 4) {
                                                if (b < i) intv->end = intv->beg + (i - b);
                                        } else if (id == 8) { // look for "END="
-                                               int c = str->s[i];
-                                               str->s[i] = 0;
-                                               s = strstr(str->s + b, "END=");
-                                               if (s == str->s + b) s += 4;
+                                               int c = line[i];
+                                               line[i] = 0;
+                                               s = strstr(line + b, "END=");
+                                               if (s == line + b) s += 4;
                                                else if (s) {
-                                                       s = strstr(str->s + b, ";END=");
+                                                       s = strstr(line + b, ";END=");
                                                        if (s) s += 5;
                                                }
                                                if (s) intv->end = strtol(s, &s, 0);
-                                               str->s[i] = c;
+                                               line[i] = c;
                                        }
                                }
                        }
@@ -212,11 +211,33 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
                        ++id;
                }
        }
-       if (intv->tid < 0 || intv->beg < 0 || intv->end < 0) return -1;
-       intv->bin = ti_reg2bin(intv->beg, intv->end);
+/*
+       if (ncols < conf->sc || ncols < conf->bc || ncols < conf->ec) {
+               if (ncols == 1) fprintf(stderr,"[get_intv] Is the file tab-delimited? The line has %d field only: %s\n", ncols, line);
+               else fprintf(stderr,"[get_intv] The line has %d field(s) only: %s\n", ncols, line);
+               exit(1);
+       }
+*/
+       if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
        return 0;
 }
 
+static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
+{
+       ti_interval_t x;
+       intv->tid = intv->beg = intv->end = intv->bin = -1;
+       if (ti_get_intv(&idx->conf, str->l, str->s, &x) == 0) {
+               int c = *x.se;
+               *x.se = '\0'; intv->tid = get_tid(idx, x.ss); *x.se = c;
+               intv->beg = x.beg; intv->end = x.end;
+               intv->bin = ti_reg2bin(intv->beg, intv->end);
+               return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
+       } else {
+               fprintf(stderr, "[%s] the following line cannot be parsed and skipped: %s\n", __func__, str->s);
+               return -1;
+       }
+}
+
 /************
  * indexing *
  ************/
@@ -322,10 +343,15 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
                        continue;
                }
                get_intv(idx, str, &intv);
+        if ( intv.beg<0 || intv.end<0 )
+        {
+            fprintf(stderr,"[ti_index_core] the indexes overlap or are out of bounds\n");
+            exit(1);
+        }
                if (last_tid != intv.tid) { // change of chromosomes
             if (last_tid>intv.tid )
             {
-                fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted?\n",(unsigned long long)lineno);
+                fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted? [pos %d]\n",(unsigned long long)lineno,intv.beg+1);
                 exit(1);
             }
                        last_tid = intv.tid;
@@ -902,6 +928,8 @@ int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *d
        return 0;
 }
 
+const ti_conf_t *ti_get_conf(ti_index_t *idx) { return idx? &idx->conf : 0; }
+
 /*******************
  * High-level APIs *
  *******************/
diff --git a/kseq.h b/kseq.h
new file mode 100644 (file)
index 0000000..82face0
--- /dev/null
+++ b/kseq.h
@@ -0,0 +1,227 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+  2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
+ */
+
+/* Last Modified: 12APR2009 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_MAX   1
+
+#define __KS_TYPE(type_t)                                              \
+       typedef struct __kstream_t {                            \
+               unsigned char *buf;                                             \
+               int begin, end, is_eof;                                 \
+               type_t f;                                                               \
+       } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)                                                          \
+       static inline kstream_t *ks_init(type_t f)                                              \
+       {                                                                                                                               \
+               kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
+               ks->f = f;                                                                                                      \
+               ks->buf = malloc(__bufsize);                                                            \
+               return ks;                                                                                                      \
+       }                                                                                                                               \
+       static inline void ks_destroy(kstream_t *ks)                                    \
+       {                                                                                                                               \
+               if (ks) {                                                                                                       \
+                       free(ks->buf);                                                                                  \
+                       free(ks);                                                                                               \
+               }                                                                                                                       \
+       }
+
+#define __KS_GETC(__read, __bufsize)                                           \
+       static inline int ks_getc(kstream_t *ks)                                \
+       {                                                                                                               \
+               if (ks->is_eof && ks->begin >= ks->end) return -1;      \
+               if (ks->begin >= ks->end) {                                                     \
+                       ks->begin = 0;                                                                  \
+                       ks->end = __read(ks->f, ks->buf, __bufsize);    \
+                       if (ks->end < __bufsize) ks->is_eof = 1;                \
+                       if (ks->end == 0) return -1;                                    \
+               }                                                                                                       \
+               return (int)ks->buf[ks->begin++];                                       \
+       }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+       size_t l, m;
+       char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)                                                               \
+       static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       {                                                                                                                                       \
+               if (dret) *dret = 0;                                                                                    \
+               str->l = 0;                                                                                                             \
+               if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
+               for (;;) {                                                                                                              \
+                       int i;                                                                                                          \
+                       if (ks->begin >= ks->end) {                                                                     \
+                               if (!ks->is_eof) {                                                                              \
+                                       ks->begin = 0;                                                                          \
+                                       ks->end = __read(ks->f, ks->buf, __bufsize);            \
+                                       if (ks->end < __bufsize) ks->is_eof = 1;                        \
+                                       if (ks->end == 0) break;                                                        \
+                               } else break;                                                                                   \
+                       }                                                                                                                       \
+                       if (delimiter > KS_SEP_MAX) {                                                           \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (ks->buf[i] == delimiter) break;                                     \
+                       } else if (delimiter == KS_SEP_SPACE) {                                         \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (isspace(ks->buf[i])) break;                                         \
+                       } else if (delimiter == KS_SEP_TAB) {                                           \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+                       } else i = 0; /* never come to here! */                                         \
+                       if (str->m - str->l < i - ks->begin + 1) {                                      \
+                               str->m = str->l + (i - ks->begin) + 1;                                  \
+                               kroundup32(str->m);                                                                             \
+                               str->s = (char*)realloc(str->s, str->m);                                \
+                       }                                                                                                                       \
+                       memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+                       str->l = str->l + (i - ks->begin);                                                      \
+                       ks->begin = i + 1;                                                                                      \
+                       if (i < ks->end) {                                                                                      \
+                               if (dret) *dret = ks->buf[i];                                                   \
+                               break;                                                                                                  \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (str->l == 0) {                                                                                              \
+                       str->m = 1;                                                                                                     \
+                       str->s = (char*)calloc(1, 1);                                                           \
+               }                                                                                                                               \
+               str->s[str->l] = '\0';                                                                                  \
+               return str->l;                                                                                                  \
+       }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+       __KS_TYPE(type_t)                                                       \
+       __KS_BASIC(type_t, __bufsize)                           \
+       __KS_GETC(__read, __bufsize)                            \
+       __KS_GETUNTIL(__read, __bufsize)
+
+#define __KSEQ_BASIC(type_t)                                                                                   \
+       static inline kseq_t *kseq_init(type_t fd)                                                      \
+       {                                                                                                                                       \
+               kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
+               s->f = ks_init(fd);                                                                                             \
+               return s;                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kseq_rewind(kseq_t *ks)                                                      \
+       {                                                                                                                                       \
+               ks->last_char = 0;                                                                                              \
+               ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
+       }                                                                                                                                       \
+       static inline void kseq_destroy(kseq_t *ks)                                                     \
+       {                                                                                                                                       \
+               if (!ks) return;                                                                                                \
+               free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+               ks_destroy(ks->f);                                                                                              \
+               free(ks);                                                                                                               \
+       }
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ                                                                                                            \
+       static int kseq_read(kseq_t *seq)                                                                       \
+       {                                                                                                                                       \
+               int c;                                                                                                                  \
+               kstream_t *ks = seq->f;                                                                                 \
+               if (seq->last_char == 0) { /* then jump to the next header line */ \
+                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
+                       if (c == -1) return -1; /* end of file */                                       \
+                       seq->last_char = c;                                                                                     \
+               } /* the first header char has been read */                                             \
+               seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
+               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
+               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
+               while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+                       if (isgraph(c)) { /* printable non-space character */           \
+                               if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
+                                       seq->seq.m = seq->seq.l + 2;                                            \
+                                       kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
+                                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+                               }                                                                                                               \
+                               seq->seq.s[seq->seq.l++] = (char)c;                                             \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
+               if (c != '+') return seq->seq.l; /* FASTA */                                    \
+               if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
+                       seq->qual.m = seq->seq.m;                                                                       \
+                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
+               }                                                                                                                               \
+               while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+               if (c == -1) return -2; /* we should not stop here */                   \
+               while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
+                       if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
+               seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
+               seq->last_char = 0;     /* we have not come to the next header line */ \
+               if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
+               return seq->seq.l;                                                                                              \
+       }
+
+#define __KSEQ_TYPE(type_t)                                            \
+       typedef struct {                                                        \
+               kstring_t name, comment, seq, qual;             \
+               int last_char;                                                  \
+               kstream_t *f;                                                   \
+       } kseq_t;
+
+#define KSEQ_INIT(type_t, __read)                              \
+       KSTREAM_INIT(type_t, __read, 4096)                      \
+       __KSEQ_TYPE(type_t)                                                     \
+       __KSEQ_BASIC(type_t)                                            \
+       __KSEQ_READ
+
+#endif
diff --git a/main.c b/main.c
index 8ae6b2e69d78c0b7251635041344e90e02782c85..46642510d6dfdccf0ab5a628826f1fe9844ae310 100644 (file)
--- a/main.c
+++ b/main.c
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/stat.h>
+#include <errno.h>
 #include "bgzf.h"
 #include "tabix.h"
 
-#define PACKAGE_VERSION "0.2.3 (r876)"
+#define PACKAGE_VERSION "0.2.4 (r949)"
+
+#define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
+
+int reheader_file(const char *header, const char *file, int meta)
+{
+    BGZF *fp = bgzf_open(file,"r");
+    if (bgzf_read_block(fp) != 0 || !fp->block_length)
+        return -1;
+    
+    char *buffer = fp->uncompressed_block;
+    int skip_until = 0;
+
+    if ( buffer[0]==meta )
+    {
+        skip_until = 1;
+
+        // Skip the header
+        while (1)
+        {
+            if ( buffer[skip_until]=='\n' )
+            {
+                skip_until++;
+                if ( skip_until>=fp->block_length )
+                {
+                    if (bgzf_read_block(fp) != 0 || !fp->block_length)
+                        error("no body?\n");
+                    skip_until = 0;
+                }
+                // The header has finished
+                if ( buffer[skip_until]!=meta ) break;
+            }
+            skip_until++;
+            if ( skip_until>=fp->block_length )
+            {
+                if (bgzf_read_block(fp) != 0 || !fp->block_length)
+                    error("no body?\n");
+                skip_until = 0;
+            }
+        }
+    }
+
+    FILE *fh = fopen(header,"r");
+    if ( !fh )
+        error("%s: %s", header,strerror(errno));
+    int page_size = getpagesize();
+    char *buf = valloc(page_size);
+    BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w");
+    ssize_t nread;
+    while ( (nread=fread(buf,1,page_size-1,fh))>0 )
+    {
+        if ( nread<page_size-1 && buf[nread-1]!='\n' )
+            buf[nread++] = '\n';
+        if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error);
+    }
+    fclose(fh);
+
+    if ( fp->block_length - skip_until > 0 )
+    {
+        if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) 
+            error("Error: %s\n",fp->error);
+    }
+    if (bgzf_flush(bgzf_out) < 0) 
+        error("Error: %s\n",bgzf_out->error);
+
+    while (1)
+    {
+#ifdef _USE_KNETFILE
+        nread = knet_read(fp->x.fpr, buf, page_size);
+#else
+        nread = fread(buf, 1, page_size, fp->file);
+#endif
+        if ( nread<=0 ) 
+            break;
+
+#ifdef _USE_KNETFILE
+        int count = fwrite(buf, 1, nread, bgzf_out->x.fpw);
+#else
+        int count = fwrite(buf, 1, nread, bgzf_out->file);
+#endif
+        if (count != nread)
+            error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
+    }
+
+    if (bgzf_close(bgzf_out) < 0) 
+        error("Error: %s\n",bgzf_out->error);
+   
+    return 0;
+}
+
 
 int main(int argc, char *argv[])
 {
-       int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0;
+       int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0;
        ti_conf_t conf = ti_conf_gff;
-       while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhf")) >= 0) {
+    const char *reheader = NULL;
+       while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) {
                switch (c) {
+               case 'B': bed_reg = 1; break;
                case '0': conf.preset |= TI_FLAG_UCSC; break;
                case 'S': skip = atoi(optarg); break;
                case 'c': meta = optarg[0]; break;
@@ -34,6 +126,7 @@ int main(int argc, char *argv[])
         case 'l': list_chrms = 1; break;
         case 'h': print_header = 1; break;
                case 'f': force = 1; break;
+        case 'r': reheader = optarg; break;
                }
        }
        if (skip >= 0) conf.line_skip = skip;
@@ -46,11 +139,13 @@ int main(int argc, char *argv[])
                fprintf(stderr, "Options: -p STR     preset: gff, bed, sam, vcf, psltbl [gff]\n");
                fprintf(stderr, "         -s INT     sequence name column [1]\n");
                fprintf(stderr, "         -b INT     start column [4]\n");
-               fprintf(stderr, "         -e INT     end column [5]\n");
+               fprintf(stderr, "         -e INT     end column; can be identical to '-b' [5]\n");
                fprintf(stderr, "         -S INT     skip first INT lines [0]\n");
                fprintf(stderr, "         -c CHAR    symbol for comment/meta lines [#]\n");
+           fprintf(stderr, "         -r FILE    replace the header with the content of FILE [null]\n");
+               fprintf(stderr, "         -B         region1 is a BED file (entire file will be read)\n");
                fprintf(stderr, "         -0         zero-based coordinate\n");
-               fprintf(stderr, "         -h         print the VCF header\n");
+               fprintf(stderr, "         -h         print the header lines\n");
                fprintf(stderr, "         -l         list chromosome names\n");
                fprintf(stderr, "         -f         force to overwrite the index\n");
                fprintf(stderr, "\n");
@@ -71,27 +166,49 @@ int main(int argc, char *argv[])
                ti_index_destroy(idx);
                return 0;
        }
+    if (reheader)
+        return reheader_file(reheader,argv[optind],conf.meta_char);
+
+       struct stat stat_tbi,stat_vcf;
+    char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
+       strcat(strcpy(fnidx, argv[optind]), ".tbi");
+
        if (optind + 1 == argc) {
                if (force == 0) {
-                       struct stat buf;
-                       char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
-                       strcat(strcpy(fnidx, argv[optind]), ".tbi");
-                       if (stat(fnidx, &buf) == 0) {
-                               fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
-                               free(fnidx);
-                               return 1;
+                       if (stat(fnidx, &stat_tbi) == 0) 
+            {
+                // Before complaining, check if the VCF file isn't newer. This is a common source of errors,
+                //  people tend not to notice that tabix failed
+                stat(argv[optind], &stat_vcf);
+                if ( stat_vcf.st_mtime <= stat_tbi.st_mtime )
+                {
+                    fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
+                    free(fnidx);
+                    return 1;
+                }
                        }
-                       free(fnidx);
                }
-        if ( is_bgzipped(argv[optind])!=1 )
+        if ( bgzf_check_bgzf(argv[optind])!=1 )
         {
             fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
+            free(fnidx);
             return 1;
         }
                return ti_index_build(argv[optind], &conf);
        }
        { // retrieve
                tabix_t *t;
+        // Common source of errors: new VCF is used with an old index
+        stat(fnidx, &stat_tbi);
+        stat(argv[optind], &stat_vcf);
+        if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
+        {
+            fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n");
+            free(fnidx);
+            return 1;
+        }
+        free(fnidx);
+
                if ((t = ti_open(argv[optind], 0)) == 0) {
                        fprintf(stderr, "[main] fail to open the data file.\n");
                        return 1;
@@ -106,36 +223,65 @@ int main(int argc, char *argv[])
                        }
                        ti_iter_destroy(iter);
                } else { // retrieve from specified regions
-                       int i;
-                       if ( ti_lazy_index_load(t) ) 
-            {
+                       int i, len;
+            ti_iter_t iter;
+            const char *s;
+                       const ti_conf_t *idxconf;
+
+                       if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
                 fprintf(stderr,"[tabix] failed to load the index file.\n");
                 return 1;
             }
+                       idxconf = ti_get_conf(t->idx);
 
-            ti_iter_t iter;
-            const char *s;
-            int len;
             if ( print_header )
             {
                 // If requested, print the header lines here
                 iter = ti_query(t, 0, 0, 0);
                 while ((s = ti_read(t, iter, &len)) != 0) {
-                    if ( *s != '#' ) break;
+                    if ((int)(*s) != idxconf->meta_char) break;
                     fputs(s, stdout); fputc('\n', stdout);
                 }
                 ti_iter_destroy(iter);
             }
-                       for (i = optind + 1; i < argc; ++i) {
-                               int tid, beg, end;
-                               if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
-                                       iter = ti_queryi(t, tid, beg, end);
-                                       while ((s = ti_read(t, iter, &len)) != 0) {
-                                               fputs(s, stdout); fputc('\n', stdout);
+                       if (bed_reg) {
+                               extern int bed_overlap(const void *_h, const char *chr, int beg, int end);
+                               extern void *bed_read(const char *fn);
+                               extern void bed_destroy(void *_h);
+
+                               const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available
+                               void *bed = bed_read(argv[optind+1]); // load the BED file
+                               ti_interval_t intv;
+
+                               if (bed == 0) {
+                                       fprintf(stderr, "[main] fail to read the BED file.\n");
+                                       return 1;
+                               }
+                               iter = ti_query(t, 0, 0, 0);
+                               while ((s = ti_read(t, iter, &len)) != 0) {
+                                       int c;
+                                       ti_get_intv(conf_, len, (char*)s, &intv);
+                                       c = *intv.se; *intv.se = '\0';
+                                       if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) {
+                                               *intv.se = c;
+                                               puts(s);
                                        }
-                                       ti_iter_destroy(iter);
-                               } 
-                // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
+                                       *intv.se = c;
+                               }
+                ti_iter_destroy(iter);
+                               bed_destroy(bed);
+                       } else {
+                               for (i = optind + 1; i < argc; ++i) {
+                                       int tid, beg, end;
+                                       if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
+                                               iter = ti_queryi(t, tid, beg, end);
+                                                       while ((s = ti_read(t, iter, &len)) != 0) {
+                                                       fputs(s, stdout); fputc('\n', stdout);
+                                               }
+                                               ti_iter_destroy(iter);
+                                       } 
+                   // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
+                               }
                        }
                }
                ti_close(t);
diff --git a/python/setup.py b/python/setup.py
new file mode 100644 (file)
index 0000000..2771eda
--- /dev/null
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+#
+# The MIT License
+#
+# Copyright (c) 2011 Seoul National University.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
+
+from distutils.core import setup, Extension
+
+# Change this to True when you need the knetfile support.
+USE_KNETFILE = False
+
+TABIX_SOURCE_FILES = [
+    '../bgzf.c', '../bgzip.c', '../index.c', '../knetfile.c', '../kstring.c'
+]
+
+define_options = [('_FILE_OFFSET_BITS', 64)]
+if USE_KNETFILE:
+    define_options.append(('_USE_KNETFILE', 1))
+
+ext_modules = [Extension("tabix", ["tabixmodule.c"] + TABIX_SOURCE_FILES,
+                         include_dirs=['..'],
+                         libraries=['z'],
+                         define_macros=define_options)]
+
+setup (name = 'tabix',
+       version = '1.0',
+       description = 'Python interface to tabix, a generic indexer '
+                     'for TAB-delimited genome position files',
+       author = 'Hyeshik Chang',
+       author_email = 'hyeshik@snu.ac.kr',
+       license = 'MIT',
+       ext_modules = ext_modules
+)
diff --git a/python/tabixmodule.c b/python/tabixmodule.c
new file mode 100644 (file)
index 0000000..d04d097
--- /dev/null
@@ -0,0 +1,408 @@
+/*-
+ * The MIT License
+ *
+ * Copyright (c) 2011 Seoul National University.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "tabix.h"
+
+static PyObject *TabixError;
+
+typedef struct {
+    PyObject_HEAD
+    tabix_t *tb;
+    char *fn;
+} TabixObject;
+
+typedef struct {
+    PyObject_HEAD
+    TabixObject *tbobj;
+    ti_iter_t iter;
+} TabixIteratorObject;
+
+static PyTypeObject Tabix_Type, TabixIterator_Type;
+
+/* --- TabixIterator --------------------------------------------------- */
+
+static PyObject *
+tabixiter_create(TabixObject *parentidx, ti_iter_t iter)
+{
+    TabixIteratorObject *self;
+
+    self = PyObject_New(TabixIteratorObject, &TabixIterator_Type);
+    if (self == NULL)
+        return NULL;
+
+    Py_INCREF(parentidx);
+    self->tbobj = parentidx;
+    self->iter = iter;
+
+    return (PyObject *)self;
+}
+
+static void
+tabixiter_dealloc(TabixIteratorObject *self)
+{
+    ti_iter_destroy(self->iter);
+    Py_DECREF(self->tbobj);
+    PyObject_Del(self);
+}
+
+static PyObject *
+tabixiter_iter(PyObject *self)
+{
+    Py_INCREF(self);
+    return self;
+}
+
+#if PY_MAJOR_VERSION < 3
+# define PYOBJECT_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#else
+# define PYOBJECT_FROM_STRING_AND_SIZE PyUnicode_FromStringAndSize
+#endif
+
+static PyObject *
+tabixiter_iternext(TabixIteratorObject *self)
+{
+    const char *chunk;
+    int len, i;
+
+    chunk = ti_read(self->tbobj->tb, self->iter, &len);
+    if (chunk != NULL) {
+        PyObject *ret, *column;
+        Py_ssize_t colidx;
+        const char *ptr, *begin;
+
+        ret = PyList_New(0);
+        if (ret == NULL)
+            return NULL;
+
+        colidx = 0;
+        ptr = begin = chunk;
+        for (i = len; i > 0; i--, ptr++)
+            if (*ptr == '\t') {
+                column = PYOBJECT_FROM_STRING_AND_SIZE(begin,
+                                                       (Py_ssize_t)(ptr - begin));
+                if (column == NULL || PyList_Append(ret, column) == -1) {
+                    Py_DECREF(ret);
+                    return NULL;
+                }
+
+                Py_DECREF(column);
+                begin = ptr + 1;
+                colidx++;
+            }
+
+        column = PYOBJECT_FROM_STRING_AND_SIZE(begin, (Py_ssize_t)(ptr - begin));
+        if (column == NULL || PyList_Append(ret, column) == -1) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        Py_DECREF(column);
+
+        return ret;
+    }
+    else
+        return NULL;
+}
+
+static PyMethodDef tabixiter_methods[] = {
+    {NULL, NULL} /* sentinel */
+};
+
+static PyTypeObject TabixIterator_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "tabix.TabixIterator",      /*tp_name*/
+    sizeof(TabixIteratorObject), /*tp_basicsize*/
+    0,                          /*tp_itemsize*/
+    /* methods */
+    (destructor)tabixiter_dealloc,  /*tp_dealloc*/
+    0,                          /*tp_print*/
+    0,                          /*tp_getattr*/
+    0,                          /*tp_setattr*/
+    0,                          /*tp_compare*/
+    0,                          /*tp_repr*/
+    0,                          /*tp_as_number*/
+    0,                          /*tp_as_sequence*/
+    0,                          /*tp_as_mapping*/
+    0,                          /*tp_hash*/
+    0,                          /*tp_call*/
+    0,                          /*tp_str*/
+    0,                          /*tp_getattro*/
+    0,                          /*tp_setattro*/
+    0,                          /*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT,         /*tp_flags*/
+    0,                          /*tp_doc*/
+    0,                          /*tp_traverse*/
+    0,                          /*tp_clear*/
+    0,                          /*tp_richcompare*/
+    0,                          /*tp_weaklistoffset*/
+    tabixiter_iter,             /*tp_iter*/
+    (iternextfunc)tabixiter_iternext, /*tp_iternext*/
+    tabixiter_methods,          /*tp_methods*/
+    0,                          /*tp_members*/
+    0,                          /*tp_getset*/
+    0,                          /*tp_base*/
+    0,                          /*tp_dict*/
+    0,                          /*tp_descr_get*/
+    0,                          /*tp_descr_set*/
+    0,                          /*tp_dictoffset*/
+    0,                          /*tp_init*/
+    0,                          /*tp_alloc*/
+    0,                          /*tp_new*/
+    0,                          /*tp_free*/
+    0,                          /*tp_is_gc*/
+};
+
+
+/* --- Tabix ----------------------------------------------------------- */
+
+static PyObject *
+tabix_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    TabixObject *self;
+    const char *fn, *fnidx=NULL;
+    static char *kwnames[]={"fn", "fnidx", NULL};
+    tabix_t *tb;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|z:Tabix",
+                                     kwnames, &fn, &fnidx))
+        return NULL;
+
+    tb = ti_open(fn, fnidx);
+    if (tb == NULL) {
+        PyErr_SetString(TabixError, "Can't open the index file.");
+        return NULL;
+    }
+
+    self = (TabixObject *)type->tp_alloc(type, 0);
+    if (self == NULL)
+        return NULL;
+
+    self->tb = tb;
+    self->fn = strdup(fn);
+
+    return (PyObject *)self;
+}
+
+static void
+tabix_dealloc(TabixObject *self)
+{
+    free(self->fn);
+    ti_close(self->tb);
+    PyObject_Del(self);
+}
+
+static PyObject *
+tabix_query(TabixObject *self, PyObject *args)
+{
+    char *name;
+    int begin, end;
+    ti_iter_t result;
+
+    if (!PyArg_ParseTuple(args, "sii:query", &name, &begin, &end))
+        return NULL;
+
+    result = ti_query(self->tb, name, begin, end);
+    if (result == NULL) {
+        PyErr_SetString(TabixError, "query failed");
+        return NULL;
+    }
+
+    return tabixiter_create(self, result);
+}
+
+static PyObject *
+tabix_queryi(TabixObject *self, PyObject *args)
+{
+    int tid, begin, end;
+    ti_iter_t result;
+
+    if (!PyArg_ParseTuple(args, "iii:queryi", &tid, &begin, &end))
+        return NULL;
+
+    result = ti_queryi(self->tb, tid, begin, end);
+    if (result == NULL) {
+        PyErr_SetString(TabixError, "query failed");
+        return NULL;
+    }
+
+    return tabixiter_create(self, result);
+}
+
+static PyObject *
+tabix_querys(TabixObject *self, PyObject *args)
+{
+    const char *reg;
+    ti_iter_t result;
+
+    if (!PyArg_ParseTuple(args, "s:querys", &reg))
+        return NULL;
+
+    result = ti_querys(self->tb, reg);
+    if (result == NULL) {
+        PyErr_SetString(TabixError, "query failed");
+        return NULL;
+    }
+
+    return tabixiter_create(self, result);
+}
+
+static PyObject *
+tabix_repr(TabixObject *self)
+{
+#if PY_MAJOR_VERSION < 3
+    return PyString_FromFormat("<Tabix fn=\"%s\">", self->fn);
+#else
+    return PyUnicode_FromFormat("<Tabix fn=\"%s\">", self->fn);
+#endif
+}
+
+static PyMethodDef tabix_methods[] = {
+    {"query",           (PyCFunction)tabix_query, METH_VARARGS,
+        PyDoc_STR("T.query(name, begin, end) -> iterator")},
+    {"queryi",          (PyCFunction)tabix_queryi, METH_VARARGS,
+        PyDoc_STR("T.queryi(tid, begin, id) -> iterator")},
+    {"querys",          (PyCFunction)tabix_querys, METH_VARARGS,
+        PyDoc_STR("T.querys(region) -> iterator")},
+    {NULL,              NULL}           /* sentinel */
+};
+
+static PyTypeObject Tabix_Type = {
+    /* The ob_type field must be initialized in the module init function
+     * to be portable to Windows without using C++. */
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "tabix.Tabix",              /*tp_name*/
+    sizeof(TabixObject),        /*tp_basicsize*/
+    0,                          /*tp_itemsize*/
+    /* methods */
+    (destructor)tabix_dealloc,  /*tp_dealloc*/
+    0,                          /*tp_print*/
+    0,                          /*tp_getattr*/
+    0,                          /*tp_setattr*/
+    0,                          /*tp_compare*/
+    (reprfunc)tabix_repr,       /*tp_repr*/
+    0,                          /*tp_as_number*/
+    0,                          /*tp_as_sequence*/
+    0,                          /*tp_as_mapping*/
+    0,                          /*tp_hash*/
+    0,                          /*tp_call*/
+    0,                          /*tp_str*/
+    0,                          /*tp_getattro*/
+    0,                          /*tp_setattro*/
+    0,                          /*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT,         /*tp_flags*/
+    0,                          /*tp_doc*/
+    0,                          /*tp_traverse*/
+    0,                          /*tp_clear*/
+    0,                          /*tp_richcompare*/
+    0,                          /*tp_weaklistoffset*/
+    0,                          /*tp_iter*/
+    0,                          /*tp_iternext*/
+    tabix_methods,              /*tp_methods*/
+    0,                          /*tp_members*/
+    0,                          /*tp_getset*/
+    0,                          /*tp_base*/
+    0,                          /*tp_dict*/
+    0,                          /*tp_descr_get*/
+    0,                          /*tp_descr_set*/
+    0,                          /*tp_dictoffset*/
+    0,                          /*tp_init*/
+    0,                          /*tp_alloc*/
+    (newfunc)tabix_new,         /*tp_new*/
+    0,                          /*tp_free*/
+    0,                          /*tp_is_gc*/
+};
+/* --------------------------------------------------------------------- */
+
+static PyMethodDef tabix_functions[] = {
+    {NULL, NULL} /* sentinel */
+};
+
+PyDoc_STRVAR(module_doc,
+"Python interface to tabix, Heng Li's generic indexer for TAB-delimited "
+"genome position filesThis is a template module just for instruction.");
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef tabixmodule = { 
+    PyModuleDef_HEAD_INIT,
+    "tabix",
+    module_doc,
+    -1, 
+    tabix_functions,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#endif
+
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC inittabix(void)
+#else
+PyMODINIT_FUNC PyInit_tabix(void)
+#endif
+{
+    PyObject *m;
+
+    if (PyType_Ready(&Tabix_Type) < 0)
+        goto fail;
+    if (PyType_Ready(&TabixIterator_Type) < 0)
+        goto fail;
+
+#if PY_MAJOR_VERSION < 3
+    m = Py_InitModule3("tabix", tabix_functions, module_doc);
+#else
+    m = PyModule_Create(&tabixmodule);
+#endif
+    if (m == NULL)
+        goto fail;
+
+    if (TabixError == NULL) {
+        TabixError = PyErr_NewException("tabix.error", NULL, NULL);
+        if (TabixError == NULL)
+            goto fail;
+    }
+    Py_INCREF(TabixError);
+    PyModule_AddObject(m, "error", TabixError);
+
+    PyModule_AddObject(m, "Tabix", (PyObject *)&Tabix_Type);
+    PyModule_AddObject(m, "TabixIterator", (PyObject *)&TabixIterator_Type);
+
+#if PY_MAJOR_VERSION >= 3
+    return m;
+#endif
+
+ fail:
+#if PY_MAJOR_VERSION < 3
+    return;
+#else
+    return NULL;
+#endif
+}
diff --git a/python/test.py b/python/test.py
new file mode 100755 (executable)
index 0000000..804a565
--- /dev/null
@@ -0,0 +1,91 @@
+#
+# The MIT License
+#
+# Copyright (c) 2011 Seoul National University.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
+
+import unittest
+import random
+import gzip
+import tabix
+
+EXAMPLEFILE = '../example.gtf.gz'
+
+def load_example_regions(path):
+    alldata = []
+    for line in gzip.GzipFile(EXAMPLEFILE):
+        fields = line.decode('ascii')[:-1].split('\t')
+        seqid = fields[0]
+        begin = int(fields[3])
+        end = int(fields[4])
+        alldata.append((seqid, begin, end, fields[:7]))
+
+    return alldata
+
+def does_overlap(A, B, C, D):
+    return (A <= D <= B) or (C <= B <= D)
+
+def sample_test_dataset(regions, ntests):
+    seqids = [seqid for seqid, _, _, _ in regions]
+    lowerbound = max(0, min(begin for _, begin, _, _ in regions) - 1000)
+    upperbound = max(end for _, _, end, _ in regions) + 1000
+
+    tests = []
+    for i in range(ntests):
+        seqid = random.choice(seqids)
+        low = random.randrange(lowerbound, upperbound)
+        high = random.randrange(low, upperbound)
+
+        # for 1-based both-end inclusive intervals
+        matches = [info for seq, begin, end, info in regions
+                   if seqid == seq and does_overlap(begin, end, low, high)]
+
+        tests.append((seqid, low, high, matches))
+
+    return tests
+
+def tbresult2excerpt(tbmatches):
+    return [fields[:7] for fields in tbmatches]
+
+class TabixTest(unittest.TestCase):
+    regions = load_example_regions(EXAMPLEFILE)
+    testset = sample_test_dataset(regions, 500)
+
+    def setUp(self):
+        self.tb = tabix.Tabix(EXAMPLEFILE)
+
+    def testQuery(self):
+        for seqid, low, high, matches in self.testset:
+            tbresult = tbresult2excerpt(self.tb.query(seqid, low, high))
+            self.assertEqual(tbresult, matches)
+
+    def testQueryS(self):
+        for seqid, low, high, matches in self.testset:
+            tbresult = tbresult2excerpt(self.tb.querys('%s:%d-%d' %
+                                                       (seqid, low, high)))
+            self.assertEqual(tbresult, matches)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tabix.1 b/tabix.1
index 3b82c766d8059d3f41adf66e2e2b95a040d7b3ef..1bd9533d93577db59b6d97e8fcf912637c56b27d 100644 (file)
--- a/tabix.1
+++ b/tabix.1
@@ -7,7 +7,7 @@ tabix - Generic indexer for TAB-delimited genome position files
 .SH SYNOPSIS
 .PP
 .B bgzip
-.RB [ \-cdh ]
+.RB [ \-cdhB ]
 .RB [ \-b
 .IR virtualOffset ]
 .RB [ \-s
@@ -82,6 +82,14 @@ Skip lines started with character CHAR. [#]
 Specify that the position in the data file is 0-based (e.g. UCSC files)
 rather than 1-based.
 .TP
+.B -h
+Print the header/meta lines.
+.TP
+.B -B
+The second argument is a BED file. When this option is in use, the input
+file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless,
+with this option, the format of the input must be specificed correctly on the command line.
+.TP
 .B -f
 Force to overwrite the index file if it is present.
 .TP
diff --git a/tabix.h b/tabix.h
index 4390c0918023936c5e8f17016d6f5532ac84b313..7b4497a35676b494d8bf1c600601b2945545b8b9 100644 (file)
--- a/tabix.h
+++ b/tabix.h
@@ -58,6 +58,11 @@ typedef struct {
        int32_t meta_char, line_skip;
 } ti_conf_t;
 
+typedef struct {
+       int beg, end;
+       char *ss, *se;
+} ti_interval_t;
+
 extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset
 
 #ifdef __cplusplus
@@ -120,6 +125,9 @@ extern "C" {
        /* Get the data line pointed by the iterator and iterate to the next record. */
        const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len);
 
+       const ti_conf_t *ti_get_conf(ti_index_t *idx);
+       int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv);
+
        /*******************
         * Deprecated APIs *
         *******************/