+------------------------------------------------------------------------
+r942 | lh3lh3 | 2011-03-31 16:39:50 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/main.c
+
+update version number
+
+------------------------------------------------------------------------
+r940 | lh3lh3 | 2011-03-31 16:38:03 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/bedidx.c
+ M /trunk/tabix/main.c
+
+fixed two bugs due to recent changes
+
+------------------------------------------------------------------------
+r939 | lh3lh3 | 2011-03-31 16:12:21 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/bgzf.c
+ M /trunk/tabix/bgzf.h
+ M /trunk/tabix/main.c
+
+update to the latest bgzf.*
+
+------------------------------------------------------------------------
+r938 | lh3lh3 | 2011-03-31 16:02:21 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.h
+
+BED support
+
+------------------------------------------------------------------------
+r937 | lh3lh3 | 2011-03-31 15:03:49 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ A /trunk/tabix/bedidx.c
+ M /trunk/tabix/example.gtf.gz.tbi
+ M /trunk/tabix/index.c
+ A /trunk/tabix/kseq.h
+ M /trunk/tabix/tabix.h
+
+restructure get_intv() for BED support
+
+------------------------------------------------------------------------
+r919 | petulda | 2011-02-24 10:14:14 -0500 (Thu, 24 Feb 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/bgzf.c
+ M /trunk/tabix/bgzf.h
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+New -r (reheader) option for efficient header replacement.
+------------------------------------------------------------------------
+r915 | lh3lh3 | 2011-02-22 09:50:57 -0500 (Tue, 22 Feb 2011) | 2 lines
+Changed paths:
+ A /trunk/tabix/python
+ A /trunk/tabix/python/setup.py (from /trunk/tabix/setup.py:914)
+ A /trunk/tabix/python/tabixmodule.c (from /trunk/tabix/tabixmodule.c:914)
+ A /trunk/tabix/python/test.py (from /trunk/tabix/test.py:914)
+ D /trunk/tabix/setup.py
+ D /trunk/tabix/tabixmodule.c
+ D /trunk/tabix/test.py
+
+move to a new python/ directory
+
+------------------------------------------------------------------------
+r914 | lh3lh3 | 2011-02-22 09:49:35 -0500 (Tue, 22 Feb 2011) | 2 lines
+Changed paths:
+ A /trunk/tabix/setup.py
+ A /trunk/tabix/tabixmodule.c
+ A /trunk/tabix/test.py
+
+CPython C-API by Hyeshik Chang
+
+------------------------------------------------------------------------
+r904 | petulda | 2011-01-28 08:06:27 -0500 (Fri, 28 Jan 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/index.c
+
+Check the number of fields on each line and exit nicely without segfault
+------------------------------------------------------------------------
+r901 | petulda | 2011-01-21 06:45:37 -0500 (Fri, 21 Jan 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/main.c
+
+Fix: Complain only when VCF is newer, not newer or same mtime
+------------------------------------------------------------------------
+r900 | petulda | 2011-01-21 04:23:04 -0500 (Fri, 21 Jan 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/main.c
+
+Prevent the common user mistake and check the timestamps of the vcf and index file
+------------------------------------------------------------------------
+r876 | lh3lh3 | 2010-12-08 12:38:45 -0500 (Wed, 08 Dec 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/ChangeLog
+ M /trunk/tabix/NEWS
+ M /trunk/tabix/main.c
+
+Release tabix-0.2.3
+
------------------------------------------------------------------------
r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines
Changed paths:
CC= gcc
CFLAGS= -g -Wall -O2 -fPIC #-m64 #-arch ppc
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
-LOBJS= bgzf.o kstring.o knetfile.o index.o
+LOBJS= bgzf.o kstring.o knetfile.o index.o bedidx.o
AOBJS= main.o
PROG= tabix bgzip
INCLUDES=
index.o:bgzf.h tabix.h khash.h ksort.h kstring.h
main.o:tabix.h kstring.h bgzf.h
bgzip.o:bgzf.h
+bedidx.o:kseq.h khash.h
tabix.pdf:tabix.tex
pdflatex tabix.tex
+Release 0.2.4 (10 April, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Give an error if the index file is older than the data file.
+
+ * Avoid a segfault given flawed input.
+
+ * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to
+ the dynamic library and are reported to be faster. Pysam also comes with a
+ tabix binding.
+
+ * Added option "-r" for efficient header replacement.
+
+ * Added BED support.
+
+ * Synchronized the BGZF library between tabix and samtools.
+
+(0.2.4: 10 April 2011, r949)
+
+
+
Beta Release 0.2.3 (8 December, 2010)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- /dev/null
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+ int n, m;
+ uint64_t *a;
+ int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+ int i, j, m, *idx;
+ m = *n_idx = 0; idx = 0;
+ for (i = 0; i < n; ++i) {
+ int beg, end;
+ beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+ if (m < end + 1) {
+ int oldm = m;
+ m = end + 1;
+ kroundup32(m);
+ idx = realloc(idx, m * sizeof(int));
+ for (j = oldm; j < m; ++j) idx[j] = -1;
+ }
+ if (beg == end) {
+ if (idx[beg] < 0) idx[beg] = i;
+ } else {
+ for (j = beg; j <= end; ++j)
+ if (idx[j] < 0) idx[j] = i;
+ }
+ *n_idx = end + 1;
+ }
+ return idx;
+}
+
+void bed_index(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ bed_reglist_t *p = &kh_val(h, k);
+ if (p->idx) free(p->idx);
+ p->idx = bed_index_core(p->n, p->a, &p->m);
+ }
+ }
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+ int i, min_off;
+ if (p->n == 0) return 0;
+ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+ int n = beg>>LIDX_SHIFT;
+ if (n > p->n) n = p->n;
+ for (i = n - 1; i >= 0; --i)
+ if (p->idx[i] >= 0) break;
+ min_off = i >= 0? p->idx[i] : 0;
+ }
+ for (i = min_off; i < p->n; ++i) {
+ if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+ if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+ return 1; // find the overlap; return
+ }
+ return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+ const reghash_t *h = (const reghash_t*)_h;
+ khint_t k;
+ if (!h) return 0;
+ k = kh_get(reg, h, chr);
+ if (k == kh_end(h)) return 0;
+ return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t *str;
+ // read the list
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) return 0;
+ str = calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+ int beg = -1, end = -1;
+ bed_reglist_t *p;
+ khint_t k = kh_get(reg, h, str->s);
+ if (k == kh_end(h)) { // absent from the hash table
+ int ret;
+ char *s = strdup(str->s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (dret != '\n') { // if the lines has other characters
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ beg = atoi(str->s); // begin
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
+ end = atoi(str->s); // end
+ }
+ }
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+ if (beg >= 0 && end > beg) {
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ bed_index(h);
+ return h;
+}
+
+void bed_destroy(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free(kh_val(h, k).idx);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+}
fp->error = message;
}
-int is_bgzipped(const char *fn)
+int bgzf_check_bgzf(const char *fn)
{
BGZF *fp;
uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
if ((fp = bgzf_open(fn, "r")) == 0)
{
- fprintf(stderr, "[is_bgzipped] failed to open the file: %s\n",fn);
+ fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn);
return -1;
}
static
BGZF*
-open_write(int fd, bool is_uncompressed)
+open_write(int fd, int compress_level) // compress_level==-1 for the default level
{
FILE* file = fdopen(fd, "w");
BGZF* fp;
fp = malloc(sizeof(BGZF));
fp->file_descriptor = fd;
fp->open_mode = 'w';
- fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
+ fp->owned_file = 0;
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
#ifdef _USE_KNETFILE
fp->x.fpw = file;
#else
bgzf_open(const char* __restrict path, const char* __restrict mode)
{
BGZF* fp = NULL;
- if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
+ if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
#ifdef _USE_KNETFILE
knetFile *file = knet_open(path, mode);
if (file == 0) return 0;
if (fd == -1) return 0;
fp = open_read(fd);
#endif
- } else if (mode[0] == 'w' || mode[0] == 'W') {
- int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
#ifdef _WIN32
oflag |= O_BINARY;
#endif
fd = open(path, oflag, 0666);
if (fd == -1) return 0;
- fp = open_write(fd, strstr(mode, "u")? 1 : 0);
- }
- if (fp != NULL) {
- fp->owned_file = 1;
+ { // set compress_level
+ int i;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ }
+ fp = open_write(fd, compress_level);
}
+ if (fp != NULL) fp->owned_file = 1;
return fp;
}
if (mode[0] == 'r' || mode[0] == 'R') {
return open_read(fd);
} else if (mode[0] == 'w' || mode[0] == 'W') {
- return open_write(fd, strstr(mode, "u")? 1 : 0);
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ return open_write(fd, compress_level);
} else {
return NULL;
}
int input_length = block_length;
int compressed_length = 0;
while (1) {
- int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
z_stream zs;
zs.zalloc = NULL;
zs.zfree = NULL;
zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
- int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
+ int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
if (status != Z_OK) {
report_error(fp, "deflate init failed");
// Inflate the block in fp->compressed_block into fp->uncompressed_block
z_stream zs;
+ int status;
zs.zalloc = NULL;
zs.zfree = NULL;
zs.next_in = fp->compressed_block + 18;
zs.next_out = fp->uncompressed_block;
zs.avail_out = fp->uncompressed_block_size;
- int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ status = inflateInit2(&zs, GZIP_WINDOW_BITS);
if (status != Z_OK) {
report_error(fp, "inflate init failed");
return -1;
bgzf_read_block(BGZF* fp)
{
bgzf_byte_t header[BLOCK_HEADER_LENGTH];
- int size = 0;
+ int count, size = 0, block_length, remaining;
#ifdef _USE_KNETFILE
int64_t block_address = knet_tell(fp->x.fpr);
if (load_block_from_cache(fp, block_address)) return 0;
- int count = knet_read(fp->x.fpr, header, sizeof(header));
+ count = knet_read(fp->x.fpr, header, sizeof(header));
#else
int64_t block_address = ftello(fp->file);
if (load_block_from_cache(fp, block_address)) return 0;
- int count = fread(header, 1, sizeof(header), fp->file);
+ count = fread(header, 1, sizeof(header), fp->file);
#endif
if (count == 0) {
fp->block_length = 0;
report_error(fp, "invalid block header");
return -1;
}
- int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1;
bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
- int remaining = block_length - BLOCK_HEADER_LENGTH;
+ remaining = block_length - BLOCK_HEADER_LENGTH;
#ifdef _USE_KNETFILE
count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
#else
}
size += count;
count = inflate_block(fp, block_length);
- if (count < 0) {
- return -1;
- }
+ if (count < 0) return -1;
if (fp->block_length != 0) {
// Do not reset offset if this read follows a seek.
fp->block_offset = 0;
int bytes_read = 0;
bgzf_byte_t* output = data;
while (bytes_read < length) {
- int available = fp->block_length - fp->block_offset;
+ int copy_length, available = fp->block_length - fp->block_offset;
+ bgzf_byte_t *buffer;
if (available <= 0) {
if (bgzf_read_block(fp) != 0) {
return -1;
break;
}
}
- int copy_length = bgzf_min(length-bytes_read, available);
- bgzf_byte_t* buffer = fp->uncompressed_block;
+ copy_length = bgzf_min(length-bytes_read, available);
+ buffer = fp->uncompressed_block;
memcpy(output, buffer + fp->block_offset, copy_length);
fp->block_offset += copy_length;
output += copy_length;
return bytes_read;
}
-static
-int
-flush_block(BGZF* fp)
+int bgzf_flush(BGZF* fp)
{
while (fp->block_offset > 0) {
- int block_length = deflate_block(fp, fp->block_offset);
- if (block_length < 0) {
- return -1;
- }
+ int count, block_length;
+ block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) return -1;
#ifdef _USE_KNETFILE
- int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+ count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
#else
- int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+ count = fwrite(fp->compressed_block, 1, block_length, fp->file);
#endif
if (count != block_length) {
report_error(fp, "write failed");
return 0;
}
-int
-bgzf_write(BGZF* fp, const void* data, int length)
+int bgzf_flush_try(BGZF *fp, int size)
{
+ if (fp->block_offset + size > fp->uncompressed_block_size)
+ return bgzf_flush(fp);
+ return -1;
+}
+
+int bgzf_write(BGZF* fp, const void* data, int length)
+{
+ const bgzf_byte_t *input = data;
+ int block_length, bytes_written;
if (fp->open_mode != 'w') {
report_error(fp, "file not open for writing");
return -1;
}
- if (fp->uncompressed_block == NULL) {
+ if (fp->uncompressed_block == NULL)
fp->uncompressed_block = malloc(fp->uncompressed_block_size);
- }
- const bgzf_byte_t* input = data;
- int block_length = fp->uncompressed_block_size;
- int bytes_written = 0;
+ input = data;
+ block_length = fp->uncompressed_block_size;
+ bytes_written = 0;
while (bytes_written < length) {
int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
bgzf_byte_t* buffer = fp->uncompressed_block;
input += copy_length;
bytes_written += copy_length;
if (fp->block_offset == block_length) {
- if (flush_block(fp) != 0) {
+ if (bgzf_flush(fp) != 0) {
break;
}
}
return bytes_written;
}
-int
-bgzf_close(BGZF* fp)
+int bgzf_close(BGZF* fp)
{
if (fp->open_mode == 'w') {
- if (flush_block(fp) != 0) {
- return -1;
- }
+ if (bgzf_flush(fp) != 0) return -1;
{ // add an empty block
int count, block_length = deflate_block(fp, 0);
#ifdef _USE_KNETFILE
else ret = knet_close(fp->x.fpr);
if (ret != 0) return -1;
#else
- if (fclose(fp->file) != 0) {
- return -1;
- }
+ if (fclose(fp->file) != 0) return -1;
#endif
}
free(fp->uncompressed_block);
return (memcmp(magic, buf, 28) == 0)? 1 : 0;
}
-int64_t
-bgzf_seek(BGZF* fp, int64_t pos, int where)
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
{
+ int block_offset;
+ int64_t block_address;
+
if (fp->open_mode != 'r') {
report_error(fp, "file not open for read");
return -1;
report_error(fp, "unimplemented seek option");
return -1;
}
- int block_offset = pos & 0xFFFF;
- int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+ block_offset = pos & 0xFFFF;
+ block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
#ifdef _USE_KNETFILE
if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
#else
#include <stdint.h>
#include <stdio.h>
-#include <stdbool.h>
#include <zlib.h>
#ifdef _USE_KNETFILE
#include "knetfile.h"
typedef struct {
int file_descriptor;
char open_mode; // 'r' or 'w'
- bool owned_file, is_uncompressed;
+ int16_t owned_file, compress_level;
#ifdef _USE_KNETFILE
union {
knetFile *fpr;
extern "C" {
#endif
-/*
- * Checks the magic string of the file. Returns 1
- * for bgzipped files, -1 on errors and 0 for files
- * without the bgzip magic string.
- */
-int is_bgzipped(const char *path);
-
/*
* Open an existing file descriptor for reading or writing.
* Mode must be either "r" or "w".
void bgzf_set_cache_size(BGZF *fp, int cache_size);
int bgzf_check_EOF(BGZF *fp);
-
int bgzf_read_block(BGZF* fp);
+int bgzf_flush(BGZF* fp);
+int bgzf_flush_try(BGZF *fp, int size);
+int bgzf_check_bgzf(const char *fn);
#ifdef __cplusplus
}
return tid;
}
-static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
+int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv)
{
- int i, b = 0, id = 1;
+ int i, b = 0, id = 1, ncols = 0;
char *s;
- intv->tid = intv->beg = intv->end = intv->bin = -1;
- for (i = 0; i <= str->l; ++i) {
- if (str->s[i] == '\t' || str->s[i] == 0) {
- if (id == idx->conf.sc) {
- str->s[i] = 0;
- intv->tid = get_tid(idx, str->s + b);
- if (i != str->l) str->s[i] = '\t';
- } else if (id == idx->conf.bc) {
+ intv->ss = intv->se = 0; intv->beg = intv->end = -1;
+ for (i = 0; i <= len; ++i) {
+ if (line[i] == '\t' || line[i] == 0) {
+ ++ncols;
+ if (id == conf->sc) {
+ intv->ss = line + b; intv->se = line + i;
+ } else if (id == conf->bc) {
// here ->beg is 0-based.
- intv->beg = intv->end = strtol(str->s + b, &s, 0);
- if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg;
+ intv->beg = intv->end = strtol(line + b, &s, 0);
+ if (!(conf->preset&TI_FLAG_UCSC)) --intv->beg;
else ++intv->end;
if (intv->beg < 0) intv->beg = 0;
if (intv->end < 1) intv->end = 1;
} else {
- if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) {
- if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0);
- } else if ((idx->conf.preset&0xffff) == TI_PRESET_SAM) {
+ if ((conf->preset&0xffff) == TI_PRESET_GENERIC) {
+ if (id == conf->ec) intv->end = strtol(line + b, &s, 0);
+ } else if ((conf->preset&0xffff) == TI_PRESET_SAM) {
if (id == 6) { // CIGAR
int l = 0, op;
char *t;
- for (s = str->s + b; s < str->s + i;) {
+ for (s = line + b; s < line + i;) {
long x = strtol(s, &t, 10);
op = toupper(*t);
if (op == 'M' || op == 'D' || op == 'N') l += x;
if (l == 0) l = 1;
intv->end = intv->beg + l;
}
- } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) {
+ } else if ((conf->preset&0xffff) == TI_PRESET_VCF) {
// FIXME: the following is NOT tested and is likely to be buggy
if (id == 4) {
if (b < i) intv->end = intv->beg + (i - b);
} else if (id == 8) { // look for "END="
- int c = str->s[i];
- str->s[i] = 0;
- s = strstr(str->s + b, "END=");
- if (s == str->s + b) s += 4;
+ int c = line[i];
+ line[i] = 0;
+ s = strstr(line + b, "END=");
+ if (s == line + b) s += 4;
else if (s) {
- s = strstr(str->s + b, ";END=");
+ s = strstr(line + b, ";END=");
if (s) s += 5;
}
if (s) intv->end = strtol(s, &s, 0);
- str->s[i] = c;
+ line[i] = c;
}
}
}
++id;
}
}
- if (intv->tid < 0 || intv->beg < 0 || intv->end < 0) return -1;
- intv->bin = ti_reg2bin(intv->beg, intv->end);
+/*
+ if (ncols < conf->sc || ncols < conf->bc || ncols < conf->ec) {
+ if (ncols == 1) fprintf(stderr,"[get_intv] Is the file tab-delimited? The line has %d field only: %s\n", ncols, line);
+ else fprintf(stderr,"[get_intv] The line has %d field(s) only: %s\n", ncols, line);
+ exit(1);
+ }
+*/
+ if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
return 0;
}
+static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
+{
+ ti_interval_t x;
+ intv->tid = intv->beg = intv->end = intv->bin = -1;
+ if (ti_get_intv(&idx->conf, str->l, str->s, &x) == 0) {
+ int c = *x.se;
+ *x.se = '\0'; intv->tid = get_tid(idx, x.ss); *x.se = c;
+ intv->beg = x.beg; intv->end = x.end;
+ intv->bin = ti_reg2bin(intv->beg, intv->end);
+ return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
+ } else {
+ fprintf(stderr, "[%s] the following line cannot be parsed and skipped: %s\n", __func__, str->s);
+ return -1;
+ }
+}
+
/************
* indexing *
************/
continue;
}
get_intv(idx, str, &intv);
+ if ( intv.beg<0 || intv.end<0 )
+ {
+ fprintf(stderr,"[ti_index_core] the indexes overlap or are out of bounds\n");
+ exit(1);
+ }
if (last_tid != intv.tid) { // change of chromosomes
if (last_tid>intv.tid )
{
- fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted?\n",(unsigned long long)lineno);
+ fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted? [pos %d]\n",(unsigned long long)lineno,intv.beg+1);
exit(1);
}
last_tid = intv.tid;
return 0;
}
+const ti_conf_t *ti_get_conf(ti_index_t *idx) { return idx? &idx->conf : 0; }
+
/*******************
* High-level APIs *
*******************/
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+ 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
+ */
+
+/* Last Modified: 12APR2009 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_MAX 1
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ unsigned char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) return -1; \
+ } \
+ return (int)ks->buf[ks->begin++]; \
+ }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { \
+ if (dret) *dret = 0; \
+ str->l = 0; \
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) break; \
+ } else break; \
+ } \
+ if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < i - ks->begin + 1) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (str->l == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define __KSEQ_BASIC(type_t) \
+ static inline kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ static inline void kseq_rewind(kseq_t *ks) \
+ { \
+ ks->last_char = 0; \
+ ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
+ } \
+ static inline void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ \
+ static int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* the first header char has been read */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
+ if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (isgraph(c)) { /* printable non-space character */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l++] = (char)c; \
+ } \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* we should not stop here */ \
+ while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
+ if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
+ seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT(type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 4096) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(type_t) \
+ __KSEQ_READ
+
+#endif
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
+#include <errno.h>
#include "bgzf.h"
#include "tabix.h"
-#define PACKAGE_VERSION "0.2.3 (r876)"
+#define PACKAGE_VERSION "0.2.4 (r949)"
+
+#define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
+
+int reheader_file(const char *header, const char *file, int meta)
+{
+ BGZF *fp = bgzf_open(file,"r");
+ if (bgzf_read_block(fp) != 0 || !fp->block_length)
+ return -1;
+
+ char *buffer = fp->uncompressed_block;
+ int skip_until = 0;
+
+ if ( buffer[0]==meta )
+ {
+ skip_until = 1;
+
+ // Skip the header
+ while (1)
+ {
+ if ( buffer[skip_until]=='\n' )
+ {
+ skip_until++;
+ if ( skip_until>=fp->block_length )
+ {
+ if (bgzf_read_block(fp) != 0 || !fp->block_length)
+ error("no body?\n");
+ skip_until = 0;
+ }
+ // The header has finished
+ if ( buffer[skip_until]!=meta ) break;
+ }
+ skip_until++;
+ if ( skip_until>=fp->block_length )
+ {
+ if (bgzf_read_block(fp) != 0 || !fp->block_length)
+ error("no body?\n");
+ skip_until = 0;
+ }
+ }
+ }
+
+ FILE *fh = fopen(header,"r");
+ if ( !fh )
+ error("%s: %s", header,strerror(errno));
+ int page_size = getpagesize();
+ char *buf = valloc(page_size);
+ BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w");
+ ssize_t nread;
+ while ( (nread=fread(buf,1,page_size-1,fh))>0 )
+ {
+ if ( nread<page_size-1 && buf[nread-1]!='\n' )
+ buf[nread++] = '\n';
+ if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error);
+ }
+ fclose(fh);
+
+ if ( fp->block_length - skip_until > 0 )
+ {
+ if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0)
+ error("Error: %s\n",fp->error);
+ }
+ if (bgzf_flush(bgzf_out) < 0)
+ error("Error: %s\n",bgzf_out->error);
+
+ while (1)
+ {
+#ifdef _USE_KNETFILE
+ nread = knet_read(fp->x.fpr, buf, page_size);
+#else
+ nread = fread(buf, 1, page_size, fp->file);
+#endif
+ if ( nread<=0 )
+ break;
+
+#ifdef _USE_KNETFILE
+ int count = fwrite(buf, 1, nread, bgzf_out->x.fpw);
+#else
+ int count = fwrite(buf, 1, nread, bgzf_out->file);
+#endif
+ if (count != nread)
+ error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
+ }
+
+ if (bgzf_close(bgzf_out) < 0)
+ error("Error: %s\n",bgzf_out->error);
+
+ return 0;
+}
+
int main(int argc, char *argv[])
{
- int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0;
+ int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0;
ti_conf_t conf = ti_conf_gff;
- while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhf")) >= 0) {
+ const char *reheader = NULL;
+ while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) {
switch (c) {
+ case 'B': bed_reg = 1; break;
case '0': conf.preset |= TI_FLAG_UCSC; break;
case 'S': skip = atoi(optarg); break;
case 'c': meta = optarg[0]; break;
case 'l': list_chrms = 1; break;
case 'h': print_header = 1; break;
case 'f': force = 1; break;
+ case 'r': reheader = optarg; break;
}
}
if (skip >= 0) conf.line_skip = skip;
fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n");
fprintf(stderr, " -s INT sequence name column [1]\n");
fprintf(stderr, " -b INT start column [4]\n");
- fprintf(stderr, " -e INT end column [5]\n");
+ fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n");
fprintf(stderr, " -S INT skip first INT lines [0]\n");
fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n");
+ fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n");
+ fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n");
fprintf(stderr, " -0 zero-based coordinate\n");
- fprintf(stderr, " -h print the VCF header\n");
+ fprintf(stderr, " -h print the header lines\n");
fprintf(stderr, " -l list chromosome names\n");
fprintf(stderr, " -f force to overwrite the index\n");
fprintf(stderr, "\n");
ti_index_destroy(idx);
return 0;
}
+ if (reheader)
+ return reheader_file(reheader,argv[optind],conf.meta_char);
+
+ struct stat stat_tbi,stat_vcf;
+ char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
+ strcat(strcpy(fnidx, argv[optind]), ".tbi");
+
if (optind + 1 == argc) {
if (force == 0) {
- struct stat buf;
- char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
- strcat(strcpy(fnidx, argv[optind]), ".tbi");
- if (stat(fnidx, &buf) == 0) {
- fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
- free(fnidx);
- return 1;
+ if (stat(fnidx, &stat_tbi) == 0)
+ {
+ // Before complaining, check if the VCF file isn't newer. This is a common source of errors,
+ // people tend not to notice that tabix failed
+ stat(argv[optind], &stat_vcf);
+ if ( stat_vcf.st_mtime <= stat_tbi.st_mtime )
+ {
+ fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
+ free(fnidx);
+ return 1;
+ }
}
- free(fnidx);
}
- if ( is_bgzipped(argv[optind])!=1 )
+ if ( bgzf_check_bgzf(argv[optind])!=1 )
{
fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
+ free(fnidx);
return 1;
}
return ti_index_build(argv[optind], &conf);
}
{ // retrieve
tabix_t *t;
+ // Common source of errors: new VCF is used with an old index
+ stat(fnidx, &stat_tbi);
+ stat(argv[optind], &stat_vcf);
+ if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
+ {
+ fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n");
+ free(fnidx);
+ return 1;
+ }
+ free(fnidx);
+
if ((t = ti_open(argv[optind], 0)) == 0) {
fprintf(stderr, "[main] fail to open the data file.\n");
return 1;
}
ti_iter_destroy(iter);
} else { // retrieve from specified regions
- int i;
- if ( ti_lazy_index_load(t) )
- {
+ int i, len;
+ ti_iter_t iter;
+ const char *s;
+ const ti_conf_t *idxconf;
+
+ if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
fprintf(stderr,"[tabix] failed to load the index file.\n");
return 1;
}
+ idxconf = ti_get_conf(t->idx);
- ti_iter_t iter;
- const char *s;
- int len;
if ( print_header )
{
// If requested, print the header lines here
iter = ti_query(t, 0, 0, 0);
while ((s = ti_read(t, iter, &len)) != 0) {
- if ( *s != '#' ) break;
+ if ((int)(*s) != idxconf->meta_char) break;
fputs(s, stdout); fputc('\n', stdout);
}
ti_iter_destroy(iter);
}
- for (i = optind + 1; i < argc; ++i) {
- int tid, beg, end;
- if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
- iter = ti_queryi(t, tid, beg, end);
- while ((s = ti_read(t, iter, &len)) != 0) {
- fputs(s, stdout); fputc('\n', stdout);
+ if (bed_reg) {
+ extern int bed_overlap(const void *_h, const char *chr, int beg, int end);
+ extern void *bed_read(const char *fn);
+ extern void bed_destroy(void *_h);
+
+ const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available
+ void *bed = bed_read(argv[optind+1]); // load the BED file
+ ti_interval_t intv;
+
+ if (bed == 0) {
+ fprintf(stderr, "[main] fail to read the BED file.\n");
+ return 1;
+ }
+ iter = ti_query(t, 0, 0, 0);
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ int c;
+ ti_get_intv(conf_, len, (char*)s, &intv);
+ c = *intv.se; *intv.se = '\0';
+ if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) {
+ *intv.se = c;
+ puts(s);
}
- ti_iter_destroy(iter);
- }
- // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
+ *intv.se = c;
+ }
+ ti_iter_destroy(iter);
+ bed_destroy(bed);
+ } else {
+ for (i = optind + 1; i < argc; ++i) {
+ int tid, beg, end;
+ if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
+ iter = ti_queryi(t, tid, beg, end);
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ fputs(s, stdout); fputc('\n', stdout);
+ }
+ ti_iter_destroy(iter);
+ }
+ // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
+ }
}
}
ti_close(t);
--- /dev/null
+#!/usr/bin/env python
+#
+# The MIT License
+#
+# Copyright (c) 2011 Seoul National University.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
+
+from distutils.core import setup, Extension
+
+# Change this to True when you need the knetfile support.
+USE_KNETFILE = False
+
+TABIX_SOURCE_FILES = [
+ '../bgzf.c', '../bgzip.c', '../index.c', '../knetfile.c', '../kstring.c'
+]
+
+define_options = [('_FILE_OFFSET_BITS', 64)]
+if USE_KNETFILE:
+ define_options.append(('_USE_KNETFILE', 1))
+
+ext_modules = [Extension("tabix", ["tabixmodule.c"] + TABIX_SOURCE_FILES,
+ include_dirs=['..'],
+ libraries=['z'],
+ define_macros=define_options)]
+
+setup (name = 'tabix',
+ version = '1.0',
+ description = 'Python interface to tabix, a generic indexer '
+ 'for TAB-delimited genome position files',
+ author = 'Hyeshik Chang',
+ author_email = 'hyeshik@snu.ac.kr',
+ license = 'MIT',
+ ext_modules = ext_modules
+)
--- /dev/null
+/*-
+ * The MIT License
+ *
+ * Copyright (c) 2011 Seoul National University.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "tabix.h"
+
+static PyObject *TabixError;
+
+typedef struct {
+ PyObject_HEAD
+ tabix_t *tb;
+ char *fn;
+} TabixObject;
+
+typedef struct {
+ PyObject_HEAD
+ TabixObject *tbobj;
+ ti_iter_t iter;
+} TabixIteratorObject;
+
+static PyTypeObject Tabix_Type, TabixIterator_Type;
+
+/* --- TabixIterator --------------------------------------------------- */
+
+static PyObject *
+tabixiter_create(TabixObject *parentidx, ti_iter_t iter)
+{
+ TabixIteratorObject *self;
+
+ self = PyObject_New(TabixIteratorObject, &TabixIterator_Type);
+ if (self == NULL)
+ return NULL;
+
+ Py_INCREF(parentidx);
+ self->tbobj = parentidx;
+ self->iter = iter;
+
+ return (PyObject *)self;
+}
+
+static void
+tabixiter_dealloc(TabixIteratorObject *self)
+{
+ ti_iter_destroy(self->iter);
+ Py_DECREF(self->tbobj);
+ PyObject_Del(self);
+}
+
+static PyObject *
+tabixiter_iter(PyObject *self)
+{
+ Py_INCREF(self);
+ return self;
+}
+
+#if PY_MAJOR_VERSION < 3
+# define PYOBJECT_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#else
+# define PYOBJECT_FROM_STRING_AND_SIZE PyUnicode_FromStringAndSize
+#endif
+
+static PyObject *
+tabixiter_iternext(TabixIteratorObject *self)
+{
+ const char *chunk;
+ int len, i;
+
+ chunk = ti_read(self->tbobj->tb, self->iter, &len);
+ if (chunk != NULL) {
+ PyObject *ret, *column;
+ Py_ssize_t colidx;
+ const char *ptr, *begin;
+
+ ret = PyList_New(0);
+ if (ret == NULL)
+ return NULL;
+
+ colidx = 0;
+ ptr = begin = chunk;
+ for (i = len; i > 0; i--, ptr++)
+ if (*ptr == '\t') {
+ column = PYOBJECT_FROM_STRING_AND_SIZE(begin,
+ (Py_ssize_t)(ptr - begin));
+ if (column == NULL || PyList_Append(ret, column) == -1) {
+ Py_DECREF(ret);
+ return NULL;
+ }
+
+ Py_DECREF(column);
+ begin = ptr + 1;
+ colidx++;
+ }
+
+ column = PYOBJECT_FROM_STRING_AND_SIZE(begin, (Py_ssize_t)(ptr - begin));
+ if (column == NULL || PyList_Append(ret, column) == -1) {
+ Py_DECREF(ret);
+ return NULL;
+ }
+ Py_DECREF(column);
+
+ return ret;
+ }
+ else
+ return NULL;
+}
+
+static PyMethodDef tabixiter_methods[] = {
+ {NULL, NULL} /* sentinel */
+};
+
+static PyTypeObject TabixIterator_Type = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "tabix.TabixIterator", /*tp_name*/
+ sizeof(TabixIteratorObject), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ /* methods */
+ (destructor)tabixiter_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash*/
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ 0, /*tp_doc*/
+ 0, /*tp_traverse*/
+ 0, /*tp_clear*/
+ 0, /*tp_richcompare*/
+ 0, /*tp_weaklistoffset*/
+ tabixiter_iter, /*tp_iter*/
+ (iternextfunc)tabixiter_iternext, /*tp_iternext*/
+ tabixiter_methods, /*tp_methods*/
+ 0, /*tp_members*/
+ 0, /*tp_getset*/
+ 0, /*tp_base*/
+ 0, /*tp_dict*/
+ 0, /*tp_descr_get*/
+ 0, /*tp_descr_set*/
+ 0, /*tp_dictoffset*/
+ 0, /*tp_init*/
+ 0, /*tp_alloc*/
+ 0, /*tp_new*/
+ 0, /*tp_free*/
+ 0, /*tp_is_gc*/
+};
+
+
+/* --- Tabix ----------------------------------------------------------- */
+
+static PyObject *
+tabix_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ TabixObject *self;
+ const char *fn, *fnidx=NULL;
+ static char *kwnames[]={"fn", "fnidx", NULL};
+ tabix_t *tb;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|z:Tabix",
+ kwnames, &fn, &fnidx))
+ return NULL;
+
+ tb = ti_open(fn, fnidx);
+ if (tb == NULL) {
+ PyErr_SetString(TabixError, "Can't open the index file.");
+ return NULL;
+ }
+
+ self = (TabixObject *)type->tp_alloc(type, 0);
+ if (self == NULL)
+ return NULL;
+
+ self->tb = tb;
+ self->fn = strdup(fn);
+
+ return (PyObject *)self;
+}
+
+static void
+tabix_dealloc(TabixObject *self)
+{
+ free(self->fn);
+ ti_close(self->tb);
+ PyObject_Del(self);
+}
+
+static PyObject *
+tabix_query(TabixObject *self, PyObject *args)
+{
+ char *name;
+ int begin, end;
+ ti_iter_t result;
+
+ if (!PyArg_ParseTuple(args, "sii:query", &name, &begin, &end))
+ return NULL;
+
+ result = ti_query(self->tb, name, begin, end);
+ if (result == NULL) {
+ PyErr_SetString(TabixError, "query failed");
+ return NULL;
+ }
+
+ return tabixiter_create(self, result);
+}
+
+static PyObject *
+tabix_queryi(TabixObject *self, PyObject *args)
+{
+ int tid, begin, end;
+ ti_iter_t result;
+
+ if (!PyArg_ParseTuple(args, "iii:queryi", &tid, &begin, &end))
+ return NULL;
+
+ result = ti_queryi(self->tb, tid, begin, end);
+ if (result == NULL) {
+ PyErr_SetString(TabixError, "query failed");
+ return NULL;
+ }
+
+ return tabixiter_create(self, result);
+}
+
+static PyObject *
+tabix_querys(TabixObject *self, PyObject *args)
+{
+ const char *reg;
+ ti_iter_t result;
+
+ if (!PyArg_ParseTuple(args, "s:querys", ®))
+ return NULL;
+
+ result = ti_querys(self->tb, reg);
+ if (result == NULL) {
+ PyErr_SetString(TabixError, "query failed");
+ return NULL;
+ }
+
+ return tabixiter_create(self, result);
+}
+
+static PyObject *
+tabix_repr(TabixObject *self)
+{
+#if PY_MAJOR_VERSION < 3
+ return PyString_FromFormat("<Tabix fn=\"%s\">", self->fn);
+#else
+ return PyUnicode_FromFormat("<Tabix fn=\"%s\">", self->fn);
+#endif
+}
+
+static PyMethodDef tabix_methods[] = {
+ {"query", (PyCFunction)tabix_query, METH_VARARGS,
+ PyDoc_STR("T.query(name, begin, end) -> iterator")},
+ {"queryi", (PyCFunction)tabix_queryi, METH_VARARGS,
+ PyDoc_STR("T.queryi(tid, begin, id) -> iterator")},
+ {"querys", (PyCFunction)tabix_querys, METH_VARARGS,
+ PyDoc_STR("T.querys(region) -> iterator")},
+ {NULL, NULL} /* sentinel */
+};
+
+static PyTypeObject Tabix_Type = {
+ /* The ob_type field must be initialized in the module init function
+ * to be portable to Windows without using C++. */
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "tabix.Tabix", /*tp_name*/
+ sizeof(TabixObject), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ /* methods */
+ (destructor)tabix_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ (reprfunc)tabix_repr, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash*/
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ 0, /*tp_doc*/
+ 0, /*tp_traverse*/
+ 0, /*tp_clear*/
+ 0, /*tp_richcompare*/
+ 0, /*tp_weaklistoffset*/
+ 0, /*tp_iter*/
+ 0, /*tp_iternext*/
+ tabix_methods, /*tp_methods*/
+ 0, /*tp_members*/
+ 0, /*tp_getset*/
+ 0, /*tp_base*/
+ 0, /*tp_dict*/
+ 0, /*tp_descr_get*/
+ 0, /*tp_descr_set*/
+ 0, /*tp_dictoffset*/
+ 0, /*tp_init*/
+ 0, /*tp_alloc*/
+ (newfunc)tabix_new, /*tp_new*/
+ 0, /*tp_free*/
+ 0, /*tp_is_gc*/
+};
+/* --------------------------------------------------------------------- */
+
+static PyMethodDef tabix_functions[] = {
+ {NULL, NULL} /* sentinel */
+};
+
+PyDoc_STRVAR(module_doc,
+"Python interface to tabix, Heng Li's generic indexer for TAB-delimited "
+"genome position filesThis is a template module just for instruction.");
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef tabixmodule = {
+ PyModuleDef_HEAD_INIT,
+ "tabix",
+ module_doc,
+ -1,
+ tabix_functions,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+};
+#endif
+
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC inittabix(void)
+#else
+PyMODINIT_FUNC PyInit_tabix(void)
+#endif
+{
+ PyObject *m;
+
+ if (PyType_Ready(&Tabix_Type) < 0)
+ goto fail;
+ if (PyType_Ready(&TabixIterator_Type) < 0)
+ goto fail;
+
+#if PY_MAJOR_VERSION < 3
+ m = Py_InitModule3("tabix", tabix_functions, module_doc);
+#else
+ m = PyModule_Create(&tabixmodule);
+#endif
+ if (m == NULL)
+ goto fail;
+
+ if (TabixError == NULL) {
+ TabixError = PyErr_NewException("tabix.error", NULL, NULL);
+ if (TabixError == NULL)
+ goto fail;
+ }
+ Py_INCREF(TabixError);
+ PyModule_AddObject(m, "error", TabixError);
+
+ PyModule_AddObject(m, "Tabix", (PyObject *)&Tabix_Type);
+ PyModule_AddObject(m, "TabixIterator", (PyObject *)&TabixIterator_Type);
+
+#if PY_MAJOR_VERSION >= 3
+ return m;
+#endif
+
+ fail:
+#if PY_MAJOR_VERSION < 3
+ return;
+#else
+ return NULL;
+#endif
+}
--- /dev/null
+#
+# The MIT License
+#
+# Copyright (c) 2011 Seoul National University.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
+
+import unittest
+import random
+import gzip
+import tabix
+
+EXAMPLEFILE = '../example.gtf.gz'
+
+def load_example_regions(path):
+ alldata = []
+ for line in gzip.GzipFile(EXAMPLEFILE):
+ fields = line.decode('ascii')[:-1].split('\t')
+ seqid = fields[0]
+ begin = int(fields[3])
+ end = int(fields[4])
+ alldata.append((seqid, begin, end, fields[:7]))
+
+ return alldata
+
+def does_overlap(A, B, C, D):
+ return (A <= D <= B) or (C <= B <= D)
+
+def sample_test_dataset(regions, ntests):
+ seqids = [seqid for seqid, _, _, _ in regions]
+ lowerbound = max(0, min(begin for _, begin, _, _ in regions) - 1000)
+ upperbound = max(end for _, _, end, _ in regions) + 1000
+
+ tests = []
+ for i in range(ntests):
+ seqid = random.choice(seqids)
+ low = random.randrange(lowerbound, upperbound)
+ high = random.randrange(low, upperbound)
+
+ # for 1-based both-end inclusive intervals
+ matches = [info for seq, begin, end, info in regions
+ if seqid == seq and does_overlap(begin, end, low, high)]
+
+ tests.append((seqid, low, high, matches))
+
+ return tests
+
+def tbresult2excerpt(tbmatches):
+ return [fields[:7] for fields in tbmatches]
+
+class TabixTest(unittest.TestCase):
+ regions = load_example_regions(EXAMPLEFILE)
+ testset = sample_test_dataset(regions, 500)
+
+ def setUp(self):
+ self.tb = tabix.Tabix(EXAMPLEFILE)
+
+ def testQuery(self):
+ for seqid, low, high, matches in self.testset:
+ tbresult = tbresult2excerpt(self.tb.query(seqid, low, high))
+ self.assertEqual(tbresult, matches)
+
+ def testQueryS(self):
+ for seqid, low, high, matches in self.testset:
+ tbresult = tbresult2excerpt(self.tb.querys('%s:%d-%d' %
+ (seqid, low, high)))
+ self.assertEqual(tbresult, matches)
+
+
+if __name__ == '__main__':
+ unittest.main()
.SH SYNOPSIS
.PP
.B bgzip
-.RB [ \-cdh ]
+.RB [ \-cdhB ]
.RB [ \-b
.IR virtualOffset ]
.RB [ \-s
Specify that the position in the data file is 0-based (e.g. UCSC files)
rather than 1-based.
.TP
+.B -h
+Print the header/meta lines.
+.TP
+.B -B
+The second argument is a BED file. When this option is in use, the input
+file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless,
+with this option, the format of the input must be specificed correctly on the command line.
+.TP
.B -f
Force to overwrite the index file if it is present.
.TP
int32_t meta_char, line_skip;
} ti_conf_t;
+typedef struct {
+ int beg, end;
+ char *ss, *se;
+} ti_interval_t;
+
extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset
#ifdef __cplusplus
/* Get the data line pointed by the iterator and iterate to the next record. */
const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len);
+ const ti_conf_t *ti_get_conf(ti_index_t *idx);
+ int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv);
+
/*******************
* Deprecated APIs *
*******************/