From c70c92b7a385548d8e670638a93a083401f49e4d Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Thu, 19 Aug 2010 15:49:36 +0900 Subject: [PATCH] Imported Upstream version 0.1.5 --- Makefile | 8 +- NEWS | 14 ++ TabixReader.java | 387 +++++++++++++++++++++++++++++++++++++++++++++++ bgzip.c | 86 +++++++---- index.c | 86 ++++++++--- knetfile.c | 2 +- main.c | 27 +++- tabix.1 | 10 +- tabix.h | 28 ++++ tabix.tex | 121 +++++++++++++++ tabix.txt | 89 ----------- 11 files changed, 707 insertions(+), 151 deletions(-) create mode 100644 NEWS create mode 100644 TabixReader.java create mode 100644 tabix.tex delete mode 100644 tabix.txt diff --git a/Makefile b/Makefile index 3d7153d..a51c175 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,9 @@ tabix:lib $(AOBJS) bgzip:bgzip.o bgzf.o knetfile.o $(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz +TabixReader.class:TabixReader.java + javac -cp .:sam.jar TabixReader.java + kstring.o:kstring.h knetfile.o:knetfile.h bgzf.o:bgzf.h knetfile.h @@ -44,7 +47,10 @@ index.o:bgzf.h tabix.h khash.h ksort.h kstring.h main.o:tabix.h kstring.h bgzf.h bgzip.o:bgzf.h +tabix.pdf:tabix.tex + pdflatex tabix.tex + cleanlocal: - rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a tabix.aux tabix.log tabix.pdf *.class clean:cleanlocal-recur diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..aaa95e0 --- /dev/null +++ b/NEWS @@ -0,0 +1,14 @@ +Beta Release 0.1.5 (5 May, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Clarify that tabix is released under MIT/X11. + + * Improve the robustness of indexing and retrieval. + + * Reduce the number of seek calls when the specified region starts from + a 16kb block with no data. The index format is the same, but the + content is changed a little. + +(0.1.5: 5 May 2010, r560) diff --git a/TabixReader.java b/TabixReader.java new file mode 100644 index 0000000..acaff24 --- /dev/null +++ b/TabixReader.java @@ -0,0 +1,387 @@ +/* The MIT License + + Copyright (c) 2010 Broad Institute. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +import net.sf.samtools.util.BlockCompressedInputStream; + +import java.io.*; +import java.nio.*; +import java.util.HashMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.lang.StringBuffer; + +public class TabixReader +{ + private String mFn; + private BlockCompressedInputStream mFp; + + private int mPreset; + private int mSc; + private int mBc; + private int mEc; + private int mMeta; + private int mSkip; + private String[] mSeq; + + private HashMap mChr2tid; + + private static int MAX_BIN = 37450; + private static int TAD_MIN_CHUNK_GAP = 32768; + private static int TAD_LIDX_SHIFT = 14; + + private class TPair64 implements Comparable { + long u, v; + public TPair64(final long _u, final long _v) { + u = _u; v = _v; + } + public TPair64(final TPair64 p) { + u = p.u; v = p.v; + } + public int compareTo(final TPair64 p) { + return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison + } + }; + + private class TIndex { + HashMap b; // binning index + long[] l; // linear index + }; + private TIndex[] mIndex; + + private class TIntv { + int tid, beg, end; + }; + + private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison + return (u < v) ^ (u < 0) ^ (v < 0); + } + + /** + * The constructor + * + * @param fn File name of the data file + */ + public TabixReader(final String fn) throws IOException { + mFn = fn; + mFp = new BlockCompressedInputStream(new File(fn)); + readIndex(); + } + + private static int reg2bins(final int beg, final int _end, final int[] list) { + int i = 0, k, end = _end; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; + } + + public static int readInt(final InputStream is) throws IOException { + byte[] buf = new byte[4]; + is.read(buf); + return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt(); + } + + public static long readLong(final InputStream is) throws IOException { + byte[] buf = new byte[8]; + is.read(buf); + return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong(); + } + + public static String readLine(final InputStream is) throws IOException { + StringBuffer buf = new StringBuffer(); + int c; + while ((c = is.read()) >= 0 && c != '\n') + buf.append((char)c); + if (c < 0) return null; + return buf.toString(); + } + + /** + * Read the Tabix index from a file + * + * @param fp File pointer + */ + public void readIndex(final File fp) throws IOException { + if (fp == null) return; + BlockCompressedInputStream is = new BlockCompressedInputStream(fp); + byte[] buf = new byte[4]; + + is.read(buf, 0, 4); // read "TBI\1" + mSeq = new String[readInt(is)]; // # sequences + mChr2tid = new HashMap(); + mPreset = readInt(is); + mSc = readInt(is); + mBc = readInt(is); + mEc = readInt(is); + mMeta = readInt(is); + mSkip = readInt(is); + // read sequence dictionary + int i, j, k, l = readInt(is); + buf = new byte[l]; + is.read(buf); + for (i = j = k = 0; i < buf.length; ++i) { + if (buf[i] == 0) { + byte[] b = new byte[i - j]; + System.arraycopy(buf, j, b, 0, b.length); + String s = new String(b); + mChr2tid.put(s, k); + mSeq[k++] = s; + j = i + 1; + } + } + // read the index + mIndex = new TIndex[mSeq.length]; + for (i = 0; i < mSeq.length; ++i) { + // the binning index + int n_bin = readInt(is); + mIndex[i] = new TIndex(); + mIndex[i].b = new HashMap(); + for (j = 0; j < n_bin; ++j) { + int bin = readInt(is); + TPair64[] chunks = new TPair64[readInt(is)]; + for (k = 0; k < chunks.length; ++k) { + long u = readLong(is); + long v = readLong(is); + chunks[k] = new TPair64(u, v); // in C, this is inefficient + } + mIndex[i].b.put(bin, chunks); + } + // the linear index + mIndex[i].l = new long[readInt(is)]; + for (k = 0; k < mIndex[i].l.length; ++k) + mIndex[i].l[k] = readLong(is); + } + // close + is.close(); + } + + /** + * Read the Tabix index from the default file. + */ + public void readIndex() throws IOException { + readIndex(new File(mFn + ".tbi")); + } + + /** + * Read one line from the data file. + */ + public String readLine() throws IOException { + return readLine(mFp); + } + + private int chr2tid(final String chr) { + if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr); + else return -1; + } + + /** + * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000" + * + * @param reg Region string + * @return An array where the three elements are sequence_id, + * region_begin and region_end. On failure, sequence_id==-1. + */ + public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -. + String chr; + int colon, hyphen; + int[] ret = new int[3]; + colon = reg.indexOf(':'); hyphen = reg.indexOf('-'); + chr = colon >= 0? reg.substring(0, colon) : reg; + ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen)) - 1 : 0; + ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff; + ret[0] = chr2tid(chr); + return ret; + } + + private TIntv getIntv(final String s) { + TIntv intv = new TIntv(); + int col = 0, end = 0, beg = 0; + while ((end = s.indexOf('\t', beg)) >= 0) { + ++col; + if (col == mSc) { + intv.tid = chr2tid(s.substring(beg, end)); + } else if (col == mBc) { + intv.beg = intv.end = Integer.parseInt(s.substring(beg, end)); + if ((mPreset&0x10000) != 0) ++intv.end; + else --intv.beg; + if (intv.beg < 0) intv.beg = 0; + if (intv.end < 1) intv.end = 1; + } else { // FIXME: SAM/VCF supports are not tested yet + if ((mPreset&0xffff) == 0) { // generic + if (col == mEc) + intv.end = Integer.parseInt(s.substring(beg, end)); + } else if ((mPreset&0xffff) == 1) { // SAM + if (col == 6) { // CIGAR + int l = 0, i, j; + String cigar = s.substring(beg, end); + for (i = j = 0; i < cigar.length(); ++i) { + if (cigar.charAt(i) > '9') { + int op = cigar.charAt(i); + if (op == 'M' || op == 'D' || op == 'N') + l += Integer.parseInt(cigar.substring(j, i)); + } + } + intv.end = intv.beg + l; + } + } else if ((mPreset&0xffff) == 2) { // VCF + if (col == 5) { + String alt = s.substring(beg, end); + int i, max = 1; + for (i = 0; i < alt.length(); ++i) { + if (alt.charAt(i) == 'D') { // deletion + int j; + for (j = i; j < alt.length() && alt.charAt(j) >= '0' && alt.charAt(j) <= '9'; ++j); + int l = Integer.parseInt(alt.substring(i, j)); + if (max < l) max = l; + i = j - 1; + } + } + intv.end = intv.beg + max; + } + } + } + beg = end + 1; + } + return intv; + } + + public class Iterator { + private int i, n_seeks; + private int tid, beg, end; + private TPair64[] off; + private long curr_off; + private boolean iseof; + + public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) { + i = -1; n_seeks = 0; curr_off = 0; iseof = false; + off = _off; tid = _tid; beg = _beg; end = _end; + } + + public String next() throws IOException { + if (iseof) return null; + for (;;) { + if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk + if (i == off.length - 1) break; // no more chunks + if (i >= 0) assert(curr_off == off[i].v); // otherwise bug + if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek + mFp.seek(off[i+1].u); + curr_off = mFp.getFilePointer(); + ++n_seeks; + } + ++i; + } + String s; + if ((s = readLine(mFp)) != null) { + TIntv intv; + char[] str = s.toCharArray(); + curr_off = mFp.getFilePointer(); + if (str.length == 0 || str[0] == mMeta) continue; + intv = getIntv(s); + if (intv.tid != tid || intv.beg >= end) break; // no need to proceed + else if (intv.end > beg && intv.beg < end) return s; // overlap; return + } else break; // end of file + } + iseof = true; + return null; + } + }; + + public Iterator query(final int tid, final int beg, final int end) { + TPair64[] off, chunks; + long min_off; + TIndex idx = mIndex[tid]; + int[] bins = new int[MAX_BIN]; + int i, l, n_off, n_bins = reg2bins(beg, end, bins); + min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT]; + for (i = n_off = 0; i < n_bins; ++i) { + if ((chunks = idx.b.get(bins[i])) != null) + n_off += chunks.length; + } + if (n_off == 0) return null; + off = new TPair64[n_off]; + for (i = n_off = 0; i < n_bins; ++i) + if ((chunks = idx.b.get(bins[i])) != null) + for (int j = 0; j < chunks.length; ++j) + if (less64(min_off, chunks[j].v)) + off[n_off++] = new TPair64(chunks[j]); + Arrays.sort(off, 0, n_off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (less64(off[l].v, off[i].v)) { + ++l; + off[l].u = off[i].u; off[l].v = off[i].v; + } + } + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u; + // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; + else { + ++l; + off[l].u = off[i].u; + off[l].v = off[i].v; + } + } + n_off = l + 1; + // return + TPair64[] ret = new TPair64[n_off]; + for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient + return new TabixReader.Iterator(tid, beg, end, ret); + } + + public Iterator query(final String reg) { + int[] x = parseReg(reg); + return query(x[0], x[1], x[2]); + } + + public static void main(String[] args) { + if (args.length < 1) { + System.out.println("Usage: java -cp .:sam.jar TabixReader [region]"); + System.exit(1); + } + try { + TabixReader tr = new TabixReader(args[0]); + String s; + if (args.length == 1) { // no region is specified; print the whole file + while ((s = tr.readLine()) != null) + System.out.println(s); + } else { // a region is specified; random access + TabixReader.Iterator iter = tr.query(args[1]); // get the iterator + while ((s = iter.next()) != null) + System.out.println(s); + } + } catch (IOException e) { + } + } +} diff --git a/bgzip.c b/bgzip.c index 381ebc5..d144632 100644 --- a/bgzip.c +++ b/bgzip.c @@ -28,26 +28,18 @@ #include #include #include +#include #include "bgzf.h" static const int WINDOW_SIZE = 64 * 1024; -static int is_ready(int fd) -{ - fd_set fdset; - struct timeval timeout; - FD_ZERO(&fdset); - FD_SET(fd, &fdset); - timeout.tv_sec = 0; timeout.tv_usec = 100000; - return select(1, &fdset, NULL, NULL, &timeout) == 1 ? 1 : 0; -} - static int bgzip_main_usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); fprintf(stderr, " -d decompress\n"); + fprintf(stderr, " -f overwrite files without asking\n"); fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); fprintf(stderr, " -h give this help\n"); @@ -108,27 +100,38 @@ int main(int argc, char **argv) return 1; } if (compress == 1) { - int f_src, f_dst = -1; - if (is_ready(fileno(stdin))) pstdout = 1; - if (argc > optind && !pstdout) { + struct stat sbuf; + int f_src = fileno(stdin); + int f_dst = fileno(stdout); + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { - fprintf(stderr, "[bgzip] Cannot open file: %s\n", argv[optind]); + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } - if (pstdout) { + + if (pstdout) f_dst = fileno(stdout); - } else { - char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + else + { + char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); f_dst = write_open(name, is_forced); if (f_dst < 0) return 1; free(name); } - } else if (pstdout) { - f_src = fileno(stdin); - f_dst = fileno(stdout); - } else return bgzip_main_usage(); + } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(); + fp = bgzf_fdopen(f_dst, "w"); buffer = malloc(WINDOW_SIZE); while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) @@ -140,25 +143,44 @@ int main(int argc, char **argv) close(f_src); return 0; } else { - int f_dst, is_stdin = 0; - if (argc == optind) pstdout = 1; - if (is_ready(fileno(stdin))) is_stdin = 1; - if (argc <= optind && !is_stdin) return bgzip_main_usage(); - if (argc > optind && !pstdout) { + struct stat sbuf; + int f_dst; + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } char *name; - if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) { + int len = strlen(argv[optind]); + if ( strcmp(argv[optind]+len-3,".gz") ) + { fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); return 1; } + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = write_open(name, is_forced); free(name); - } else f_dst = fileno(stdout); - fp = (argc == optind)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[optind], "r"); - if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); - return 1; + } + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(); + else + { + f_dst = fileno(stdout); + fp = bgzf_fdopen(fileno(stdin), "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } } buffer = malloc(WINDOW_SIZE); if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); diff --git a/index.c b/index.c index 704b037..cfb2938 100644 --- a/index.c +++ b/index.c @@ -1,5 +1,6 @@ #include #include +#include #include "khash.h" #include "ksort.h" #include "kstring.h" @@ -161,6 +162,9 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) // here ->beg is 0-based. intv->beg = intv->end = strtol(str->s + b, &s, 0); if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg; + else ++intv->end; + if (intv->beg < 0) intv->beg = 0; + if (intv->end < 1) intv->end = 1; } else { if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) { if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0); @@ -174,6 +178,7 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) if (op == 'M' || op == 'D' || op == 'N') l += x; s = t + 1; } + if (l == 0) l = 1; intv->end = intv->beg + l; } } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) { @@ -262,6 +267,17 @@ static void merge_chunks(ti_index_t *idx) } // ~for(i) } +static void fill_missing(ti_index_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + ti_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) { int ret; @@ -316,6 +332,7 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) } if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp)); merge_chunks(idx); + fill_missing(idx); free(str->s); free(str); return idx; @@ -523,22 +540,10 @@ static ti_index_t *ti_index_load_core(BGZF *fp) return idx; } -ti_index_t *ti_index_load_local(const char *_fn) +ti_index_t *ti_index_load_local(const char *fnidx) { BGZF *fp; - char *fnidx, *fn; - - if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) { - const char *p; - int l = strlen(_fn); - for (p = _fn + l - 1; p >= _fn; --p) - if (*p == '/') break; - fn = strdup(p + 1); - } else fn = strdup(_fn); - fnidx = (char*)calloc(strlen(fn) + 5, 1); - strcpy(fnidx, fn); strcat(fnidx, ".tbi"); fp = bgzf_open(fnidx, "r"); - free(fnidx); free(fn); if (fp) { ti_index_t *idx = ti_index_load_core(fp); bgzf_close(fp); @@ -584,17 +589,53 @@ static void download_from_remote(const char *url) } #endif +static char *get_local_version(const char *fn) +{ + struct stat sbuf; + char *fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".tbi"); + if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { + char *p, *url; + int l = strlen(fnidx); + for (p = fnidx + l - 1; p >= fnidx; --p) + if (*p == '/') break; + url = fnidx; fnidx = strdup(p + 1); + if (stat(fnidx, &sbuf) == 0) { + free(url); + return fnidx; + } + fprintf(stderr, "[%s] downloading the index file...\n", __func__); + download_from_remote(url); + free(url); + } + if (stat(fnidx, &sbuf) == 0) return fnidx; + free(fnidx); return 0; +} + +int ti_list_chromosomes(const char *fn) +{ + ti_index_t *idx; + char **names; + int i; + khint_t k; + idx = ti_index_load(fn); + names = calloc(idx->n, sizeof(void*)); + for (k = kh_begin(idx->tname); k < kh_end(idx->tname); ++k) + if (kh_exist(idx->tname, k)) + names[kh_val(idx->tname, k)] = (char*)kh_key(idx->tname, k); + for (i = 0; i < idx->n; ++i) printf("%s\n", names[i]); + free(names); + ti_index_destroy(idx); + return 0; +} + ti_index_t *ti_index_load(const char *fn) { ti_index_t *idx; - idx = ti_index_load_local(fn); - if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) { - char *fnidx = calloc(strlen(fn) + 5, 1); - strcat(strcpy(fnidx, fn), ".tbi"); - fprintf(stderr, "[ti_index_load] attempting to download the remote index file.\n"); - download_from_remote(fnidx); - idx = ti_index_load_local(fn); - } + char *fname = get_local_version(fn); + if (fname == 0) return 0; + idx = ti_index_load_local(fname); + free(fname); if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n"); return idx; } @@ -709,7 +750,8 @@ pair64_t *get_chunk_coordinates(const ti_index_t *idx, int tid, int beg, int end bins = (uint16_t*)calloc(MAX_BIN, 2); n_bins = reg2bins(beg, end, bins); index = idx->index[tid]; - min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT]; + min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT]; for (i = n_off = 0; i < n_bins; ++i) { if ((k = kh_get(i, index, bins[i])) != kh_end(index)) n_off += kh_value(index, k).n; diff --git a/knetfile.c b/knetfile.c index 994babb..7c96a3e 100644 --- a/knetfile.c +++ b/knetfile.c @@ -584,7 +584,7 @@ int knet_close(knetFile *fp) else netclose(fp->fd); } free(fp->host); free(fp->port); - free(fp->response); free(fp->retr); // FTP specific + free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific free(fp->path); free(fp->http_host); // HTTP specific free(fp); return 0; diff --git a/main.c b/main.c index e0417b9..520d663 100644 --- a/main.c +++ b/main.c @@ -2,10 +2,11 @@ #include #include #include +#include #include "bgzf.h" #include "tabix.h" -#define PACKAGE_VERSION "0.1.2 (r543)" +#define PACKAGE_VERSION "0.1.5 (r560)" static int fetch_func(int l, const char *s, void *data) { @@ -15,9 +16,9 @@ static int fetch_func(int l, const char *s, void *data) int main(int argc, char *argv[]) { - int c, skip = -1, meta = -1; + int c, skip = -1, meta = -1, list_chrms = 0, force = 0; ti_conf_t conf = ti_conf_gff; - while ((c = getopt(argc, argv, "p:s:b:e:0S:c:")) >= 0) { + while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lf")) >= 0) { switch (c) { case '0': conf.preset |= TI_FLAG_UCSC; break; case 'S': skip = atoi(optarg); break; @@ -36,6 +37,8 @@ int main(int argc, char *argv[]) case 's': conf.sc = atoi(optarg); break; case 'b': conf.bc = atoi(optarg); break; case 'e': conf.ec = atoi(optarg); break; + case 'l': list_chrms = 1; break; + case 'f': force = 1; break; } } if (skip >= 0) conf.line_skip = skip; @@ -52,11 +55,27 @@ int main(int argc, char *argv[]) fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); fprintf(stderr, " -0 zero-based coordinate\n"); + fprintf(stderr, " -l list chromosome names\n"); + fprintf(stderr, " -f force to overwrite the index\n"); fprintf(stderr, "\n"); return 1; } - if (optind + 1 == argc) + if (list_chrms) + return ti_list_chromosomes(argv[optind]); + if (optind + 1 == argc) { + if (force == 0) { + struct stat buf; + char *fnidx = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(fnidx, argv[optind]), ".tbi"); + if (stat(fnidx, &buf) == 0) { + fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n"); + free(fnidx); + return 1; + } + free(fnidx); + } return ti_index_build(argv[optind], &conf); + } { // retrieve BGZF *fp; fp = bgzf_open(argv[optind], "r"); diff --git a/tabix.1 b/tabix.1 index aa6dc75..2657829 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,4 +1,4 @@ -.TH tabix 1 "16 March 2009" "tabix-0.1.1" "Bioinformatics tools" +.TH tabix 1 "5 May 2010" "tabix-0.1.5" "Bioinformatics tools" .SH NAME .PP bgzip - Block compression/decompression utility @@ -15,7 +15,7 @@ tabix - Generic indexer for TAB-delimited genome position files .RI [ file ] .PP .B tabix -.RB [ \-0 ] +.RB [ \-0lf ] .RB [ \-p .R gff|bed|sam|vcf] .RB [ \-s @@ -81,6 +81,12 @@ Skip lines started with character CHAR. [#] .B -0 Specify that the position in the data file is 0-based (e.g. UCSC files) rather than 1-based. +.TP +.B -f +Force to overwrite the index file if it is present. +.TP +.B -l +List the sequence names stored in the index file. .RE .SH EXAMPLE diff --git a/tabix.h b/tabix.h index e608c79..0df29a0 100644 --- a/tabix.h +++ b/tabix.h @@ -1,3 +1,30 @@ +/* The MIT License + + Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + #ifndef __TABIDX_H #define __TABIDX_H @@ -32,6 +59,7 @@ extern "C" { int ti_index_build(const char *fn, const ti_conf_t *conf); ti_index_t *ti_index_load(const char *fn); + int ti_list_chromosomes(const char *fn); void ti_index_destroy(ti_index_t *idx); int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end); int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func); diff --git a/tabix.tex b/tabix.tex new file mode 100644 index 0000000..669d673 --- /dev/null +++ b/tabix.tex @@ -0,0 +1,121 @@ +\documentclass[10pt]{article} +\usepackage{color} +\definecolor{gray}{rgb}{0.7,0.7,0.7} + +\setlength{\topmargin}{0.0cm} +\setlength{\textheight}{21.5cm} +\setlength{\oddsidemargin}{0cm} +\setlength{\textwidth}{16.5cm} +\setlength{\columnsep}{0.6cm} + +\begin{document} + +\title{The Tabix index file format} +\author{Heng Li} +\date{} + +\maketitle + +\begin{center} +\begin{tabular}{|l|l|l|l|l|l|l|} +\hline +\multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\ +\hline\hline +\multicolumn{4}{|l|}{\tt magic} & Magic string & {\tt char[4]} & TBI$\backslash$1 \\ +\hline +\multicolumn{4}{|l|}{\tt n\_ref} & \# sequences & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt format} & Format (0: generic; 1: SAM; 2: VCF) & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt col\_seq} & Column for the sequence name & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt col\_beg} & Column for the start of a region & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt col\_end} & Column for the end of a region & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt meta} & Leading character for comment lines & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt skip} & \# lines to skip at the beginning & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\ +\hline +\multicolumn{4}{|l|}{\tt names} & Concatenated names, each zero terminated & {\tt char[l\_nm]} & \\ +\hline +\multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}}\\ +\cline{2-7} +\hspace{0.1cm} & \multicolumn{3}{l|}{\tt n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\ +\cline{2-7} + & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\ +\cline{3-7} + & \hspace{0.1cm} & \multicolumn{2}{l|}{\tt bin} & Distinct bin number & {\tt uint32\_t} & \\ +\cline{3-7} + & & \multicolumn{2}{l|}{\tt n\_chunk} & \# chunks & {\tt int32\_t} & \\ +\cline{3-7} + & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\ +\cline{4-7} + & & \hspace{0.1cm} & {\tt cnk\_beg} & Virtual file offset of the start of the chunk & {\tt uint64\_t} & \\ +\cline{4-7} + & & & {\tt cnk\_end} & Virtual file offset of the end of the chunk & {\tt uint64\_t} & \\ +\cline{2-7} + & \multicolumn{3}{l|}{\tt n\_intv} & \# 16kb intervals (for the linear index) & {\tt int32\_t} & \\ +\cline{2-7} + & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct intervals (n=n\_intv)}} \\ +\cline{3-7} + & & \multicolumn{2}{l|}{\tt ioff} & File offset of the first record in the interval & {\tt uint64\_t} & \\ +\hline +\end{tabular} +\end{center} + +{\bf Notes:} + +\begin{itemize} +\item The index file is BGZF compressed. +\item All integers are little-endian. +\item When {\tt (format\&0x10000)} is true, the coordinate follows the + {\tt BED} rule (i.e. half-closed-half-open and zero based); otherwise, + the coordinate follows the {\tt GFF} rule (closed and one based). +\item For the SAM format, the end of a region equals {\tt POS} plus the + reference length in the alignment, inferred from {\tt CIGAR}. For the + VCF format, the end of a region equals {\tt POS} plus the size of the + deletion. +\item Field {\tt col\_beg} may equal {\tt col\_end}, and in this case, + the end of a region is {\tt end}={\tt beg+1}. +\item Example. For {\tt GFF}, {\tt format}=0, {\tt col\_seq}=1, {\tt + col\_beg}=4, {\tt col\_end}=5, {\tt meta}=`{\tt \#}' and {\tt + skip}=0. For {\tt BED}, {\tt format}=0x10000, {\tt col\_seq}=1, {\tt + col\_beg}=2, {\tt col\_end}=3, {\tt meta}=`{\tt \#}' and {\tt + skip}=0. +\item Given a zero-based, half-closed and half-open region {\tt + [beg,end)}, the {\tt bin} number is calculated with the following C + function: +\begin{verbatim} +int reg2bin(int beg, int end) { + --end; + if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); + if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); + if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); + if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); + if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); + return 0; +} +\end{verbatim} +\item The list of bins that may overlap a region {\tt [beg,end)} can be + obtained with the following C function. +\begin{verbatim} +#define MAX_BIN (((1<<18)-1)/7) +int reg2bins(int rbeg, int rend, uint16_t list[MAX_BIN]) +{ + int i = 0, k; + --rend; + list[i++] = 0; + for (k = 1 + (rbeg>>26); k <= 1 + (rend>>26); ++k) list[i++] = k; + for (k = 9 + (rbeg>>23); k <= 9 + (rend>>23); ++k) list[i++] = k; + for (k = 73 + (rbeg>>20); k <= 73 + (rend>>20); ++k) list[i++] = k; + for (k = 585 + (rbeg>>17); k <= 585 + (rend>>17); ++k) list[i++] = k; + for (k = 4681 + (rbeg>>14); k <= 4681 + (rend>>14); ++k) list[i++] = k; + return i; // #elements in list[] +} +\end{verbatim} +\end{itemize} + +\end{document} \ No newline at end of file diff --git a/tabix.txt b/tabix.txt deleted file mode 100644 index 5ba263b..0000000 --- a/tabix.txt +++ /dev/null @@ -1,89 +0,0 @@ -tabix(1) Bioinformatics tools tabix(1) - - - -NAME - bgzip - Block compression/decompression utility - - tabix - Generic indexer for TAB-delimited genome position files - -SYNOPSIS - bgzip [-cdh] [-b virtualOffset] [-s size] [file] - - tabix [-0] [-p gff|bed|sam|vcf] [-s seqCol] [-b begCol] [-e endCol] [-S - lineSkip] [-c metaChar] in.tab.bgz [region1 [region2 [...]]] - - -DESCRIPTION - Tabix indexes a TAB-delimited genome position file in.tab.bgz and cre- - ates an index file in.tab.bgz.tbi when region is absent from the com- - mand-line. The input data file must be position sorted and compressed - by bgzip which has a gzip(1) like interface. After indexing, tabix is - able to quickly retrieve data lines overlapping regions specified in - the format "chr:beginPos-endPos". Fast data retrieval also works over - network if URI is given as a file name and in this case the index file - will be downloaded if it is not present locally. - - -OPTIONS OF TABIX - -p STR Input format for indexing. Valid values are: gff, bed, sam, - vcf and psltab. This option should not be applied together - with any of -s, -b, -e, -c and -0; it is not used for data - retrieval because this setting is stored in the index file. - [gff] - - -s INT Column of sequence name. Option -s, -b, -e, -S, -c and -0 are - all stored in the index file and thus not used in data - retrieval. [1] - - -b INT Column of start chromosomal position. [4] - - -e INT Column of end chromosomal position. [5] - - -S INT Skip first INT lines in the data file. [0] - - -c CHAR Skip lines started with character CHAR. [#] - - -0 Specify that the position in the data file is 0-based (e.g. - UCSC files) rather than 1-based. - - -EXAMPLE - grep -v ^"#" unsorted.gff | sort -k1,1 -k4,4n | bgzip -c > - sorted.gff.gz; - - tabix -p gff sorted.gff.gz; - - tabix sorted.gff.gz chr1:10,000,000-20,000,000; - - -NOTES - It is straightforward to achieve overlap queries using the standard B- - tree index (with or without binning) implemented in all SQL databases, - or the R-tree index in PostgreSQL and Oracle. But there are still many - reasons to use tabix. Firstly, tabix directly works with a lot of - widely used TAB-delimited formats such as GFF/GTF and BED. We do not - need to design database schema or specialized binary formats. Data do - not need to be duplicated in different formats, either. Secondly, tabix - works on compressed data files while most SQL databases do not. The - GenCode annotation GTF can be compressed down to 4%. Thirdly, tabix is - fast. The same indexing algorithm is known to work efficiently for an - alignment with a few billion short reads. SQL databases probably cannot - easily handle data at this scale. Last but not the least, tabix sup- - ports remote data retrieval. One can put the data file and the index at - an FTP or HTTP server, and other users or even web services will be - able to get a slice without downloading the entire file. - - -AUTHOR - Tabix was written by Heng Li. The BGZF library was originally imple- - mented by Bob Handsaker and modified by Heng Li for remote file access - and in-memory caching. - - -SEE ALSO - samtools(1) - - - -tabix-0.1.0 2 November 2009 tabix(1) -- 2.30.2