bgzip:bgzip.o bgzf.o knetfile.o
$(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz
+TabixReader.class:TabixReader.java
+ javac -cp .:sam.jar TabixReader.java
+
kstring.o:kstring.h
knetfile.o:knetfile.h
bgzf.o:bgzf.h knetfile.h
main.o:tabix.h kstring.h bgzf.h
bgzip.o:bgzf.h
+tabix.pdf:tabix.tex
+ pdflatex tabix.tex
+
cleanlocal:
- rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+ rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a tabix.aux tabix.log tabix.pdf *.class
clean:cleanlocal-recur
--- /dev/null
+Beta Release 0.1.5 (5 May, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Clarify that tabix is released under MIT/X11.
+
+ * Improve the robustness of indexing and retrieval.
+
+ * Reduce the number of seek calls when the specified region starts from
+ a 16kb block with no data. The index format is the same, but the
+ content is changed a little.
+
+(0.1.5: 5 May 2010, r560)
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2010 Broad Institute.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <hengli@broadinstitute.org> */
+
+import net.sf.samtools.util.BlockCompressedInputStream;
+
+import java.io.*;
+import java.nio.*;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.lang.StringBuffer;
+
+public class TabixReader
+{
+ private String mFn;
+ private BlockCompressedInputStream mFp;
+
+ private int mPreset;
+ private int mSc;
+ private int mBc;
+ private int mEc;
+ private int mMeta;
+ private int mSkip;
+ private String[] mSeq;
+
+ private HashMap<String, Integer> mChr2tid;
+
+ private static int MAX_BIN = 37450;
+ private static int TAD_MIN_CHUNK_GAP = 32768;
+ private static int TAD_LIDX_SHIFT = 14;
+
+ private class TPair64 implements Comparable<TPair64> {
+ long u, v;
+ public TPair64(final long _u, final long _v) {
+ u = _u; v = _v;
+ }
+ public TPair64(final TPair64 p) {
+ u = p.u; v = p.v;
+ }
+ public int compareTo(final TPair64 p) {
+ return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
+ }
+ };
+
+ private class TIndex {
+ HashMap<Integer, TPair64[]> b; // binning index
+ long[] l; // linear index
+ };
+ private TIndex[] mIndex;
+
+ private class TIntv {
+ int tid, beg, end;
+ };
+
+ private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
+ return (u < v) ^ (u < 0) ^ (v < 0);
+ }
+
+ /**
+ * The constructor
+ *
+ * @param fn File name of the data file
+ */
+ public TabixReader(final String fn) throws IOException {
+ mFn = fn;
+ mFp = new BlockCompressedInputStream(new File(fn));
+ readIndex();
+ }
+
+ private static int reg2bins(final int beg, final int _end, final int[] list) {
+ int i = 0, k, end = _end;
+ --end;
+ list[i++] = 0;
+ for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
+ for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
+ for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
+ for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
+ for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+ return i;
+ }
+
+ public static int readInt(final InputStream is) throws IOException {
+ byte[] buf = new byte[4];
+ is.read(buf);
+ return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
+ }
+
+ public static long readLong(final InputStream is) throws IOException {
+ byte[] buf = new byte[8];
+ is.read(buf);
+ return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
+ }
+
+ public static String readLine(final InputStream is) throws IOException {
+ StringBuffer buf = new StringBuffer();
+ int c;
+ while ((c = is.read()) >= 0 && c != '\n')
+ buf.append((char)c);
+ if (c < 0) return null;
+ return buf.toString();
+ }
+
+ /**
+ * Read the Tabix index from a file
+ *
+ * @param fp File pointer
+ */
+ public void readIndex(final File fp) throws IOException {
+ if (fp == null) return;
+ BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
+ byte[] buf = new byte[4];
+
+ is.read(buf, 0, 4); // read "TBI\1"
+ mSeq = new String[readInt(is)]; // # sequences
+ mChr2tid = new HashMap<String, Integer>();
+ mPreset = readInt(is);
+ mSc = readInt(is);
+ mBc = readInt(is);
+ mEc = readInt(is);
+ mMeta = readInt(is);
+ mSkip = readInt(is);
+ // read sequence dictionary
+ int i, j, k, l = readInt(is);
+ buf = new byte[l];
+ is.read(buf);
+ for (i = j = k = 0; i < buf.length; ++i) {
+ if (buf[i] == 0) {
+ byte[] b = new byte[i - j];
+ System.arraycopy(buf, j, b, 0, b.length);
+ String s = new String(b);
+ mChr2tid.put(s, k);
+ mSeq[k++] = s;
+ j = i + 1;
+ }
+ }
+ // read the index
+ mIndex = new TIndex[mSeq.length];
+ for (i = 0; i < mSeq.length; ++i) {
+ // the binning index
+ int n_bin = readInt(is);
+ mIndex[i] = new TIndex();
+ mIndex[i].b = new HashMap<Integer, TPair64[]>();
+ for (j = 0; j < n_bin; ++j) {
+ int bin = readInt(is);
+ TPair64[] chunks = new TPair64[readInt(is)];
+ for (k = 0; k < chunks.length; ++k) {
+ long u = readLong(is);
+ long v = readLong(is);
+ chunks[k] = new TPair64(u, v); // in C, this is inefficient
+ }
+ mIndex[i].b.put(bin, chunks);
+ }
+ // the linear index
+ mIndex[i].l = new long[readInt(is)];
+ for (k = 0; k < mIndex[i].l.length; ++k)
+ mIndex[i].l[k] = readLong(is);
+ }
+ // close
+ is.close();
+ }
+
+ /**
+ * Read the Tabix index from the default file.
+ */
+ public void readIndex() throws IOException {
+ readIndex(new File(mFn + ".tbi"));
+ }
+
+ /**
+ * Read one line from the data file.
+ */
+ public String readLine() throws IOException {
+ return readLine(mFp);
+ }
+
+ private int chr2tid(final String chr) {
+ if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
+ else return -1;
+ }
+
+ /**
+ * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
+ *
+ * @param reg Region string
+ * @return An array where the three elements are sequence_id,
+ * region_begin and region_end. On failure, sequence_id==-1.
+ */
+ public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
+ String chr;
+ int colon, hyphen;
+ int[] ret = new int[3];
+ colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
+ chr = colon >= 0? reg.substring(0, colon) : reg;
+ ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen)) - 1 : 0;
+ ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
+ ret[0] = chr2tid(chr);
+ return ret;
+ }
+
+ private TIntv getIntv(final String s) {
+ TIntv intv = new TIntv();
+ int col = 0, end = 0, beg = 0;
+ while ((end = s.indexOf('\t', beg)) >= 0) {
+ ++col;
+ if (col == mSc) {
+ intv.tid = chr2tid(s.substring(beg, end));
+ } else if (col == mBc) {
+ intv.beg = intv.end = Integer.parseInt(s.substring(beg, end));
+ if ((mPreset&0x10000) != 0) ++intv.end;
+ else --intv.beg;
+ if (intv.beg < 0) intv.beg = 0;
+ if (intv.end < 1) intv.end = 1;
+ } else { // FIXME: SAM/VCF supports are not tested yet
+ if ((mPreset&0xffff) == 0) { // generic
+ if (col == mEc)
+ intv.end = Integer.parseInt(s.substring(beg, end));
+ } else if ((mPreset&0xffff) == 1) { // SAM
+ if (col == 6) { // CIGAR
+ int l = 0, i, j;
+ String cigar = s.substring(beg, end);
+ for (i = j = 0; i < cigar.length(); ++i) {
+ if (cigar.charAt(i) > '9') {
+ int op = cigar.charAt(i);
+ if (op == 'M' || op == 'D' || op == 'N')
+ l += Integer.parseInt(cigar.substring(j, i));
+ }
+ }
+ intv.end = intv.beg + l;
+ }
+ } else if ((mPreset&0xffff) == 2) { // VCF
+ if (col == 5) {
+ String alt = s.substring(beg, end);
+ int i, max = 1;
+ for (i = 0; i < alt.length(); ++i) {
+ if (alt.charAt(i) == 'D') { // deletion
+ int j;
+ for (j = i; j < alt.length() && alt.charAt(j) >= '0' && alt.charAt(j) <= '9'; ++j);
+ int l = Integer.parseInt(alt.substring(i, j));
+ if (max < l) max = l;
+ i = j - 1;
+ }
+ }
+ intv.end = intv.beg + max;
+ }
+ }
+ }
+ beg = end + 1;
+ }
+ return intv;
+ }
+
+ public class Iterator {
+ private int i, n_seeks;
+ private int tid, beg, end;
+ private TPair64[] off;
+ private long curr_off;
+ private boolean iseof;
+
+ public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
+ i = -1; n_seeks = 0; curr_off = 0; iseof = false;
+ off = _off; tid = _tid; beg = _beg; end = _end;
+ }
+
+ public String next() throws IOException {
+ if (iseof) return null;
+ for (;;) {
+ if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
+ if (i == off.length - 1) break; // no more chunks
+ if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+ if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+ mFp.seek(off[i+1].u);
+ curr_off = mFp.getFilePointer();
+ ++n_seeks;
+ }
+ ++i;
+ }
+ String s;
+ if ((s = readLine(mFp)) != null) {
+ TIntv intv;
+ char[] str = s.toCharArray();
+ curr_off = mFp.getFilePointer();
+ if (str.length == 0 || str[0] == mMeta) continue;
+ intv = getIntv(s);
+ if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
+ else if (intv.end > beg && intv.beg < end) return s; // overlap; return
+ } else break; // end of file
+ }
+ iseof = true;
+ return null;
+ }
+ };
+
+ public Iterator query(final int tid, final int beg, final int end) {
+ TPair64[] off, chunks;
+ long min_off;
+ TIndex idx = mIndex[tid];
+ int[] bins = new int[MAX_BIN];
+ int i, l, n_off, n_bins = reg2bins(beg, end, bins);
+ min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((chunks = idx.b.get(bins[i])) != null)
+ n_off += chunks.length;
+ }
+ if (n_off == 0) return null;
+ off = new TPair64[n_off];
+ for (i = n_off = 0; i < n_bins; ++i)
+ if ((chunks = idx.b.get(bins[i])) != null)
+ for (int j = 0; j < chunks.length; ++j)
+ if (less64(min_off, chunks[j].v))
+ off[n_off++] = new TPair64(chunks[j]);
+ Arrays.sort(off, 0, n_off);
+ // resolve completely contained adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i) {
+ if (less64(off[l].v, off[i].v)) {
+ ++l;
+ off[l].u = off[i].u; off[l].v = off[i].v;
+ }
+ }
+ n_off = l + 1;
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+ for (i = 1; i < n_off; ++i)
+ if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
+ // merge adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i) {
+ if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+ else {
+ ++l;
+ off[l].u = off[i].u;
+ off[l].v = off[i].v;
+ }
+ }
+ n_off = l + 1;
+ // return
+ TPair64[] ret = new TPair64[n_off];
+ for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
+ return new TabixReader.Iterator(tid, beg, end, ret);
+ }
+
+ public Iterator query(final String reg) {
+ int[] x = parseReg(reg);
+ return query(x[0], x[1], x[2]);
+ }
+
+ public static void main(String[] args) {
+ if (args.length < 1) {
+ System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
+ System.exit(1);
+ }
+ try {
+ TabixReader tr = new TabixReader(args[0]);
+ String s;
+ if (args.length == 1) { // no region is specified; print the whole file
+ while ((s = tr.readLine()) != null)
+ System.out.println(s);
+ } else { // a region is specified; random access
+ TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
+ while ((s = iter.next()) != null)
+ System.out.println(s);
+ }
+ } catch (IOException e) {
+ }
+ }
+}
#include <unistd.h>
#include <errno.h>
#include <sys/select.h>
+#include <sys/stat.h>
#include "bgzf.h"
static const int WINDOW_SIZE = 64 * 1024;
-static int is_ready(int fd)
-{
- fd_set fdset;
- struct timeval timeout;
- FD_ZERO(&fdset);
- FD_SET(fd, &fdset);
- timeout.tv_sec = 0; timeout.tv_usec = 100000;
- return select(1, &fdset, NULL, NULL, &timeout) == 1 ? 1 : 0;
-}
-
static int bgzip_main_usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n");
fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n");
fprintf(stderr, " -d decompress\n");
+ fprintf(stderr, " -f overwrite files without asking\n");
fprintf(stderr, " -b INT decompress at virtual file pointer INT\n");
fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n");
fprintf(stderr, " -h give this help\n");
return 1;
}
if (compress == 1) {
- int f_src, f_dst = -1;
- if (is_ready(fileno(stdin))) pstdout = 1;
- if (argc > optind && !pstdout) {
+ struct stat sbuf;
+ int f_src = fileno(stdin);
+ int f_dst = fileno(stdout);
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
- fprintf(stderr, "[bgzip] Cannot open file: %s\n", argv[optind]);
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
return 1;
}
- if (pstdout) {
+
+ if (pstdout)
f_dst = fileno(stdout);
- } else {
- char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+ else
+ {
+ char *name = malloc(strlen(argv[optind]) + 5);
strcpy(name, argv[optind]);
strcat(name, ".gz");
f_dst = write_open(name, is_forced);
if (f_dst < 0) return 1;
free(name);
}
- } else if (pstdout) {
- f_src = fileno(stdin);
- f_dst = fileno(stdout);
- } else return bgzip_main_usage();
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdout)) )
+ return bgzip_main_usage();
+
fp = bgzf_fdopen(f_dst, "w");
buffer = malloc(WINDOW_SIZE);
while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
close(f_src);
return 0;
} else {
- int f_dst, is_stdin = 0;
- if (argc == optind) pstdout = 1;
- if (is_ready(fileno(stdin))) is_stdin = 1;
- if (argc <= optind && !is_stdin) return bgzip_main_usage();
- if (argc > optind && !pstdout) {
+ struct stat sbuf;
+ int f_dst;
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
char *name;
- if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) {
+ int len = strlen(argv[optind]);
+ if ( strcmp(argv[optind]+len-3,".gz") )
+ {
fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
return 1;
}
+ fp = bgzf_open(argv[optind], "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
+ return 1;
+ }
+
name = strdup(argv[optind]);
name[strlen(name) - 3] = '\0';
f_dst = write_open(name, is_forced);
free(name);
- } else f_dst = fileno(stdout);
- fp = (argc == optind)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[optind], "r");
- if (fp == NULL) {
- fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
- return 1;
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdin)) )
+ return bgzip_main_usage();
+ else
+ {
+ f_dst = fileno(stdout);
+ fp = bgzf_fdopen(fileno(stdin), "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
+ return 1;
+ }
}
buffer = malloc(WINDOW_SIZE);
if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
#include <ctype.h>
#include <assert.h>
+#include <sys/stat.h>
#include "khash.h"
#include "ksort.h"
#include "kstring.h"
// here ->beg is 0-based.
intv->beg = intv->end = strtol(str->s + b, &s, 0);
if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg;
+ else ++intv->end;
+ if (intv->beg < 0) intv->beg = 0;
+ if (intv->end < 1) intv->end = 1;
} else {
if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) {
if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0);
if (op == 'M' || op == 'D' || op == 'N') l += x;
s = t + 1;
}
+ if (l == 0) l = 1;
intv->end = intv->beg + l;
}
} else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) {
} // ~for(i)
}
+static void fill_missing(ti_index_t *idx)
+{
+ int i, j;
+ for (i = 0; i < idx->n; ++i) {
+ ti_lidx_t *idx2 = &idx->index2[i];
+ for (j = 1; j < idx2->n; ++j)
+ if (idx2->offset[j] == 0)
+ idx2->offset[j] = idx2->offset[j-1];
+ }
+}
+
ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
{
int ret;
}
if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp));
merge_chunks(idx);
+ fill_missing(idx);
free(str->s); free(str);
return idx;
return idx;
}
-ti_index_t *ti_index_load_local(const char *_fn)
+ti_index_t *ti_index_load_local(const char *fnidx)
{
BGZF *fp;
- char *fnidx, *fn;
-
- if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
- const char *p;
- int l = strlen(_fn);
- for (p = _fn + l - 1; p >= _fn; --p)
- if (*p == '/') break;
- fn = strdup(p + 1);
- } else fn = strdup(_fn);
- fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcpy(fnidx, fn); strcat(fnidx, ".tbi");
fp = bgzf_open(fnidx, "r");
- free(fnidx); free(fn);
if (fp) {
ti_index_t *idx = ti_index_load_core(fp);
bgzf_close(fp);
}
#endif
+static char *get_local_version(const char *fn)
+{
+ struct stat sbuf;
+ char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".tbi");
+ if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
+ char *p, *url;
+ int l = strlen(fnidx);
+ for (p = fnidx + l - 1; p >= fnidx; --p)
+ if (*p == '/') break;
+ url = fnidx; fnidx = strdup(p + 1);
+ if (stat(fnidx, &sbuf) == 0) {
+ free(url);
+ return fnidx;
+ }
+ fprintf(stderr, "[%s] downloading the index file...\n", __func__);
+ download_from_remote(url);
+ free(url);
+ }
+ if (stat(fnidx, &sbuf) == 0) return fnidx;
+ free(fnidx); return 0;
+}
+
+int ti_list_chromosomes(const char *fn)
+{
+ ti_index_t *idx;
+ char **names;
+ int i;
+ khint_t k;
+ idx = ti_index_load(fn);
+ names = calloc(idx->n, sizeof(void*));
+ for (k = kh_begin(idx->tname); k < kh_end(idx->tname); ++k)
+ if (kh_exist(idx->tname, k))
+ names[kh_val(idx->tname, k)] = (char*)kh_key(idx->tname, k);
+ for (i = 0; i < idx->n; ++i) printf("%s\n", names[i]);
+ free(names);
+ ti_index_destroy(idx);
+ return 0;
+}
+
ti_index_t *ti_index_load(const char *fn)
{
ti_index_t *idx;
- idx = ti_index_load_local(fn);
- if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
- char *fnidx = calloc(strlen(fn) + 5, 1);
- strcat(strcpy(fnidx, fn), ".tbi");
- fprintf(stderr, "[ti_index_load] attempting to download the remote index file.\n");
- download_from_remote(fnidx);
- idx = ti_index_load_local(fn);
- }
+ char *fname = get_local_version(fn);
+ if (fname == 0) return 0;
+ idx = ti_index_load_local(fname);
+ free(fname);
if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n");
return idx;
}
bins = (uint16_t*)calloc(MAX_BIN, 2);
n_bins = reg2bins(beg, end, bins);
index = idx->index[tid];
- min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
+ min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+ : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
for (i = n_off = 0; i < n_bins; ++i) {
if ((k = kh_get(i, index, bins[i])) != kh_end(index))
n_off += kh_value(index, k).n;
else netclose(fp->fd);
}
free(fp->host); free(fp->port);
- free(fp->response); free(fp->retr); // FTP specific
+ free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific
free(fp->path); free(fp->http_host); // HTTP specific
free(fp);
return 0;
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
+#include <sys/stat.h>
#include "bgzf.h"
#include "tabix.h"
-#define PACKAGE_VERSION "0.1.2 (r543)"
+#define PACKAGE_VERSION "0.1.5 (r560)"
static int fetch_func(int l, const char *s, void *data)
{
int main(int argc, char *argv[])
{
- int c, skip = -1, meta = -1;
+ int c, skip = -1, meta = -1, list_chrms = 0, force = 0;
ti_conf_t conf = ti_conf_gff;
- while ((c = getopt(argc, argv, "p:s:b:e:0S:c:")) >= 0) {
+ while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lf")) >= 0) {
switch (c) {
case '0': conf.preset |= TI_FLAG_UCSC; break;
case 'S': skip = atoi(optarg); break;
case 's': conf.sc = atoi(optarg); break;
case 'b': conf.bc = atoi(optarg); break;
case 'e': conf.ec = atoi(optarg); break;
+ case 'l': list_chrms = 1; break;
+ case 'f': force = 1; break;
}
}
if (skip >= 0) conf.line_skip = skip;
fprintf(stderr, " -S INT skip first INT lines [0]\n");
fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n");
fprintf(stderr, " -0 zero-based coordinate\n");
+ fprintf(stderr, " -l list chromosome names\n");
+ fprintf(stderr, " -f force to overwrite the index\n");
fprintf(stderr, "\n");
return 1;
}
- if (optind + 1 == argc)
+ if (list_chrms)
+ return ti_list_chromosomes(argv[optind]);
+ if (optind + 1 == argc) {
+ if (force == 0) {
+ struct stat buf;
+ char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
+ strcat(strcpy(fnidx, argv[optind]), ".tbi");
+ if (stat(fnidx, &buf) == 0) {
+ fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
+ free(fnidx);
+ return 1;
+ }
+ free(fnidx);
+ }
return ti_index_build(argv[optind], &conf);
+ }
{ // retrieve
BGZF *fp;
fp = bgzf_open(argv[optind], "r");
-.TH tabix 1 "16 March 2009" "tabix-0.1.1" "Bioinformatics tools"
+.TH tabix 1 "5 May 2010" "tabix-0.1.5" "Bioinformatics tools"
.SH NAME
.PP
bgzip - Block compression/decompression utility
.RI [ file ]
.PP
.B tabix
-.RB [ \-0 ]
+.RB [ \-0lf ]
.RB [ \-p
.R gff|bed|sam|vcf]
.RB [ \-s
.B -0
Specify that the position in the data file is 0-based (e.g. UCSC files)
rather than 1-based.
+.TP
+.B -f
+Force to overwrite the index file if it is present.
+.TP
+.B -l
+List the sequence names stored in the index file.
.RE
.SH EXAMPLE
+/* The MIT License
+
+ Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@live.co.uk> */
+
#ifndef __TABIDX_H
#define __TABIDX_H
int ti_index_build(const char *fn, const ti_conf_t *conf);
ti_index_t *ti_index_load(const char *fn);
+ int ti_list_chromosomes(const char *fn);
void ti_index_destroy(ti_index_t *idx);
int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end);
int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func);
--- /dev/null
+\documentclass[10pt]{article}
+\usepackage{color}
+\definecolor{gray}{rgb}{0.7,0.7,0.7}
+
+\setlength{\topmargin}{0.0cm}
+\setlength{\textheight}{21.5cm}
+\setlength{\oddsidemargin}{0cm}
+\setlength{\textwidth}{16.5cm}
+\setlength{\columnsep}{0.6cm}
+
+\begin{document}
+
+\title{The Tabix index file format}
+\author{Heng Li}
+\date{}
+
+\maketitle
+
+\begin{center}
+\begin{tabular}{|l|l|l|l|l|l|l|}
+\hline
+\multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\
+\hline\hline
+\multicolumn{4}{|l|}{\tt magic} & Magic string & {\tt char[4]} & TBI$\backslash$1 \\
+\hline
+\multicolumn{4}{|l|}{\tt n\_ref} & \# sequences & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt format} & Format (0: generic; 1: SAM; 2: VCF) & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_seq} & Column for the sequence name & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_beg} & Column for the start of a region & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_end} & Column for the end of a region & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt meta} & Leading character for comment lines & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt skip} & \# lines to skip at the beginning & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt names} & Concatenated names, each zero terminated & {\tt char[l\_nm]} & \\
+\hline
+\multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}}\\
+\cline{2-7}
+\hspace{0.1cm} & \multicolumn{3}{l|}{\tt n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\
+\cline{2-7}
+ & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\
+\cline{3-7}
+ & \hspace{0.1cm} & \multicolumn{2}{l|}{\tt bin} & Distinct bin number & {\tt uint32\_t} & \\
+\cline{3-7}
+ & & \multicolumn{2}{l|}{\tt n\_chunk} & \# chunks & {\tt int32\_t} & \\
+\cline{3-7}
+ & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\
+\cline{4-7}
+ & & \hspace{0.1cm} & {\tt cnk\_beg} & Virtual file offset of the start of the chunk & {\tt uint64\_t} & \\
+\cline{4-7}
+ & & & {\tt cnk\_end} & Virtual file offset of the end of the chunk & {\tt uint64\_t} & \\
+\cline{2-7}
+ & \multicolumn{3}{l|}{\tt n\_intv} & \# 16kb intervals (for the linear index) & {\tt int32\_t} & \\
+\cline{2-7}
+ & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct intervals (n=n\_intv)}} \\
+\cline{3-7}
+ & & \multicolumn{2}{l|}{\tt ioff} & File offset of the first record in the interval & {\tt uint64\_t} & \\
+\hline
+\end{tabular}
+\end{center}
+
+{\bf Notes:}
+
+\begin{itemize}
+\item The index file is BGZF compressed.
+\item All integers are little-endian.
+\item When {\tt (format\&0x10000)} is true, the coordinate follows the
+ {\tt BED} rule (i.e. half-closed-half-open and zero based); otherwise,
+ the coordinate follows the {\tt GFF} rule (closed and one based).
+\item For the SAM format, the end of a region equals {\tt POS} plus the
+ reference length in the alignment, inferred from {\tt CIGAR}. For the
+ VCF format, the end of a region equals {\tt POS} plus the size of the
+ deletion.
+\item Field {\tt col\_beg} may equal {\tt col\_end}, and in this case,
+ the end of a region is {\tt end}={\tt beg+1}.
+\item Example. For {\tt GFF}, {\tt format}=0, {\tt col\_seq}=1, {\tt
+ col\_beg}=4, {\tt col\_end}=5, {\tt meta}=`{\tt \#}' and {\tt
+ skip}=0. For {\tt BED}, {\tt format}=0x10000, {\tt col\_seq}=1, {\tt
+ col\_beg}=2, {\tt col\_end}=3, {\tt meta}=`{\tt \#}' and {\tt
+ skip}=0.
+\item Given a zero-based, half-closed and half-open region {\tt
+ [beg,end)}, the {\tt bin} number is calculated with the following C
+ function:
+\begin{verbatim}
+int reg2bin(int beg, int end) {
+ --end;
+ if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
+ if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
+ if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
+ if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
+ if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
+ return 0;
+}
+\end{verbatim}
+\item The list of bins that may overlap a region {\tt [beg,end)} can be
+ obtained with the following C function.
+\begin{verbatim}
+#define MAX_BIN (((1<<18)-1)/7)
+int reg2bins(int rbeg, int rend, uint16_t list[MAX_BIN])
+{
+ int i = 0, k;
+ --rend;
+ list[i++] = 0;
+ for (k = 1 + (rbeg>>26); k <= 1 + (rend>>26); ++k) list[i++] = k;
+ for (k = 9 + (rbeg>>23); k <= 9 + (rend>>23); ++k) list[i++] = k;
+ for (k = 73 + (rbeg>>20); k <= 73 + (rend>>20); ++k) list[i++] = k;
+ for (k = 585 + (rbeg>>17); k <= 585 + (rend>>17); ++k) list[i++] = k;
+ for (k = 4681 + (rbeg>>14); k <= 4681 + (rend>>14); ++k) list[i++] = k;
+ return i; // #elements in list[]
+}
+\end{verbatim}
+\end{itemize}
+
+\end{document}
\ No newline at end of file
+++ /dev/null
-tabix(1) Bioinformatics tools tabix(1)
-
-
-
-NAME
- bgzip - Block compression/decompression utility
-
- tabix - Generic indexer for TAB-delimited genome position files
-
-SYNOPSIS
- bgzip [-cdh] [-b virtualOffset] [-s size] [file]
-
- tabix [-0] [-p gff|bed|sam|vcf] [-s seqCol] [-b begCol] [-e endCol] [-S
- lineSkip] [-c metaChar] in.tab.bgz [region1 [region2 [...]]]
-
-
-DESCRIPTION
- Tabix indexes a TAB-delimited genome position file in.tab.bgz and cre-
- ates an index file in.tab.bgz.tbi when region is absent from the com-
- mand-line. The input data file must be position sorted and compressed
- by bgzip which has a gzip(1) like interface. After indexing, tabix is
- able to quickly retrieve data lines overlapping regions specified in
- the format "chr:beginPos-endPos". Fast data retrieval also works over
- network if URI is given as a file name and in this case the index file
- will be downloaded if it is not present locally.
-
-
-OPTIONS OF TABIX
- -p STR Input format for indexing. Valid values are: gff, bed, sam,
- vcf and psltab. This option should not be applied together
- with any of -s, -b, -e, -c and -0; it is not used for data
- retrieval because this setting is stored in the index file.
- [gff]
-
- -s INT Column of sequence name. Option -s, -b, -e, -S, -c and -0 are
- all stored in the index file and thus not used in data
- retrieval. [1]
-
- -b INT Column of start chromosomal position. [4]
-
- -e INT Column of end chromosomal position. [5]
-
- -S INT Skip first INT lines in the data file. [0]
-
- -c CHAR Skip lines started with character CHAR. [#]
-
- -0 Specify that the position in the data file is 0-based (e.g.
- UCSC files) rather than 1-based.
-
-
-EXAMPLE
- grep -v ^"#" unsorted.gff | sort -k1,1 -k4,4n | bgzip -c >
- sorted.gff.gz;
-
- tabix -p gff sorted.gff.gz;
-
- tabix sorted.gff.gz chr1:10,000,000-20,000,000;
-
-
-NOTES
- It is straightforward to achieve overlap queries using the standard B-
- tree index (with or without binning) implemented in all SQL databases,
- or the R-tree index in PostgreSQL and Oracle. But there are still many
- reasons to use tabix. Firstly, tabix directly works with a lot of
- widely used TAB-delimited formats such as GFF/GTF and BED. We do not
- need to design database schema or specialized binary formats. Data do
- not need to be duplicated in different formats, either. Secondly, tabix
- works on compressed data files while most SQL databases do not. The
- GenCode annotation GTF can be compressed down to 4%. Thirdly, tabix is
- fast. The same indexing algorithm is known to work efficiently for an
- alignment with a few billion short reads. SQL databases probably cannot
- easily handle data at this scale. Last but not the least, tabix sup-
- ports remote data retrieval. One can put the data file and the index at
- an FTP or HTTP server, and other users or even web services will be
- able to get a slice without downloading the entire file.
-
-
-AUTHOR
- Tabix was written by Heng Li. The BGZF library was originally imple-
- mented by Bob Handsaker and modified by Heng Li for remote file access
- and in-memory caching.
-
-
-SEE ALSO
- samtools(1)
-
-
-
-tabix-0.1.0 2 November 2009 tabix(1)