From c70c92b7a385548d8e670638a93a083401f49e4d Mon Sep 17 00:00:00 2001
From: Charles Plessy <plessy@debian.org>
Date: Thu, 19 Aug 2010 15:49:36 +0900
Subject: [PATCH] Imported Upstream version 0.1.5

---
 Makefile         |   8 +-
 NEWS             |  14 ++
 TabixReader.java | 387 +++++++++++++++++++++++++++++++++++++++++++++++
 bgzip.c          |  86 +++++++----
 index.c          |  86 ++++++++---
 knetfile.c       |   2 +-
 main.c           |  27 +++-
 tabix.1          |  10 +-
 tabix.h          |  28 ++++
 tabix.tex        | 121 +++++++++++++++
 tabix.txt        |  89 -----------
 11 files changed, 707 insertions(+), 151 deletions(-)
 create mode 100644 NEWS
 create mode 100644 TabixReader.java
 create mode 100644 tabix.tex
 delete mode 100644 tabix.txt
diff --git a/Makefile b/Makefile
index 3d7153d..a51c175 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,9 @@ tabix:lib $(AOBJS)
 bgzip:bgzip.o bgzf.o knetfile.o
 		$(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz
 
+TabixReader.class:TabixReader.java
+		javac -cp .:sam.jar TabixReader.java
+
 kstring.o:kstring.h
 knetfile.o:knetfile.h
 bgzf.o:bgzf.h knetfile.h
@@ -44,7 +47,10 @@ index.o:bgzf.h tabix.h khash.h ksort.h kstring.h
 main.o:tabix.h kstring.h bgzf.h
 bgzip.o:bgzf.h
 
+tabix.pdf:tabix.tex
+		pdflatex tabix.tex
+
 cleanlocal:
-		rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+		rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a tabix.aux tabix.log tabix.pdf *.class
 
 clean:cleanlocal-recur
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..aaa95e0
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,14 @@
+Beta Release 0.1.5 (5 May, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Clarify that tabix is released under MIT/X11.
+
+ * Improve the robustness of indexing and retrieval.
+
+ * Reduce the number of seek calls when the specified region starts from
+   a 16kb block with no data. The index format is the same, but the
+   content is changed a little.
+
+(0.1.5: 5 May 2010, r560)
diff --git a/TabixReader.java b/TabixReader.java
new file mode 100644
index 0000000..acaff24
--- /dev/null
+++ b/TabixReader.java
@@ -0,0 +1,387 @@
+/* The MIT License
+
+   Copyright (c) 2010 Broad Institute.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <hengli@broadinstitute.org> */
+
+import net.sf.samtools.util.BlockCompressedInputStream;
+
+import java.io.*;
+import java.nio.*;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.lang.StringBuffer;
+
+public class TabixReader
+{
+	private String mFn;
+	private BlockCompressedInputStream mFp;
+
+	private int mPreset;
+	private int mSc;
+	private int mBc;
+	private int mEc;
+	private int mMeta;
+	private int mSkip;
+	private String[] mSeq;
+
+	private HashMap<String, Integer> mChr2tid;
+
+	private static int MAX_BIN = 37450;
+	private static int TAD_MIN_CHUNK_GAP = 32768;
+	private static int TAD_LIDX_SHIFT = 14;
+
+	private class TPair64 implements Comparable<TPair64> {
+		long u, v;
+		public TPair64(final long _u, final long _v) {
+			u = _u; v = _v;
+		}
+		public TPair64(final TPair64 p) {
+			u = p.u; v = p.v;
+		}
+		public int compareTo(final TPair64 p) {
+			return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
+		}
+	};
+
+	private class TIndex {
+		HashMap<Integer, TPair64[]> b; // binning index
+		long[] l; // linear index
+	};
+	private TIndex[] mIndex;
+
+	private class TIntv {
+		int tid, beg, end;
+	};
+
+	private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
+		return (u < v) ^ (u < 0) ^ (v < 0);
+	}
+
+	/**
+	 * The constructor
+	 *
+	 * @param fn File name of the data file
+	 */
+	public TabixReader(final String fn) throws IOException {
+		mFn = fn;
+		mFp = new BlockCompressedInputStream(new File(fn));
+		readIndex();
+	}
+
+	private static int reg2bins(final int beg, final int _end, final int[] list) {
+		int i = 0, k, end = _end;
+		--end;
+		list[i++] = 0;
+		for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
+		for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
+		for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
+		for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
+		for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+		return i;
+	}
+
+	public static int readInt(final InputStream is) throws IOException {
+		byte[] buf = new byte[4];
+		is.read(buf);
+		return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
+	}
+
+	public static long readLong(final InputStream is) throws IOException {
+		byte[] buf = new byte[8];
+		is.read(buf);
+		return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
+	}
+
+	public static String readLine(final InputStream is) throws IOException {
+		StringBuffer buf = new StringBuffer();
+		int c;
+		while ((c = is.read()) >= 0 && c != '\n')
+			buf.append((char)c);
+		if (c < 0) return null;
+		return buf.toString();
+	}
+
+	/**
+	 * Read the Tabix index from a file
+	 *
+	 * @param fp File pointer
+	 */
+	public void readIndex(final File fp) throws IOException {
+		if (fp == null) return;
+		BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
+		byte[] buf = new byte[4];
+
+		is.read(buf, 0, 4); // read "TBI\1"
+		mSeq = new String[readInt(is)]; // # sequences
+		mChr2tid = new HashMap<String, Integer>();
+		mPreset = readInt(is);
+		mSc = readInt(is);
+		mBc = readInt(is);
+		mEc = readInt(is);
+		mMeta = readInt(is);
+		mSkip = readInt(is);
+		// read sequence dictionary
+		int i, j, k, l = readInt(is);
+		buf = new byte[l];
+		is.read(buf);
+		for (i = j = k = 0; i < buf.length; ++i) {
+			if (buf[i] == 0) {
+				byte[] b = new byte[i - j];
+				System.arraycopy(buf, j, b, 0, b.length);
+				String s = new String(b);
+				mChr2tid.put(s, k);
+				mSeq[k++] = s;
+				j = i + 1;
+			}
+		}
+		// read the index
+		mIndex = new TIndex[mSeq.length];
+		for (i = 0; i < mSeq.length; ++i) {
+			// the binning index
+			int n_bin = readInt(is);
+			mIndex[i] = new TIndex();
+			mIndex[i].b = new HashMap<Integer, TPair64[]>();
+			for (j = 0; j < n_bin; ++j) {
+				int bin = readInt(is);
+				TPair64[] chunks = new TPair64[readInt(is)];
+				for (k = 0; k < chunks.length; ++k) {
+					long u = readLong(is);
+					long v = readLong(is);
+					chunks[k] = new TPair64(u, v); // in C, this is inefficient
+				}
+				mIndex[i].b.put(bin, chunks);
+			}
+			// the linear index
+			mIndex[i].l = new long[readInt(is)];
+			for (k = 0; k < mIndex[i].l.length; ++k)
+				mIndex[i].l[k] = readLong(is);
+		}
+		// close
+		is.close();
+	}
+
+	/**
+	 * Read the Tabix index from the default file.
+	 */
+	public void readIndex() throws IOException {
+		readIndex(new File(mFn + ".tbi"));
+	}
+
+	/**
+	 * Read one line from the data file.
+	 */
+	public String readLine() throws IOException {
+		return readLine(mFp);
+	}
+
+	private int chr2tid(final String chr) {
+		if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
+		else return -1;
+	}
+
+	/**
+	 * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
+	 *
+	 * @param reg Region string
+	 * @return An array where the three elements are sequence_id,
+	 *         region_begin and region_end. On failure, sequence_id==-1.
+	 */
+	public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
+		String chr;
+		int colon, hyphen;
+		int[] ret = new int[3];
+		colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
+		chr = colon >= 0? reg.substring(0, colon) : reg;
+		ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen)) - 1 : 0;
+		ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
+		ret[0] = chr2tid(chr);
+		return ret;
+	}
+
+	private TIntv getIntv(final String s) {
+		TIntv intv = new TIntv();
+		int col = 0, end = 0, beg = 0;
+		while ((end = s.indexOf('\t', beg)) >= 0) {
+			++col;
+			if (col == mSc) {
+				intv.tid = chr2tid(s.substring(beg, end));
+			} else if (col == mBc) {
+				intv.beg = intv.end = Integer.parseInt(s.substring(beg, end));
+				if ((mPreset&0x10000) != 0) ++intv.end;
+				else --intv.beg;
+				if (intv.beg < 0) intv.beg = 0;
+				if (intv.end < 1) intv.end = 1;
+			} else { // FIXME: SAM/VCF supports are not tested yet
+				if ((mPreset&0xffff) == 0) { // generic
+					if (col == mEc)
+						intv.end = Integer.parseInt(s.substring(beg, end));
+				} else if ((mPreset&0xffff) == 1) { // SAM
+					if (col == 6) { // CIGAR
+						int l = 0, i, j;
+						String cigar = s.substring(beg, end);
+						for (i = j = 0; i < cigar.length(); ++i) {
+							if (cigar.charAt(i) > '9') {
+								int op = cigar.charAt(i);
+								if (op == 'M' || op == 'D' || op == 'N')
+									l += Integer.parseInt(cigar.substring(j, i));
+							}
+						}
+						intv.end = intv.beg + l;
+					}
+				} else if ((mPreset&0xffff) == 2) { // VCF
+					if (col == 5) {
+						String alt = s.substring(beg, end);
+						int i, max = 1;
+						for (i = 0; i < alt.length(); ++i) {
+							if (alt.charAt(i) == 'D') { // deletion
+								int j;
+							    for (j = i; j < alt.length() && alt.charAt(j) >= '0' && alt.charAt(j) <= '9'; ++j);
+								int l = Integer.parseInt(alt.substring(i, j));
+								if (max < l) max = l;
+								i = j - 1;
+							}
+						}
+						intv.end = intv.beg + max;
+					}
+				}
+			}
+			beg = end + 1;
+		}
+		return intv;
+	}
+
+	public class Iterator {
+		private int i, n_seeks;
+		private int tid, beg, end;
+		private TPair64[] off;
+		private long curr_off;
+		private boolean iseof;
+
+		public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
+			i = -1; n_seeks = 0; curr_off = 0; iseof = false;
+			off = _off; tid = _tid; beg = _beg; end = _end;
+		}
+
+		public String next() throws IOException {
+			if (iseof) return null;
+			for (;;) {
+				if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
+					if (i == off.length - 1) break; // no more chunks
+					if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+					if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+						mFp.seek(off[i+1].u);
+						curr_off = mFp.getFilePointer();
+						++n_seeks;
+					}
+					++i;
+				}
+				String s;
+				if ((s = readLine(mFp)) != null) {
+					TIntv intv;
+					char[] str = s.toCharArray();
+					curr_off = mFp.getFilePointer();
+					if (str.length == 0 || str[0] == mMeta) continue;
+					intv = getIntv(s);
+					if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
+					else if (intv.end > beg && intv.beg < end) return s; // overlap; return
+				} else break; // end of file
+			}
+			iseof = true;
+			return null;
+		}
+	};
+
+	public Iterator query(final int tid, final int beg, final int end) {
+		TPair64[] off, chunks;
+		long min_off;
+		TIndex idx = mIndex[tid];
+		int[] bins = new int[MAX_BIN];
+		int i, l, n_off, n_bins = reg2bins(beg, end, bins);
+		min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
+		for (i = n_off = 0; i < n_bins; ++i) {
+			if ((chunks = idx.b.get(bins[i])) != null)
+				n_off += chunks.length;
+		}
+		if (n_off == 0) return null;
+		off = new TPair64[n_off];
+		for (i = n_off = 0; i < n_bins; ++i)
+			if ((chunks = idx.b.get(bins[i])) != null)
+				for (int j = 0; j < chunks.length; ++j)
+					if (less64(min_off, chunks[j].v))
+						off[n_off++] = new TPair64(chunks[j]);
+		Arrays.sort(off, 0, n_off);
+		// resolve completely contained adjacent blocks
+		for (i = 1, l = 0; i < n_off; ++i) {
+			if (less64(off[l].v, off[i].v)) {
+				++l;
+				off[l].u = off[i].u; off[l].v = off[i].v;
+			}
+		}
+		n_off = l + 1;
+		// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+		for (i = 1; i < n_off; ++i)
+			if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
+		// merge adjacent blocks
+		for (i = 1, l = 0; i < n_off; ++i) {
+			if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+			else {
+				++l;
+				off[l].u = off[i].u;
+				off[l].v = off[i].v;
+			}
+		}
+		n_off = l + 1;
+		// return
+		TPair64[] ret = new TPair64[n_off];
+		for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
+		return new TabixReader.Iterator(tid, beg, end, ret);
+	}
+	
+	public Iterator query(final String reg) {
+		int[] x = parseReg(reg);
+		return query(x[0], x[1], x[2]);
+	}
+
+	public static void main(String[] args) {
+		if (args.length < 1) {
+			System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
+			System.exit(1);
+		}
+		try {
+			TabixReader tr = new TabixReader(args[0]);
+			String s;
+			if (args.length == 1) { // no region is specified; print the whole file
+				while ((s = tr.readLine()) != null)
+					System.out.println(s);
+			} else { // a region is specified; random access
+				TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
+				while ((s = iter.next()) != null)
+					System.out.println(s);
+			}
+		} catch (IOException e) {
+		}
+	}
+}
diff --git a/bgzip.c b/bgzip.c
index 381ebc5..d144632 100644
--- a/bgzip.c
+++ b/bgzip.c
@@ -28,26 +28,18 @@
 #include <unistd.h>
 #include <errno.h>
 #include <sys/select.h>
+#include <sys/stat.h>
 #include "bgzf.h"
 
 static const int WINDOW_SIZE = 64 * 1024;
 
-static int is_ready(int fd)
-{
-	fd_set fdset;
-	struct timeval timeout;
-	FD_ZERO(&fdset);
-	FD_SET(fd, &fdset);
-	timeout.tv_sec = 0; timeout.tv_usec = 100000;
-	return select(1, &fdset, NULL, NULL, &timeout) == 1 ? 1 : 0;
-}
-
 static int bgzip_main_usage()
 {
 	fprintf(stderr, "\n");
 	fprintf(stderr, "Usage:   bgzip [options] [file] ...\n\n");
 	fprintf(stderr, "Options: -c      write on standard output, keep original files unchanged\n");
 	fprintf(stderr, "         -d      decompress\n");
+	fprintf(stderr, "         -f      overwrite files without asking\n");
 	fprintf(stderr, "         -b INT  decompress at virtual file pointer INT\n");
 	fprintf(stderr, "         -s INT  decompress INT bytes in the uncompressed file\n");
 	fprintf(stderr, "         -h      give this help\n");
@@ -108,27 +100,38 @@ int main(int argc, char **argv)
 		return 1;
 	}
 	if (compress == 1) {
-		int f_src, f_dst = -1;
-		if (is_ready(fileno(stdin))) pstdout = 1;
-		if (argc > optind && !pstdout) {
+		struct stat sbuf;
+		int f_src = fileno(stdin);
+		int f_dst = fileno(stdout);
+
+		if ( argc>optind )
+		{
+			if ( stat(argv[optind],&sbuf)<0 ) 
+			{ 
+				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+				return 1; 
+			}
+
 			if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
-				fprintf(stderr, "[bgzip] Cannot open file: %s\n", argv[optind]);
+				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
 				return 1;
 			}
-			if (pstdout) {
+
+			if (pstdout)
 				f_dst = fileno(stdout);
-			} else {
-				char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+			else
+			{
+				char *name = malloc(strlen(argv[optind]) + 5);
 				strcpy(name, argv[optind]);
 				strcat(name, ".gz");
 				f_dst = write_open(name, is_forced);
 				if (f_dst < 0) return 1;
 				free(name);
 			}
-		} else if (pstdout) { 
-			f_src = fileno(stdin);
-			f_dst = fileno(stdout);
-		} else return bgzip_main_usage();
+		}
+		else if (!pstdout && isatty(fileno((FILE *)stdout)) )
+			return bgzip_main_usage();
+
 		fp = bgzf_fdopen(f_dst, "w");
 		buffer = malloc(WINDOW_SIZE);
 		while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
@@ -140,25 +143,44 @@ int main(int argc, char **argv)
 		close(f_src);
 		return 0;
 	} else {
-		int f_dst, is_stdin = 0;
-		if (argc == optind) pstdout = 1;
-		if (is_ready(fileno(stdin))) is_stdin = 1;
-		if (argc <= optind && !is_stdin) return bgzip_main_usage();
-		if (argc > optind && !pstdout) {
+		struct stat sbuf;
+		int f_dst;
+
+		if ( argc>optind )
+		{
+			if ( stat(argv[optind],&sbuf)<0 )
+			{
+				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+				return 1;
+			}
 			char *name;
-			if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) {
+			int len = strlen(argv[optind]);
+			if ( strcmp(argv[optind]+len-3,".gz") )
+			{
 				fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
 				return 1;
 			}
+			fp = bgzf_open(argv[optind], "r");
+			if (fp == NULL) {
+				fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
+				return 1;
+			}
+
 			name = strdup(argv[optind]);
 			name[strlen(name) - 3] = '\0';
 			f_dst = write_open(name, is_forced);
 			free(name);
-		} else f_dst = fileno(stdout);
-		fp = (argc == optind)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[optind], "r");
-		if (fp == NULL) {
-			fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
-			return 1;
+		}
+		else if (!pstdout && isatty(fileno((FILE *)stdin)) )
+			return bgzip_main_usage();
+		else
+		{
+			f_dst = fileno(stdout);
+			fp = bgzf_fdopen(fileno(stdin), "r");
+			if (fp == NULL) {
+				fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
+				return 1;
+			}
 		}
 		buffer = malloc(WINDOW_SIZE);
 		if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
diff --git a/index.c b/index.c
index 704b037..cfb2938 100644
--- a/index.c
+++ b/index.c
@@ -1,5 +1,6 @@
 #include <ctype.h>
 #include <assert.h>
+#include <sys/stat.h>
 #include "khash.h"
 #include "ksort.h"
 #include "kstring.h"
@@ -161,6 +162,9 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
 				// here ->beg is 0-based.
 				intv->beg = intv->end = strtol(str->s + b, &s, 0);
 				if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg;
+				else ++intv->end;
+				if (intv->beg < 0) intv->beg = 0;
+				if (intv->end < 1) intv->end = 1;
 			} else {
 				if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) {
 					if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0);
@@ -174,6 +178,7 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
 							if (op == 'M' || op == 'D' || op == 'N') l += x;
 							s = t + 1;
 						}
+						if (l == 0) l = 1;
 						intv->end = intv->beg + l;
 					}
 				} else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) {
@@ -262,6 +267,17 @@ static void merge_chunks(ti_index_t *idx)
 	} // ~for(i)
 }
 
+static void fill_missing(ti_index_t *idx)
+{
+	int i, j;
+	for (i = 0; i < idx->n; ++i) {
+		ti_lidx_t *idx2 = &idx->index2[i];
+		for (j = 1; j < idx2->n; ++j)
+			if (idx2->offset[j] == 0)
+				idx2->offset[j] = idx2->offset[j-1];
+	}
+}
+
 ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
 {
 	int ret;
@@ -316,6 +332,7 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
 	}
 	if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp));
 	merge_chunks(idx);
+	fill_missing(idx);
 
 	free(str->s); free(str);
 	return idx;
@@ -523,22 +540,10 @@ static ti_index_t *ti_index_load_core(BGZF *fp)
 	return idx;
 }
 
-ti_index_t *ti_index_load_local(const char *_fn)
+ti_index_t *ti_index_load_local(const char *fnidx)
 {
 	BGZF *fp;
-	char *fnidx, *fn;
-
-	if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
-		const char *p;
-		int l = strlen(_fn);
-		for (p = _fn + l - 1; p >= _fn; --p)
-			if (*p == '/') break;
-		fn = strdup(p + 1);
-	} else fn = strdup(_fn);
-	fnidx = (char*)calloc(strlen(fn) + 5, 1);
-	strcpy(fnidx, fn); strcat(fnidx, ".tbi");
 	fp = bgzf_open(fnidx, "r");
-	free(fnidx); free(fn);
 	if (fp) {
 		ti_index_t *idx = ti_index_load_core(fp);
 		bgzf_close(fp);
@@ -584,17 +589,53 @@ static void download_from_remote(const char *url)
 }
 #endif
 
+static char *get_local_version(const char *fn)
+{
+    struct stat sbuf;
+	char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
+	strcat(strcpy(fnidx, fn), ".tbi");
+	if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
+		char *p, *url;
+		int l = strlen(fnidx);
+		for (p = fnidx + l - 1; p >= fnidx; --p)
+			if (*p == '/') break;
+		url = fnidx; fnidx = strdup(p + 1);
+		if (stat(fnidx, &sbuf) == 0) {
+			free(url);
+			return fnidx;
+		}
+		fprintf(stderr, "[%s] downloading the index file...\n", __func__);
+		download_from_remote(url);
+		free(url);
+	}
+    if (stat(fnidx, &sbuf) == 0) return fnidx;
+	free(fnidx); return 0;
+}
+
+int ti_list_chromosomes(const char *fn)
+{
+	ti_index_t *idx;
+	char **names;
+	int i;
+	khint_t k;
+	idx = ti_index_load(fn);
+	names = calloc(idx->n, sizeof(void*));
+	for (k = kh_begin(idx->tname); k < kh_end(idx->tname); ++k)
+		if (kh_exist(idx->tname, k))
+			names[kh_val(idx->tname, k)] = (char*)kh_key(idx->tname, k);
+	for (i = 0; i < idx->n; ++i) printf("%s\n", names[i]);
+	free(names);
+	ti_index_destroy(idx);
+	return 0;
+}
+
 ti_index_t *ti_index_load(const char *fn)
 {
 	ti_index_t *idx;
-	idx = ti_index_load_local(fn);
-	if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
-		char *fnidx = calloc(strlen(fn) + 5, 1);
-		strcat(strcpy(fnidx, fn), ".tbi");
-		fprintf(stderr, "[ti_index_load] attempting to download the remote index file.\n");
-		download_from_remote(fnidx);
-		idx = ti_index_load_local(fn);
-	}
+    char *fname = get_local_version(fn);
+	if (fname == 0) return 0;
+	idx = ti_index_load_local(fname);
+    free(fname);
 	if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n");
 	return idx;
 }
@@ -709,7 +750,8 @@ pair64_t *get_chunk_coordinates(const ti_index_t *idx, int tid, int beg, int end
 	bins = (uint16_t*)calloc(MAX_BIN, 2);
 	n_bins = reg2bins(beg, end, bins);
 	index = idx->index[tid];
-	min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
+	min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+		: idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
 	for (i = n_off = 0; i < n_bins; ++i) {
 		if ((k = kh_get(i, index, bins[i])) != kh_end(index))
 			n_off += kh_value(index, k).n;
diff --git a/knetfile.c b/knetfile.c
index 994babb..7c96a3e 100644
--- a/knetfile.c
+++ b/knetfile.c
@@ -584,7 +584,7 @@ int knet_close(knetFile *fp)
 		else netclose(fp->fd);
 	}
 	free(fp->host); free(fp->port);
-	free(fp->response); free(fp->retr); // FTP specific
+	free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific
 	free(fp->path); free(fp->http_host); // HTTP specific
 	free(fp);
 	return 0;
diff --git a/main.c b/main.c
index e0417b9..520d663 100644
--- a/main.c
+++ b/main.c
@@ -2,10 +2,11 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <sys/stat.h>
 #include "bgzf.h"
 #include "tabix.h"
 
-#define PACKAGE_VERSION "0.1.2 (r543)"
+#define PACKAGE_VERSION "0.1.5 (r560)"
 
 static int fetch_func(int l, const char *s, void *data)
 {
@@ -15,9 +16,9 @@ static int fetch_func(int l, const char *s, void *data)
 
 int main(int argc, char *argv[])
 {
-	int c, skip = -1, meta = -1;
+	int c, skip = -1, meta = -1, list_chrms = 0, force = 0;
 	ti_conf_t conf = ti_conf_gff;
-	while ((c = getopt(argc, argv, "p:s:b:e:0S:c:")) >= 0) {
+	while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lf")) >= 0) {
 		switch (c) {
 		case '0': conf.preset |= TI_FLAG_UCSC; break;
 		case 'S': skip = atoi(optarg); break;
@@ -36,6 +37,8 @@ int main(int argc, char *argv[])
 		case 's': conf.sc = atoi(optarg); break;
 		case 'b': conf.bc = atoi(optarg); break;
 		case 'e': conf.ec = atoi(optarg); break;
+        case 'l': list_chrms = 1; break;
+		case 'f': force = 1; break;
 		}
 	}
 	if (skip >= 0) conf.line_skip = skip;
@@ -52,11 +55,27 @@ int main(int argc, char *argv[])
 		fprintf(stderr, "         -S INT     skip first INT lines [0]\n");
 		fprintf(stderr, "         -c CHAR    symbol for comment/meta lines [#]\n");
 		fprintf(stderr, "         -0         zero-based coordinate\n");
+		fprintf(stderr, "         -l         list chromosome names\n");
+		fprintf(stderr, "         -f         force to overwrite the index\n");
 		fprintf(stderr, "\n");
 		return 1;
 	}
-	if (optind + 1 == argc)
+    if (list_chrms)
+        return ti_list_chromosomes(argv[optind]);
+	if (optind + 1 == argc) {
+		if (force == 0) {
+			struct stat buf;
+			char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
+			strcat(strcpy(fnidx, argv[optind]), ".tbi");
+			if (stat(fnidx, &buf) == 0) {
+				fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
+				free(fnidx);
+				return 1;
+			}
+			free(fnidx);
+		}
 		return ti_index_build(argv[optind], &conf);
+	}
 	{ // retrieve
 		BGZF *fp;
 		fp = bgzf_open(argv[optind], "r");
diff --git a/tabix.1 b/tabix.1
index aa6dc75..2657829 100644
--- a/tabix.1
+++ b/tabix.1
@@ -1,4 +1,4 @@
-.TH tabix 1 "16 March 2009" "tabix-0.1.1" "Bioinformatics tools"
+.TH tabix 1 "5 May 2010" "tabix-0.1.5" "Bioinformatics tools"
 .SH NAME
 .PP
 bgzip - Block compression/decompression utility
@@ -15,7 +15,7 @@ tabix - Generic indexer for TAB-delimited genome position files
 .RI [ file ]
 .PP
 .B tabix
-.RB [ \-0 ]
+.RB [ \-0lf ]
 .RB [ \-p
 .R gff|bed|sam|vcf]
 .RB [ \-s
@@ -81,6 +81,12 @@ Skip lines started with character CHAR. [#]
 .B -0
 Specify that the position in the data file is 0-based (e.g. UCSC files)
 rather than 1-based.
+.TP
+.B -f
+Force to overwrite the index file if it is present.
+.TP
+.B -l
+List the sequence names stored in the index file.
 .RE
 
 .SH EXAMPLE
diff --git a/tabix.h b/tabix.h
index e608c79..0df29a0 100644
--- a/tabix.h
+++ b/tabix.h
@@ -1,3 +1,30 @@
+/* The MIT License
+
+   Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@live.co.uk> */
+
 #ifndef __TABIDX_H
 #define __TABIDX_H
 
@@ -32,6 +59,7 @@ extern "C" {
 
 	int ti_index_build(const char *fn, const ti_conf_t *conf);
 	ti_index_t *ti_index_load(const char *fn);
+    int ti_list_chromosomes(const char *fn);
 	void ti_index_destroy(ti_index_t *idx);
 	int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end);
 	int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func);
diff --git a/tabix.tex b/tabix.tex
new file mode 100644
index 0000000..669d673
--- /dev/null
+++ b/tabix.tex
@@ -0,0 +1,121 @@
+\documentclass[10pt]{article}
+\usepackage{color}
+\definecolor{gray}{rgb}{0.7,0.7,0.7}
+
+\setlength{\topmargin}{0.0cm}
+\setlength{\textheight}{21.5cm}
+\setlength{\oddsidemargin}{0cm} 
+\setlength{\textwidth}{16.5cm}
+\setlength{\columnsep}{0.6cm}
+
+\begin{document}
+
+\title{The Tabix index file format}
+\author{Heng Li}
+\date{}
+
+\maketitle
+
+\begin{center}
+\begin{tabular}{|l|l|l|l|l|l|l|}
+\hline
+\multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\
+\hline\hline
+\multicolumn{4}{|l|}{\tt magic} & Magic string & {\tt char[4]} & TBI$\backslash$1 \\
+\hline
+\multicolumn{4}{|l|}{\tt n\_ref} & \# sequences & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt format} & Format (0: generic; 1: SAM; 2: VCF) & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_seq} & Column for the sequence name & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_beg} & Column for the start of a region & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_end} & Column for the end of a region & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt meta} & Leading character for comment lines & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt skip} & \# lines to skip at the beginning & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt names} & Concatenated names, each zero terminated & {\tt char[l\_nm]} & \\
+\hline
+\multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}}\\
+\cline{2-7}
+\hspace{0.1cm} & \multicolumn{3}{l|}{\tt n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\
+\cline{2-7}
+ & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\
+\cline{3-7}
+ & \hspace{0.1cm} & \multicolumn{2}{l|}{\tt bin} & Distinct bin number & {\tt uint32\_t} & \\
+\cline{3-7}
+ & & \multicolumn{2}{l|}{\tt n\_chunk} & \# chunks & {\tt int32\_t} & \\
+\cline{3-7}
+ & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\
+\cline{4-7}
+ & & \hspace{0.1cm} & {\tt cnk\_beg} & Virtual file offset of the start of the chunk & {\tt uint64\_t} & \\
+\cline{4-7}
+ & & & {\tt cnk\_end} & Virtual file offset of the end of the chunk & {\tt uint64\_t} & \\
+\cline{2-7}
+ & \multicolumn{3}{l|}{\tt n\_intv} & \# 16kb intervals (for the linear index) & {\tt int32\_t} & \\
+\cline{2-7}
+ & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct intervals (n=n\_intv)}} \\
+\cline{3-7}
+ & & \multicolumn{2}{l|}{\tt ioff} & File offset of the first record in the interval & {\tt uint64\_t} & \\
+\hline
+\end{tabular}
+\end{center}
+
+{\bf Notes:}
+
+\begin{itemize}
+\item The index file is BGZF compressed.
+\item All integers are little-endian.
+\item When {\tt (format\&0x10000)} is true, the coordinate follows the
+  {\tt BED} rule (i.e. half-closed-half-open and zero based); otherwise,
+  the coordinate follows the {\tt GFF} rule (closed and one based).
+\item For the SAM format, the end of a region equals {\tt POS} plus the
+  reference length in the alignment, inferred from {\tt CIGAR}. For the
+  VCF format, the end of a region equals {\tt POS} plus the size of the
+  deletion.
+\item Field {\tt col\_beg} may equal {\tt col\_end}, and in this case,
+  the end of a region is {\tt end}={\tt beg+1}.
+\item Example. For {\tt GFF}, {\tt format}=0, {\tt col\_seq}=1, {\tt
+    col\_beg}=4, {\tt col\_end}=5, {\tt meta}=`{\tt \#}' and {\tt
+    skip}=0. For {\tt BED}, {\tt format}=0x10000, {\tt col\_seq}=1, {\tt
+    col\_beg}=2, {\tt col\_end}=3, {\tt meta}=`{\tt \#}' and {\tt
+    skip}=0.
+\item Given a zero-based, half-closed and half-open region {\tt
+    [beg,end)}, the {\tt bin} number is calculated with the following C
+  function:
+\begin{verbatim}
+int reg2bin(int beg, int end) {
+  --end;
+  if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
+  if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
+  if (beg>>20 == end>>20) return  ((1<<9)-1)/7 + (beg>>20);
+  if (beg>>23 == end>>23) return  ((1<<6)-1)/7 + (beg>>23);
+  if (beg>>26 == end>>26) return  ((1<<3)-1)/7 + (beg>>26);
+  return 0;
+}
+\end{verbatim}
+\item The list of bins that may overlap a region {\tt [beg,end)} can be
+  obtained with the following C function.
+\begin{verbatim}
+#define MAX_BIN (((1<<18)-1)/7)
+int reg2bins(int rbeg, int rend, uint16_t list[MAX_BIN])
+{
+  int i = 0, k;
+  --rend;
+  list[i++] = 0;
+  for (k =    1 + (rbeg>>26); k <=    1 + (rend>>26); ++k) list[i++] = k;
+  for (k =    9 + (rbeg>>23); k <=    9 + (rend>>23); ++k) list[i++] = k;
+  for (k =   73 + (rbeg>>20); k <=   73 + (rend>>20); ++k) list[i++] = k;
+  for (k =  585 + (rbeg>>17); k <=  585 + (rend>>17); ++k) list[i++] = k;
+  for (k = 4681 + (rbeg>>14); k <= 4681 + (rend>>14); ++k) list[i++] = k;
+  return i; // #elements in list[]
+}
+\end{verbatim}
+\end{itemize}
+
+\end{document}
\ No newline at end of file
diff --git a/tabix.txt b/tabix.txt
deleted file mode 100644
index 5ba263b..0000000
--- a/tabix.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-tabix(1)                     Bioinformatics tools                     tabix(1)
-
-
-
-NAME
-       bgzip - Block compression/decompression utility
-
-       tabix - Generic indexer for TAB-delimited genome position files
-
-SYNOPSIS
-       bgzip [-cdh] [-b virtualOffset] [-s size] [file]
-
-       tabix [-0] [-p gff|bed|sam|vcf] [-s seqCol] [-b begCol] [-e endCol] [-S
-       lineSkip] [-c metaChar] in.tab.bgz [region1 [region2 [...]]]
-
-
-DESCRIPTION
-       Tabix indexes a TAB-delimited genome position file in.tab.bgz and  cre-
-       ates  an  index file in.tab.bgz.tbi when region is absent from the com-
-       mand-line. The input data file must be position sorted  and  compressed
-       by  bgzip  which has a gzip(1) like interface. After indexing, tabix is
-       able to quickly retrieve data lines overlapping  regions  specified  in
-       the  format  "chr:beginPos-endPos". Fast data retrieval also works over
-       network if URI is given as a file name and in this case the index  file
-       will be downloaded if it is not present locally.
-
-
-OPTIONS OF TABIX
-       -p STR    Input  format  for indexing. Valid values are: gff, bed, sam,
-                 vcf and psltab. This option should not  be  applied  together
-                 with  any  of  -s, -b, -e, -c and -0; it is not used for data
-                 retrieval because this setting is stored in the  index  file.
-                 [gff]
-
-       -s INT    Column of sequence name. Option -s, -b, -e, -S, -c and -0 are
-                 all stored in the index  file  and  thus  not  used  in  data
-                 retrieval. [1]
-
-       -b INT    Column of start chromosomal position. [4]
-
-       -e INT    Column of end chromosomal position. [5]
-
-       -S INT    Skip first INT lines in the data file. [0]
-
-       -c CHAR   Skip lines started with character CHAR. [#]
-
-       -0        Specify  that  the position in the data file is 0-based (e.g.
-                 UCSC files) rather than 1-based.
-
-
-EXAMPLE
-       grep  -v  ^"#"  unsorted.gff  |  sort  -k1,1  -k4,4n  |  bgzip   -c   >
-       sorted.gff.gz;
-
-       tabix -p gff sorted.gff.gz;
-
-       tabix sorted.gff.gz chr1:10,000,000-20,000,000;
-
-
-NOTES
-       It  is straightforward to achieve overlap queries using the standard B-
-       tree index (with or without binning) implemented in all SQL  databases,
-       or  the R-tree index in PostgreSQL and Oracle. But there are still many
-       reasons to use tabix. Firstly, tabix  directly  works  with  a  lot  of
-       widely  used  TAB-delimited  formats such as GFF/GTF and BED. We do not
-       need to design database schema or specialized binary formats.  Data  do
-       not need to be duplicated in different formats, either. Secondly, tabix
-       works on compressed data files while most SQL  databases  do  not.  The
-       GenCode annotation GTF can be compressed down to 4%.  Thirdly, tabix is
-       fast. The same indexing algorithm is known to work efficiently  for  an
-       alignment with a few billion short reads. SQL databases probably cannot
-       easily handle data at this scale. Last but not the  least,  tabix  sup-
-       ports remote data retrieval. One can put the data file and the index at
-       an FTP or HTTP server, and other users or even  web  services  will  be
-       able to get a slice without downloading the entire file.
-
-
-AUTHOR
-       Tabix  was  written  by Heng Li. The BGZF library was originally imple-
-       mented by Bob Handsaker and modified by Heng Li for remote file  access
-       and in-memory caching.
-
-
-SEE ALSO
-       samtools(1)
-
-
-
-tabix-0.1.0                     2 November 2009                       tabix(1)
-- 
2.30.2