Imported Upstream version 0.1.5

author Charles Plessy <plessy@debian.org>

Thu, 19 Aug 2010 06:49:36 +0000 (15:49 +0900)

committer Charles Plessy <plessy@debian.org>

Thu, 19 Aug 2010 06:49:36 +0000 (15:49 +0900)
author Charles Plessy <plessy@debian.org>
Thu, 19 Aug 2010 06:49:36 +0000 (15:49 +0900)
committer Charles Plessy <plessy@debian.org>
Thu, 19 Aug 2010 06:49:36 +0000 (15:49 +0900)
diff --git a/Makefile b/Makefile

index 3d7153d4a58bcfb8508eaef041f7f50d22e3191c..a51c1757ad139d7135da8ad42a8b71a01b66c04d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,9 @@ tabix:lib $(AOBJS)
  bgzip:bgzip.o bgzf.o knetfile.o
                 $(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz
  
+TabixReader.class:TabixReader.java
+               javac -cp .:sam.jar TabixReader.java
+
  kstring.o:kstring.h
  knetfile.o:knetfile.h
  bgzf.o:bgzf.h knetfile.h
@@ -44,7 +47,10 @@ index.o:bgzf.h tabix.h khash.h ksort.h kstring.h
  main.o:tabix.h kstring.h bgzf.h
  bgzip.o:bgzf.h
  
+tabix.pdf:tabix.tex
+               pdflatex tabix.tex
+
  cleanlocal:
-               rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+               rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a tabix.aux tabix.log tabix.pdf *.class
  
  clean:cleanlocal-recur
diff --git a/NEWS b/NEWS

new file mode 100644 (file)

index 0000000..aaa95e0
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,14 @@
+Beta Release 0.1.5 (5 May, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Clarify that tabix is released under MIT/X11.
+
+ * Improve the robustness of indexing and retrieval.
+
+ * Reduce the number of seek calls when the specified region starts from
+   a 16kb block with no data. The index format is the same, but the
+   content is changed a little.
+
+(0.1.5: 5 May 2010, r560)
diff --git a/TabixReader.java b/TabixReader.java

new file mode 100644 (file)

index 0000000..acaff24
--- /dev/null
+++ b/TabixReader.java
@@ -0,0 +1,387 @@
+/* The MIT License
+
+   Copyright (c) 2010 Broad Institute.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <hengli@broadinstitute.org> */
+
+import net.sf.samtools.util.BlockCompressedInputStream;
+
+import java.io.*;
+import java.nio.*;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.lang.StringBuffer;
+
+public class TabixReader
+{
+       private String mFn;
+       private BlockCompressedInputStream mFp;
+
+       private int mPreset;
+       private int mSc;
+       private int mBc;
+       private int mEc;
+       private int mMeta;
+       private int mSkip;
+       private String[] mSeq;
+
+       private HashMap<String, Integer> mChr2tid;
+
+       private static int MAX_BIN = 37450;
+       private static int TAD_MIN_CHUNK_GAP = 32768;
+       private static int TAD_LIDX_SHIFT = 14;
+
+       private class TPair64 implements Comparable<TPair64> {
+               long u, v;
+               public TPair64(final long _u, final long _v) {
+                       u = _u; v = _v;
+               }
+               public TPair64(final TPair64 p) {
+                       u = p.u; v = p.v;
+               }
+               public int compareTo(final TPair64 p) {
+                       return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
+               }
+       };
+
+       private class TIndex {
+               HashMap<Integer, TPair64[]> b; // binning index
+               long[] l; // linear index
+       };
+       private TIndex[] mIndex;
+
+       private class TIntv {
+               int tid, beg, end;
+       };
+
+       private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
+               return (u < v) ^ (u < 0) ^ (v < 0);
+       }
+
+       /**
+        * The constructor
+        *
+        * @param fn File name of the data file
+        */
+       public TabixReader(final String fn) throws IOException {
+               mFn = fn;
+               mFp = new BlockCompressedInputStream(new File(fn));
+               readIndex();
+       }
+
+       private static int reg2bins(final int beg, final int _end, final int[] list) {
+               int i = 0, k, end = _end;
+               --end;
+               list[i++] = 0;
+               for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
+               for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
+               for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
+               for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
+               for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+               return i;
+       }
+
+       public static int readInt(final InputStream is) throws IOException {
+               byte[] buf = new byte[4];
+               is.read(buf);
+               return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
+       }
+
+       public static long readLong(final InputStream is) throws IOException {
+               byte[] buf = new byte[8];
+               is.read(buf);
+               return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
+       }
+
+       public static String readLine(final InputStream is) throws IOException {
+               StringBuffer buf = new StringBuffer();
+               int c;
+               while ((c = is.read()) >= 0 && c != '\n')
+                       buf.append((char)c);
+               if (c < 0) return null;
+               return buf.toString();
+       }
+
+       /**
+        * Read the Tabix index from a file
+        *
+        * @param fp File pointer
+        */
+       public void readIndex(final File fp) throws IOException {
+               if (fp == null) return;
+               BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
+               byte[] buf = new byte[4];
+
+               is.read(buf, 0, 4); // read "TBI\1"
+               mSeq = new String[readInt(is)]; // # sequences
+               mChr2tid = new HashMap<String, Integer>();
+               mPreset = readInt(is);
+               mSc = readInt(is);
+               mBc = readInt(is);
+               mEc = readInt(is);
+               mMeta = readInt(is);
+               mSkip = readInt(is);
+               // read sequence dictionary
+               int i, j, k, l = readInt(is);
+               buf = new byte[l];
+               is.read(buf);
+               for (i = j = k = 0; i < buf.length; ++i) {
+                       if (buf[i] == 0) {
+                               byte[] b = new byte[i - j];
+                               System.arraycopy(buf, j, b, 0, b.length);
+                               String s = new String(b);
+                               mChr2tid.put(s, k);
+                               mSeq[k++] = s;
+                               j = i + 1;
+                       }
+               }
+               // read the index
+               mIndex = new TIndex[mSeq.length];
+               for (i = 0; i < mSeq.length; ++i) {
+                       // the binning index
+                       int n_bin = readInt(is);
+                       mIndex[i] = new TIndex();
+                       mIndex[i].b = new HashMap<Integer, TPair64[]>();
+                       for (j = 0; j < n_bin; ++j) {
+                               int bin = readInt(is);
+                               TPair64[] chunks = new TPair64[readInt(is)];
+                               for (k = 0; k < chunks.length; ++k) {
+                                       long u = readLong(is);
+                                       long v = readLong(is);
+                                       chunks[k] = new TPair64(u, v); // in C, this is inefficient
+                               }
+                               mIndex[i].b.put(bin, chunks);
+                       }
+                       // the linear index
+                       mIndex[i].l = new long[readInt(is)];
+                       for (k = 0; k < mIndex[i].l.length; ++k)
+                               mIndex[i].l[k] = readLong(is);
+               }
+               // close
+               is.close();
+       }
+
+       /**
+        * Read the Tabix index from the default file.
+        */
+       public void readIndex() throws IOException {
+               readIndex(new File(mFn + ".tbi"));
+       }
+
+       /**
+        * Read one line from the data file.
+        */
+       public String readLine() throws IOException {
+               return readLine(mFp);
+       }
+
+       private int chr2tid(final String chr) {
+               if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
+               else return -1;
+       }
+
+       /**
+        * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
+        *
+        * @param reg Region string
+        * @return An array where the three elements are sequence_id,
+        *         region_begin and region_end. On failure, sequence_id==-1.
+        */
+       public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
+               String chr;
+               int colon, hyphen;
+               int[] ret = new int[3];
+               colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
+               chr = colon >= 0? reg.substring(0, colon) : reg;
+               ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen)) - 1 : 0;
+               ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
+               ret[0] = chr2tid(chr);
+               return ret;
+       }
+
+       private TIntv getIntv(final String s) {
+               TIntv intv = new TIntv();
+               int col = 0, end = 0, beg = 0;
+               while ((end = s.indexOf('\t', beg)) >= 0) {
+                       ++col;
+                       if (col == mSc) {
+                               intv.tid = chr2tid(s.substring(beg, end));
+                       } else if (col == mBc) {
+                               intv.beg = intv.end = Integer.parseInt(s.substring(beg, end));
+                               if ((mPreset&0x10000) != 0) ++intv.end;
+                               else --intv.beg;
+                               if (intv.beg < 0) intv.beg = 0;
+                               if (intv.end < 1) intv.end = 1;
+                       } else { // FIXME: SAM/VCF supports are not tested yet
+                               if ((mPreset&0xffff) == 0) { // generic
+                                       if (col == mEc)
+                                               intv.end = Integer.parseInt(s.substring(beg, end));
+                               } else if ((mPreset&0xffff) == 1) { // SAM
+                                       if (col == 6) { // CIGAR
+                                               int l = 0, i, j;
+                                               String cigar = s.substring(beg, end);
+                                               for (i = j = 0; i < cigar.length(); ++i) {
+                                                       if (cigar.charAt(i) > '9') {
+                                                               int op = cigar.charAt(i);
+                                                               if (op == 'M' || op == 'D' || op == 'N')
+                                                                       l += Integer.parseInt(cigar.substring(j, i));
+                                                       }
+                                               }
+                                               intv.end = intv.beg + l;
+                                       }
+                               } else if ((mPreset&0xffff) == 2) { // VCF
+                                       if (col == 5) {
+                                               String alt = s.substring(beg, end);
+                                               int i, max = 1;
+                                               for (i = 0; i < alt.length(); ++i) {
+                                                       if (alt.charAt(i) == 'D') { // deletion
+                                                               int j;
+                                                           for (j = i; j < alt.length() && alt.charAt(j) >= '0' && alt.charAt(j) <= '9'; ++j);
+                                                               int l = Integer.parseInt(alt.substring(i, j));
+                                                               if (max < l) max = l;
+                                                               i = j - 1;
+                                                       }
+                                               }
+                                               intv.end = intv.beg + max;
+                                       }
+                               }
+                       }
+                       beg = end + 1;
+               }
+               return intv;
+       }
+
+       public class Iterator {
+               private int i, n_seeks;
+               private int tid, beg, end;
+               private TPair64[] off;
+               private long curr_off;
+               private boolean iseof;
+
+               public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
+                       i = -1; n_seeks = 0; curr_off = 0; iseof = false;
+                       off = _off; tid = _tid; beg = _beg; end = _end;
+               }
+
+               public String next() throws IOException {
+                       if (iseof) return null;
+                       for (;;) {
+                               if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
+                                       if (i == off.length - 1) break; // no more chunks
+                                       if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+                                       if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+                                               mFp.seek(off[i+1].u);
+                                               curr_off = mFp.getFilePointer();
+                                               ++n_seeks;
+                                       }
+                                       ++i;
+                               }
+                               String s;
+                               if ((s = readLine(mFp)) != null) {
+                                       TIntv intv;
+                                       char[] str = s.toCharArray();
+                                       curr_off = mFp.getFilePointer();
+                                       if (str.length == 0 || str[0] == mMeta) continue;
+                                       intv = getIntv(s);
+                                       if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
+                                       else if (intv.end > beg && intv.beg < end) return s; // overlap; return
+                               } else break; // end of file
+                       }
+                       iseof = true;
+                       return null;
+               }
+       };
+
+       public Iterator query(final int tid, final int beg, final int end) {
+               TPair64[] off, chunks;
+               long min_off;
+               TIndex idx = mIndex[tid];
+               int[] bins = new int[MAX_BIN];
+               int i, l, n_off, n_bins = reg2bins(beg, end, bins);
+               min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
+               for (i = n_off = 0; i < n_bins; ++i) {
+                       if ((chunks = idx.b.get(bins[i])) != null)
+                               n_off += chunks.length;
+               }
+               if (n_off == 0) return null;
+               off = new TPair64[n_off];
+               for (i = n_off = 0; i < n_bins; ++i)
+                       if ((chunks = idx.b.get(bins[i])) != null)
+                               for (int j = 0; j < chunks.length; ++j)
+                                       if (less64(min_off, chunks[j].v))
+                                               off[n_off++] = new TPair64(chunks[j]);
+               Arrays.sort(off, 0, n_off);
+               // resolve completely contained adjacent blocks
+               for (i = 1, l = 0; i < n_off; ++i) {
+                       if (less64(off[l].v, off[i].v)) {
+                               ++l;
+                               off[l].u = off[i].u; off[l].v = off[i].v;
+                       }
+               }
+               n_off = l + 1;
+               // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+               for (i = 1; i < n_off; ++i)
+                       if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
+               // merge adjacent blocks
+               for (i = 1, l = 0; i < n_off; ++i) {
+                       if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+                       else {
+                               ++l;
+                               off[l].u = off[i].u;
+                               off[l].v = off[i].v;
+                       }
+               }
+               n_off = l + 1;
+               // return
+               TPair64[] ret = new TPair64[n_off];
+               for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
+               return new TabixReader.Iterator(tid, beg, end, ret);
+       }
+       
+       public Iterator query(final String reg) {
+               int[] x = parseReg(reg);
+               return query(x[0], x[1], x[2]);
+       }
+
+       public static void main(String[] args) {
+               if (args.length < 1) {
+                       System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
+                       System.exit(1);
+               }
+               try {
+                       TabixReader tr = new TabixReader(args[0]);
+                       String s;
+                       if (args.length == 1) { // no region is specified; print the whole file
+                               while ((s = tr.readLine()) != null)
+                                       System.out.println(s);
+                       } else { // a region is specified; random access
+                               TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
+                               while ((s = iter.next()) != null)
+                                       System.out.println(s);
+                       }
+               } catch (IOException e) {
+               }
+       }
+}
diff --git a/bgzip.c b/bgzip.c

index 381ebc5d86190d79a808b911e7a15c35baa00bcb..d144632c64a1f7b8f7ce2d060c49a680e50f7915 100644 (file)
--- a/bgzip.c
+++ b/bgzip.c
@@ -28,26 +28,18 @@
  #include <unistd.h>
  #include <errno.h>
  #include <sys/select.h>
+#include <sys/stat.h>
  #include "bgzf.h"
  
  static const int WINDOW_SIZE = 64 * 1024;
  
-static int is_ready(int fd)
-{
-       fd_set fdset;
-       struct timeval timeout;
-       FD_ZERO(&fdset);
-       FD_SET(fd, &fdset);
-       timeout.tv_sec = 0; timeout.tv_usec = 100000;
-       return select(1, &fdset, NULL, NULL, &timeout) == 1 ? 1 : 0;
-}
-
  static int bgzip_main_usage()
  {
         fprintf(stderr, "\n");
         fprintf(stderr, "Usage:   bgzip [options] [file] ...\n\n");
         fprintf(stderr, "Options: -c      write on standard output, keep original files unchanged\n");
         fprintf(stderr, "         -d      decompress\n");
+       fprintf(stderr, "         -f      overwrite files without asking\n");
         fprintf(stderr, "         -b INT  decompress at virtual file pointer INT\n");
         fprintf(stderr, "         -s INT  decompress INT bytes in the uncompressed file\n");
         fprintf(stderr, "         -h      give this help\n");
@@ -108,27 +100,38 @@ int main(int argc, char **argv)
                 return 1;
         }
         if (compress == 1) {
-               int f_src, f_dst = -1;
-               if (is_ready(fileno(stdin))) pstdout = 1;
-               if (argc > optind && !pstdout) {
+               struct stat sbuf;
+               int f_src = fileno(stdin);
+               int f_dst = fileno(stdout);
+
+               if ( argc>optind )
+               {
+                       if ( stat(argv[optind],&sbuf)<0 ) 
+                       { 
+                               fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+                               return 1; 
+                       }
+
                         if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
-                               fprintf(stderr, "[bgzip] Cannot open file: %s\n", argv[optind]);
+                               fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
                                 return 1;
                         }
-                       if (pstdout) {
+
+                       if (pstdout)
                                 f_dst = fileno(stdout);
-                       } else {
-                               char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+                       else
+                       {
+                               char *name = malloc(strlen(argv[optind]) + 5);
                                 strcpy(name, argv[optind]);
                                 strcat(name, ".gz");
                                 f_dst = write_open(name, is_forced);
                                 if (f_dst < 0) return 1;
                                 free(name);
                         }
-               } else if (pstdout) { 
-                       f_src = fileno(stdin);
-                       f_dst = fileno(stdout);
-               } else return bgzip_main_usage();
+               }
+               else if (!pstdout && isatty(fileno((FILE *)stdout)) )
+                       return bgzip_main_usage();
+
                 fp = bgzf_fdopen(f_dst, "w");
                 buffer = malloc(WINDOW_SIZE);
                 while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
@@ -140,25 +143,44 @@ int main(int argc, char **argv)
                 close(f_src);
                 return 0;
         } else {
-               int f_dst, is_stdin = 0;
-               if (argc == optind) pstdout = 1;
-               if (is_ready(fileno(stdin))) is_stdin = 1;
-               if (argc <= optind && !is_stdin) return bgzip_main_usage();
-               if (argc > optind && !pstdout) {
+               struct stat sbuf;
+               int f_dst;
+
+               if ( argc>optind )
+               {
+                       if ( stat(argv[optind],&sbuf)<0 )
+                       {
+                               fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+                               return 1;
+                       }
                         char *name;
-                       if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) {
+                       int len = strlen(argv[optind]);
+                       if ( strcmp(argv[optind]+len-3,".gz") )
+                       {
                                 fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
                                 return 1;
                         }
+                       fp = bgzf_open(argv[optind], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
+                               return 1;
+                       }
+
                         name = strdup(argv[optind]);
                         name[strlen(name) - 3] = '\0';
                         f_dst = write_open(name, is_forced);
                         free(name);
-               } else f_dst = fileno(stdout);
-               fp = (argc == optind)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[optind], "r");
-               if (fp == NULL) {
-                       fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
-                       return 1;
+               }
+               else if (!pstdout && isatty(fileno((FILE *)stdin)) )
+                       return bgzip_main_usage();
+               else
+               {
+                       f_dst = fileno(stdout);
+                       fp = bgzf_fdopen(fileno(stdin), "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
+                               return 1;
+                       }
                 }
                 buffer = malloc(WINDOW_SIZE);
                 if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
diff --git a/index.c b/index.c

index 704b037d2add76bf0c9d02624c3f449ac2bd314b..cfb2938ee4ee9d490cd70f5635608b8f21abe291 100644 (file)
--- a/index.c
+++ b/index.c
@@ -1,5 +1,6 @@
  #include <ctype.h>
  #include <assert.h>
+#include <sys/stat.h>
  #include "khash.h"
  #include "ksort.h"
  #include "kstring.h"
@@ -161,6 +162,9 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
                                 // here ->beg is 0-based.
                                 intv->beg = intv->end = strtol(str->s + b, &s, 0);
                                 if (!(idx->conf.preset&TI_FLAG_UCSC)) --intv->beg;
+                               else ++intv->end;
+                               if (intv->beg < 0) intv->beg = 0;
+                               if (intv->end < 1) intv->end = 1;
                         } else {
                                 if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) {
                                         if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0);
@@ -174,6 +178,7 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
                                                         if (op == 'M' || op == 'D' || op == 'N') l += x;
                                                         s = t + 1;
                                                 }
+                                               if (l == 0) l = 1;
                                                 intv->end = intv->beg + l;
                                         }
                                 } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) {
@@ -262,6 +267,17 @@ static void merge_chunks(ti_index_t *idx)
         } // ~for(i)
  }
  
+static void fill_missing(ti_index_t *idx)
+{
+       int i, j;
+       for (i = 0; i < idx->n; ++i) {
+               ti_lidx_t *idx2 = &idx->index2[i];
+               for (j = 1; j < idx2->n; ++j)
+                       if (idx2->offset[j] == 0)
+                               idx2->offset[j] = idx2->offset[j-1];
+       }
+}
+
  ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
  {
         int ret;
@@ -316,6 +332,7 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
         }
         if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp));
         merge_chunks(idx);
+       fill_missing(idx);
  
         free(str->s); free(str);
         return idx;
@@ -523,22 +540,10 @@ static ti_index_t *ti_index_load_core(BGZF *fp)
         return idx;
  }
  
-ti_index_t *ti_index_load_local(const char *_fn)
+ti_index_t *ti_index_load_local(const char *fnidx)
  {
         BGZF *fp;
-       char *fnidx, *fn;
-
-       if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
-               const char *p;
-               int l = strlen(_fn);
-               for (p = _fn + l - 1; p >= _fn; --p)
-                       if (*p == '/') break;
-               fn = strdup(p + 1);
-       } else fn = strdup(_fn);
-       fnidx = (char*)calloc(strlen(fn) + 5, 1);
-       strcpy(fnidx, fn); strcat(fnidx, ".tbi");
         fp = bgzf_open(fnidx, "r");
-       free(fnidx); free(fn);
         if (fp) {
                 ti_index_t *idx = ti_index_load_core(fp);
                 bgzf_close(fp);
@@ -584,17 +589,53 @@ static void download_from_remote(const char *url)
  }
  #endif
  
+static char *get_local_version(const char *fn)
+{
+    struct stat sbuf;
+       char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
+       strcat(strcpy(fnidx, fn), ".tbi");
+       if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
+               char *p, *url;
+               int l = strlen(fnidx);
+               for (p = fnidx + l - 1; p >= fnidx; --p)
+                       if (*p == '/') break;
+               url = fnidx; fnidx = strdup(p + 1);
+               if (stat(fnidx, &sbuf) == 0) {
+                       free(url);
+                       return fnidx;
+               }
+               fprintf(stderr, "[%s] downloading the index file...\n", __func__);
+               download_from_remote(url);
+               free(url);
+       }
+    if (stat(fnidx, &sbuf) == 0) return fnidx;
+       free(fnidx); return 0;
+}
+
+int ti_list_chromosomes(const char *fn)
+{
+       ti_index_t *idx;
+       char **names;
+       int i;
+       khint_t k;
+       idx = ti_index_load(fn);
+       names = calloc(idx->n, sizeof(void*));
+       for (k = kh_begin(idx->tname); k < kh_end(idx->tname); ++k)
+               if (kh_exist(idx->tname, k))
+                       names[kh_val(idx->tname, k)] = (char*)kh_key(idx->tname, k);
+       for (i = 0; i < idx->n; ++i) printf("%s\n", names[i]);
+       free(names);
+       ti_index_destroy(idx);
+       return 0;
+}
+
  ti_index_t *ti_index_load(const char *fn)
  {
         ti_index_t *idx;
-       idx = ti_index_load_local(fn);
-       if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
-               char *fnidx = calloc(strlen(fn) + 5, 1);
-               strcat(strcpy(fnidx, fn), ".tbi");
-               fprintf(stderr, "[ti_index_load] attempting to download the remote index file.\n");
-               download_from_remote(fnidx);
-               idx = ti_index_load_local(fn);
-       }
+    char *fname = get_local_version(fn);
+       if (fname == 0) return 0;
+       idx = ti_index_load_local(fname);
+    free(fname);
         if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n");
         return idx;
  }
@@ -709,7 +750,8 @@ pair64_t *get_chunk_coordinates(const ti_index_t *idx, int tid, int beg, int end
         bins = (uint16_t*)calloc(MAX_BIN, 2);
         n_bins = reg2bins(beg, end, bins);
         index = idx->index[tid];
-       min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
+       min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+               : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
         for (i = n_off = 0; i < n_bins; ++i) {
                 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
                         n_off += kh_value(index, k).n;
diff --git a/knetfile.c b/knetfile.c

index 994babb6a384e2757098bd6b042970a4dd54a801..7c96a3eaf3f9c62c5764a180a4244f287419e055 100644 (file)
--- a/knetfile.c
+++ b/knetfile.c
@@ -584,7 +584,7 @@ int knet_close(knetFile *fp)
                 else netclose(fp->fd);
         }
         free(fp->host); free(fp->port);
-       free(fp->response); free(fp->retr); // FTP specific
+       free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific
         free(fp->path); free(fp->http_host); // HTTP specific
         free(fp);
         return 0;
diff --git a/main.c b/main.c

index e0417b9d8b3fd2f5d293e4094de799561f8b83c8..520d6630679216b0eea8fd9fd91183e2ac779b70 100644 (file)
--- a/main.c
+++ b/main.c
@@ -2,10 +2,11 @@
  #include <unistd.h>
  #include <stdlib.h>
  #include <stdio.h>
+#include <sys/stat.h>
  #include "bgzf.h"
  #include "tabix.h"
  
-#define PACKAGE_VERSION "0.1.2 (r543)"
+#define PACKAGE_VERSION "0.1.5 (r560)"
  
  static int fetch_func(int l, const char *s, void *data)
  {
@@ -15,9 +16,9 @@ static int fetch_func(int l, const char *s, void *data)
  
  int main(int argc, char *argv[])
  {
-       int c, skip = -1, meta = -1;
+       int c, skip = -1, meta = -1, list_chrms = 0, force = 0;
         ti_conf_t conf = ti_conf_gff;
-       while ((c = getopt(argc, argv, "p:s:b:e:0S:c:")) >= 0) {
+       while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lf")) >= 0) {
                 switch (c) {
                 case '0': conf.preset |= TI_FLAG_UCSC; break;
                 case 'S': skip = atoi(optarg); break;
@@ -36,6 +37,8 @@ int main(int argc, char *argv[])
                 case 's': conf.sc = atoi(optarg); break;
                 case 'b': conf.bc = atoi(optarg); break;
                 case 'e': conf.ec = atoi(optarg); break;
+        case 'l': list_chrms = 1; break;
+               case 'f': force = 1; break;
                 }
         }
         if (skip >= 0) conf.line_skip = skip;
@@ -52,11 +55,27 @@ int main(int argc, char *argv[])
                 fprintf(stderr, "         -S INT     skip first INT lines [0]\n");
                 fprintf(stderr, "         -c CHAR    symbol for comment/meta lines [#]\n");
                 fprintf(stderr, "         -0         zero-based coordinate\n");
+               fprintf(stderr, "         -l         list chromosome names\n");
+               fprintf(stderr, "         -f         force to overwrite the index\n");
                 fprintf(stderr, "\n");
                 return 1;
         }
-       if (optind + 1 == argc)
+    if (list_chrms)
+        return ti_list_chromosomes(argv[optind]);
+       if (optind + 1 == argc) {
+               if (force == 0) {
+                       struct stat buf;
+                       char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
+                       strcat(strcpy(fnidx, argv[optind]), ".tbi");
+                       if (stat(fnidx, &buf) == 0) {
+                               fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
+                               free(fnidx);
+                               return 1;
+                       }
+                       free(fnidx);
+               }
                 return ti_index_build(argv[optind], &conf);
+       }
         { // retrieve
                 BGZF *fp;
                 fp = bgzf_open(argv[optind], "r");
diff --git a/tabix.1 b/tabix.1

index aa6dc75c9f0e7a4a14e93f27b5b45bb39b8c3eb3..26578296f04f545c115a2edbd3a26eb5536c6ecc 100644 (file)
--- a/tabix.1
+++ b/tabix.1
@@ -1,4 +1,4 @@
-.TH tabix 1 "16 March 2009" "tabix-0.1.1" "Bioinformatics tools"
+.TH tabix 1 "5 May 2010" "tabix-0.1.5" "Bioinformatics tools"
  .SH NAME
  .PP
  bgzip - Block compression/decompression utility
@@ -15,7 +15,7 @@ tabix - Generic indexer for TAB-delimited genome position files
  .RI [ file ]
  .PP
  .B tabix
-.RB [ \-0 ]
+.RB [ \-0lf ]
  .RB [ \-p
  .R gff|bed|sam|vcf]
  .RB [ \-s
@@ -81,6 +81,12 @@ Skip lines started with character CHAR. [#]
  .B -0
  Specify that the position in the data file is 0-based (e.g. UCSC files)
  rather than 1-based.
+.TP
+.B -f
+Force to overwrite the index file if it is present.
+.TP
+.B -l
+List the sequence names stored in the index file.
  .RE
  
  .SH EXAMPLE
diff --git a/tabix.h b/tabix.h

index e608c790dd69842504734935500514713c7519f9..0df29a0f8ea889f03684eb540bd517cbb76d861c 100644 (file)
--- a/tabix.h
+++ b/tabix.h
@@ -1,3 +1,30 @@
+/* The MIT License
+
+   Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@live.co.uk> */
+
  #ifndef __TABIDX_H
  #define __TABIDX_H
  
@@ -32,6 +59,7 @@ extern "C" {
  
         int ti_index_build(const char *fn, const ti_conf_t *conf);
         ti_index_t *ti_index_load(const char *fn);
+    int ti_list_chromosomes(const char *fn);
         void ti_index_destroy(ti_index_t *idx);
         int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end);
         int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func);
diff --git a/tabix.tex b/tabix.tex

new file mode 100644 (file)

index 0000000..669d673
--- /dev/null
+++ b/tabix.tex
@@ -0,0 +1,121 @@
+\documentclass[10pt]{article}
+\usepackage{color}
+\definecolor{gray}{rgb}{0.7,0.7,0.7}
+
+\setlength{\topmargin}{0.0cm}
+\setlength{\textheight}{21.5cm}
+\setlength{\oddsidemargin}{0cm} 
+\setlength{\textwidth}{16.5cm}
+\setlength{\columnsep}{0.6cm}
+
+\begin{document}
+
+\title{The Tabix index file format}
+\author{Heng Li}
+\date{}
+
+\maketitle
+
+\begin{center}
+\begin{tabular}{|l|l|l|l|l|l|l|}
+\hline
+\multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\
+\hline\hline
+\multicolumn{4}{|l|}{\tt magic} & Magic string & {\tt char[4]} & TBI$\backslash$1 \\
+\hline
+\multicolumn{4}{|l|}{\tt n\_ref} & \# sequences & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt format} & Format (0: generic; 1: SAM; 2: VCF) & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_seq} & Column for the sequence name & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_beg} & Column for the start of a region & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt col\_end} & Column for the end of a region & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt meta} & Leading character for comment lines & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt skip} & \# lines to skip at the beginning & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\
+\hline
+\multicolumn{4}{|l|}{\tt names} & Concatenated names, each zero terminated & {\tt char[l\_nm]} & \\
+\hline
+\multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}}\\
+\cline{2-7}
+\hspace{0.1cm} & \multicolumn{3}{l|}{\tt n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\
+\cline{2-7}
+ & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\
+\cline{3-7}
+ & \hspace{0.1cm} & \multicolumn{2}{l|}{\tt bin} & Distinct bin number & {\tt uint32\_t} & \\
+\cline{3-7}
+ & & \multicolumn{2}{l|}{\tt n\_chunk} & \# chunks & {\tt int32\_t} & \\
+\cline{3-7}
+ & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\
+\cline{4-7}
+ & & \hspace{0.1cm} & {\tt cnk\_beg} & Virtual file offset of the start of the chunk & {\tt uint64\_t} & \\
+\cline{4-7}
+ & & & {\tt cnk\_end} & Virtual file offset of the end of the chunk & {\tt uint64\_t} & \\
+\cline{2-7}
+ & \multicolumn{3}{l|}{\tt n\_intv} & \# 16kb intervals (for the linear index) & {\tt int32\_t} & \\
+\cline{2-7}
+ & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct intervals (n=n\_intv)}} \\
+\cline{3-7}
+ & & \multicolumn{2}{l|}{\tt ioff} & File offset of the first record in the interval & {\tt uint64\_t} & \\
+\hline
+\end{tabular}
+\end{center}
+
+{\bf Notes:}
+
+\begin{itemize}
+\item The index file is BGZF compressed.
+\item All integers are little-endian.
+\item When {\tt (format\&0x10000)} is true, the coordinate follows the
+  {\tt BED} rule (i.e. half-closed-half-open and zero based); otherwise,
+  the coordinate follows the {\tt GFF} rule (closed and one based).
+\item For the SAM format, the end of a region equals {\tt POS} plus the
+  reference length in the alignment, inferred from {\tt CIGAR}. For the
+  VCF format, the end of a region equals {\tt POS} plus the size of the
+  deletion.
+\item Field {\tt col\_beg} may equal {\tt col\_end}, and in this case,
+  the end of a region is {\tt end}={\tt beg+1}.
+\item Example. For {\tt GFF}, {\tt format}=0, {\tt col\_seq}=1, {\tt
+    col\_beg}=4, {\tt col\_end}=5, {\tt meta}=`{\tt \#}' and {\tt
+    skip}=0. For {\tt BED}, {\tt format}=0x10000, {\tt col\_seq}=1, {\tt
+    col\_beg}=2, {\tt col\_end}=3, {\tt meta}=`{\tt \#}' and {\tt
+    skip}=0.
+\item Given a zero-based, half-closed and half-open region {\tt
+    [beg,end)}, the {\tt bin} number is calculated with the following C
+  function:
+\begin{verbatim}
+int reg2bin(int beg, int end) {
+  --end;
+  if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
+  if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
+  if (beg>>20 == end>>20) return  ((1<<9)-1)/7 + (beg>>20);
+  if (beg>>23 == end>>23) return  ((1<<6)-1)/7 + (beg>>23);
+  if (beg>>26 == end>>26) return  ((1<<3)-1)/7 + (beg>>26);
+  return 0;
+}
+\end{verbatim}
+\item The list of bins that may overlap a region {\tt [beg,end)} can be
+  obtained with the following C function.
+\begin{verbatim}
+#define MAX_BIN (((1<<18)-1)/7)
+int reg2bins(int rbeg, int rend, uint16_t list[MAX_BIN])
+{
+  int i = 0, k;
+  --rend;
+  list[i++] = 0;
+  for (k =    1 + (rbeg>>26); k <=    1 + (rend>>26); ++k) list[i++] = k;
+  for (k =    9 + (rbeg>>23); k <=    9 + (rend>>23); ++k) list[i++] = k;
+  for (k =   73 + (rbeg>>20); k <=   73 + (rend>>20); ++k) list[i++] = k;
+  for (k =  585 + (rbeg>>17); k <=  585 + (rend>>17); ++k) list[i++] = k;
+  for (k = 4681 + (rbeg>>14); k <= 4681 + (rend>>14); ++k) list[i++] = k;
+  return i; // #elements in list[]
+}
+\end{verbatim}
+\end{itemize}
+
+\end{document}
+\ No newline at end of file
diff --git a/tabix.txt b/tabix.txt

deleted file mode 100644 (file)

index 5ba263b..0000000
--- a/tabix.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-tabix(1)                     Bioinformatics tools                     tabix(1)
-
-
-
-NAME
-       bgzip - Block compression/decompression utility
-
-       tabix - Generic indexer for TAB-delimited genome position files
-
-SYNOPSIS
-       bgzip [-cdh] [-b virtualOffset] [-s size] [file]
-
-       tabix [-0] [-p gff|bed|sam|vcf] [-s seqCol] [-b begCol] [-e endCol] [-S
-       lineSkip] [-c metaChar] in.tab.bgz [region1 [region2 [...]]]
-
-
-DESCRIPTION
-       Tabix indexes a TAB-delimited genome position file in.tab.bgz and  cre-
-       ates  an  index file in.tab.bgz.tbi when region is absent from the com-
-       mand-line. The input data file must be position sorted  and  compressed
-       by  bgzip  which has a gzip(1) like interface. After indexing, tabix is
-       able to quickly retrieve data lines overlapping  regions  specified  in
-       the  format  "chr:beginPos-endPos". Fast data retrieval also works over
-       network if URI is given as a file name and in this case the index  file
-       will be downloaded if it is not present locally.
-
-
-OPTIONS OF TABIX
-       -p STR    Input  format  for indexing. Valid values are: gff, bed, sam,
-                 vcf and psltab. This option should not  be  applied  together
-                 with  any  of  -s, -b, -e, -c and -0; it is not used for data
-                 retrieval because this setting is stored in the  index  file.
-                 [gff]
-
-       -s INT    Column of sequence name. Option -s, -b, -e, -S, -c and -0 are
-                 all stored in the index  file  and  thus  not  used  in  data
-                 retrieval. [1]
-
-       -b INT    Column of start chromosomal position. [4]
-
-       -e INT    Column of end chromosomal position. [5]
-
-       -S INT    Skip first INT lines in the data file. [0]
-
-       -c CHAR   Skip lines started with character CHAR. [#]
-
-       -0        Specify  that  the position in the data file is 0-based (e.g.
-                 UCSC files) rather than 1-based.
-
-
-EXAMPLE
-       grep  -v  ^"#"  unsorted.gff  |  sort  -k1,1  -k4,4n  |  bgzip   -c   >
-       sorted.gff.gz;
-
-       tabix -p gff sorted.gff.gz;
-
-       tabix sorted.gff.gz chr1:10,000,000-20,000,000;
-
-
-NOTES
-       It  is straightforward to achieve overlap queries using the standard B-
-       tree index (with or without binning) implemented in all SQL  databases,
-       or  the R-tree index in PostgreSQL and Oracle. But there are still many
-       reasons to use tabix. Firstly, tabix  directly  works  with  a  lot  of
-       widely  used  TAB-delimited  formats such as GFF/GTF and BED. We do not
-       need to design database schema or specialized binary formats.  Data  do
-       not need to be duplicated in different formats, either. Secondly, tabix
-       works on compressed data files while most SQL  databases  do  not.  The
-       GenCode annotation GTF can be compressed down to 4%.  Thirdly, tabix is
-       fast. The same indexing algorithm is known to work efficiently  for  an
-       alignment with a few billion short reads. SQL databases probably cannot
-       easily handle data at this scale. Last but not the  least,  tabix  sup-
-       ports remote data retrieval. One can put the data file and the index at
-       an FTP or HTTP server, and other users or even  web  services  will  be
-       able to get a slice without downloading the entire file.
-
-
-AUTHOR
-       Tabix  was  written  by Heng Li. The BGZF library was originally imple-
-       mented by Bob Handsaker and modified by Heng Li for remote file  access
-       and in-memory caching.
-
-
-SEE ALSO
-       samtools(1)
-
-
-
-tabix-0.1.0                     2 November 2009                       tabix(1)
author	Charles Plessy <plessy@debian.org>
	Thu, 19 Aug 2010 06:49:36 +0000 (15:49 +0900)
committer	Charles Plessy <plessy@debian.org>
	Thu, 19 Aug 2010 06:49:36 +0000 (15:49 +0900)
Makefile		patch \| blob \| history
NEWS	[new file with mode: 0644]	patch \| blob
TabixReader.java	[new file with mode: 0644]	patch \| blob
bgzip.c		patch \| blob \| history
index.c		patch \| blob \| history
knetfile.c		patch \| blob \| history
main.c		patch \| blob \| history
tabix.1		patch \| blob \| history
tabix.h		patch \| blob \| history
tabix.tex	[new file with mode: 0644]	patch \| blob
tabix.txt	[deleted file]	patch \| blob \| history