3 Copyright (c) 2010 Broad Institute.
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <hengli@broadinstitute.org> */
28 import net.sf.samtools.util.BlockCompressedInputStream;
32 import java.util.HashMap;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.lang.StringBuffer;
37 public class TabixReader
40 private BlockCompressedInputStream mFp;
48 private String[] mSeq;
50 private HashMap<String, Integer> mChr2tid;
52 private static int MAX_BIN = 37450;
53 private static int TAD_MIN_CHUNK_GAP = 32768;
54 private static int TAD_LIDX_SHIFT = 14;
56 private class TPair64 implements Comparable<TPair64> {
58 public TPair64(final long _u, final long _v) {
61 public TPair64(final TPair64 p) {
64 public int compareTo(final TPair64 p) {
65 return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
69 private class TIndex {
70 HashMap<Integer, TPair64[]> b; // binning index
71 long[] l; // linear index
73 private TIndex[] mIndex;
79 private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
80 return (u < v) ^ (u < 0) ^ (v < 0);
86 * @param fn File name of the data file
88 public TabixReader(final String fn) throws IOException {
90 mFp = new BlockCompressedInputStream(new File(fn));
94 private static int reg2bins(final int beg, final int _end, final int[] list) {
95 int i = 0, k, end = _end;
98 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
99 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
100 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
101 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
102 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
106 public static int readInt(final InputStream is) throws IOException {
107 byte[] buf = new byte[4];
109 return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
112 public static long readLong(final InputStream is) throws IOException {
113 byte[] buf = new byte[8];
115 return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
118 public static String readLine(final InputStream is) throws IOException {
119 StringBuffer buf = new StringBuffer();
121 while ((c = is.read()) >= 0 && c != '\n')
123 if (c < 0) return null;
124 return buf.toString();
128 * Read the Tabix index from a file
130 * @param fp File pointer
132 public void readIndex(final File fp) throws IOException {
133 if (fp == null) return;
134 BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
135 byte[] buf = new byte[4];
137 is.read(buf, 0, 4); // read "TBI\1"
138 mSeq = new String[readInt(is)]; // # sequences
139 mChr2tid = new HashMap<String, Integer>();
140 mPreset = readInt(is);
146 // read sequence dictionary
147 int i, j, k, l = readInt(is);
150 for (i = j = k = 0; i < buf.length; ++i) {
152 byte[] b = new byte[i - j];
153 System.arraycopy(buf, j, b, 0, b.length);
154 String s = new String(b);
161 mIndex = new TIndex[mSeq.length];
162 for (i = 0; i < mSeq.length; ++i) {
164 int n_bin = readInt(is);
165 mIndex[i] = new TIndex();
166 mIndex[i].b = new HashMap<Integer, TPair64[]>();
167 for (j = 0; j < n_bin; ++j) {
168 int bin = readInt(is);
169 TPair64[] chunks = new TPair64[readInt(is)];
170 for (k = 0; k < chunks.length; ++k) {
171 long u = readLong(is);
172 long v = readLong(is);
173 chunks[k] = new TPair64(u, v); // in C, this is inefficient
175 mIndex[i].b.put(bin, chunks);
178 mIndex[i].l = new long[readInt(is)];
179 for (k = 0; k < mIndex[i].l.length; ++k)
180 mIndex[i].l[k] = readLong(is);
187 * Read the Tabix index from the default file.
189 public void readIndex() throws IOException {
190 readIndex(new File(mFn + ".tbi"));
194 * Read one line from the data file.
196 public String readLine() throws IOException {
197 return readLine(mFp);
200 private int chr2tid(final String chr) {
201 if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
206 * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
208 * @param reg Region string
209 * @return An array where the three elements are sequence_id,
210 * region_begin and region_end. On failure, sequence_id==-1.
212 public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
215 int[] ret = new int[3];
216 colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
217 chr = colon >= 0? reg.substring(0, colon) : reg;
218 ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen)) - 1 : 0;
219 ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
220 ret[0] = chr2tid(chr);
224 private TIntv getIntv(final String s) {
225 TIntv intv = new TIntv();
226 int col = 0, end = 0, beg = 0;
227 while ((end = s.indexOf('\t', beg)) >= 0) {
230 intv.tid = chr2tid(s.substring(beg, end));
231 } else if (col == mBc) {
232 intv.beg = intv.end = Integer.parseInt(s.substring(beg, end));
233 if ((mPreset&0x10000) != 0) ++intv.end;
235 if (intv.beg < 0) intv.beg = 0;
236 if (intv.end < 1) intv.end = 1;
237 } else { // FIXME: SAM/VCF supports are not tested yet
238 if ((mPreset&0xffff) == 0) { // generic
240 intv.end = Integer.parseInt(s.substring(beg, end));
241 } else if ((mPreset&0xffff) == 1) { // SAM
242 if (col == 6) { // CIGAR
244 String cigar = s.substring(beg, end);
245 for (i = j = 0; i < cigar.length(); ++i) {
246 if (cigar.charAt(i) > '9') {
247 int op = cigar.charAt(i);
248 if (op == 'M' || op == 'D' || op == 'N')
249 l += Integer.parseInt(cigar.substring(j, i));
252 intv.end = intv.beg + l;
254 } else if ((mPreset&0xffff) == 2) { // VCF
256 String alt = s.substring(beg, end);
258 for (i = 0; i < alt.length(); ++i) {
259 if (alt.charAt(i) == 'D') { // deletion
261 for (j = i; j < alt.length() && alt.charAt(j) >= '0' && alt.charAt(j) <= '9'; ++j);
262 int l = Integer.parseInt(alt.substring(i, j));
263 if (max < l) max = l;
267 intv.end = intv.beg + max;
276 public class Iterator {
277 private int i, n_seeks;
278 private int tid, beg, end;
279 private TPair64[] off;
280 private long curr_off;
281 private boolean iseof;
283 public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
284 i = -1; n_seeks = 0; curr_off = 0; iseof = false;
285 off = _off; tid = _tid; beg = _beg; end = _end;
288 public String next() throws IOException {
289 if (iseof) return null;
291 if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
292 if (i == off.length - 1) break; // no more chunks
293 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
294 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
295 mFp.seek(off[i+1].u);
296 curr_off = mFp.getFilePointer();
302 if ((s = readLine(mFp)) != null) {
304 char[] str = s.toCharArray();
305 curr_off = mFp.getFilePointer();
306 if (str.length == 0 || str[0] == mMeta) continue;
308 if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
309 else if (intv.end > beg && intv.beg < end) return s; // overlap; return
310 } else break; // end of file
317 public Iterator query(final int tid, final int beg, final int end) {
318 TPair64[] off, chunks;
320 TIndex idx = mIndex[tid];
321 int[] bins = new int[MAX_BIN];
322 int i, l, n_off, n_bins = reg2bins(beg, end, bins);
323 min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
324 for (i = n_off = 0; i < n_bins; ++i) {
325 if ((chunks = idx.b.get(bins[i])) != null)
326 n_off += chunks.length;
328 if (n_off == 0) return null;
329 off = new TPair64[n_off];
330 for (i = n_off = 0; i < n_bins; ++i)
331 if ((chunks = idx.b.get(bins[i])) != null)
332 for (int j = 0; j < chunks.length; ++j)
333 if (less64(min_off, chunks[j].v))
334 off[n_off++] = new TPair64(chunks[j]);
335 Arrays.sort(off, 0, n_off);
336 // resolve completely contained adjacent blocks
337 for (i = 1, l = 0; i < n_off; ++i) {
338 if (less64(off[l].v, off[i].v)) {
340 off[l].u = off[i].u; off[l].v = off[i].v;
344 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
345 for (i = 1; i < n_off; ++i)
346 if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
347 // merge adjacent blocks
348 for (i = 1, l = 0; i < n_off; ++i) {
349 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
358 TPair64[] ret = new TPair64[n_off];
359 for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
360 return new TabixReader.Iterator(tid, beg, end, ret);
363 public Iterator query(final String reg) {
364 int[] x = parseReg(reg);
365 return query(x[0], x[1], x[2]);
368 public static void main(String[] args) {
369 if (args.length < 1) {
370 System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
374 TabixReader tr = new TabixReader(args[0]);
376 if (args.length == 1) { // no region is specified; print the whole file
377 while ((s = tr.readLine()) != null)
378 System.out.println(s);
379 } else { // a region is specified; random access
380 TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
381 while ((s = iter.next()) != null)
382 System.out.println(s);
384 } catch (IOException e) {