3 Copyright (c) 2010 Broad Institute.
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <hengli@broadinstitute.org> */
28 import net.sf.samtools.util.BlockCompressedInputStream;
32 import java.util.HashMap;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.lang.StringBuffer;
37 public class TabixReader
40 private BlockCompressedInputStream mFp;
48 private String[] mSeq;
50 private HashMap<String, Integer> mChr2tid;
52 private static int MAX_BIN = 37450;
53 private static int TAD_MIN_CHUNK_GAP = 32768;
54 private static int TAD_LIDX_SHIFT = 14;
56 private class TPair64 implements Comparable<TPair64> {
58 public TPair64(final long _u, final long _v) {
61 public TPair64(final TPair64 p) {
64 public int compareTo(final TPair64 p) {
65 return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
69 private class TIndex {
70 HashMap<Integer, TPair64[]> b; // binning index
71 long[] l; // linear index
73 private TIndex[] mIndex;
79 private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
80 return (u < v) ^ (u < 0) ^ (v < 0);
86 * @param fn File name of the data file
88 public TabixReader(final String fn) throws IOException {
90 mFp = new BlockCompressedInputStream(new File(fn));
94 private static int reg2bins(final int beg, final int _end, final int[] list) {
95 int i = 0, k, end = _end;
96 if (beg >= end) return 0;
97 if (end >= 1<<29) end = 1<<29;
100 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
101 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
102 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
103 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
104 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
108 public static int readInt(final InputStream is) throws IOException {
109 byte[] buf = new byte[4];
111 return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
114 public static long readLong(final InputStream is) throws IOException {
115 byte[] buf = new byte[8];
117 return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
120 public static String readLine(final InputStream is) throws IOException {
121 StringBuffer buf = new StringBuffer();
123 while ((c = is.read()) >= 0 && c != '\n')
125 if (c < 0) return null;
126 return buf.toString();
130 * Read the Tabix index from a file
132 * @param fp File pointer
134 public void readIndex(final File fp) throws IOException {
135 if (fp == null) return;
136 BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
137 byte[] buf = new byte[4];
139 is.read(buf, 0, 4); // read "TBI\1"
140 mSeq = new String[readInt(is)]; // # sequences
141 mChr2tid = new HashMap<String, Integer>();
142 mPreset = readInt(is);
148 // read sequence dictionary
149 int i, j, k, l = readInt(is);
152 for (i = j = k = 0; i < buf.length; ++i) {
154 byte[] b = new byte[i - j];
155 System.arraycopy(buf, j, b, 0, b.length);
156 String s = new String(b);
163 mIndex = new TIndex[mSeq.length];
164 for (i = 0; i < mSeq.length; ++i) {
166 int n_bin = readInt(is);
167 mIndex[i] = new TIndex();
168 mIndex[i].b = new HashMap<Integer, TPair64[]>();
169 for (j = 0; j < n_bin; ++j) {
170 int bin = readInt(is);
171 TPair64[] chunks = new TPair64[readInt(is)];
172 for (k = 0; k < chunks.length; ++k) {
173 long u = readLong(is);
174 long v = readLong(is);
175 chunks[k] = new TPair64(u, v); // in C, this is inefficient
177 mIndex[i].b.put(bin, chunks);
180 mIndex[i].l = new long[readInt(is)];
181 for (k = 0; k < mIndex[i].l.length; ++k)
182 mIndex[i].l[k] = readLong(is);
189 * Read the Tabix index from the default file.
191 public void readIndex() throws IOException {
192 readIndex(new File(mFn + ".tbi"));
196 * Read one line from the data file.
198 public String readLine() throws IOException {
199 return readLine(mFp);
202 private int chr2tid(final String chr) {
203 if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
208 * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
210 * @param reg Region string
211 * @return An array where the three elements are sequence_id,
212 * region_begin and region_end. On failure, sequence_id==-1.
214 public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
217 int[] ret = new int[3];
218 colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
219 chr = colon >= 0? reg.substring(0, colon) : reg;
220 ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen >= 0? hyphen : reg.length())) - 1 : 0;
221 ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
222 ret[0] = chr2tid(chr);
226 private TIntv getIntv(final String s) {
227 TIntv intv = new TIntv();
228 int col = 0, end = 0, beg = 0;
229 while ((end = s.indexOf('\t', beg)) >= 0 || end == -1) {
232 intv.tid = chr2tid(s.substring(beg, end));
233 } else if (col == mBc) {
234 intv.beg = intv.end = Integer.parseInt(s.substring(beg, end==-1?s.length():end));
235 if ((mPreset&0x10000) != 0) ++intv.end;
237 if (intv.beg < 0) intv.beg = 0;
238 if (intv.end < 1) intv.end = 1;
239 } else { // FIXME: SAM supports are not tested yet
240 if ((mPreset&0xffff) == 0) { // generic
242 intv.end = Integer.parseInt(s.substring(beg, end));
243 } else if ((mPreset&0xffff) == 1) { // SAM
244 if (col == 6) { // CIGAR
246 String cigar = s.substring(beg, end);
247 for (i = j = 0; i < cigar.length(); ++i) {
248 if (cigar.charAt(i) > '9') {
249 int op = cigar.charAt(i);
250 if (op == 'M' || op == 'D' || op == 'N')
251 l += Integer.parseInt(cigar.substring(j, i));
254 intv.end = intv.beg + l;
256 } else if ((mPreset&0xffff) == 2) { // VCF
258 alt = end >= 0? s.substring(beg, end) : s.substring(beg);
259 if (col == 4) { // REF
260 if (alt.length() > 0) intv.end = intv.beg + alt.length();
261 } else if (col == 8) { // INFO
262 int e_off = -1, i = alt.indexOf("END=");
263 if (i == 0) e_off = 4;
265 i = alt.indexOf(";END=");
266 if (i >= 0) e_off = i + 5;
269 i = alt.indexOf(";", e_off);
270 intv.end = Integer.parseInt(i > e_off? alt.substring(e_off, i) : alt.substring(e_off));
275 if (end == -1) break;
281 public class Iterator {
282 private int i, n_seeks;
283 private int tid, beg, end;
284 private TPair64[] off;
285 private long curr_off;
286 private boolean iseof;
288 public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
289 i = -1; n_seeks = 0; curr_off = 0; iseof = false;
290 off = _off; tid = _tid; beg = _beg; end = _end;
293 public String next() throws IOException {
294 if (iseof) return null;
296 if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
297 if (i == off.length - 1) break; // no more chunks
298 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
299 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
300 mFp.seek(off[i+1].u);
301 curr_off = mFp.getFilePointer();
307 if ((s = readLine(mFp)) != null) {
309 char[] str = s.toCharArray();
310 curr_off = mFp.getFilePointer();
311 if (str.length == 0 || str[0] == mMeta) continue;
313 if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
314 else if (intv.end > beg && intv.beg < end) return s; // overlap; return
315 } else break; // end of file
322 public Iterator query(final int tid, final int beg, final int end) {
323 TPair64[] off, chunks;
325 TIndex idx = mIndex[tid];
326 int[] bins = new int[MAX_BIN];
327 int i, l, n_off, n_bins = reg2bins(beg, end, bins);
328 if (idx.l.length > 0)
329 min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
331 for (i = n_off = 0; i < n_bins; ++i) {
332 if ((chunks = idx.b.get(bins[i])) != null)
333 n_off += chunks.length;
335 if (n_off == 0) return null;
336 off = new TPair64[n_off];
337 for (i = n_off = 0; i < n_bins; ++i)
338 if ((chunks = idx.b.get(bins[i])) != null)
339 for (int j = 0; j < chunks.length; ++j)
340 if (less64(min_off, chunks[j].v))
341 off[n_off++] = new TPair64(chunks[j]);
342 if (n_off == 0) return null;
343 Arrays.sort(off, 0, n_off);
344 // resolve completely contained adjacent blocks
345 for (i = 1, l = 0; i < n_off; ++i) {
346 if (less64(off[l].v, off[i].v)) {
348 off[l].u = off[i].u; off[l].v = off[i].v;
352 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
353 for (i = 1; i < n_off; ++i)
354 if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
355 // merge adjacent blocks
356 for (i = 1, l = 0; i < n_off; ++i) {
357 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
366 TPair64[] ret = new TPair64[n_off];
367 for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
368 return new TabixReader.Iterator(tid, beg, end, ret);
371 public Iterator query(final String reg) {
372 int[] x = parseReg(reg);
373 return query(x[0], x[1], x[2]);
376 public static void main(String[] args) {
377 if (args.length < 1) {
378 System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
382 TabixReader tr = new TabixReader(args[0]);
384 if (args.length == 1) { // no region is specified; print the whole file
385 while ((s = tr.readLine()) != null)
386 System.out.println(s);
387 } else { // a region is specified; random access
388 TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
389 while (iter != null && (s = iter.next()) != null)
390 System.out.println(s);
392 } catch (IOException e) {