3 Copyright (c) 2010 Broad Institute.
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <hengli@broadinstitute.org> */
28 import net.sf.samtools.util.BlockCompressedInputStream;
32 import java.util.HashMap;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.lang.StringBuffer;
37 public class TabixReader
40 private BlockCompressedInputStream mFp;
48 private String[] mSeq;
50 private HashMap<String, Integer> mChr2tid;
52 private static int MAX_BIN = 37450;
53 private static int TAD_MIN_CHUNK_GAP = 32768;
54 private static int TAD_LIDX_SHIFT = 14;
56 private class TPair64 implements Comparable<TPair64> {
58 public TPair64(final long _u, final long _v) {
61 public TPair64(final TPair64 p) {
64 public int compareTo(final TPair64 p) {
65 return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
69 private class TIndex {
70 HashMap<Integer, TPair64[]> b; // binning index
71 long[] l; // linear index
73 private TIndex[] mIndex;
79 private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
80 return (u < v) ^ (u < 0) ^ (v < 0);
86 * @param fn File name of the data file
88 public TabixReader(final String fn) throws IOException {
90 mFp = new BlockCompressedInputStream(new File(fn));
94 private static int reg2bins(final int beg, final int _end, final int[] list) {
95 int i = 0, k, end = _end;
96 if (beg >= end) return 0;
97 if (end >= 1<<29) end = 1<<29;
100 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
101 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
102 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
103 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
104 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
108 public static int readInt(final InputStream is) throws IOException {
109 byte[] buf = new byte[4];
111 return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
114 public static long readLong(final InputStream is) throws IOException {
115 byte[] buf = new byte[8];
117 return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
120 public static String readLine(final InputStream is) throws IOException {
121 StringBuffer buf = new StringBuffer();
123 while ((c = is.read()) >= 0 && c != '\n')
125 if (c < 0) return null;
126 return buf.toString();
130 * Read the Tabix index from a file
132 * @param fp File pointer
134 public void readIndex(final File fp) throws IOException {
135 if (fp == null) return;
136 BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
137 byte[] buf = new byte[4];
139 is.read(buf, 0, 4); // read "TBI\1"
140 mSeq = new String[readInt(is)]; // # sequences
141 mChr2tid = new HashMap<String, Integer>();
142 mPreset = readInt(is);
148 // read sequence dictionary
149 int i, j, k, l = readInt(is);
152 for (i = j = k = 0; i < buf.length; ++i) {
154 byte[] b = new byte[i - j];
155 System.arraycopy(buf, j, b, 0, b.length);
156 String s = new String(b);
163 mIndex = new TIndex[mSeq.length];
164 for (i = 0; i < mSeq.length; ++i) {
166 int n_bin = readInt(is);
167 mIndex[i] = new TIndex();
168 mIndex[i].b = new HashMap<Integer, TPair64[]>();
169 for (j = 0; j < n_bin; ++j) {
170 int bin = readInt(is);
171 TPair64[] chunks = new TPair64[readInt(is)];
172 for (k = 0; k < chunks.length; ++k) {
173 long u = readLong(is);
174 long v = readLong(is);
175 chunks[k] = new TPair64(u, v); // in C, this is inefficient
177 mIndex[i].b.put(bin, chunks);
180 mIndex[i].l = new long[readInt(is)];
181 for (k = 0; k < mIndex[i].l.length; ++k)
182 mIndex[i].l[k] = readLong(is);
189 * Read the Tabix index from the default file.
191 public void readIndex() throws IOException {
192 readIndex(new File(mFn + ".tbi"));
196 * Read one line from the data file.
198 public String readLine() throws IOException {
199 return readLine(mFp);
202 private int chr2tid(final String chr) {
203 if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
208 * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
210 * @param reg Region string
211 * @return An array where the three elements are sequence_id,
212 * region_begin and region_end. On failure, sequence_id==-1.
214 public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
217 int[] ret = new int[3];
218 colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
219 chr = colon >= 0? reg.substring(0, colon) : reg;
220 ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen)) - 1 : 0;
221 ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
222 ret[0] = chr2tid(chr);
226 private TIntv getIntv(final String s) {
227 TIntv intv = new TIntv();
228 int col = 0, end = 0, beg = 0;
229 while ((end = s.indexOf('\t', beg)) >= 0) {
232 intv.tid = chr2tid(s.substring(beg, end));
233 } else if (col == mBc) {
234 intv.beg = intv.end = Integer.parseInt(s.substring(beg, end));
235 if ((mPreset&0x10000) != 0) ++intv.end;
237 if (intv.beg < 0) intv.beg = 0;
238 if (intv.end < 1) intv.end = 1;
239 } else { // FIXME: SAM/VCF supports are not tested yet
240 if ((mPreset&0xffff) == 0) { // generic
242 intv.end = Integer.parseInt(s.substring(beg, end));
243 } else if ((mPreset&0xffff) == 1) { // SAM
244 if (col == 6) { // CIGAR
246 String cigar = s.substring(beg, end);
247 for (i = j = 0; i < cigar.length(); ++i) {
248 if (cigar.charAt(i) > '9') {
249 int op = cigar.charAt(i);
250 if (op == 'M' || op == 'D' || op == 'N')
251 l += Integer.parseInt(cigar.substring(j, i));
254 intv.end = intv.beg + l;
256 } else if ((mPreset&0xffff) == 2) { // VCF
258 String alt = s.substring(beg, end);
260 for (i = 0; i < alt.length(); ++i) {
261 if (alt.charAt(i) == 'D') { // deletion
263 for (j = i; j < alt.length() && alt.charAt(j) >= '0' && alt.charAt(j) <= '9'; ++j);
264 int l = Integer.parseInt(alt.substring(i, j));
265 if (max < l) max = l;
269 intv.end = intv.beg + max;
278 public class Iterator {
279 private int i, n_seeks;
280 private int tid, beg, end;
281 private TPair64[] off;
282 private long curr_off;
283 private boolean iseof;
285 public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
286 i = -1; n_seeks = 0; curr_off = 0; iseof = false;
287 off = _off; tid = _tid; beg = _beg; end = _end;
290 public String next() throws IOException {
291 if (iseof) return null;
293 if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
294 if (i == off.length - 1) break; // no more chunks
295 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
296 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
297 mFp.seek(off[i+1].u);
298 curr_off = mFp.getFilePointer();
304 if ((s = readLine(mFp)) != null) {
306 char[] str = s.toCharArray();
307 curr_off = mFp.getFilePointer();
308 if (str.length == 0 || str[0] == mMeta) continue;
310 if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
311 else if (intv.end > beg && intv.beg < end) return s; // overlap; return
312 } else break; // end of file
319 public Iterator query(final int tid, final int beg, final int end) {
320 TPair64[] off, chunks;
322 TIndex idx = mIndex[tid];
323 int[] bins = new int[MAX_BIN];
324 int i, l, n_off, n_bins = reg2bins(beg, end, bins);
325 if (idx.l.length > 0)
326 min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
328 for (i = n_off = 0; i < n_bins; ++i) {
329 if ((chunks = idx.b.get(bins[i])) != null)
330 n_off += chunks.length;
332 if (n_off == 0) return null;
333 off = new TPair64[n_off];
334 for (i = n_off = 0; i < n_bins; ++i)
335 if ((chunks = idx.b.get(bins[i])) != null)
336 for (int j = 0; j < chunks.length; ++j)
337 if (less64(min_off, chunks[j].v))
338 off[n_off++] = new TPair64(chunks[j]);
339 Arrays.sort(off, 0, n_off);
340 // resolve completely contained adjacent blocks
341 for (i = 1, l = 0; i < n_off; ++i) {
342 if (less64(off[l].v, off[i].v)) {
344 off[l].u = off[i].u; off[l].v = off[i].v;
348 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
349 for (i = 1; i < n_off; ++i)
350 if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
351 // merge adjacent blocks
352 for (i = 1, l = 0; i < n_off; ++i) {
353 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
362 TPair64[] ret = new TPair64[n_off];
363 for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
364 return new TabixReader.Iterator(tid, beg, end, ret);
367 public Iterator query(final String reg) {
368 int[] x = parseReg(reg);
369 return query(x[0], x[1], x[2]);
372 public static void main(String[] args) {
373 if (args.length < 1) {
374 System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
378 TabixReader tr = new TabixReader(args[0]);
380 if (args.length == 1) { // no region is specified; print the whole file
381 while ((s = tr.readLine()) != null)
382 System.out.println(s);
383 } else { // a region is specified; random access
384 TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
385 while ((s = iter.next()) != null)
386 System.out.println(s);
388 } catch (IOException e) {