Imported Upstream version 0.2.2
[tabix.git] / index.c
diff --git a/index.c b/index.c
index 61d7d6d6fdc370d09d9b4b916e629cf48e1aa8c2..8eddcecb34bab21f8d4f6de7a88639f8c67d0358 100644 (file)
--- a/index.c
+++ b/index.c
@@ -19,7 +19,7 @@ typedef struct {
 } pair64_t;
 
 #define pair64_lt(a,b) ((a).u < (b).u)
-KSORT_INIT(off, pair64_t, pair64_lt)
+KSORT_INIT(offt, pair64_t, pair64_lt)
 
 typedef struct {
        uint32_t m, n;
@@ -47,7 +47,6 @@ struct __ti_iter_t {
        int tid, beg, end, n_off, i, finished;
        uint64_t curr_off;
        kstring_t str;
-       BGZF *fp;
        const ti_index_t *idx;
        pair64_t *off;
 };
@@ -193,17 +192,19 @@ static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
                                        }
                                } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) {
                                        // FIXME: the following is NOT tested and is likely to be buggy
-                                       if (id == 5) { // ALT
-                                               char *t;
-                                               int max = 1;
-                                               for (s = str->s + b; s < str->s + i;) {
-                                                       if (s[i] == 'D') {
-                                                               long x = strtol(s + 1, &t, 10);
-                                                               if (x > max) max = x;
-                                                               s = t + 1;
-                                                       } else ++s;
+                                       if (id == 4) {
+                                               if (b < i) intv->end = intv->beg + (i - b);
+                                       } else if (id == 8) { // look for "END="
+                                               int c = str->s[i];
+                                               str->s[i] = 0;
+                                               s = strstr(str->s + b, "END=");
+                                               if (s == str->s + b) s += 4;
+                                               else if (s) {
+                                                       s = strstr(str->s + b, ";END=");
+                                                       if (s) s += 5;
                                                }
-                                               intv->end = intv->beg + max;
+                                               if (s) intv->end = strtol(s, &s, 0);
+                                               str->s[i] = c;
                                        }
                                }
                        }
@@ -686,12 +687,19 @@ int ti_index_build(const char *fn, const ti_conf_t *conf)
  * parse a region in the format chr:beg-end *
  ********************************************/
 
-int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end)
+int ti_get_tid(const ti_index_t *idx, const char *name)
+{
+       khiter_t iter;
+       const khash_t(s) *h = idx->tname;
+       iter = kh_get(s, h, name); /* get the tid */
+       if (iter == kh_end(h)) return -1;
+       return kh_value(h, iter);
+}
+
+int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end)
 {
        char *s, *p;
        int i, l, k;
-       khiter_t iter;
-       khash_t(s) *h = idx->tname;
        l = strlen(str);
        p = s = (char*)malloc(l+1);
        /* squeeze out "," */
@@ -700,12 +708,10 @@ int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int
        s[k] = 0;
        for (i = 0; i != k; ++i) if (s[i] == ':') break;
        s[i] = 0;
-       iter = kh_get(s, h, s); /* get the tid */
-       if (iter == kh_end(h)) { // name not found
-               *tid = -1; free(s);
+       if ((*tid = ti_get_tid(idx, s)) < 0) {
+               free(s);
                return -1;
        }
-       *tid = kh_value(h, iter);
        if (i == k) { /* dump the whole sequence */
                *begin = 0; *end = 1<<29; free(s);
                return 0;
@@ -731,6 +737,8 @@ int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int
 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
 {
        int i = 0, k;
+       if (beg >= end) return 0;
+       if (end >= 1u<<29) end = 1u<<29;
        --end;
        list[i++] = 0;
        for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
@@ -741,15 +749,15 @@ static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
        return i;
 }
 
-ti_iter_t ti_first(BGZF *fp)
+ti_iter_t ti_iter_first()
 {
        ti_iter_t iter;
        iter = calloc(1, sizeof(struct __ti_iter_t));
-       iter->fp = fp; iter->from_first = 1;
+       iter->from_first = 1;
        return iter;
 }
 
-ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end)
+ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end)
 {
        uint16_t *bins;
        int i, n_bins, n_off;
@@ -759,10 +767,11 @@ ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end)
        uint64_t min_off;
        ti_iter_t iter = 0;
 
+       if (beg < 0) beg = 0;
+       if (end < beg) return 0;
        // initialize the iterator
        iter = calloc(1, sizeof(struct __ti_iter_t));
-       iter->fp = fp; iter->idx = idx;
-       iter->tid = tid; iter->beg = beg; iter->end = end; iter->i = -1;
+       iter->idx = idx; iter->tid = tid; iter->beg = beg; iter->end = end; iter->i = -1;
        // random access
        bins = (uint16_t*)calloc(MAX_BIN, 2);
        n_bins = reg2bins(beg, end, bins);
@@ -797,7 +806,7 @@ ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end)
        free(bins);
        {
                int l;
-               ks_introsort(off, n_off, off);
+               ks_introsort(offt, n_off, off);
                // resolve completely contained adjacent blocks
                for (i = 1, l = 0; i < n_off; ++i)
                        if (off[l].v < off[i].v)
@@ -818,12 +827,12 @@ ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end)
        return iter;
 }
 
-const char *ti_iter_read(ti_iter_t iter, int *len)
+const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len)
 {
        if (iter->finished) return 0;
        if (iter->from_first) {
                int ret;
-               if ((ret = ti_readline(iter->fp, &iter->str)) < 0) {
+               if ((ret = ti_readline(fp, &iter->str)) < 0) {
                        iter->finished = 1;
                        return 0;
                } else {
@@ -838,14 +847,14 @@ const char *ti_iter_read(ti_iter_t iter, int *len)
                        if (iter->i == iter->n_off - 1) break; // no more chunks
                        if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
                        if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
-                               bgzf_seek(iter->fp, iter->off[iter->i+1].u, SEEK_SET);
-                               iter->curr_off = bgzf_tell(iter->fp);
+                               bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
+                               iter->curr_off = bgzf_tell(fp);
                        }
                        ++iter->i;
                }
-               if ((ret = ti_readline(iter->fp, &iter->str)) >= 0) {
+               if ((ret = ti_readline(fp, &iter->str)) >= 0) {
                        ti_intv_t intv;
-                       iter->curr_off = bgzf_tell(iter->fp);
+                       iter->curr_off = bgzf_tell(fp);
                        if (iter->str.s[0] == iter->idx->conf.meta_char) continue;
                        get_intv((ti_index_t*)iter->idx, &iter->str, &intv);
                        if (intv.tid != iter->tid || intv.beg >= iter->end) break; // no need to proceed
@@ -872,9 +881,76 @@ int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *d
        ti_iter_t iter;
        const char *s;
        int len;
-       iter = ti_query(fp, idx, tid, beg, end);
-       while ((s = ti_iter_read(iter, &len)) != 0)
+       iter = ti_iter_query(idx, tid, beg, end);
+       while ((s = ti_iter_read(fp, iter, &len)) != 0)
                func(len, s, data);
        ti_iter_destroy(iter);
        return 0;
 }
+
+/*******************
+ * High-level APIs *
+ *******************/
+
+tabix_t *ti_open(const char *fn, const char *fnidx)
+{
+       tabix_t *t;
+       BGZF *fp;
+       if ((fp = bgzf_open(fn, "r")) == 0) return 0;
+       t = calloc(1, sizeof(tabix_t));
+       t->fn = strdup(fn);
+       if (fnidx) t->fnidx = strdup(fnidx);
+       t->fp = fp;
+       return t;
+}
+
+void ti_close(tabix_t *t)
+{
+       if (t) {
+               bgzf_close(t->fp);
+               if (t->idx) ti_index_destroy(t->idx);
+               free(t->fn); free(t->fnidx);
+               free(t);
+       }
+}
+
+int ti_lazy_index_load(tabix_t *t)
+{
+       if (t->idx == 0) { // load index
+               if (t->fnidx) t->idx = ti_index_load_local(t->fnidx);
+               else t->idx = ti_index_load(t->fn);
+               if (t->idx == 0) return -1; // fail to load index
+       }
+       return 0;
+}
+
+ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end)
+{
+       if (tid < 0) return ti_iter_first();
+       if (ti_lazy_index_load(t) != 0) return 0;
+       return ti_iter_query(t->idx, tid, beg, end);    
+}
+
+ti_iter_t ti_querys(tabix_t *t, const char *reg)
+{
+       int tid, beg, end;
+       if (reg == 0) return ti_iter_first();
+       if (ti_lazy_index_load(t) != 0) return 0;
+       if (ti_parse_region(t->idx, reg, &tid, &beg, &end) < 0) return 0;
+       return ti_iter_query(t->idx, tid, beg, end);
+}
+
+ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end)
+{
+       int tid;
+       if (name == 0) return ti_iter_first();
+       // then need to load the index
+       if (ti_lazy_index_load(t) != 0) return 0;
+       if ((tid = ti_get_tid(t->idx, name)) < 0) return 0;
+       return ti_iter_query(t->idx, tid, beg, end);
+}
+
+const char *ti_read(tabix_t *t, ti_iter_t iter, int *len)
+{
+       return ti_iter_read(t->fp, iter, len);
+}