From 36e5c2e93cc97d22f8b33768602d511697fa1adf Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Thu, 19 Aug 2010 15:49:44 +0900 Subject: [PATCH] Imported Upstream version 0.2.0 --- ChangeLog | 101 +++++++++++++++++++++++++++++++++++++++ Makefile | 2 +- NEWS | 18 +++++++ TabixReader.java | 6 ++- example.gtf.gz | Bin 0 -> 3778 bytes example.gtf.gz.tbi | Bin 0 -> 260 bytes index.c | 107 +++++++++++++++++++++++++++++++++--------- main.c | 26 ++++------ perl/MANIFEST | 8 ++++ perl/Makefile.PL | 8 ++++ perl/Tabix.pm | 76 ++++++++++++++++++++++++++++++ perl/Tabix.xs | 71 ++++++++++++++++++++++++++++ perl/TabixIterator.pm | 41 ++++++++++++++++ perl/t/01local.t | 28 +++++++++++ perl/t/02remote.t | 28 +++++++++++ perl/typemap | 3 ++ tabix.1 | 2 +- tabix.h | 55 +++++++++++++++++----- 18 files changed, 526 insertions(+), 54 deletions(-) create mode 100644 example.gtf.gz create mode 100644 example.gtf.gz.tbi create mode 100644 perl/MANIFEST create mode 100644 perl/Makefile.PL create mode 100644 perl/Tabix.pm create mode 100644 perl/Tabix.xs create mode 100644 perl/TabixIterator.pm create mode 100644 perl/t/01local.t create mode 100644 perl/t/02remote.t create mode 100644 perl/typemap diff --git a/ChangeLog b/ChangeLog index 19d4f3e..907d72b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,104 @@ +------------------------------------------------------------------------ +r573 | lh3lh3 | 2010-05-11 12:08:30 -0400 (Tue, 11 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/Makefile + +Added -fPIC + +------------------------------------------------------------------------ +r572 | lh3lh3 | 2010-05-11 11:59:07 -0400 (Tue, 11 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/perl/MANIFEST + +update + +------------------------------------------------------------------------ +r571 | lh3lh3 | 2010-05-11 11:56:54 -0400 (Tue, 11 May 2010) | 4 lines +Changed paths: + A /trunk/tabix/example.gtf.gz + A /trunk/tabix/example.gtf.gz.tbi + M /trunk/tabix/index.c + M /trunk/tabix/main.c + M /trunk/tabix/perl/MANIFEST + M /trunk/tabix/perl/Tabix.pm + M /trunk/tabix/perl/Tabix.xs + A /trunk/tabix/perl/TabixIterator.pm + A /trunk/tabix/perl/t + A /trunk/tabix/perl/t/01local.t + A /trunk/tabix/perl/t/02remote.t + M /trunk/tabix/tabix.1 + M /trunk/tabix/tabix.h + + * improved C/Perl APIs + * added test for Perl + * added an tiny example + +------------------------------------------------------------------------ +r570 | lh3lh3 | 2010-05-11 01:04:21 -0400 (Tue, 11 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/TabixReader.java + +fixed the same issue in java + +------------------------------------------------------------------------ +r569 | lh3lh3 | 2010-05-11 01:03:24 -0400 (Tue, 11 May 2010) | 3 lines +Changed paths: + M /trunk/tabix/index.c + M /trunk/tabix/perl/Tabix.pm + M /trunk/tabix/perl/Tabix.xs + + * fixed a potential issue in index.c + * improve perl APIs + +------------------------------------------------------------------------ +r568 | lh3lh3 | 2010-05-10 23:46:21 -0400 (Mon, 10 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/perl/Tabix.xs + +return an array from get_names() + +------------------------------------------------------------------------ +r567 | lh3lh3 | 2010-05-10 23:38:46 -0400 (Mon, 10 May 2010) | 4 lines +Changed paths: + M /trunk/tabix/TabixReader.java + M /trunk/tabix/index.c + A /trunk/tabix/perl + A /trunk/tabix/perl/MANIFEST + A /trunk/tabix/perl/Makefile.PL + A /trunk/tabix/perl/Tabix.pm + A /trunk/tabix/perl/Tabix.xs + A /trunk/tabix/perl/typemap + M /trunk/tabix/tabix.h + + * added the initial perl binding. The interface needs to be improved. + * added a new API for perl binding + * fixed a potential bug in java. + +------------------------------------------------------------------------ +r565 | lh3lh3 | 2010-05-09 23:24:35 -0400 (Sun, 09 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/main.c + +Release tabix-0.1.6 + +------------------------------------------------------------------------ +r564 | lh3lh3 | 2010-05-09 23:01:49 -0400 (Sun, 09 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/index.c + +fixed a typo + +------------------------------------------------------------------------ +r563 | lh3lh3 | 2010-05-09 22:58:26 -0400 (Sun, 09 May 2010) | 2 lines +Changed paths: + A /trunk/tabix/ChangeLog + M /trunk/tabix/NEWS + M /trunk/tabix/index.c + M /trunk/tabix/main.c + M /trunk/tabix/tabix.h + +If nothing bad happens, this will become 0.1.6 + ------------------------------------------------------------------------ r562 | lh3lh3 | 2010-05-09 19:43:56 -0400 (Sun, 09 May 2010) | 2 lines Changed paths: diff --git a/Makefile b/Makefile index a51c175..03a4996 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CC= gcc -CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +CFLAGS= -g -Wall -O2 -fPIC #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE LOBJS= bgzf.o kstring.o knetfile.o index.o AOBJS= main.o diff --git a/NEWS b/NEWS index 8cb7f0d..290f2d8 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,21 @@ +Beta Release 0.2.0 (11 May, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Fixed an issue for random access given an interval end larger than + 2^29. + + * Updated the Java binding. + + * Added a Perl module using XS. + + * Improved the C APIs. + +(0.2.0: 11 May 2010, r574) + + + Beta Release 0.1.6 (9 May, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/TabixReader.java b/TabixReader.java index acaff24..752f71c 100644 --- a/TabixReader.java +++ b/TabixReader.java @@ -93,6 +93,8 @@ public class TabixReader private static int reg2bins(final int beg, final int _end, final int[] list) { int i = 0, k, end = _end; + if (beg >= end) return 0; + if (end >= 1<<29) end = 1<<29; --end; list[i++] = 0; for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; @@ -320,7 +322,9 @@ public class TabixReader TIndex idx = mIndex[tid]; int[] bins = new int[MAX_BIN]; int i, l, n_off, n_bins = reg2bins(beg, end, bins); - min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT]; + if (idx.l.length > 0) + min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT]; + else min_off = 0; for (i = n_off = 0; i < n_bins; ++i) { if ((chunks = idx.b.get(bins[i])) != null) n_off += chunks.length; diff --git a/example.gtf.gz b/example.gtf.gz new file mode 100644 index 0000000000000000000000000000000000000000..693db0c6372a12d4299eb20a49cfb54ebba94882 GIT binary patch literal 3778 zcmV;z4n6T7iwFb&00000{{{d;LjnMM3+f&>0tNYFDc2fB#RU&N}fbM1uCNn!_<#oo)7Yny#4wJi??T=-hTY$ z!{YPR^je?EG8yjiVpAMd|EZ5mt^ z4Y-huYTKar*{A<~UmhKQe|&ga-mk82AMWl~UpC)cKixe2SRWn#=j{B?v&|P*H{a|J zTwVa2Fn$5-4K#n3r^8x{cDJ=}?RI`biTeeAE&pErb;N!?`g-&C&Fbdb{a22T&#$iD zfBsm$Z~Mnc+w@I2hUoW#zR)`*oMyWtgq{eUE4iRTBJ_!&*Q3=6y5<-? z#7C-f_*`KGM-iwRTX_ehtPAr;}(0)$R&kH#MhtJ?(NTyas~WBdEELyxK%% zfeQ>R!VwhKet6AQU-iYB3KSm(s@34df{Ko4cSW^+dicJHN~s|3K{3Kgh_DJ~n0h9p zo}5ax;7sxi+od&dHQA6HCFy*&VLWMg#w47~1QO2OWxxXXU?Ygc`-pehQX$MX^Xj&x zhw7gjS-lO}wsA5=PCicB4qk2uASW-U!XGy{VIJX6P!ZJrK+SdJHKIB#buK52F?wSR z=M03j!G%y&0FA+gckDJn;n?+g&61M_g3cgDPzb8e8H~XRok5&4P&T=LMva+)f9y6u zp)>HMnvyHBkdkT?RM2TQIiWA`4m^cXv)fEmvKPM@&IUPi(-vEU5Uoa`C zxm&35K+V)dHKL0VIAL@V64Jm=M7<(Z#^Dv>j&1V!;b02;qqMG~o8xz$`2Vmnitx-r}@|bj9 zyNfnB#vDIzjU{$*-T_giiV6?X-V3R@MjYM44@I>fRC85TBaSu9;Gx75 z#<5MDP9*rys?BR7*YNP8 z>9c*uLCqFySPxVwvi7oWL|xYGP-CA=N+(_At}jkZDUcJ}QkS%K`?r;%iu7RT}xG@hy!3a8eU) z=YDQ92C2rN!4#u`5qy@=&Swb|*g%O$6J=U5MVmUJO>L4z129fHu&oYtCP1Mf7z=GN zB_$*wsMc(x^^H@U8ldi3tqXL^l#!xP#mkg};k$FGX-uCdIXjm}vaoKuZk2O&-R#-| z(k-&X08&h_RY15Z3|TUeR)ZH0#0#UAUFw(EY2_aJAmnGf2;lz zf;=Eb!~}Va7LjdF@o(YPGf}r1On^EUZxUMHY%U6NAA&3{8`ti#F8dVBOB$52ZZ$xf zQ3*PC8Q0SIU34lk#|jp`wh2God^tM))kaYEi?_FTYxic{KN%6l28Qq_z#~6_&Y7Qz zw=oKk_q4oAdz;6*)!>Elj<+5>U~^HK`%-3ozS5aqUOc5Y7Deo%V0S5Vh1--zU?J{F zRsNiCIo`%SE8Bw9C!<~jbro(j<@*WhlmVd^8f?Z9SRm1am{7VRC1+%ZcUEDv@t_~A zFxn)PpIgc@l5!w5ELj5(Wde7DAnf6=i@3SMOc@Uy*6PRHYVfKsCn4^#2F2}m#z3fo z-xwa;CSin7GM(Jn`5$jTc*yZQa+|LGuag=FqU%Fu>abclHlaV*ZL=l$pzyg)bFxlcUnOE?YI9?Sijj?Ias{ ztHD&rM*y7!*=q_=qI;BRWfhpEjn;R1>nm&?;$B?IP-u6r4({%CtBJXrVJKMNO z=G_jHTXMhs6$w@Ml&h#S;1yvVRri6C+h_QyJPRJ-PYu6ve-(Dab@-4)5B64riLno_ zlWTPzkGtcB7OH^D@+GvC-6vHh%8L?S1+#=pNV>Sy3Sf8f`Oc+4e2pEwTX4b4coAIp zj`?SNLrvWQ;w$bocHB%|cD_Rpky|sIRGd+aX!a1%H05dlOnsQ6dx>-}ZE$EG`@@>r zU91iqpzzY|q>+NT7X_gu6Jv-BaWhPr-NW^=Dnomwmhz`JQf8K@Rac~~Y;&$sy^=E{ zH0Y1KoLdcE73UGO?sPw~#%PHbMn_oS$^7!3!eYU*}T%hIqzm6E)!MJt$}Ui;Am1(+ zVusk?e_E{xWmyTN+q-od%C2}zX^{6syenl(>gKYTWGWC@AnEZ)qqUygT1!C$T90Wo z-JBxhN^<316c1DgaH4cDF_F8d*Q?93)9Ya+FuIF!M~8#7tox5=&c@Pe@G7u)k<=YK z>ZNBGiGXjQRMPM<`lKwPYGa9D@wxM!%*{`yyz$R6dT>eT4}Y=NZZM$Qvu5XQ?mXeg6+CNcltn0096WiwFb&00000{{{d;LjnLP z1MS+;ZrU&u2H@-LDNt^wR`#)dY^zs@b*r?NN-OOyQ3O$o)CB}twY~l9kkAkaBu#>b zWYjh)J$1s@(;wSP{Cm~&zYp(!4*jcMybPk%WI0>J<0$wQ_Wb@Wf@DNrQn&y@P04Hc z7QXi2s`nDS1yL}54kJ@{KN{UU-hCtM2M7@Urs93jtZtOnNs8LP`ZRf2Qtzh!)F1Xe zJDf)aG3PV~%9Qdhz0c;;{$MhHsnB}+s`nQpzZ?Hum!BXNyfsq-^DO}B=B=8+|)Qab5V zNOM4C4=A`%zSu9#G(&Yr8c_&YUwU}|bbVVQEt^4SLzhF#I?h~?O3sAI{rS6gMkYai z#JNy#R?fAI%hR`^pzG9Qoth?sklbdkP((QAPMrlbch*!RoS}2&IayU_YMsi{Bh^;O zh|}&uJMC(zky&Q~%B4q#>Cuj}>*r(rthi(@H%{QJsm*8e)y?iY%nbw%#2$p8&bdQp zh7{durG00}oF7L&JWz^qk5Q!M;Y@9U(*wv1B;xL-W*p9<@nk-oMb8yBh`#pkC?$gu z+wHOHYzw91UXX3lIC$>)gTa>v^WOqDMfoqTKZ$;Mq~uH=Ujrcm%7n^y|#%yTyt7&U-RQ-P!T;GH@( z-KKxXz=58xX0@ zXqTPcD7QI5<$Y%BEM+BUzT=#117*${Fu7Z!wsWE2tek5Zmn{Pu3%X7{)~V}6FhPLa sIk)Wm50iKxG}SKv03VA81ONa4009360763o02=@U000000000001%faGynhq literal 0 HcmV?d00001 diff --git a/example.gtf.gz.tbi b/example.gtf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..eb3f24c6ba9cd8cf5c4551ced2edfd08518ce9e7 GIT binary patch literal 260 zcmb2|=3rp}f&Xj_PR>jW&l$K2Kc%E3H88j-q%kIiDTK9k6*3kUxlL5;Fi3j9_Q__Z zf|k9PW5^o|wuQE>3Xcpv?2>S*c08K+XGW#KEom*j$NOX07S=9!WU%j<8OK|0(e{tw z@7gca1r$o`|02-xt-Z_Miszxc1@}T`pqhVIw+mPa05xs?*K)xTsK!2OOPjQqky-%% z;X_P!^$oUv%XeS;|Bb^|=`=HqFNqIdt1xa2&yz__6kvj}sjjX2{KmopQ;*n&H}%16RVB7#QTy{3*@A4E8pN004_)Vut_# literal 0 HcmV?d00001 diff --git a/index.c b/index.c index 61d7d6d..b3479fb 100644 --- a/index.c +++ b/index.c @@ -47,7 +47,6 @@ struct __ti_iter_t { int tid, beg, end, n_off, i, finished; uint64_t curr_off; kstring_t str; - BGZF *fp; const ti_index_t *idx; pair64_t *off; }; @@ -686,12 +685,19 @@ int ti_index_build(const char *fn, const ti_conf_t *conf) * parse a region in the format chr:beg-end * ********************************************/ -int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end) +int ti_get_tid(const ti_index_t *idx, const char *name) +{ + khiter_t iter; + const khash_t(s) *h = idx->tname; + iter = kh_get(s, h, name); /* get the tid */ + if (iter == kh_end(h)) return -1; + return kh_value(h, iter); +} + +int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end) { char *s, *p; int i, l, k; - khiter_t iter; - khash_t(s) *h = idx->tname; l = strlen(str); p = s = (char*)malloc(l+1); /* squeeze out "," */ @@ -700,12 +706,10 @@ int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int s[k] = 0; for (i = 0; i != k; ++i) if (s[i] == ':') break; s[i] = 0; - iter = kh_get(s, h, s); /* get the tid */ - if (iter == kh_end(h)) { // name not found - *tid = -1; free(s); + if ((*tid = ti_get_tid(idx, s)) < 0) { + free(s); return -1; } - *tid = kh_value(h, iter); if (i == k) { /* dump the whole sequence */ *begin = 0; *end = 1<<29; free(s); return 0; @@ -731,6 +735,8 @@ int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) { int i = 0, k; + if (beg >= end) return 0; + if (end >= 1u<<29) end = 1u<<29; --end; list[i++] = 0; for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; @@ -741,15 +747,15 @@ static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) return i; } -ti_iter_t ti_first(BGZF *fp) +ti_iter_t ti_iter_first() { ti_iter_t iter; iter = calloc(1, sizeof(struct __ti_iter_t)); - iter->fp = fp; iter->from_first = 1; + iter->from_first = 1; return iter; } -ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end) +ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end) { uint16_t *bins; int i, n_bins, n_off; @@ -759,10 +765,11 @@ ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end) uint64_t min_off; ti_iter_t iter = 0; + if (beg < 0) beg = 0; + if (end < beg) return 0; // initialize the iterator iter = calloc(1, sizeof(struct __ti_iter_t)); - iter->fp = fp; iter->idx = idx; - iter->tid = tid; iter->beg = beg; iter->end = end; iter->i = -1; + iter->idx = idx; iter->tid = tid; iter->beg = beg; iter->end = end; iter->i = -1; // random access bins = (uint16_t*)calloc(MAX_BIN, 2); n_bins = reg2bins(beg, end, bins); @@ -818,12 +825,12 @@ ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end) return iter; } -const char *ti_iter_read(ti_iter_t iter, int *len) +const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len) { if (iter->finished) return 0; if (iter->from_first) { int ret; - if ((ret = ti_readline(iter->fp, &iter->str)) < 0) { + if ((ret = ti_readline(fp, &iter->str)) < 0) { iter->finished = 1; return 0; } else { @@ -838,14 +845,14 @@ const char *ti_iter_read(ti_iter_t iter, int *len) if (iter->i == iter->n_off - 1) break; // no more chunks if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek - bgzf_seek(iter->fp, iter->off[iter->i+1].u, SEEK_SET); - iter->curr_off = bgzf_tell(iter->fp); + bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET); + iter->curr_off = bgzf_tell(fp); } ++iter->i; } - if ((ret = ti_readline(iter->fp, &iter->str)) >= 0) { + if ((ret = ti_readline(fp, &iter->str)) >= 0) { ti_intv_t intv; - iter->curr_off = bgzf_tell(iter->fp); + iter->curr_off = bgzf_tell(fp); if (iter->str.s[0] == iter->idx->conf.meta_char) continue; get_intv((ti_index_t*)iter->idx, &iter->str, &intv); if (intv.tid != iter->tid || intv.beg >= iter->end) break; // no need to proceed @@ -872,9 +879,67 @@ int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *d ti_iter_t iter; const char *s; int len; - iter = ti_query(fp, idx, tid, beg, end); - while ((s = ti_iter_read(iter, &len)) != 0) + iter = ti_iter_query(idx, tid, beg, end); + while ((s = ti_iter_read(fp, iter, &len)) != 0) func(len, s, data); ti_iter_destroy(iter); return 0; } + +/******************* + * High-level APIs * + *******************/ + +tabix_t *ti_open(const char *fn, const char *fnidx) +{ + tabix_t *t; + BGZF *fp; + if ((fp = bgzf_open(fn, "r")) == 0) return 0; + t = calloc(1, sizeof(tabix_t)); + t->fn = strdup(fn); + if (fnidx) t->fnidx = strdup(fnidx); + t->fp = fp; + return t; +} + +void ti_close(tabix_t *t) +{ + if (t) { + bgzf_close(t->fp); + if (t->idx) ti_index_destroy(t->idx); + free(t->fn); free(t->fnidx); + free(t); + } +} + +int ti_lazy_index_load(tabix_t *t) +{ + if (t->idx == 0) { // load index + if (t->fnidx) t->idx = ti_index_load_local(t->fnidx); + else t->idx = ti_index_load(t->fn); + if (t->idx == 0) return -1; // fail to load index + } + return 0; +} + +ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end) +{ + if (tid < 0) return ti_iter_first(); + if (ti_lazy_index_load(t) != 0) return 0; + return ti_iter_query(t->idx, tid, beg, end); +} + +ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end) +{ + int tid; + if (name == 0) return ti_iter_first(); + // then need to load the index + if (ti_lazy_index_load(t) != 0) return 0; + if ((tid = ti_get_tid(t->idx, name)) < 0) return 0; + return ti_iter_query(t->idx, tid, beg, end); +} + +const char *ti_read(tabix_t *t, ti_iter_t iter, int *len) +{ + return ti_iter_read(t->fp, iter, len); +} diff --git a/main.c b/main.c index a05939d..a2a4565 100644 --- a/main.c +++ b/main.c @@ -84,9 +84,8 @@ int main(int argc, char *argv[]) return ti_index_build(argv[optind], &conf); } { // retrieve - BGZF *fp; - fp = bgzf_open(argv[optind], "r"); - if (fp == 0) { + tabix_t *t; + if ((t = ti_open(argv[optind], 0)) == 0) { fprintf(stderr, "[main] fail to open the data file.\n"); return 1; } @@ -94,36 +93,29 @@ int main(int argc, char *argv[]) ti_iter_t iter; const char *s; int len; - iter = ti_first(fp); - while ((s = ti_iter_read(iter, &len)) != 0) { + iter = ti_query(t, 0, 0, 0); + while ((s = ti_read(t, iter, &len)) != 0) { fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); } else { // retrieve from specified regions - ti_index_t *idx; int i; - idx = ti_index_load(argv[optind]); - if (idx == 0) { - bgzf_close(fp); - fprintf(stderr, "[main] fail to load the index.\n"); - return 1; - } + ti_lazy_index_load(t); for (i = optind + 1; i < argc; ++i) { int tid, beg, end; - if (ti_parse_region(idx, argv[i], &tid, &beg, &end) == 0) { + if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { ti_iter_t iter; const char *s; int len; - iter = ti_query(fp, idx, tid, beg, end); - while ((s = ti_iter_read(iter, &len)) != 0) { + iter = ti_queryi(t, tid, beg, end); + while ((s = ti_read(t, iter, &len)) != 0) { fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); } else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); } - ti_index_destroy(idx); } - bgzf_close(fp); + ti_close(t); } return 0; } diff --git a/perl/MANIFEST b/perl/MANIFEST new file mode 100644 index 0000000..877da96 --- /dev/null +++ b/perl/MANIFEST @@ -0,0 +1,8 @@ +MANIFEST +typemap +Tabix.xs +Tabix.pm +TabixIterator.pm +Makefile.PL +t/01local.t +t/02remote.t \ No newline at end of file diff --git a/perl/Makefile.PL b/perl/Makefile.PL new file mode 100644 index 0000000..3ea6e8d --- /dev/null +++ b/perl/Makefile.PL @@ -0,0 +1,8 @@ +use ExtUtils::MakeMaker; +WriteMakefile( + NAME => 'Tabix', + VERSION_FROM => 'Tabix.pm', + LIBS => ['-lz -L.. -ltabix'], + DEFINE => '-D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE', + INC => '-I..', + ); diff --git a/perl/Tabix.pm b/perl/Tabix.pm new file mode 100644 index 0000000..fd7165d --- /dev/null +++ b/perl/Tabix.pm @@ -0,0 +1,76 @@ +package Tabix; + +use strict; +use warnings; +use Carp qw/croak/; + +use TabixIterator; + +require Exporter; + +our @ISA = qw/Exporter/; +our @EXPORT = qw/tabix_open tabix_close tabix_read tabix_query tabix_getnames tabix_iter_free/; + +our $VERSION = '0.2.0'; + +require XSLoader; +XSLoader::load('Tabix', $VERSION); + +sub new { + my $invocant = shift; + my %args = @_; + $args{-data} || croak("-data argument required"); + my $class = ref($invocant) || $invocant; + my $self = {}; + bless($self, $class); + $self->open($args{-data}, $args{-index}); + return $self; +} + +sub open { + my ($self, $fn, $fnidx) = @_; + $self->close; + $self->{_fn} = $fn; + $self->{_fnidx} = $fnidx; + $self->{_} = $fnidx? tabix_open($fn, $fnidx) : tabix_open($fn); +} + +sub close { + my $self = shift; + if ($self->{_}) { + tabix_close($self->{_}); + delete($self->{_}); delete($self->{_fn}); delete($self->{_fnidx}); + } +} + +sub DESTROY { + my $self = shift; + $self->close; +} + +sub query { + my $self = shift; + my $iter; + if (@_) { + $iter = tabix_query($self->{_}, @_); + } else { + $iter = tabix_query($self->{_}); + } + my $i = TabixIterator->new; + $i->set($iter); + return $i; +} + +sub read { + my $self = shift; + my $iter = shift; + return tabix_read($self->{_}, $iter->get); +} + +sub getnames { + my $self = shift; + return tabix_getnames($self->{_}); +} + +1; +__END__ diff --git a/perl/Tabix.xs b/perl/Tabix.xs new file mode 100644 index 0000000..50dabb1 --- /dev/null +++ b/perl/Tabix.xs @@ -0,0 +1,71 @@ +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +#include +#include "tabix.h" + +MODULE = Tabix PACKAGE = Tabix + +tabix_t* +tabix_open(fn, fnidx=0) + char *fn + char *fnidx + CODE: + RETVAL = ti_open(fn, fnidx); + OUTPUT: + RETVAL + +void +tabix_close(t) + tabix_t *t + CODE: + ti_close(t); + +ti_iter_t +tabix_query(t, seq=0, beg=0, end=0x7fffffff) + tabix_t *t + const char *seq + int beg + int end + PREINIT: + CODE: + RETVAL = ti_query(t, seq, beg, end); + OUTPUT: + RETVAL + +SV* +tabix_read(t, iter) + tabix_t *t + ti_iter_t iter + PREINIT: + const char *s; + int len; + CODE: + s = ti_read(t, iter, &len); + if (s == 0) + return XSRETURN_EMPTY; + RETVAL = newSVpv(s, len); + OUTPUT: + RETVAL + +void +tabix_getnames(t) + tabix_t *t + PREINIT: + const char **names; + int i, n; + PPCODE: + ti_lazy_index_load(t); + names = ti_seqname(t->idx, &n); + for (i = 0; i < n; ++i) + XPUSHs(sv_2mortal(newSVpv(names[i], 0))); + free(names); + +MODULE = Tabix PACKAGE = TabixIterator + +void +tabix_iter_free(iter) + ti_iter_t iter + CODE: + ti_iter_destroy(iter); diff --git a/perl/TabixIterator.pm b/perl/TabixIterator.pm new file mode 100644 index 0000000..335194a --- /dev/null +++ b/perl/TabixIterator.pm @@ -0,0 +1,41 @@ +package TabixIterator; + +use strict; +use warnings; +use Carp qw/croak/; + +require Exporter; + +our @ISA = qw/Exporter/; +our @EXPORT = qw/tabix_iter_free/; + +our $VERSION = '0.2.0'; + +require XSLoader; +XSLoader::load('Tabix', $VERSION); + +sub new { + my $invocant = shift; + my $class = ref($invocant) || $invocant; + my $self = {}; + bless($self, $class); + return $self; +} + +sub set { + my ($self, $iter) = @_; + $self->{_} = $iter; +} + +sub get { + my $self = shift; + return $self->{_}; +} + +sub DESTROY { + my $self = shift; + tabix_iter_free($self->{_}) if ($self->{_}); +} + +1; +__END__ diff --git a/perl/t/01local.t b/perl/t/01local.t new file mode 100644 index 0000000..4eb6534 --- /dev/null +++ b/perl/t/01local.t @@ -0,0 +1,28 @@ +#-*-Perl-*- +use Test::More tests => 9; +BEGIN { use_ok('Tabix') }; + +{ # C-like low-level interface + my $t = tabix_open("../example.gtf.gz"); + ok($t); + my $iter = tabix_query($t, "chr1", 0, 2000); + ok($iter); + $_ = 0; + ++$_ while (tabix_read($t, $iter)); + is($_, 6); + tabix_iter_free($iter); + @_ = tabix_getnames($t); + is(scalar(@_), 2); +} + +{ # OOP high-level interface + my $t = Tabix->new(-data=>"../example.gtf.gz"); + ok($t); + my $iter = $t->query("chr1", 3000, 5000); + ok($iter); + $_ = 0; + ++$_ while ($t->read($iter)); + is($_, 27); + @_ = $t->getnames; + is($_[1], "chr2"); +} diff --git a/perl/t/02remote.t b/perl/t/02remote.t new file mode 100644 index 0000000..0668e8f --- /dev/null +++ b/perl/t/02remote.t @@ -0,0 +1,28 @@ +#-*-Perl-*- +use Test::More tests => 9; +BEGIN { use_ok('Tabix') }; + +{ # FTP access + my $t = Tabix->new(-data=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz"); + ok($t); + my $iter = $t->query("1", 1000000, 1100000); + ok($iter); + $_ = 0; + ++$_ while ($t->read($iter)); + is($_, 306); + @_ = $t->getnames; + is(scalar(@_), 22); +} + +{ # FTP access plus FTP index + my $t = Tabix->new(-data=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz", + -index=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz.tbi"); + ok($t); + my $iter = $t->query("19", 10000000, 10100000); + ok($iter); + $_ = 0; + ++$_ while ($t->read($iter)); + is($_, 268); + @_ = $t->getnames; + is(scalar(@_), 22); +} diff --git a/perl/typemap b/perl/typemap new file mode 100644 index 0000000..a312f99 --- /dev/null +++ b/perl/typemap @@ -0,0 +1,3 @@ +TYPEMAP +tabix_t* T_PTROBJ +ti_iter_t T_PTROBJ \ No newline at end of file diff --git a/tabix.1 b/tabix.1 index 2657829..3b82c76 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,4 +1,4 @@ -.TH tabix 1 "5 May 2010" "tabix-0.1.5" "Bioinformatics tools" +.TH tabix 1 "11 May 2010" "tabix-0.2.0" "Bioinformatics tools" .SH NAME .PP bgzip - Block compression/decompression utility diff --git a/tabix.h b/tabix.h index a002d8a..95a8491 100644 --- a/tabix.h +++ b/tabix.h @@ -46,6 +46,12 @@ typedef struct __ti_index_t ti_index_t; struct __ti_iter_t; typedef struct __ti_iter_t *ti_iter_t; +typedef struct { + BGZF *fp; + ti_index_t *idx; + char *fn, *fnidx; +} tabix_t; + typedef struct { int32_t preset; int32_t sc, bc, ec; // seq col., beg col. and end col. @@ -58,6 +64,31 @@ extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_ extern "C" { #endif + /******************* + * High-level APIs * + *******************/ + + tabix_t *ti_open(const char *fn, const char *fnidx); + int ti_lazy_index_load(tabix_t *t); + void ti_close(tabix_t *t); + ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end); + ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end); + const char *ti_read(tabix_t *t, ti_iter_t iter, int *len); + + /* Destroy the iterator */ + void ti_iter_destroy(ti_iter_t iter); + + /* Get the list of sequence names. Each "char*" pointer points to a + * internal member of the index, so DO NOT modify the returned + * pointer; otherwise the index will be corrupted. The returned + * pointer should be freed by a single free() call by the routine + * calling this function. The number of sequences is returned at *n. */ + const char **ti_seqname(const ti_index_t *idx, int *n); + + /****************** + * Low-level APIs * + ******************/ + /* Build the index for file . File .tbi will be generated * and overwrite the file of the same name. Return -1 on failure. */ int ti_index_build(const char *fn, const ti_conf_t *conf); @@ -67,32 +98,30 @@ extern "C" { * downloaded. Return NULL on failure. */ ti_index_t *ti_index_load(const char *fn); + ti_index_t *ti_index_load_local(const char *fnidx); + /* Destroy the index */ void ti_index_destroy(ti_index_t *idx); - /* Get the list of sequence names. Each "char*" pointer points to a - * internal member of the index, so DO NOT modify the returned - * pointer; otherwise the index will be corrupted. The returned - * pointer should be freed by a single free() call by the routine - * calling this function. The number of sequences is returned at *n. */ - const char **ti_seqname(const ti_index_t *idx, int *n); - /* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */ - int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end); + int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end); + + int ti_get_tid(const ti_index_t *idx, const char *name); /* Get the iterator pointing to the first record at the current file * position. If the file is just openned, the iterator points to the * first record in the file. */ - ti_iter_t ti_first(BGZF *fp); + ti_iter_t ti_iter_first(void); /* Get the iterator pointing to the first record in region tid:beg-end */ - ti_iter_t ti_query(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end); + ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end); /* Get the data line pointed by the iterator and iterate to the next record. */ - const char *ti_iter_read(ti_iter_t iter, int *len); + const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len); - /* Destroy the iterator */ - void ti_iter_destroy(ti_iter_t iter); + /******************* + * Deprecated APIs * + *******************/ /* The callback version for random access */ int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func); -- 2.30.2