From 3be5ff47495762af7f2ebec145bc8f9c7674593d Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Thu, 9 Dec 2010 18:56:35 +0900 Subject: [PATCH] Imported Upstream version 0.2.3 --- ChangeLog | 136 +++++++++++++++++++++++++++++++++++++++++++++++ NEWS | 22 ++++++++ TabixReader.java | 5 +- bgzf.c | 26 +++++++++ bgzf.h | 7 +++ bgzip.c | 15 ++++-- index.c | 24 +++++++-- main.c | 39 +++++++++++--- 8 files changed, 254 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 907d72b..3594496 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,139 @@ +------------------------------------------------------------------------ +r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines +Changed paths: + M /trunk/tabix/ChangeLog + M /trunk/tabix/index.c + +Fixed a minor bug in generating index + +------------------------------------------------------------------------ +r855 | petulda | 2010-11-25 11:50:13 -0500 (Thu, 25 Nov 2010) | 1 line +Changed paths: + M /trunk/tabix/main.c + +Disable "unknown target name or minus interval" warning. +------------------------------------------------------------------------ +r775 | petulda | 2010-10-26 15:02:30 -0400 (Tue, 26 Oct 2010) | 1 line +Changed paths: + M /trunk/tabix/main.c + +Added -h option to print header lines +------------------------------------------------------------------------ +r742 | jmarshall | 2010-09-27 06:47:23 -0400 (Mon, 27 Sep 2010) | 2 lines +Changed paths: + M /trunk/tabix + +Add svn:ignore properties for intermediate and generated files. + +------------------------------------------------------------------------ +r725 | lh3lh3 | 2010-09-15 13:01:53 -0400 (Wed, 15 Sep 2010) | 2 lines +Changed paths: + M /trunk/tabix/bgzip.c + +patches by Peter Chines + +------------------------------------------------------------------------ +r714 | lh3lh3 | 2010-09-07 10:13:25 -0400 (Tue, 07 Sep 2010) | 2 lines +Changed paths: + M /trunk/tabix/TabixReader.java + M /trunk/tabix/index.c + M /trunk/tabix/main.c + +fixed a bug in C/Java when n_off == 0 + +------------------------------------------------------------------------ +r712 | lh3lh3 | 2010-09-03 09:21:23 -0400 (Fri, 03 Sep 2010) | 2 lines +Changed paths: + M /trunk/tabix/TabixReader.java + +fixed a bug in parsing region strings + +------------------------------------------------------------------------ +r700 | petulda | 2010-08-25 10:42:37 -0400 (Wed, 25 Aug 2010) | 1 line +Changed paths: + M /trunk/tabix/main.c + +Fix: Exit with an error rather than segfault when index is not present and region is queried +------------------------------------------------------------------------ +r696 | petulda | 2010-08-24 10:24:12 -0400 (Tue, 24 Aug 2010) | 1 line +Changed paths: + M /trunk/tabix/bgzf.c + M /trunk/tabix/bgzf.h + M /trunk/tabix/index.c + M /trunk/tabix/main.c + +Complain about not-bgzipped files and check for noncontinuous chromosome blocks +------------------------------------------------------------------------ +r603 | lh3lh3 | 2010-06-28 10:49:39 -0400 (Mon, 28 Jun 2010) | 2 lines +Changed paths: + M /trunk/tabix/NEWS + M /trunk/tabix/TabixReader.java + M /trunk/tabix/index.c + M /trunk/tabix/main.c + +Release tabix-0.2.2 + +------------------------------------------------------------------------ +r597 | lh3lh3 | 2010-06-13 21:08:29 -0400 (Sun, 13 Jun 2010) | 3 lines +Changed paths: + M /trunk/tabix/index.c + +Change the namespace of sorting, to avoid function name collision with samtools. + + +------------------------------------------------------------------------ +r582 | lh3lh3 | 2010-06-03 10:40:25 -0400 (Thu, 03 Jun 2010) | 2 lines +Changed paths: + M /trunk/tabix/NEWS + M /trunk/tabix/main.c + M /trunk/tabix/tabix.py + +Release tabix-0.2.1 + +------------------------------------------------------------------------ +r581 | lh3lh3 | 2010-05-24 14:24:24 -0400 (Mon, 24 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/tabix.py + +OOP interface with the help from Aaron Quinlan + +------------------------------------------------------------------------ +r580 | lh3lh3 | 2010-05-23 23:36:05 -0400 (Sun, 23 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/tabix.py + +minor change + +------------------------------------------------------------------------ +r579 | lh3lh3 | 2010-05-23 23:25:24 -0400 (Sun, 23 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/tabix.py + +For Snow Leopard compatibility + +------------------------------------------------------------------------ +r575 | lh3lh3 | 2010-05-12 19:31:27 -0400 (Wed, 12 May 2010) | 4 lines +Changed paths: + M /trunk/tabix/Makefile + M /trunk/tabix/index.c + M /trunk/tabix/tabix.h + A /trunk/tabix/tabix.py + + * optionally generate shared library for Mac and Linux + * added a python script that directly calls the shared library + * added a new API for easy python access + +------------------------------------------------------------------------ +r574 | lh3lh3 | 2010-05-11 12:14:27 -0400 (Tue, 11 May 2010) | 2 lines +Changed paths: + M /trunk/tabix/ChangeLog + M /trunk/tabix/NEWS + M /trunk/tabix/perl/Tabix.pm + M /trunk/tabix/perl/TabixIterator.pm + M /trunk/tabix/tabix.1 + +Release tabix-0.2.0 + ------------------------------------------------------------------------ r573 | lh3lh3 | 2010-05-11 12:08:30 -0400 (Tue, 11 May 2010) | 2 lines Changed paths: diff --git a/NEWS b/NEWS index a6f3c70..ff28ded 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,25 @@ +Beta Release 0.2.3 (8 December, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Fixed a minor bug where the first record in a headerless file may be + missed. + + * Added an option to print header lines. + + * Fixed a rare bug which may occasionally happen when retrieving data + from a region without any records. + + * Enhanced error reporting. + + * Fixed a bug in bgzip which may delete the original file even if not + intended. + +(0.2.3: 8 December 2010, r876) + + + Beta Release 0.2.2 (28 June, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/TabixReader.java b/TabixReader.java index 40d3951..5874202 100644 --- a/TabixReader.java +++ b/TabixReader.java @@ -217,7 +217,7 @@ public class TabixReader int[] ret = new int[3]; colon = reg.indexOf(':'); hyphen = reg.indexOf('-'); chr = colon >= 0? reg.substring(0, colon) : reg; - ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen)) - 1 : 0; + ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen >= 0? hyphen : reg.length())) - 1 : 0; ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff; ret[0] = chr2tid(chr); return ret; @@ -339,6 +339,7 @@ public class TabixReader for (int j = 0; j < chunks.length; ++j) if (less64(min_off, chunks[j].v)) off[n_off++] = new TPair64(chunks[j]); + if (n_off == 0) return null; Arrays.sort(off, 0, n_off); // resolve completely contained adjacent blocks for (i = 1, l = 0; i < n_off; ++i) { @@ -385,7 +386,7 @@ public class TabixReader System.out.println(s); } else { // a region is specified; random access TabixReader.Iterator iter = tr.query(args[1]); // get the iterator - while ((s = iter.next()) != null) + while (iter != null && (s = iter.next()) != null) System.out.println(s); } } catch (IOException e) { diff --git a/bgzf.c b/bgzf.c index 7a936a8..94e6194 100644 --- a/bgzf.c +++ b/bgzf.c @@ -111,6 +111,32 @@ report_error(BGZF* fp, const char* message) { fp->error = message; } +int is_bgzipped(const char *fn) +{ + BGZF *fp; + uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; + int n; + + if ((fp = bgzf_open(fn, "r")) == 0) + { + fprintf(stderr, "[is_bgzipped] failed to open the file: %s\n",fn); + return -1; + } + +#ifdef _USE_KNETFILE + n = knet_read(fp->x.fpr, buf, 10); +#else + n = fread(buf, 1, 10, fp->file); +#endif + bgzf_close(fp); + + if ( n!=10 ) + return -1; + + if ( !memcmp(magic, buf, 10) ) return 1; + return 0; +} + static BGZF *bgzf_read_init() { BGZF *fp; diff --git a/bgzf.h b/bgzf.h index f544a67..70f497e 100644 --- a/bgzf.h +++ b/bgzf.h @@ -62,6 +62,13 @@ typedef struct { extern "C" { #endif +/* + * Checks the magic string of the file. Returns 1 + * for bgzipped files, -1 on errors and 0 for files + * without the bgzip magic string. + */ +int is_bgzipped(const char *path); + /* * Open an existing file descriptor for reading or writing. * Mode must be either "r" or "w". diff --git a/bgzip.c b/bgzip.c index d144632..ebcafa2 100644 --- a/bgzip.c +++ b/bgzip.c @@ -138,7 +138,7 @@ int main(int argc, char **argv) if (bgzf_write(fp, buffer, c) < 0) fail(fp); // f_dst will be closed here if (bgzf_close(fp) < 0) fail(fp); - if (argc > optind) unlink(argv[optind]); + if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; @@ -166,10 +166,15 @@ int main(int argc, char **argv) return 1; } - name = strdup(argv[optind]); - name[strlen(name) - 3] = '\0'; - f_dst = write_open(name, is_forced); - free(name); + if (pstdout) { + f_dst = fileno(stdout); + } + else { + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) return bgzip_main_usage(); diff --git a/index.c b/index.c index 8eddcec..4ed32c2 100644 --- a/index.c +++ b/index.c @@ -240,7 +240,7 @@ static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t l->list[l->n].u = beg; l->list[l->n++].v = end; } -static inline void insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_t offset) +static inline uint64_t insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_t offset) { int i, beg, end; beg = _beg >> TAD_LIDX_SHIFT; @@ -259,6 +259,7 @@ static inline void insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_ if (index2->offset[i] == 0) index2->offset[i] = offset; } if (index2->n < end + 1) index2->n = end + 1; + return (uint64_t)beg<<32 | end; } static void merge_chunks(ti_index_t *idx) @@ -299,7 +300,7 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) ti_index_t *idx; uint32_t last_bin, save_bin; int32_t last_coor, last_tid, save_tid; - uint64_t save_off, last_off, lineno = 0; + uint64_t save_off, last_off, lineno = 0, offset0 = (uint64_t)-1, tmp; kstring_t *str; str = calloc(1, sizeof(kstring_t)); @@ -322,13 +323,19 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) } get_intv(idx, str, &intv); if (last_tid != intv.tid) { // change of chromosomes + if (last_tid>intv.tid ) + { + fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted?\n",(unsigned long long)lineno); + exit(1); + } last_tid = intv.tid; last_bin = 0xffffffffu; } else if (last_coor > intv.beg) { fprintf(stderr, "[ti_index_core] the file out of order at line %llu\n", (unsigned long long)lineno); exit(1); } - insert_offset2(&idx->index2[intv.tid], intv.beg, intv.end, last_off); + tmp = insert_offset2(&idx->index2[intv.tid], intv.beg, intv.end, last_off); + if (last_off == 0) offset0 = tmp; if (intv.bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); @@ -348,6 +355,10 @@ ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp)); merge_chunks(idx); fill_missing(idx); + if (offset0 != (uint64_t)-1 && idx->n && idx->index2[0].offset) { + int i, beg = offset0>>32, end = offset0&0xffffffffu; + for (i = beg; i <= end; ++i) idx->index2[0].offset[i] = 0; + } free(str->s); free(str); return idx; @@ -645,8 +656,8 @@ ti_index_t *ti_index_load(const char *fn) char *fname = get_local_version(fn); if (fname == 0) return 0; idx = ti_index_load_local(fname); + if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load the index: %s\n", fname); free(fname); - if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n"); return idx; } @@ -656,7 +667,7 @@ int ti_index_build2(const char *fn, const ti_conf_t *conf, const char *_fnidx) BGZF *fp, *fpidx; ti_index_t *idx; if ((fp = bgzf_open(fn, "r")) == 0) { - fprintf(stderr, "[ti_index_build2] fail to open the BAM file.\n"); + fprintf(stderr, "[ti_index_build2] fail to open the file: %s\n", fn); return -1; } idx = ti_index_core(fp, conf); @@ -803,6 +814,9 @@ ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end) if (p->list[j].v > min_off) off[n_off++] = p->list[j]; } } + if (n_off == 0) { + free(bins); free(off); return iter; + } free(bins); { int l; diff --git a/main.c b/main.c index 84e5cfc..8ae6b2e 100644 --- a/main.c +++ b/main.c @@ -6,13 +6,13 @@ #include "bgzf.h" #include "tabix.h" -#define PACKAGE_VERSION "0.2.2 (r603)" +#define PACKAGE_VERSION "0.2.3 (r876)" int main(int argc, char *argv[]) { - int c, skip = -1, meta = -1, list_chrms = 0, force = 0; + int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0; ti_conf_t conf = ti_conf_gff; - while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lf")) >= 0) { + while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhf")) >= 0) { switch (c) { case '0': conf.preset |= TI_FLAG_UCSC; break; case 'S': skip = atoi(optarg); break; @@ -32,6 +32,7 @@ int main(int argc, char *argv[]) case 'b': conf.bc = atoi(optarg); break; case 'e': conf.ec = atoi(optarg); break; case 'l': list_chrms = 1; break; + case 'h': print_header = 1; break; case 'f': force = 1; break; } } @@ -49,6 +50,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); fprintf(stderr, " -0 zero-based coordinate\n"); + fprintf(stderr, " -h print the VCF header\n"); fprintf(stderr, " -l list chromosome names\n"); fprintf(stderr, " -f force to overwrite the index\n"); fprintf(stderr, "\n"); @@ -81,6 +83,11 @@ int main(int argc, char *argv[]) } free(fnidx); } + if ( is_bgzipped(argv[optind])!=1 ) + { + fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]); + return 1; + } return ti_index_build(argv[optind], &conf); } { // retrieve @@ -100,19 +107,35 @@ int main(int argc, char *argv[]) ti_iter_destroy(iter); } else { // retrieve from specified regions int i; - ti_lazy_index_load(t); + if ( ti_lazy_index_load(t) ) + { + fprintf(stderr,"[tabix] failed to load the index file.\n"); + return 1; + } + + ti_iter_t iter; + const char *s; + int len; + if ( print_header ) + { + // If requested, print the header lines here + iter = ti_query(t, 0, 0, 0); + while ((s = ti_read(t, iter, &len)) != 0) { + if ( *s != '#' ) break; + fputs(s, stdout); fputc('\n', stdout); + } + ti_iter_destroy(iter); + } for (i = optind + 1; i < argc; ++i) { int tid, beg, end; if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { - ti_iter_t iter; - const char *s; - int len; iter = ti_queryi(t, tid, beg, end); while ((s = ti_read(t, iter, &len)) != 0) { fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); - } else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); + } + // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); } } ti_close(t); -- 2.30.2