X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=tabix.git;a=blobdiff_plain;f=main.c;h=46642510d6dfdccf0ab5a628826f1fe9844ae310;hp=8ae6b2e69d78c0b7251635041344e90e02782c85;hb=2112b1dba17e822d8028151605b4cdc01f0a4c3d;hpb=3be5ff47495762af7f2ebec145bc8f9c7674593d diff --git a/main.c b/main.c index 8ae6b2e..4664251 100644 --- a/main.c +++ b/main.c @@ -3,17 +3,109 @@ #include #include #include +#include #include "bgzf.h" #include "tabix.h" -#define PACKAGE_VERSION "0.2.3 (r876)" +#define PACKAGE_VERSION "0.2.4 (r949)" + +#define error(...) { fprintf(stderr,__VA_ARGS__); return -1; } + +int reheader_file(const char *header, const char *file, int meta) +{ + BGZF *fp = bgzf_open(file,"r"); + if (bgzf_read_block(fp) != 0 || !fp->block_length) + return -1; + + char *buffer = fp->uncompressed_block; + int skip_until = 0; + + if ( buffer[0]==meta ) + { + skip_until = 1; + + // Skip the header + while (1) + { + if ( buffer[skip_until]=='\n' ) + { + skip_until++; + if ( skip_until>=fp->block_length ) + { + if (bgzf_read_block(fp) != 0 || !fp->block_length) + error("no body?\n"); + skip_until = 0; + } + // The header has finished + if ( buffer[skip_until]!=meta ) break; + } + skip_until++; + if ( skip_until>=fp->block_length ) + { + if (bgzf_read_block(fp) != 0 || !fp->block_length) + error("no body?\n"); + skip_until = 0; + } + } + } + + FILE *fh = fopen(header,"r"); + if ( !fh ) + error("%s: %s", header,strerror(errno)); + int page_size = getpagesize(); + char *buf = valloc(page_size); + BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w"); + ssize_t nread; + while ( (nread=fread(buf,1,page_size-1,fh))>0 ) + { + if ( nreaderror); + } + fclose(fh); + + if ( fp->block_length - skip_until > 0 ) + { + if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) + error("Error: %s\n",fp->error); + } + if (bgzf_flush(bgzf_out) < 0) + error("Error: %s\n",bgzf_out->error); + + while (1) + { +#ifdef _USE_KNETFILE + nread = knet_read(fp->x.fpr, buf, page_size); +#else + nread = fread(buf, 1, page_size, fp->file); +#endif + if ( nread<=0 ) + break; + +#ifdef _USE_KNETFILE + int count = fwrite(buf, 1, nread, bgzf_out->x.fpw); +#else + int count = fwrite(buf, 1, nread, bgzf_out->file); +#endif + if (count != nread) + error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); + } + + if (bgzf_close(bgzf_out) < 0) + error("Error: %s\n",bgzf_out->error); + + return 0; +} + int main(int argc, char *argv[]) { - int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0; + int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0; ti_conf_t conf = ti_conf_gff; - while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhf")) >= 0) { + const char *reheader = NULL; + while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) { switch (c) { + case 'B': bed_reg = 1; break; case '0': conf.preset |= TI_FLAG_UCSC; break; case 'S': skip = atoi(optarg); break; case 'c': meta = optarg[0]; break; @@ -34,6 +126,7 @@ int main(int argc, char *argv[]) case 'l': list_chrms = 1; break; case 'h': print_header = 1; break; case 'f': force = 1; break; + case 'r': reheader = optarg; break; } } if (skip >= 0) conf.line_skip = skip; @@ -46,11 +139,13 @@ int main(int argc, char *argv[]) fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n"); fprintf(stderr, " -s INT sequence name column [1]\n"); fprintf(stderr, " -b INT start column [4]\n"); - fprintf(stderr, " -e INT end column [5]\n"); + fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); + fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n"); + fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n"); fprintf(stderr, " -0 zero-based coordinate\n"); - fprintf(stderr, " -h print the VCF header\n"); + fprintf(stderr, " -h print the header lines\n"); fprintf(stderr, " -l list chromosome names\n"); fprintf(stderr, " -f force to overwrite the index\n"); fprintf(stderr, "\n"); @@ -71,27 +166,49 @@ int main(int argc, char *argv[]) ti_index_destroy(idx); return 0; } + if (reheader) + return reheader_file(reheader,argv[optind],conf.meta_char); + + struct stat stat_tbi,stat_vcf; + char *fnidx = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(fnidx, argv[optind]), ".tbi"); + if (optind + 1 == argc) { if (force == 0) { - struct stat buf; - char *fnidx = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(fnidx, argv[optind]), ".tbi"); - if (stat(fnidx, &buf) == 0) { - fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n"); - free(fnidx); - return 1; + if (stat(fnidx, &stat_tbi) == 0) + { + // Before complaining, check if the VCF file isn't newer. This is a common source of errors, + // people tend not to notice that tabix failed + stat(argv[optind], &stat_vcf); + if ( stat_vcf.st_mtime <= stat_tbi.st_mtime ) + { + fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n"); + free(fnidx); + return 1; + } } - free(fnidx); } - if ( is_bgzipped(argv[optind])!=1 ) + if ( bgzf_check_bgzf(argv[optind])!=1 ) { fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]); + free(fnidx); return 1; } return ti_index_build(argv[optind], &conf); } { // retrieve tabix_t *t; + // Common source of errors: new VCF is used with an old index + stat(fnidx, &stat_tbi); + stat(argv[optind], &stat_vcf); + if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime ) + { + fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n"); + free(fnidx); + return 1; + } + free(fnidx); + if ((t = ti_open(argv[optind], 0)) == 0) { fprintf(stderr, "[main] fail to open the data file.\n"); return 1; @@ -106,36 +223,65 @@ int main(int argc, char *argv[]) } ti_iter_destroy(iter); } else { // retrieve from specified regions - int i; - if ( ti_lazy_index_load(t) ) - { + int i, len; + ti_iter_t iter; + const char *s; + const ti_conf_t *idxconf; + + if (ti_lazy_index_load(t) < 0 && bed_reg == 0) { fprintf(stderr,"[tabix] failed to load the index file.\n"); return 1; } + idxconf = ti_get_conf(t->idx); - ti_iter_t iter; - const char *s; - int len; if ( print_header ) { // If requested, print the header lines here iter = ti_query(t, 0, 0, 0); while ((s = ti_read(t, iter, &len)) != 0) { - if ( *s != '#' ) break; + if ((int)(*s) != idxconf->meta_char) break; fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); } - for (i = optind + 1; i < argc; ++i) { - int tid, beg, end; - if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { - iter = ti_queryi(t, tid, beg, end); - while ((s = ti_read(t, iter, &len)) != 0) { - fputs(s, stdout); fputc('\n', stdout); + if (bed_reg) { + extern int bed_overlap(const void *_h, const char *chr, int beg, int end); + extern void *bed_read(const char *fn); + extern void bed_destroy(void *_h); + + const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available + void *bed = bed_read(argv[optind+1]); // load the BED file + ti_interval_t intv; + + if (bed == 0) { + fprintf(stderr, "[main] fail to read the BED file.\n"); + return 1; + } + iter = ti_query(t, 0, 0, 0); + while ((s = ti_read(t, iter, &len)) != 0) { + int c; + ti_get_intv(conf_, len, (char*)s, &intv); + c = *intv.se; *intv.se = '\0'; + if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) { + *intv.se = c; + puts(s); } - ti_iter_destroy(iter); - } - // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); + *intv.se = c; + } + ti_iter_destroy(iter); + bed_destroy(bed); + } else { + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { + iter = ti_queryi(t, tid, beg, end); + while ((s = ti_read(t, iter, &len)) != 0) { + fputs(s, stdout); fputc('\n', stdout); + } + ti_iter_destroy(iter); + } + // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); + } } } ti_close(t);