#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
+#include <sys/stat.h>
+#include <errno.h>
#include "bgzf.h"
#include "tabix.h"
-#define PACKAGE_VERSION "0.1.2 (r543)"
+#define PACKAGE_VERSION "0.2.5 (r964)"
-static int fetch_func(int l, const char *s, void *data)
+#define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
+
+int reheader_file(const char *header, const char *file, int meta)
{
- printf("%s\n", s);
- return 0;
+ BGZF *fp = bgzf_open(file,"r");
+ if (bgzf_read_block(fp) != 0 || !fp->block_length)
+ return -1;
+
+ char *buffer = fp->uncompressed_block;
+ int skip_until = 0;
+
+ if ( buffer[0]==meta )
+ {
+ skip_until = 1;
+
+ // Skip the header
+ while (1)
+ {
+ if ( buffer[skip_until]=='\n' )
+ {
+ skip_until++;
+ if ( skip_until>=fp->block_length )
+ {
+ if (bgzf_read_block(fp) != 0 || !fp->block_length)
+ error("no body?\n");
+ skip_until = 0;
+ }
+ // The header has finished
+ if ( buffer[skip_until]!=meta ) break;
+ }
+ skip_until++;
+ if ( skip_until>=fp->block_length )
+ {
+ if (bgzf_read_block(fp) != 0 || !fp->block_length)
+ error("no body?\n");
+ skip_until = 0;
+ }
+ }
+ }
+
+ FILE *fh = fopen(header,"r");
+ if ( !fh )
+ error("%s: %s", header,strerror(errno));
+ int page_size = getpagesize();
+ char *buf = valloc(page_size);
+ BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w");
+ ssize_t nread;
+ while ( (nread=fread(buf,1,page_size-1,fh))>0 )
+ {
+ if ( nread<page_size-1 && buf[nread-1]!='\n' )
+ buf[nread++] = '\n';
+ if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error);
+ }
+ fclose(fh);
+
+ if ( fp->block_length - skip_until > 0 )
+ {
+ if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0)
+ error("Error: %s\n",fp->error);
+ }
+ if (bgzf_flush(bgzf_out) < 0)
+ error("Error: %s\n",bgzf_out->error);
+
+ while (1)
+ {
+#ifdef _USE_KNETFILE
+ nread = knet_read(fp->x.fpr, buf, page_size);
+#else
+ nread = fread(buf, 1, page_size, fp->file);
+#endif
+ if ( nread<=0 )
+ break;
+
+#ifdef _USE_KNETFILE
+ int count = fwrite(buf, 1, nread, bgzf_out->x.fpw);
+#else
+ int count = fwrite(buf, 1, nread, bgzf_out->file);
+#endif
+ if (count != nread)
+ error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
+ }
+
+ if (bgzf_close(bgzf_out) < 0)
+ error("Error: %s\n",bgzf_out->error);
+
+ return 0;
}
+
int main(int argc, char *argv[])
{
- int c, skip = -1, meta = -1;
+ int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0;
ti_conf_t conf = ti_conf_gff;
- while ((c = getopt(argc, argv, "p:s:b:e:0S:c:")) >= 0) {
+ const char *reheader = NULL;
+ while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) {
switch (c) {
+ case 'B': bed_reg = 1; break;
case '0': conf.preset |= TI_FLAG_UCSC; break;
case 'S': skip = atoi(optarg); break;
case 'c': meta = optarg[0]; break;
if (strcmp(optarg, "gff") == 0) conf = ti_conf_gff;
else if (strcmp(optarg, "bed") == 0) conf = ti_conf_bed;
else if (strcmp(optarg, "sam") == 0) conf = ti_conf_sam;
- else if (strcmp(optarg, "vcf") == 0) conf = ti_conf_vcf;
+ else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf = ti_conf_vcf;
else if (strcmp(optarg, "psltbl") == 0) conf = ti_conf_psltbl;
else {
fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg);
case 's': conf.sc = atoi(optarg); break;
case 'b': conf.bc = atoi(optarg); break;
case 'e': conf.ec = atoi(optarg); break;
+ case 'l': list_chrms = 1; break;
+ case 'h': print_header = 1; break;
+ case 'f': force = 1; break;
+ case 'r': reheader = optarg; break;
}
}
if (skip >= 0) conf.line_skip = skip;
fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n");
fprintf(stderr, " -s INT sequence name column [1]\n");
fprintf(stderr, " -b INT start column [4]\n");
- fprintf(stderr, " -e INT end column [5]\n");
+ fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n");
fprintf(stderr, " -S INT skip first INT lines [0]\n");
fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n");
+ fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n");
+ fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n");
fprintf(stderr, " -0 zero-based coordinate\n");
+ fprintf(stderr, " -h print the header lines\n");
+ fprintf(stderr, " -l list chromosome names\n");
+ fprintf(stderr, " -f force to overwrite the index\n");
fprintf(stderr, "\n");
return 1;
}
- if (optind + 1 == argc)
+ if (list_chrms) {
+ ti_index_t *idx;
+ int i, n;
+ const char **names;
+ idx = ti_index_load(argv[optind]);
+ if (idx == 0) {
+ fprintf(stderr, "[main] fail to load the index file.\n");
+ return 1;
+ }
+ names = ti_seqname(idx, &n);
+ for (i = 0; i < n; ++i) printf("%s\n", names[i]);
+ free(names);
+ ti_index_destroy(idx);
+ return 0;
+ }
+ if (reheader)
+ return reheader_file(reheader,argv[optind],conf.meta_char);
+
+ struct stat stat_tbi,stat_vcf;
+ char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
+ strcat(strcpy(fnidx, argv[optind]), ".tbi");
+
+ if (optind + 1 == argc) {
+ if (force == 0) {
+ if (stat(fnidx, &stat_tbi) == 0)
+ {
+ // Before complaining, check if the VCF file isn't newer. This is a common source of errors,
+ // people tend not to notice that tabix failed
+ stat(argv[optind], &stat_vcf);
+ if ( stat_vcf.st_mtime <= stat_tbi.st_mtime )
+ {
+ fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
+ free(fnidx);
+ return 1;
+ }
+ }
+ }
+ if ( bgzf_check_bgzf(argv[optind])!=1 )
+ {
+ fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
+ free(fnidx);
+ return 1;
+ }
return ti_index_build(argv[optind], &conf);
+ }
{ // retrieve
- BGZF *fp;
- fp = bgzf_open(argv[optind], "r");
- if (fp == 0) {
+ tabix_t *t;
+ // Common source of errors: new VCF is used with an old index
+ stat(fnidx, &stat_tbi);
+ stat(argv[optind], &stat_vcf);
+ if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
+ {
+ fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n");
+ free(fnidx);
+ return 1;
+ }
+ free(fnidx);
+
+ if ((t = ti_open(argv[optind], 0)) == 0) {
fprintf(stderr, "[main] fail to open the data file.\n");
return 1;
}
if (strcmp(argv[optind+1], ".") == 0) { // retrieve all
- kstring_t *str = calloc(1, sizeof(kstring_t));
- while (ti_readline(fp, str) >= 0) { // FIXME: check return code for error
- fputs(str->s, stdout); fputc('\n', stdout);
+ ti_iter_t iter;
+ const char *s;
+ int len;
+ iter = ti_query(t, 0, 0, 0);
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ fputs(s, stdout); fputc('\n', stdout);
}
- free(str->s); free(str);
+ ti_iter_destroy(iter);
} else { // retrieve from specified regions
- ti_index_t *idx;
- int i;
- idx = ti_index_load(argv[optind]);
- if (idx == 0) {
- bgzf_close(fp);
- fprintf(stderr, "[main] fail to load the index.\n");
- return 1;
- }
- for (i = optind + 1; i < argc; ++i) {
- int tid, beg, end;
- if (ti_parse_region(idx, argv[i], &tid, &beg, &end) == 0) {
- ti_fetch(fp, idx, tid, beg, end, 0, fetch_func);
- } else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
+ int i, len;
+ ti_iter_t iter;
+ const char *s;
+ const ti_conf_t *idxconf;
+
+ if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
+ fprintf(stderr,"[tabix] failed to load the index file.\n");
+ return 1;
+ }
+ idxconf = ti_get_conf(t->idx);
+
+ if ( print_header )
+ {
+ // If requested, print the header lines here
+ iter = ti_query(t, 0, 0, 0);
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ if ((int)(*s) != idxconf->meta_char) break;
+ fputs(s, stdout); fputc('\n', stdout);
+ }
+ ti_iter_destroy(iter);
+ }
+ if (bed_reg) {
+ extern int bed_overlap(const void *_h, const char *chr, int beg, int end);
+ extern void *bed_read(const char *fn);
+ extern void bed_destroy(void *_h);
+
+ const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available
+ void *bed = bed_read(argv[optind+1]); // load the BED file
+ ti_interval_t intv;
+
+ if (bed == 0) {
+ fprintf(stderr, "[main] fail to read the BED file.\n");
+ return 1;
+ }
+ iter = ti_query(t, 0, 0, 0);
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ int c;
+ ti_get_intv(conf_, len, (char*)s, &intv);
+ c = *intv.se; *intv.se = '\0';
+ if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) {
+ *intv.se = c;
+ puts(s);
+ }
+ *intv.se = c;
+ }
+ ti_iter_destroy(iter);
+ bed_destroy(bed);
+ } else {
+ for (i = optind + 1; i < argc; ++i) {
+ int tid, beg, end;
+ if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
+ iter = ti_queryi(t, tid, beg, end);
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ fputs(s, stdout); fputc('\n', stdout);
+ }
+ ti_iter_destroy(iter);
+ }
+ // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
+ }
}
- ti_index_destroy(idx);
}
- bgzf_close(fp);
+ ti_close(t);
}
return 0;
}