11 #define PACKAGE_VERSION "0.2.5 (r1005)"
13 #define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
15 int reheader_file(const char *header, const char *file, int meta)
17 BGZF *fp = bgzf_open(file,"r");
18 if (bgzf_read_block(fp) != 0 || !fp->block_length)
21 char *buffer = fp->uncompressed_block;
24 if ( buffer[0]==meta )
31 if ( buffer[skip_until]=='\n' )
34 if ( skip_until>=fp->block_length )
36 if (bgzf_read_block(fp) != 0 || !fp->block_length)
40 // The header has finished
41 if ( buffer[skip_until]!=meta ) break;
44 if ( skip_until>=fp->block_length )
46 if (bgzf_read_block(fp) != 0 || !fp->block_length)
53 FILE *fh = fopen(header,"r");
55 error("%s: %s", header,strerror(errno));
56 int page_size = getpagesize();
57 char *buf = valloc(page_size);
58 BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
60 while ( (nread=fread(buf,1,page_size-1,fh))>0 )
62 if ( nread<page_size-1 && buf[nread-1]!='\n' )
64 if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
68 if ( fp->block_length - skip_until > 0 )
70 if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0)
71 error("Error: %d\n",fp->errcode);
73 if (bgzf_flush(bgzf_out) < 0)
74 error("Error: %d\n",bgzf_out->errcode);
79 nread = knet_read(fp->fp, buf, page_size);
81 nread = fread(buf, 1, page_size, fp->fp);
86 int count = fwrite(buf, 1, nread, bgzf_out->fp);
88 error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
91 if (bgzf_close(bgzf_out) < 0)
92 error("Error: %d\n",bgzf_out->errcode);
98 int main(int argc, char *argv[])
100 int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, print_only_header = 0, bed_reg = 0;
101 ti_conf_t conf = ti_conf_gff, *conf_ptr = NULL;
102 const char *reheader = NULL;
103 while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhHfBr:")) >= 0) {
105 case 'B': bed_reg = 1; break;
106 case '0': conf.preset |= TI_FLAG_UCSC; break;
107 case 'S': skip = atoi(optarg); break;
108 case 'c': meta = optarg[0]; break;
110 if (strcmp(optarg, "gff") == 0) conf_ptr = &ti_conf_gff;
111 else if (strcmp(optarg, "bed") == 0) conf_ptr = &ti_conf_bed;
112 else if (strcmp(optarg, "sam") == 0) conf_ptr = &ti_conf_sam;
113 else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf_ptr = &ti_conf_vcf;
114 else if (strcmp(optarg, "psltbl") == 0) conf_ptr = &ti_conf_psltbl;
116 fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg);
120 case 's': conf.sc = atoi(optarg); break;
121 case 'b': conf.bc = atoi(optarg); break;
122 case 'e': conf.ec = atoi(optarg); break;
123 case 'l': list_chrms = 1; break;
124 case 'h': print_header = 1; break;
125 case 'H': print_only_header = 1; break;
126 case 'f': force = 1; break;
127 case 'r': reheader = optarg; break;
130 if (optind == argc) {
131 fprintf(stderr, "\n");
132 fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n");
133 fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
134 fprintf(stderr, "Usage: tabix <in.tab.bgz> [region1 [region2 [...]]]\n\n");
135 fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n");
136 fprintf(stderr, " -s INT sequence name column [1]\n");
137 fprintf(stderr, " -b INT start column [4]\n");
138 fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n");
139 fprintf(stderr, " -S INT skip first INT lines [0]\n");
140 fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n");
141 fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n");
142 fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n");
143 fprintf(stderr, " -0 zero-based coordinate\n");
144 fprintf(stderr, " -h print also the header lines\n");
145 fprintf(stderr, " -H print only the header lines\n");
146 fprintf(stderr, " -l list chromosome names\n");
147 fprintf(stderr, " -f force to overwrite the index\n");
148 fprintf(stderr, "\n");
153 int l = strlen(argv[optind]);
154 int strcasecmp(const char *s1, const char *s2);
155 if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &ti_conf_gff;
156 else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &ti_conf_bed;
157 else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &ti_conf_sam;
158 else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &ti_conf_vcf;
159 else if (l>=10 && strcasecmp(argv[optind]+l-10, ".psltbl.gz") == 0) conf_ptr = &ti_conf_psltbl;
164 if (skip >= 0) conf.line_skip = skip;
165 if (meta >= 0) conf.meta_char = meta;
170 idx = ti_index_load(argv[optind]);
172 fprintf(stderr, "[main] fail to load the index file.\n");
175 names = ti_seqname(idx, &n);
176 for (i = 0; i < n; ++i) printf("%s\n", names[i]);
178 ti_index_destroy(idx);
182 return reheader_file(reheader,argv[optind],conf.meta_char);
184 struct stat stat_tbi,stat_vcf;
185 char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
186 strcat(strcpy(fnidx, argv[optind]), ".tbi");
188 if (optind + 1 == argc && !print_only_header) {
190 if (stat(fnidx, &stat_tbi) == 0)
192 // Before complaining, check if the VCF file isn't newer. This is a common source of errors,
193 // people tend not to notice that tabix failed
194 stat(argv[optind], &stat_vcf);
195 if ( stat_vcf.st_mtime <= stat_tbi.st_mtime )
197 fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
203 if ( bgzf_is_bgzf(argv[optind])!=1 )
205 fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
211 // Building the index but the file type was neither recognised nor given. If no custom change
212 // has been made, warn the user that GFF is used
213 if ( conf.preset==ti_conf_gff.preset
214 && conf.sc==ti_conf_gff.sc
215 && conf.bc==ti_conf_gff.bc
216 && conf.ec==ti_conf_gff.ec
217 && conf.meta_char==ti_conf_gff.meta_char
218 && conf.line_skip==ti_conf_gff.line_skip )
219 fprintf(stderr,"[tabix] The file type not recognised and -p not given, using the preset [gff].\n");
221 return ti_index_build(argv[optind], &conf);
225 // On some systems, stat on non-existent files returns undefined value for sm_mtime, the user had to use -f
226 int is_remote = (strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx) ? 1 : 0;
229 // Common source of errors: new VCF is used with an old index
230 stat(fnidx, &stat_tbi);
231 stat(argv[optind], &stat_vcf);
232 if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
234 fprintf(stderr, "[tabix] the index file either does not exist or is older than the vcf file. Please reindex.\n");
241 if ((t = ti_open(argv[optind], 0)) == 0) {
242 fprintf(stderr, "[main] fail to open the data file.\n");
245 if ( print_only_header )
250 if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
251 fprintf(stderr,"[tabix] failed to load the index file.\n");
254 const ti_conf_t *idxconf = ti_get_conf(t->idx);
255 iter = ti_query(t, 0, 0, 0);
256 while ((s = ti_read(t, iter, &len)) != 0) {
257 if ((int)(*s) != idxconf->meta_char) break;
258 fputs(s, stdout); fputc('\n', stdout);
260 ti_iter_destroy(iter);
264 if (strcmp(argv[optind+1], ".") == 0) { // retrieve all
268 iter = ti_query(t, 0, 0, 0);
269 while ((s = ti_read(t, iter, &len)) != 0) {
270 fputs(s, stdout); fputc('\n', stdout);
272 ti_iter_destroy(iter);
273 } else { // retrieve from specified regions
277 const ti_conf_t *idxconf;
279 if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
280 fprintf(stderr,"[tabix] failed to load the index file.\n");
283 idxconf = ti_get_conf(t->idx);
287 // If requested, print the header lines here
288 iter = ti_query(t, 0, 0, 0);
289 while ((s = ti_read(t, iter, &len)) != 0) {
290 if ((int)(*s) != idxconf->meta_char) break;
291 fputs(s, stdout); fputc('\n', stdout);
293 ti_iter_destroy(iter);
296 extern int bed_overlap(const void *_h, const char *chr, int beg, int end);
297 extern void *bed_read(const char *fn);
298 extern void bed_destroy(void *_h);
300 const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available
301 void *bed = bed_read(argv[optind+1]); // load the BED file
305 fprintf(stderr, "[main] fail to read the BED file.\n");
308 iter = ti_query(t, 0, 0, 0);
309 while ((s = ti_read(t, iter, &len)) != 0) {
311 ti_get_intv(conf_, len, (char*)s, &intv);
312 c = *intv.se; *intv.se = '\0';
313 if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) {
319 ti_iter_destroy(iter);
322 for (i = optind + 1; i < argc; ++i) {
324 if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
325 iter = ti_queryi(t, tid, beg, end);
326 while ((s = ti_read(t, iter, &len)) != 0) {
327 fputs(s, stdout); fputc('\n', stdout);
329 ti_iter_destroy(iter);
331 // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");