Imported Upstream version 0.2.5
[tabix.git] / main.c
1 #include <string.h>
2 #include <unistd.h>
3 #include <stdlib.h>
4 #include <stdio.h>
5 #include <sys/stat.h>
6 #include <errno.h>
7 #include "bgzf.h"
8 #include "tabix.h"
9
10 #define PACKAGE_VERSION "0.2.5 (r964)"
11
12 #define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
13
14 int reheader_file(const char *header, const char *file, int meta)
15 {
16     BGZF *fp = bgzf_open(file,"r");
17     if (bgzf_read_block(fp) != 0 || !fp->block_length)
18         return -1;
19     
20     char *buffer = fp->uncompressed_block;
21     int skip_until = 0;
22
23     if ( buffer[0]==meta )
24     {
25         skip_until = 1;
26
27         // Skip the header
28         while (1)
29         {
30             if ( buffer[skip_until]=='\n' )
31             {
32                 skip_until++;
33                 if ( skip_until>=fp->block_length )
34                 {
35                     if (bgzf_read_block(fp) != 0 || !fp->block_length)
36                         error("no body?\n");
37                     skip_until = 0;
38                 }
39                 // The header has finished
40                 if ( buffer[skip_until]!=meta ) break;
41             }
42             skip_until++;
43             if ( skip_until>=fp->block_length )
44             {
45                 if (bgzf_read_block(fp) != 0 || !fp->block_length)
46                     error("no body?\n");
47                 skip_until = 0;
48             }
49         }
50     }
51
52     FILE *fh = fopen(header,"r");
53     if ( !fh )
54         error("%s: %s", header,strerror(errno));
55     int page_size = getpagesize();
56     char *buf = valloc(page_size);
57     BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w");
58     ssize_t nread;
59     while ( (nread=fread(buf,1,page_size-1,fh))>0 )
60     {
61         if ( nread<page_size-1 && buf[nread-1]!='\n' )
62             buf[nread++] = '\n';
63         if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error);
64     }
65     fclose(fh);
66
67     if ( fp->block_length - skip_until > 0 )
68     {
69         if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) 
70             error("Error: %s\n",fp->error);
71     }
72     if (bgzf_flush(bgzf_out) < 0) 
73         error("Error: %s\n",bgzf_out->error);
74
75     while (1)
76     {
77 #ifdef _USE_KNETFILE
78         nread = knet_read(fp->x.fpr, buf, page_size);
79 #else
80         nread = fread(buf, 1, page_size, fp->file);
81 #endif
82         if ( nread<=0 ) 
83             break;
84
85 #ifdef _USE_KNETFILE
86         int count = fwrite(buf, 1, nread, bgzf_out->x.fpw);
87 #else
88         int count = fwrite(buf, 1, nread, bgzf_out->file);
89 #endif
90         if (count != nread)
91             error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
92     }
93
94     if (bgzf_close(bgzf_out) < 0) 
95         error("Error: %s\n",bgzf_out->error);
96    
97     return 0;
98 }
99
100
101 int main(int argc, char *argv[])
102 {
103         int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0;
104         ti_conf_t conf = ti_conf_gff;
105     const char *reheader = NULL;
106         while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) {
107                 switch (c) {
108                 case 'B': bed_reg = 1; break;
109                 case '0': conf.preset |= TI_FLAG_UCSC; break;
110                 case 'S': skip = atoi(optarg); break;
111                 case 'c': meta = optarg[0]; break;
112                 case 'p':
113                         if (strcmp(optarg, "gff") == 0) conf = ti_conf_gff;
114                         else if (strcmp(optarg, "bed") == 0) conf = ti_conf_bed;
115                         else if (strcmp(optarg, "sam") == 0) conf = ti_conf_sam;
116                         else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf = ti_conf_vcf;
117                         else if (strcmp(optarg, "psltbl") == 0) conf = ti_conf_psltbl;
118                         else {
119                                 fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg);
120                                 return 1;
121                         }
122                         break;
123                 case 's': conf.sc = atoi(optarg); break;
124                 case 'b': conf.bc = atoi(optarg); break;
125                 case 'e': conf.ec = atoi(optarg); break;
126         case 'l': list_chrms = 1; break;
127         case 'h': print_header = 1; break;
128                 case 'f': force = 1; break;
129         case 'r': reheader = optarg; break;
130                 }
131         }
132         if (skip >= 0) conf.line_skip = skip;
133         if (meta >= 0) conf.meta_char = meta;
134         if (optind == argc) {
135                 fprintf(stderr, "\n");
136                 fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n");
137                 fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
138                 fprintf(stderr, "Usage:   tabix <in.tab.bgz> [region1 [region2 [...]]]\n\n");
139                 fprintf(stderr, "Options: -p STR     preset: gff, bed, sam, vcf, psltbl [gff]\n");
140                 fprintf(stderr, "         -s INT     sequence name column [1]\n");
141                 fprintf(stderr, "         -b INT     start column [4]\n");
142                 fprintf(stderr, "         -e INT     end column; can be identical to '-b' [5]\n");
143                 fprintf(stderr, "         -S INT     skip first INT lines [0]\n");
144                 fprintf(stderr, "         -c CHAR    symbol for comment/meta lines [#]\n");
145             fprintf(stderr, "         -r FILE    replace the header with the content of FILE [null]\n");
146                 fprintf(stderr, "         -B         region1 is a BED file (entire file will be read)\n");
147                 fprintf(stderr, "         -0         zero-based coordinate\n");
148                 fprintf(stderr, "         -h         print the header lines\n");
149                 fprintf(stderr, "         -l         list chromosome names\n");
150                 fprintf(stderr, "         -f         force to overwrite the index\n");
151                 fprintf(stderr, "\n");
152                 return 1;
153         }
154     if (list_chrms) {
155                 ti_index_t *idx;
156                 int i, n;
157                 const char **names;
158                 idx = ti_index_load(argv[optind]);
159                 if (idx == 0) {
160                         fprintf(stderr, "[main] fail to load the index file.\n");
161                         return 1;
162                 }
163                 names = ti_seqname(idx, &n);
164                 for (i = 0; i < n; ++i) printf("%s\n", names[i]);
165                 free(names);
166                 ti_index_destroy(idx);
167                 return 0;
168         }
169     if (reheader)
170         return reheader_file(reheader,argv[optind],conf.meta_char);
171
172         struct stat stat_tbi,stat_vcf;
173     char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
174         strcat(strcpy(fnidx, argv[optind]), ".tbi");
175
176         if (optind + 1 == argc) {
177                 if (force == 0) {
178                         if (stat(fnidx, &stat_tbi) == 0) 
179             {
180                 // Before complaining, check if the VCF file isn't newer. This is a common source of errors,
181                 //  people tend not to notice that tabix failed
182                 stat(argv[optind], &stat_vcf);
183                 if ( stat_vcf.st_mtime <= stat_tbi.st_mtime )
184                 {
185                     fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
186                     free(fnidx);
187                     return 1;
188                 }
189                         }
190                 }
191         if ( bgzf_check_bgzf(argv[optind])!=1 )
192         {
193             fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
194             free(fnidx);
195             return 1;
196         }
197                 return ti_index_build(argv[optind], &conf);
198         }
199         { // retrieve
200                 tabix_t *t;
201         // Common source of errors: new VCF is used with an old index
202         stat(fnidx, &stat_tbi);
203         stat(argv[optind], &stat_vcf);
204         if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
205         {
206             fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n");
207             free(fnidx);
208             return 1;
209         }
210         free(fnidx);
211
212                 if ((t = ti_open(argv[optind], 0)) == 0) {
213                         fprintf(stderr, "[main] fail to open the data file.\n");
214                         return 1;
215                 }
216                 if (strcmp(argv[optind+1], ".") == 0) { // retrieve all
217                         ti_iter_t iter;
218                         const char *s;
219                         int len;
220                         iter = ti_query(t, 0, 0, 0);
221                         while ((s = ti_read(t, iter, &len)) != 0) {
222                                 fputs(s, stdout); fputc('\n', stdout);
223                         }
224                         ti_iter_destroy(iter);
225                 } else { // retrieve from specified regions
226                         int i, len;
227             ti_iter_t iter;
228             const char *s;
229                         const ti_conf_t *idxconf;
230
231                         if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
232                 fprintf(stderr,"[tabix] failed to load the index file.\n");
233                 return 1;
234             }
235                         idxconf = ti_get_conf(t->idx);
236
237             if ( print_header )
238             {
239                 // If requested, print the header lines here
240                 iter = ti_query(t, 0, 0, 0);
241                 while ((s = ti_read(t, iter, &len)) != 0) {
242                     if ((int)(*s) != idxconf->meta_char) break;
243                     fputs(s, stdout); fputc('\n', stdout);
244                 }
245                 ti_iter_destroy(iter);
246             }
247                         if (bed_reg) {
248                                 extern int bed_overlap(const void *_h, const char *chr, int beg, int end);
249                                 extern void *bed_read(const char *fn);
250                                 extern void bed_destroy(void *_h);
251
252                                 const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available
253                                 void *bed = bed_read(argv[optind+1]); // load the BED file
254                                 ti_interval_t intv;
255
256                                 if (bed == 0) {
257                                         fprintf(stderr, "[main] fail to read the BED file.\n");
258                                         return 1;
259                                 }
260                                 iter = ti_query(t, 0, 0, 0);
261                                 while ((s = ti_read(t, iter, &len)) != 0) {
262                                         int c;
263                                         ti_get_intv(conf_, len, (char*)s, &intv);
264                                         c = *intv.se; *intv.se = '\0';
265                                         if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) {
266                                                 *intv.se = c;
267                                                 puts(s);
268                                         }
269                                         *intv.se = c;
270                                 }
271                 ti_iter_destroy(iter);
272                                 bed_destroy(bed);
273                         } else {
274                                 for (i = optind + 1; i < argc; ++i) {
275                                         int tid, beg, end;
276                                         if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
277                                                 iter = ti_queryi(t, tid, beg, end);
278                                                         while ((s = ti_read(t, iter, &len)) != 0) {
279                                                         fputs(s, stdout); fputc('\n', stdout);
280                                                 }
281                                                 ti_iter_destroy(iter);
282                                         } 
283                     // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
284                                 }
285                         }
286                 }
287                 ti_close(t);
288         }
289         return 0;
290 }