12 int32_t line_len, line_blen;
16 KHASH_MAP_INIT_STR(s, faidx1_t)
22 #define ftello(fp) ftell(fp)
23 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
25 extern off_t ftello(FILE *stream);
26 extern int fseeko(FILE *stream, off_t offset, int whence);
29 #define razf_read(fp, buf, size) fread(buf, 1, size, fp)
30 #define razf_open(fn, mode) fopen(fn, mode)
31 #define razf_close(fp) fclose(fp)
32 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
33 #define razf_tell(fp) ftello(fp)
47 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
50 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
55 if (idx->n == idx->m) {
56 idx->m = idx->m? idx->m<<1 : 16;
57 idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
59 idx->name[idx->n] = strdup(name);
60 k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
61 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
62 kh_value(idx->hash, k) = t;
66 faidx_t *fai_build_core(RAZF *rz)
69 int l_name, m_name, ret;
70 int line_len, line_blen, state;
76 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
77 idx->hash = kh_init(s);
78 name = 0; l_name = m_name = 0;
79 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
80 while (razf_read(rz, &c, 1)) {
81 if (c == '\n') { // an empty line
83 offset = razf_tell(rz);
85 } else if ((state == 0 && len < 0) || state == 2) continue;
87 if (c == '>') { // fasta header
89 fai_insert_index(idx, name, len, line_len, line_blen, offset);
91 while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
92 if (m_name < l_name + 2) {
95 name = (char*)realloc(name, m_name);
101 fprintf(pysamerr, "[fai_build_core] the last entry has no sequence\n");
102 free(name); fai_destroy(idx);
105 if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
107 offset = razf_tell(rz);
110 fprintf(pysamerr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
111 free(name); fai_destroy(idx);
114 if (state == 2) state = 3;
118 if (isgraph(c)) ++l2;
119 } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
120 if (state == 3 && l2) {
121 fprintf(pysamerr, "[fai_build_core] different line length in sequence '%s'.\n", name);
122 free(name); fai_destroy(idx);
126 if (state == 1) line_len = l1, line_blen = l2, state = 0;
127 else if (state == 0) {
128 if (l1 != line_len || l2 != line_blen) state = 2;
132 fai_insert_index(idx, name, len, line_len, line_blen, offset);
137 void fai_save(const faidx_t *fai, FILE *fp)
141 for (i = 0; i < fai->n; ++i) {
143 k = kh_get(s, fai->hash, fai->name[i]);
144 x = kh_value(fai->hash, k);
146 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
148 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
153 faidx_t *fai_read(FILE *fp)
157 int len, line_len, line_blen;
163 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
164 fai->hash = kh_init(s);
165 buf = (char*)calloc(0x10000, 1);
166 while (!feof(fp) && fgets(buf, 0x10000, fp)) {
167 for (p = buf; *p && isgraph(*p); ++p);
170 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
172 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
174 fai_insert_index(fai, buf, len, line_len, line_blen, offset);
180 void fai_destroy(faidx_t *fai)
183 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
185 kh_destroy(s, fai->hash);
186 if (fai->rz) razf_close(fai->rz);
190 int fai_build(const char *fn)
196 str = (char*)calloc(strlen(fn) + 5, 1);
197 sprintf(str, "%s.fai", fn);
198 rz = razf_open(fn, "r");
200 fprintf(pysamerr, "[fai_build] fail to open the FASTA file %s\n",fn);
204 fai = fai_build_core(rz);
206 fp = fopen(str, "wb");
208 fprintf(pysamerr, "[fai_build] fail to write FASTA index %s\n",str);
209 fai_destroy(fai); free(str);
220 FILE *download_and_open(const char *fn)
222 const int buf_size = 1 * 1024 * 1024;
226 const char *url = fn;
229 for (p = fn + l - 1; p >= fn; --p)
230 if (*p == '/') break;
233 // First try to open a local copy
238 // If failed, download from remote and open
239 fp_remote = knet_open(url, "rb");
240 if (fp_remote == 0) {
241 fprintf(pysamerr, "[download_from_remote] fail to open remote file %s\n",url);
244 if ((fp = fopen(fn, "wb")) == 0) {
245 fprintf(pysamerr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
246 knet_close(fp_remote);
249 buf = (uint8_t*)calloc(buf_size, 1);
250 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
251 fwrite(buf, 1, l, fp);
254 knet_close(fp_remote);
256 return fopen(fn, "r");
260 faidx_t *fai_load(const char *fn)
265 str = (char*)calloc(strlen(fn) + 5, 1);
266 sprintf(str, "%s.fai", fn);
269 if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
271 fp = download_and_open(str);
274 fprintf(pysamerr, "[fai_load] failed to open remote FASTA index %s\n", str);
281 fp = fopen(str, "rb");
283 fprintf(pysamerr, "[fai_load] build FASTA index.\n");
285 fp = fopen(str, "rb");
287 fprintf(pysamerr, "[fai_load] fail to open FASTA index.\n");
296 fai->rz = razf_open(fn, "rb");
299 fprintf(pysamerr, "[fai_load] fail to open FASTA file.\n");
305 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
308 int i, l, k, name_end;
316 name_end = l = strlen(str);
317 s = (char*)malloc(l+1);
319 for (i = k = 0; i < l; ++i)
320 if (!isspace(str[i])) s[k++] = str[i];
322 // determine the sequence name
323 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
324 if (i >= 0) name_end = i;
325 if (name_end < l) { // check if this is really the end
327 for (i = name_end + 1; i < l; ++i) {
328 if (s[i] == '-') ++n_hyphen;
329 else if (!isdigit(s[i]) && s[i] != ',') break;
331 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
333 iter = kh_get(s, h, s);
334 if (iter == kh_end(h)) { // cannot find the sequence name
335 iter = kh_get(s, h, str); // try str as the name
336 if (iter == kh_end(h)) {
339 } else s[name_end] = ':', name_end = l;
341 } else iter = kh_get(s, h, str);
342 val = kh_value(h, iter);
343 // parse the interval
345 for (i = k = name_end + 1; i < l; ++i)
346 if (s[i] != ',') s[k++] = s[i];
348 beg = atoi(s + name_end + 1);
349 for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
350 end = i < k? atoi(s + i + 1) : val.len;
352 } else beg = 0, end = val.len;
353 if (beg >= val.len) beg = val.len;
354 if (end >= val.len) end = val.len;
355 if (beg > end) beg = end;
358 // now retrieve the sequence
360 s = (char*)malloc(end - beg + 2);
361 razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
362 while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
363 if (isgraph(c)) s[l++] = c;
369 int faidx_main(int argc, char *argv[])
372 fprintf(pysamerr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
375 if (argc == 2) fai_build(argv[1]);
380 fai = fai_load(argv[1]);
381 if (fai == 0) return 1;
382 for (i = 2; i != argc; ++i) {
383 printf(">%s\n", argv[i]);
384 s = fai_fetch(fai, argv[i], &l);
385 for (j = 0; j < l; j += 60) {
386 for (k = 0; k < 60 && k < l - j; ++k)
398 int faidx_fetch_nseq(const faidx_t *fai)
403 char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
412 iter = kh_get(s, fai->hash, c_name);
413 if(iter == kh_end(fai->hash)) return 0;
414 val = kh_value(fai->hash, iter);
415 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
416 if(p_beg_i < 0) p_beg_i = 0;
417 else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
418 if(p_end_i < 0) p_end_i = 0;
419 else if(val.len <= p_end_i) p_end_i = val.len - 1;
421 // Now retrieve the sequence
423 seq = (char*)malloc(p_end_i - p_beg_i + 2);
424 razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
425 while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
426 if (isgraph(c)) seq[l++] = c;
433 int main(int argc, char *argv[]) { return faidx_main(argc, argv); }