X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=faidx.c;h=f0798fccc8057a5e33fdb04afaae1af084badbc9;hp=055445f266cbdb429fe65c2aeb58c763af86d1d9;hb=2347ad62090e036841008e054c234868e21f7a80;hpb=4a17fa7e1f91b2fe04ad334a63fc2b0d5e859d8a diff --git a/faidx.c b/faidx.c index 055445f..f0798fc 100644 --- a/faidx.c +++ b/faidx.c @@ -2,11 +2,13 @@ #include #include #include +#include #include "faidx.h" #include "khash.h" typedef struct { - uint64_t len:32, line_len:16, line_blen:16; + int32_t line_len, line_blen; + int64_t len; uint64_t offset; } faidx1_t; KHASH_MAP_INIT_STR(s, faidx1_t) @@ -28,6 +30,9 @@ extern int fseeko(FILE *stream, off_t offset, int whence); #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) #define razf_tell(fp) ftello(fp) #endif +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif struct __faidx_t { RAZF *rz; @@ -60,10 +65,11 @@ faidx_t *fai_build_core(RAZF *rz) { char c, *name; int l_name, m_name, ret; - int len, line_len, line_blen, state; + int line_len, line_blen, state; int l1, l2; faidx_t *idx; uint64_t offset; + int64_t len; idx = (faidx_t*)calloc(1, sizeof(faidx_t)); idx->hash = kh_init(s); @@ -115,11 +121,6 @@ faidx_t *fai_build_core(RAZF *rz) return 0; } ++l1; len += l2; - if (l2 >= 0x10000) { - fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); - free(name); fai_destroy(idx); - return 0; - } if (state == 1) line_len = l1, line_blen = l2, state = 0; else if (state == 0) { if (l1 != line_len || l2 != line_blen) state = 2; @@ -194,7 +195,7 @@ int fai_build(const char *fn) sprintf(str, "%s.fai", fn); rz = razf_open(fn, "r"); if (rz == 0) { - fprintf(stderr, "[fai_build] fail to open the FASTA file.\n"); + fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); free(str); return -1; } @@ -202,7 +203,7 @@ int fai_build(const char *fn) razf_close(rz); fp = fopen(str, "wb"); if (fp == 0) { - fprintf(stderr, "[fai_build] fail to write FASTA index.\n"); + fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); fai_destroy(fai); free(str); return -1; } @@ -213,6 +214,47 @@ int fai_build(const char *fn) return 0; } +#ifdef _USE_KNETFILE +FILE *download_and_open(const char *fn) +{ + const int buf_size = 1 * 1024 * 1024; + uint8_t *buf; + FILE *fp; + knetFile *fp_remote; + const char *url = fn; + const char *p; + int l = strlen(fn); + for (p = fn + l - 1; p >= fn; --p) + if (*p == '/') break; + fn = p + 1; + + // First try to open a local copy + fp = fopen(fn, "r"); + if (fp) + return fp; + + // If failed, download from remote and open + fp_remote = knet_open(url, "rb"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); + return NULL; + } + if ((fp = fopen(fn, "wb")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); + knet_close(fp_remote); + return NULL; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); + + return fopen(fn, "r"); +} +#endif + faidx_t *fai_load(const char *fn) { char *str; @@ -220,19 +262,35 @@ faidx_t *fai_load(const char *fn) faidx_t *fai; str = (char*)calloc(strlen(fn) + 5, 1); sprintf(str, "%s.fai", fn); - fp = fopen(str, "rb"); + +#ifdef _USE_KNETFILE + if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) + { + fp = download_and_open(str); + if ( !fp ) + { + fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); + free(str); + return 0; + } + } + else +#endif + fp = fopen(str, "rb"); if (fp == 0) { fprintf(stderr, "[fai_load] build FASTA index.\n"); fai_build(fn); - fp = fopen(str, "r"); + fp = fopen(str, "rb"); if (fp == 0) { fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); free(str); return 0; } } + fai = fai_read(fp); fclose(fp); + fai->rz = razf_open(fn, "rb"); free(str); if (fai->rz == 0) { @@ -244,8 +302,8 @@ faidx_t *fai_load(const char *fn) char *fai_fetch(const faidx_t *fai, const char *str, int *len) { - char *s, *p, c; - int i, l, k; + char *s, c; + int i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; @@ -253,31 +311,43 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) beg = end = -1; h = fai->hash; - l = strlen(str); - p = s = (char*)malloc(l+1); - /* squeeze out "," */ - for (i = k = 0; i != l; ++i) - if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; - s[k] = 0; - for (i = 0; i != k; ++i) if (s[i] == ':') break; - s[i] = 0; - iter = kh_get(s, h, s); /* get the ref_id */ - if (iter == kh_end(h)) { - *len = 0; - free(s); return 0; - } + name_end = l = strlen(str); + s = (char*)malloc(l+1); + // remove space + for (i = k = 0; i < l; ++i) + if (!isspace(str[i])) s[k++] = str[i]; + s[k] = 0; l = k; + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + s[name_end] = 0; + iter = kh_get(s, h, s); + if (iter == kh_end(h)) { // cannot find the sequence name + iter = kh_get(s, h, str); // try str as the name + if (iter == kh_end(h)) { + *len = 0; + free(s); return 0; + } else s[name_end] = ':', name_end = l; + } + } else iter = kh_get(s, h, str); val = kh_value(h, iter); - if (i == k) { /* dump the whole sequence */ - beg = 0; end = val.len; - } else { - for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; - beg = atoi(p); - if (i < k) { - p = s + i + 1; - end = atoi(p); - } else end = val.len; - } - if (beg > 0) --beg; + // parse the interval + if (name_end < l) { + for (i = k = name_end + 1; i < l; ++i) + if (s[i] != ',') s[k++] = s[i]; + s[k] = 0; + beg = atoi(s + name_end + 1); + for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; + end = i < k? atoi(s + i + 1) : val.len; + if (beg > 0) --beg; + } else beg = 0, end = val.len; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; @@ -287,7 +357,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) l = 0; s = (char*)malloc(end - beg + 2); razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); - while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg) + while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) if (isgraph(c)) s[l++] = c; s[l] = '\0'; *len = l; @@ -323,6 +393,40 @@ int faidx_main(int argc, char *argv[]) return 0; } +int faidx_fetch_nseq(const faidx_t *fai) +{ + return fai->n; +} + +char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) +{ + int l; + char c; + khiter_t iter; + faidx1_t val; + char *seq=NULL; + + // Adjust position + iter = kh_get(s, fai->hash, c_name); + if(iter == kh_end(fai->hash)) return 0; + val = kh_value(fai->hash, iter); + if(p_end_i < p_beg_i) p_beg_i = p_end_i; + if(p_beg_i < 0) p_beg_i = 0; + else if(val.len <= p_beg_i) p_beg_i = val.len - 1; + if(p_end_i < 0) p_end_i = 0; + else if(val.len <= p_end_i) p_end_i = val.len - 1; + + // Now retrieve the sequence + l = 0; + seq = (char*)malloc(p_end_i - p_beg_i + 2); + razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); + while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) + if (isgraph(c)) seq[l++] = c; + seq[l] = '\0'; + *len = l; + return seq; +} + #ifdef FAIDX_MAIN int main(int argc, char *argv[]) { return faidx_main(argc, argv); } #endif