6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 if ((l->list = (pair64_t*)calloc(l->m, 16)) == NULL) {
84 fprintf(stderr, "[%s] failed to allocate bam_binlist node.\n", __func__);
90 if ((l->list = (pair64_t*)realloc(l->list, l->m * 16)) == NULL) {
91 fprintf(stderr, "[%s] failed to resize bam_binlist node.\n", __func__);
95 l->list[l->n].u = beg; l->list[l->n++].v = end;
98 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
101 beg = b->core.pos >> BAM_LIDX_SHIFT;
102 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
103 if (index2->m < end + 1) {
104 int old_m = index2->m;
106 kroundup32(index2->m);
107 if ((index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8)) == NULL) {
108 fprintf(stderr, "[%s] failed to resize offset array.\n", __func__);
111 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
114 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
116 for (i = beg; i <= end; ++i)
117 if (index2->offset[i] == 0) index2->offset[i] = offset;
122 static void merge_chunks(bam_index_t *idx)
124 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
128 for (i = 0; i < idx->n; ++i) {
129 index = idx->index[i];
130 for (k = kh_begin(index); k != kh_end(index); ++k) {
132 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
133 p = &kh_value(index, k);
135 for (l = 1; l < p->n; ++l) {
136 #ifdef BAM_TRUE_OFFSET
137 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
139 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
141 else p->list[++m] = p->list[l];
146 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
149 static void fill_missing(bam_index_t *idx)
152 for (i = 0; i < idx->n; ++i) {
153 bam_lidx_t *idx2 = &idx->index2[i];
154 for (j = 1; j < idx2->n; ++j)
155 if (idx2->offset[j] == 0)
156 idx2->offset[j] = idx2->offset[j-1];
160 bam_index_t *bam_index_core(bamFile fp)
166 uint32_t last_bin, save_bin;
167 int32_t last_coor, last_tid, save_tid;
169 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
171 if ((idx = (bam_index_t*)calloc(1, sizeof(bam_index_t))) == NULL) {
172 fprintf(stderr, "[%s] failed to allocate bam_index structure.\n", __func__);
175 if ((b = (bam1_t*)calloc(1, sizeof(bam1_t))) == NULL) {
176 fprintf(stderr, "[%s] failed to allocate bam1_t buffer.\n", __func__);
180 h = bam_header_read(fp);
183 idx->n = h->n_targets;
184 bam_header_destroy(h);
185 if ((idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*))) == NULL) {
186 fprintf(stderr, "[%s] failed to allocate bam_index bin hash.\n", __func__);
191 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
192 if ((idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t))) == NULL) {
193 fprintf(stderr, "[%s] failed to allocate bam_index linear index.\n", __func__);
200 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
201 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
202 n_mapped = n_unmapped = n_no_coor = off_end = 0;
203 off_beg = off_end = bam_tell(fp);
204 while ((ret = bam_read1(fp, b)) >= 0) {
205 if (c->tid < 0) ++n_no_coor;
206 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
208 last_bin = 0xffffffffu;
209 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
210 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
211 bam1_qname(b), last_tid+1, c->tid+1);
213 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
214 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
215 bam1_qname(b), last_coor, c->pos, c->tid+1);
218 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
219 if (c->bin != last_bin) { // then possibly write the binning index
220 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
221 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
222 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
224 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
225 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
226 n_mapped = n_unmapped = 0;
230 save_bin = last_bin = c->bin;
232 if (save_tid < 0) break;
234 if (bam_tell(fp) <= last_off) {
235 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
236 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
239 if (c->flag & BAM_FUNMAP) ++n_unmapped;
241 last_off = bam_tell(fp);
242 last_coor = b->core.pos;
245 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
246 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
247 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
252 while ((ret = bam_read1(fp, b)) >= 0) {
254 if (c->tid >= 0 && n_no_coor) {
255 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
260 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
261 free(b->data); free(b);
262 idx->n_no_coor = n_no_coor;
266 void bam_index_destroy(bam_index_t *idx)
270 if (idx == 0) return;
271 for (i = 0; i < idx->n; ++i) {
272 khash_t(i) *index = idx->index[i];
273 bam_lidx_t *index2 = idx->index2 + i;
274 for (k = kh_begin(index); k != kh_end(index); ++k) {
275 if (kh_exist(index, k))
276 free(kh_value(index, k).list);
278 kh_destroy(i, index);
279 free(index2->offset);
281 free(idx->index); free(idx->index2);
285 void bam_index_save(const bam_index_t *idx, FILE *fp)
289 if (fwrite("BAI\1", 1, 4, fp) < 4) {
290 fprintf(stderr, "[%s] failed to write magic number.\n", __func__);
295 if (fwrite(bam_swap_endian_4p(&x), 4, 1, fp) < 1) {
296 fprintf(stderr, "[%s] failed to write n_ref.\n", __func__);
301 if (fwrite(&idx->n, 4, 1, fp) < 1) {
302 fprintf(stderr, "[%s] failed to write n_ref.\n", __func__);
306 for (i = 0; i < idx->n; ++i) {
307 khash_t(i) *index = idx->index[i];
308 bam_lidx_t *index2 = idx->index2 + i;
309 // write binning index
310 size = kh_size(index);
311 if (bam_is_be) { // big endian
313 if (fwrite(bam_swap_endian_4p(&x), 4, 1, fp) < 1) {
314 fprintf(stderr, "[%s] failed to write n_bin.\n", __func__);
319 if (fwrite(&size, 4, 1, fp) < 1) {
320 fprintf(stderr, "[%s] failed to write n_bin.\n", __func__);
324 for (k = kh_begin(index); k != kh_end(index); ++k) {
325 if (kh_exist(index, k)) {
326 bam_binlist_t *p = &kh_value(index, k);
327 if (bam_is_be) { // big endian
329 x = kh_key(index, k);
330 if (fwrite(bam_swap_endian_4p(&x), 4, 1, fp) < 1) {
331 fprintf(stderr, "[%s] failed to write bin.\n", __func__);
335 if (fwrite(bam_swap_endian_4p(&x), 4, 1, fp) < 1) {
336 fprintf(stderr, "[%s] failed to write n_chunk.\n", __func__);
339 for (x = 0; (int)x < p->n; ++x) {
340 bam_swap_endian_8p(&p->list[x].u);
341 bam_swap_endian_8p(&p->list[x].v);
343 if (fwrite(p->list, 16, p->n, fp) < p->n) {
344 fprintf(stderr, "[%s] failed to write %d chunks.\n", __func__, p->n);
348 for (x = 0; (int)x < p->n; ++x) {
349 bam_swap_endian_8p(&p->list[x].u);
350 bam_swap_endian_8p(&p->list[x].v);
354 if (fwrite(&kh_key(index, k), 4, 1, fp) < 1) {
355 fprintf(stderr, "[%s] failed to write bin.\n", __func__);
358 if (fwrite(&p->n, 4, 1, fp) < 1) {
359 fprintf(stderr, "[%s] failed to write n_chunk.\n", __func__);
362 if (fwrite(p->list, 16, p->n, fp) < p->n) {
363 fprintf(stderr, "[%s] failed to write %d chunks.\n", __func__, p->n);
369 // write linear index (index2)
372 if (fwrite(bam_swap_endian_4p(&x), 4, 1, fp) < 1) {
373 fprintf(stderr, "[%s] failed to write n_intv.\n", __func__);
378 if (fwrite(&index2->n, 4, 1, fp) < 1) {
379 fprintf(stderr, "[%s] failed to write n_intv.\n", __func__);
383 if (bam_is_be) { // big endian
385 for (x = 0; (int)x < index2->n; ++x)
386 bam_swap_endian_8p(&index2->offset[x]);
387 if (fwrite(index2->offset, 8, index2->n, fp) < index2->n) {
388 fprintf(stderr, "[%s] failed to write ioffset.\n", __func__);
391 for (x = 0; (int)x < index2->n; ++x)
392 bam_swap_endian_8p(&index2->offset[x]);
395 if (fwrite(index2->offset, 8, index2->n, fp) < index2->n) {
396 fprintf(stderr, "[%s] failed to write ioffset.\n", __func__);
401 { // write the number of reads coor-less records.
402 uint64_t x = idx->n_no_coor;
403 if (bam_is_be) bam_swap_endian_8p(&x);
404 if (fwrite(&x, 8, 1, fp) < 1) {
405 fprintf(stderr, "[%s] failed to write n_no_coor.\n", __func__);
411 static bam_index_t *bam_index_load_core(FILE *fp)
417 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
420 if (fread(magic, 1, 4, fp) < 4) {
421 fprintf(stderr, "[%s] failed to read magic number.\n", __func__);
425 if (strncmp(magic, "BAI\1", 4)) {
426 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
430 if ((idx = (bam_index_t*)calloc(1, sizeof(bam_index_t))) == NULL) {
431 fprintf(stderr, "[%s] failed to allocate bam_index structure.\n", __func__);
435 if (fread(&idx->n, 4, 1, fp) < 1) {
436 fprintf(stderr, "[%s] failed to read n_ref.\n", __func__);
441 if (bam_is_be) bam_swap_endian_4p(&idx->n);
442 if ((idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*))) == NULL) {
443 fprintf(stderr, "[%s] failed to allocate bam_index bin hash.\n", __func__);
448 if ((idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t))) == NULL) {
449 fprintf(stderr, "[%s] failed to allocate bam_index linear index.\n", __func__);
456 for (i = 0; i < idx->n; ++i) {
458 bam_lidx_t *index2 = idx->index2 + i;
463 index = idx->index[i] = kh_init(i);
464 // load binning index
465 if (fread(&size, 4, 1, fp) < 1) {
466 fprintf(stderr, "[%s] failed to read n_bin.\n", __func__);
472 if (bam_is_be) bam_swap_endian_4p(&size);
473 for (j = 0; j < (int)size; ++j) {
474 if (fread(&key, 4, 1, fp) < 1) {
475 fprintf(stderr, "[%s] failed to read bin.\n", __func__);
481 if (bam_is_be) bam_swap_endian_4p(&key);
482 k = kh_put(i, index, key, &ret);
483 p = &kh_value(index, k);
484 if (fread(&p->n, 4, 1, fp) < 1) {
485 fprintf(stderr, "[%s] failed to read n_chunk.\n", __func__);
491 if (bam_is_be) bam_swap_endian_4p(&p->n);
493 if ((p->list = (pair64_t*)malloc(p->m * 16)) == NULL) {
494 fprintf(stderr, "[%s] failed to allocate chunk array.\n", __func__);
500 if (fread(p->list, 16, p->n, fp) < p->n) {
501 fprintf(stderr, "[%s] failed to read %d chunks.\n", __func__, p->n);
509 for (x = 0; x < p->n; ++x) {
510 bam_swap_endian_8p(&p->list[x].u);
511 bam_swap_endian_8p(&p->list[x].v);
516 if (fread(&index2->n, 4, 1, fp) < 1) {
517 fprintf(stderr, "[%s] failed to read n_intv", __func__);
523 if (bam_is_be) bam_swap_endian_4p(&index2->n);
525 index2->m = index2->n;
526 if ((index2->offset = (uint64_t*)calloc(index2->m, 8)) == NULL) {
527 fprintf(stderr, "[%s] falied to allocate interval array.\n", __func__);
533 if (fread(index2->offset, index2->n, 8, fp) < index2->n) {
534 fprintf(stderr, "[%s] failed to read %d intervals.", __func__, index2->n);
535 free(index2->offset);
542 for (j = 0; j < index2->n; ++j) {
543 bam_swap_endian_8p(&index2->offset[j]);
548 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
549 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
553 bam_index_t *bam_index_load_local(const char *_fn)
558 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
561 for (p = _fn + l - 1; p >= _fn; --p)
562 if (*p == '/') break;
564 } else fn = strdup(_fn);
565 if ((fnidx = (char*)calloc(strlen(fn) + 5, 1)) == NULL) {
566 fprintf(stderr, "[%s] failed to allocate space for index file name.\n", __func__);
570 strcpy(fnidx, fn); strcat(fnidx, ".bai");
571 fp = fopen(fnidx, "rb");
572 if (fp == 0) { // try "{base}.bai"
573 char *s = strstr(fn, "bam");
574 if (s == fn + strlen(fn) - 3) {
576 fnidx[strlen(fn)-1] = 'i';
577 fp = fopen(fnidx, "rb");
580 free(fnidx); free(fn);
582 bam_index_t *idx = bam_index_load_core(fp);
587 fprintf(stderr, "[%s] failed to open index file.\n", __func__);
593 static void download_from_remote(const char *url)
595 const int buf_size = 1 * 1024 * 1024;
601 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
603 for (fn = (char*)url + l - 1; fn >= url; --fn)
604 if (*fn == '/') break;
605 ++fn; // fn now points to the file name
606 fp_remote = knet_open(url, "r");
607 if (fp_remote == 0) {
608 fprintf(stderr, "[%s] failed to open remote file.\n", __func__);
611 if ((fp = fopen(fn, "wb")) == 0) {
612 fprintf(stderr, "[%s] failed to create file in the working directory.\n", __func__);
613 knet_close(fp_remote);
616 if ((buf = (uint8_t*)calloc(buf_size, 1)) == NULL) {
617 fprintf(stderr, "[%s] failed to allocate buffer.\n", __func__);
621 while ((l = knet_read(fp_remote, buf, buf_size)) != 0) {
622 if (fwrite(buf, 1, l, fp) < l) {
623 fprintf(stderr, "[%s] failed to write to destination file.\n", __func__);
629 knet_close(fp_remote);
632 static void download_from_remote(const char *url)
638 bam_index_t *bam_index_load(const char *fn)
641 idx = bam_index_load_local(fn);
642 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
643 char *fnidx = calloc(strlen(fn) + 5, 1);
645 fprintf(stderr, "[%s] failed to allocate space for index filename.\n", __func__);
648 strcat(strcpy(fnidx, fn), ".bai");
649 fprintf(stderr, "[%s] attempting to download the remote index file.\n", __func__);
650 download_from_remote(fnidx);
651 idx = bam_index_load_local(fn);
653 if (idx == 0) fprintf(stderr, "[%s] fail to load BAM index.\n", __func__);
657 int bam_index_build2(const char *fn, const char *_fnidx)
663 if ((fp = bam_open(fn, "r")) == 0) {
664 fprintf(stderr, "[%s] failure to open the BAM file.\n", __func__);
667 idx = bam_index_core(fp);
670 fprintf(stderr, "[%s] fail to index the BAM file.\n", __func__);
674 fnidx = (char*)calloc(strlen(fn) + 5, 1);
675 strcpy(fnidx, fn); strcat(fnidx, ".bai");
676 } else fnidx = strdup(_fnidx);
677 fpidx = fopen(fnidx, "wb");
679 fprintf(stderr, "[%s] fail to create the index file.\n", __func__);
683 bam_index_save(idx, fpidx);
684 bam_index_destroy(idx);
690 int bam_index_build(const char *fn)
692 return bam_index_build2(fn, 0);
695 int bam_index(int argc, char *argv[])
698 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
701 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
702 else bam_index_build(argv[1]);
706 int bam_idxstats(int argc, char *argv[])
709 bam_header_t *header;
713 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
716 fp = bam_open(argv[1], "r");
717 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
718 header = bam_header_read(fp);
720 idx = bam_index_load(argv[1]);
721 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
722 for (i = 0; i < idx->n; ++i) {
724 khash_t(i) *h = idx->index[i];
725 printf("%s\t%d", header->target_name[i], header->target_len[i]);
726 k = kh_get(i, h, BAM_MAX_BIN);
728 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
729 else printf("\t0\t0");
732 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
733 bam_header_destroy(header);
734 bam_index_destroy(idx);
738 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
741 if (beg >= end) return 0;
742 if (end >= 1u<<29) end = 1u<<29;
745 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
746 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
747 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
748 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
749 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
753 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
755 uint32_t rbeg = b->core.pos;
756 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
757 return (rend > beg && rbeg < end);
760 struct __bam_iter_t {
761 int from_first; // read from the first record; no random access
762 int tid, beg, end, n_off, i, finished;
767 // bam_fetch helper function retrieves
768 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
771 int i, n_bins, n_off;
778 if (beg < 0) beg = 0;
779 if (end < beg) return 0;
781 if ((iter = calloc(1, sizeof(struct __bam_iter_t))) == NULL) {
782 fprintf(stderr, "[%s] failed to allocate bam iterator.\n", __func__);
785 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
787 if ((bins = (uint16_t*)calloc(BAM_MAX_BIN, 2)) == NULL) {
788 fprintf(stderr, "[%s] failed to allocate bin array.\n", __func__);
791 n_bins = reg2bins(beg, end, bins);
792 index = idx->index[tid];
793 if (idx->index2[tid].n > 0) {
794 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
795 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
796 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
797 int n = beg >> BAM_LIDX_SHIFT;
798 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
799 for (i = n - 1; i >= 0; --i)
800 if (idx->index2[tid].offset[i] != 0) break;
801 if (i >= 0) min_off = idx->index2[tid].offset[i];
803 } else min_off = 0; // tabix 0.1.2 may produce such index files
804 for (i = n_off = 0; i < n_bins; ++i) {
805 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
806 n_off += kh_value(index, k).n;
809 free(bins); return iter;
811 if ((off = (pair64_t*)calloc(n_off, 16)) == NULL) {
812 fprintf(stderr, "[%s] failed to allocate offset pair array.\n", __func__);
815 for (i = n_off = 0; i < n_bins; ++i) {
816 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
818 bam_binlist_t *p = &kh_value(index, k);
819 for (j = 0; j < p->n; ++j)
820 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
825 free(off); return iter;
828 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
830 fprintf(stderr, "[%s] failure to allocate bam1_t buffer.\n", __func__);
834 ks_introsort(off, n_off, off);
835 // resolve completely contained adjacent blocks
836 for (i = 1, l = 0; i < n_off; ++i)
837 if (off[l].v < off[i].v)
840 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
841 for (i = 1; i < n_off; ++i)
842 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
843 { // merge adjacent blocks
844 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
845 for (i = 1, l = 0; i < n_off; ++i) {
846 #ifdef BAM_TRUE_OFFSET
847 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
849 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
851 else off[++l] = off[i];
858 iter->n_off = n_off; iter->off = off;
862 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
863 { // for pysam compatibility
866 iter = bam_iter_query(idx, tid, beg, end);
867 off = iter->off; *cnt_off = iter->n_off;
872 void bam_iter_destroy(bam_iter_t iter)
874 if (iter) { free(iter->off); free(iter); }
877 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
880 if (iter && iter->finished) return -1;
881 if (iter == 0 || iter->from_first) {
882 ret = bam_read1(fp, b);
883 if (ret < 0 && iter) iter->finished = 1;
886 if (iter->off == 0) return -1;
888 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
889 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
890 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
891 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
892 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
893 iter->curr_off = bam_tell(fp);
897 if ((ret = bam_read1(fp, b)) >= 0) {
898 iter->curr_off = bam_tell(fp);
899 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
900 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
903 else if (is_overlap(iter->beg, iter->end, b)) return ret;
904 } else break; // end of file or error
910 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
916 iter = bam_iter_query(idx, tid, beg, end);
917 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
918 bam_iter_destroy(iter);
920 return (ret == -1)? 0 : ret;