8 #include "bam_endian.h"
16 Alignment indexing. Before indexing, BAM must be sorted based on the
17 leftmost coordinate of alignments. In indexing, BAM uses two indices:
18 a UCSC binning index and a simple linear index. The binning index is
19 efficient for alignments spanning long distance, while the auxiliary
20 linear index helps to reduce unnecessary seek calls especially for
23 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
24 Stein and is explained by Kent et al. (2002). In this scheme, each bin
25 represents a contiguous genomic region which can be fully contained in
26 another bin; each alignment is associated with a bin which represents
27 the smallest region containing the entire alignment. The binning
28 scheme is essentially another representation of R-tree. A distinct bin
29 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
30 a child of Bin B if region A is contained in B.
32 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
33 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
34 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
35 find the alignments overlapped with a region [rbeg,rend), we need to
36 calculate the list of bins that may be overlapped the region and test
37 the alignments in the bins to confirm the overlaps. If the specified
38 region is short, typically only a few alignments in six bins need to
39 be retrieved. The overlapping alignments can be quickly fetched.
43 #define BAM_MIN_CHUNK_GAP 32768
44 // 1<<14 is the size of minimum bin.
45 #define BAM_LIDX_SHIFT 14
47 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
53 #define pair64_lt(a,b) ((a).u < (b).u)
54 KSORT_INIT(off, pair64_t, pair64_lt)
66 KHASH_MAP_INIT_INT(i, bam_binlist_t)
68 struct __bam_index_t {
70 uint64_t n_no_coor; // unmapped reads without coordinate
75 // requirement: len <= LEN_MASK
76 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
81 k = kh_put(i, h, bin, &ret);
83 if (ret) { // not present
85 l->list = (pair64_t*)calloc(l->m, 16);
89 l->list = (pair64_t*)realloc(l->list, l->m * 16);
91 l->list[l->n].u = beg; l->list[l->n++].v = end;
94 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
97 beg = b->core.pos >> BAM_LIDX_SHIFT;
98 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
99 if (index2->m < end + 1) {
100 int old_m = index2->m;
102 kroundup32(index2->m);
103 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
104 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
107 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
109 for (i = beg; i <= end; ++i)
110 if (index2->offset[i] == 0) index2->offset[i] = offset;
115 static void merge_chunks(bam_index_t *idx)
117 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
121 for (i = 0; i < idx->n; ++i) {
122 index = idx->index[i];
123 for (k = kh_begin(index); k != kh_end(index); ++k) {
125 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
126 p = &kh_value(index, k);
128 for (l = 1; l < p->n; ++l) {
129 #ifdef BAM_TRUE_OFFSET
130 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
132 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
134 else p->list[++m] = p->list[l];
139 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
142 static void fill_missing(bam_index_t *idx)
145 for (i = 0; i < idx->n; ++i) {
146 bam_lidx_t *idx2 = &idx->index2[i];
147 for (j = 1; j < idx2->n; ++j)
148 if (idx2->offset[j] == 0)
149 idx2->offset[j] = idx2->offset[j-1];
153 bam_index_t *bam_index_core(bamFile fp)
159 uint32_t last_bin, save_bin;
160 int32_t last_coor, last_tid, save_tid;
162 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
164 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
165 b = (bam1_t*)calloc(1, sizeof(bam1_t));
166 h = bam_header_read(fp);
169 idx->n = h->n_targets;
170 bam_header_destroy(h);
171 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
172 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
173 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
175 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
176 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
177 n_mapped = n_unmapped = n_no_coor = off_end = 0;
178 off_beg = off_end = bam_tell(fp);
179 while ((ret = bam_read1(fp, b)) >= 0) {
180 if (c->tid < 0) ++n_no_coor;
181 if (last_tid != c->tid) { // change of chromosomes
183 last_bin = 0xffffffffu;
184 } else if (last_coor > c->pos) {
185 fprintf(pysamerr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
186 bam1_qname(b), last_coor, c->pos, c->tid+1);
189 if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
190 if (c->bin != last_bin) { // then possibly write the binning index
191 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
192 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
193 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
195 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
196 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
197 n_mapped = n_unmapped = 0;
201 save_bin = last_bin = c->bin;
203 if (save_tid < 0) break;
205 if (bam_tell(fp) <= last_off) {
206 fprintf(pysamerr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
207 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
210 if (c->flag & BAM_FUNMAP) ++n_unmapped;
212 last_off = bam_tell(fp);
213 last_coor = b->core.pos;
216 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
217 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
218 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
223 while ((ret = bam_read1(fp, b)) >= 0) {
225 if (c->tid >= 0 && n_no_coor) {
226 fprintf(pysamerr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
231 if (ret < -1) fprintf(pysamerr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
232 free(b->data); free(b);
233 idx->n_no_coor = n_no_coor;
237 void bam_index_destroy(bam_index_t *idx)
241 if (idx == 0) return;
242 for (i = 0; i < idx->n; ++i) {
243 khash_t(i) *index = idx->index[i];
244 bam_lidx_t *index2 = idx->index2 + i;
245 for (k = kh_begin(index); k != kh_end(index); ++k) {
246 if (kh_exist(index, k))
247 free(kh_value(index, k).list);
249 kh_destroy(i, index);
250 free(index2->offset);
252 free(idx->index); free(idx->index2);
256 void bam_index_save(const bam_index_t *idx, FILE *fp)
260 fwrite("BAI\1", 1, 4, fp);
263 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
264 } else fwrite(&idx->n, 4, 1, fp);
265 for (i = 0; i < idx->n; ++i) {
266 khash_t(i) *index = idx->index[i];
267 bam_lidx_t *index2 = idx->index2 + i;
268 // write binning index
269 size = kh_size(index);
270 if (bam_is_be) { // big endian
272 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
273 } else fwrite(&size, 4, 1, fp);
274 for (k = kh_begin(index); k != kh_end(index); ++k) {
275 if (kh_exist(index, k)) {
276 bam_binlist_t *p = &kh_value(index, k);
277 if (bam_is_be) { // big endian
279 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
280 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
281 for (x = 0; (int)x < p->n; ++x) {
282 bam_swap_endian_8p(&p->list[x].u);
283 bam_swap_endian_8p(&p->list[x].v);
285 fwrite(p->list, 16, p->n, fp);
286 for (x = 0; (int)x < p->n; ++x) {
287 bam_swap_endian_8p(&p->list[x].u);
288 bam_swap_endian_8p(&p->list[x].v);
291 fwrite(&kh_key(index, k), 4, 1, fp);
292 fwrite(&p->n, 4, 1, fp);
293 fwrite(p->list, 16, p->n, fp);
297 // write linear index (index2)
300 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
301 } else fwrite(&index2->n, 4, 1, fp);
302 if (bam_is_be) { // big endian
304 for (x = 0; (int)x < index2->n; ++x)
305 bam_swap_endian_8p(&index2->offset[x]);
306 fwrite(index2->offset, 8, index2->n, fp);
307 for (x = 0; (int)x < index2->n; ++x)
308 bam_swap_endian_8p(&index2->offset[x]);
309 } else fwrite(index2->offset, 8, index2->n, fp);
311 { // write the number of reads coor-less records.
312 uint64_t x = idx->n_no_coor;
313 if (bam_is_be) bam_swap_endian_8p(&x);
314 fwrite(&x, 8, 1, fp);
319 static bam_index_t *bam_index_load_core(FILE *fp)
325 fprintf(pysamerr, "[bam_index_load_core] fail to load index.\n");
328 fread(magic, 1, 4, fp);
329 if (strncmp(magic, "BAI\1", 4)) {
330 fprintf(pysamerr, "[bam_index_load] wrong magic number.\n");
334 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
335 fread(&idx->n, 4, 1, fp);
336 if (bam_is_be) bam_swap_endian_4p(&idx->n);
337 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
338 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
339 for (i = 0; i < idx->n; ++i) {
341 bam_lidx_t *index2 = idx->index2 + i;
346 index = idx->index[i] = kh_init(i);
347 // load binning index
348 fread(&size, 4, 1, fp);
349 if (bam_is_be) bam_swap_endian_4p(&size);
350 for (j = 0; j < (int)size; ++j) {
351 fread(&key, 4, 1, fp);
352 if (bam_is_be) bam_swap_endian_4p(&key);
353 k = kh_put(i, index, key, &ret);
354 p = &kh_value(index, k);
355 fread(&p->n, 4, 1, fp);
356 if (bam_is_be) bam_swap_endian_4p(&p->n);
358 p->list = (pair64_t*)malloc(p->m * 16);
359 fread(p->list, 16, p->n, fp);
362 for (x = 0; x < p->n; ++x) {
363 bam_swap_endian_8p(&p->list[x].u);
364 bam_swap_endian_8p(&p->list[x].v);
369 fread(&index2->n, 4, 1, fp);
370 if (bam_is_be) bam_swap_endian_4p(&index2->n);
371 index2->m = index2->n;
372 index2->offset = (uint64_t*)calloc(index2->m, 8);
373 fread(index2->offset, index2->n, 8, fp);
375 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
377 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
378 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
382 bam_index_t *bam_index_load_local(const char *_fn)
387 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
390 for (p = _fn + l - 1; p >= _fn; --p)
391 if (*p == '/') break;
393 } else fn = strdup(_fn);
394 fnidx = (char*)calloc(strlen(fn) + 5, 1);
395 strcpy(fnidx, fn); strcat(fnidx, ".bai");
396 fp = fopen(fnidx, "rb");
397 if (fp == 0) { // try "{base}.bai"
398 char *s = strstr(fn, "bam");
399 if (s == fn + strlen(fn) - 3) {
401 fnidx[strlen(fn)-1] = 'i';
402 fp = fopen(fnidx, "rb");
405 free(fnidx); free(fn);
407 bam_index_t *idx = bam_index_load_core(fp);
414 static void download_from_remote(const char *url)
416 const int buf_size = 1 * 1024 * 1024;
422 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
424 for (fn = (char*)url + l - 1; fn >= url; --fn)
425 if (*fn == '/') break;
426 ++fn; // fn now points to the file name
427 fp_remote = knet_open(url, "r");
428 if (fp_remote == 0) {
429 fprintf(pysamerr, "[download_from_remote] fail to open remote file.\n");
432 if ((fp = fopen(fn, "wb")) == 0) {
433 fprintf(pysamerr, "[download_from_remote] fail to create file in the working directory.\n");
434 knet_close(fp_remote);
437 buf = (uint8_t*)calloc(buf_size, 1);
438 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
439 fwrite(buf, 1, l, fp);
442 knet_close(fp_remote);
445 static void download_from_remote(const char *url)
451 bam_index_t *bam_index_load(const char *fn)
454 idx = bam_index_load_local(fn);
455 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
456 char *fnidx = calloc(strlen(fn) + 5, 1);
457 strcat(strcpy(fnidx, fn), ".bai");
458 fprintf(pysamerr, "[bam_index_load] attempting to download the remote index file.\n");
459 download_from_remote(fnidx);
460 idx = bam_index_load_local(fn);
462 if (idx == 0) fprintf(pysamerr, "[bam_index_load] fail to load BAM index.\n");
466 int bam_index_build2(const char *fn, const char *_fnidx)
472 if ((fp = bam_open(fn, "r")) == 0) {
473 fprintf(pysamerr, "[bam_index_build2] fail to open the BAM file.\n");
476 idx = bam_index_core(fp);
479 fnidx = (char*)calloc(strlen(fn) + 5, 1);
480 strcpy(fnidx, fn); strcat(fnidx, ".bai");
481 } else fnidx = strdup(_fnidx);
482 fpidx = fopen(fnidx, "wb");
484 fprintf(pysamerr, "[bam_index_build2] fail to create the index file.\n");
488 bam_index_save(idx, fpidx);
489 bam_index_destroy(idx);
495 int bam_index_build(const char *fn)
497 return bam_index_build2(fn, 0);
500 int bam_index(int argc, char *argv[])
503 fprintf(pysamerr, "Usage: samtools index <in.bam> [out.index]\n");
506 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
507 else bam_index_build(argv[1]);
511 int bam_idxstats(int argc, char *argv[])
514 bam_header_t *header;
518 fprintf(pysamerr, "Usage: samtools idxstats <in.bam>\n");
521 fp = bam_open(argv[1], "r");
522 if (fp == 0) { fprintf(pysamerr, "[%s] fail to open BAM.\n", __func__); return 1; }
523 header = bam_header_read(fp);
525 idx = bam_index_load(argv[1]);
526 if (idx == 0) { fprintf(pysamerr, "[%s] fail to load the index.\n", __func__); return 1; }
527 for (i = 0; i < idx->n; ++i) {
529 khash_t(i) *h = idx->index[i];
530 printf("%s\t%d", header->target_name[i], header->target_len[i]);
531 k = kh_get(i, h, BAM_MAX_BIN);
533 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
534 else printf("\t0\t0");
537 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
538 bam_header_destroy(header);
539 bam_index_destroy(idx);
543 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
546 if (beg >= end) return 0;
547 if (end >= 1u<<29) end = 1u<<29;
550 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
551 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
552 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
553 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
554 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
558 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
560 uint32_t rbeg = b->core.pos;
561 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
562 return (rend > beg && rbeg < end);
565 struct __bam_iter_t {
566 int from_first; // read from the first record; no random access
567 int tid, beg, end, n_off, i, finished;
572 // bam_fetch helper function retrieves
573 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
576 int i, n_bins, n_off;
583 if (beg < 0) beg = 0;
584 if (end < beg) return 0;
586 iter = calloc(1, sizeof(struct __bam_iter_t));
587 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
589 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
590 n_bins = reg2bins(beg, end, bins);
591 index = idx->index[tid];
592 if (idx->index2[tid].n > 0) {
593 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
594 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
595 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
596 int n = beg>>BAM_LIDX_SHIFT;
597 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
598 for (i = n - 1; i >= 0; --i)
599 if (idx->index2[tid].offset[i] != 0) break;
600 if (i >= 0) min_off = idx->index2[tid].offset[i];
602 } else min_off = 0; // tabix 0.1.2 may produce such index files
603 for (i = n_off = 0; i < n_bins; ++i) {
604 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
605 n_off += kh_value(index, k).n;
608 free(bins); return iter;
610 off = (pair64_t*)calloc(n_off, 16);
611 for (i = n_off = 0; i < n_bins; ++i) {
612 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
614 bam_binlist_t *p = &kh_value(index, k);
615 for (j = 0; j < p->n; ++j)
616 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
621 free(off); return iter;
624 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
626 ks_introsort(off, n_off, off);
627 // resolve completely contained adjacent blocks
628 for (i = 1, l = 0; i < n_off; ++i)
629 if (off[l].v < off[i].v)
632 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
633 for (i = 1; i < n_off; ++i)
634 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
635 { // merge adjacent blocks
636 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
637 for (i = 1, l = 0; i < n_off; ++i) {
638 #ifdef BAM_TRUE_OFFSET
639 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
641 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
643 else off[++l] = off[i];
650 iter->n_off = n_off; iter->off = off;
654 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
655 { // for pysam compatibility
658 iter = bam_iter_query(idx, tid, beg, end);
659 off = iter->off; *cnt_off = iter->n_off;
664 void bam_iter_destroy(bam_iter_t iter)
666 if (iter) { free(iter->off); free(iter); }
669 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
672 if (iter && iter->finished) return -1;
673 if (iter == 0 || iter->from_first) {
674 ret = bam_read1(fp, b);
675 if (ret < 0 && iter) iter->finished = 1;
678 if (iter->off == 0) return -1;
680 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
681 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
682 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
683 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
684 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
685 iter->curr_off = bam_tell(fp);
689 if ((ret = bam_read1(fp, b)) >= 0) {
690 iter->curr_off = bam_tell(fp);
691 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
692 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
695 else if (is_overlap(iter->beg, iter->end, b)) return ret;
696 } else break; // end of file or error
702 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
708 iter = bam_iter_query(idx, tid, beg, end);
709 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
710 bam_iter_destroy(iter);
712 return (ret == -1)? 0 : ret;