8 #include "bam_endian.h"
16 Alignment indexing. Before indexing, BAM must be sorted based on the
17 leftmost coordinate of alignments. In indexing, BAM uses two indices:
18 a UCSC binning index and a simple linear index. The binning index is
19 efficient for alignments spanning long distance, while the auxiliary
20 linear index helps to reduce unnecessary seek calls especially for
23 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
24 Stein and is explained by Kent et al. (2002). In this scheme, each bin
25 represents a contiguous genomic region which can be fully contained in
26 another bin; each alignment is associated with a bin which represents
27 the smallest region containing the entire alignment. The binning
28 scheme is essentially another representation of R-tree. A distinct bin
29 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
30 a child of Bin B if region A is contained in B.
32 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
33 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
34 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
35 find the alignments overlapped with a region [rbeg,rend), we need to
36 calculate the list of bins that may be overlapped the region and test
37 the alignments in the bins to confirm the overlaps. If the specified
38 region is short, typically only a few alignments in six bins need to
39 be retrieved. The overlapping alignments can be quickly fetched.
43 #define BAM_MIN_CHUNK_GAP 32768
44 // 1<<14 is the size of minimum bin.
45 #define BAM_LIDX_SHIFT 14
47 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
53 #define pair64_lt(a,b) ((a).u < (b).u)
54 KSORT_INIT(off, pair64_t, pair64_lt)
66 KHASH_MAP_INIT_INT(i, bam_binlist_t)
68 struct __bam_index_t {
70 uint64_t n_no_coor; // unmapped reads without coordinate
75 // requirement: len <= LEN_MASK
76 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
81 k = kh_put(i, h, bin, &ret);
83 if (ret) { // not present
85 l->list = (pair64_t*)calloc(l->m, 16);
89 l->list = (pair64_t*)realloc(l->list, l->m * 16);
91 l->list[l->n].u = beg; l->list[l->n++].v = end;
94 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
97 beg = b->core.pos >> BAM_LIDX_SHIFT;
98 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
99 if (index2->m < end + 1) {
100 int old_m = index2->m;
102 kroundup32(index2->m);
103 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
104 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
107 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
109 for (i = beg; i <= end; ++i)
110 if (index2->offset[i] == 0) index2->offset[i] = offset;
115 static void merge_chunks(bam_index_t *idx)
117 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
121 for (i = 0; i < idx->n; ++i) {
122 index = idx->index[i];
123 for (k = kh_begin(index); k != kh_end(index); ++k) {
125 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
126 p = &kh_value(index, k);
128 for (l = 1; l < p->n; ++l) {
129 #ifdef BAM_TRUE_OFFSET
130 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
132 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
134 else p->list[++m] = p->list[l];
139 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
142 static void fill_missing(bam_index_t *idx)
145 for (i = 0; i < idx->n; ++i) {
146 bam_lidx_t *idx2 = &idx->index2[i];
147 for (j = 1; j < idx2->n; ++j)
148 if (idx2->offset[j] == 0)
149 idx2->offset[j] = idx2->offset[j-1];
153 bam_index_t *bam_index_core(bamFile fp)
159 uint32_t last_bin, save_bin;
160 int32_t last_coor, last_tid, save_tid;
162 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
164 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
165 b = (bam1_t*)calloc(1, sizeof(bam1_t));
166 h = bam_header_read(fp);
169 idx->n = h->n_targets;
170 bam_header_destroy(h);
171 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
172 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
173 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
175 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
176 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
177 n_mapped = n_unmapped = n_no_coor = off_end = 0;
178 off_beg = off_end = bam_tell(fp);
179 while ((ret = bam_read1(fp, b)) >= 0) {
180 if (c->tid < 0) ++n_no_coor;
181 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
183 last_bin = 0xffffffffu;
184 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
185 fprintf(pysamerr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
186 bam1_qname(b), last_tid+1, c->tid+1);
188 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
189 fprintf(pysamerr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
190 bam1_qname(b), last_coor, c->pos, c->tid+1);
193 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
194 if (c->bin != last_bin) { // then possibly write the binning index
195 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
196 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
197 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
199 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
200 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
201 n_mapped = n_unmapped = 0;
205 save_bin = last_bin = c->bin;
207 if (save_tid < 0) break;
209 if (bam_tell(fp) <= last_off) {
210 fprintf(pysamerr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
211 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
214 if (c->flag & BAM_FUNMAP) ++n_unmapped;
216 last_off = bam_tell(fp);
217 last_coor = b->core.pos;
220 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
221 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
222 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
227 while ((ret = bam_read1(fp, b)) >= 0) {
229 if (c->tid >= 0 && n_no_coor) {
230 fprintf(pysamerr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
235 if (ret < -1) fprintf(pysamerr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
236 free(b->data); free(b);
237 idx->n_no_coor = n_no_coor;
241 void bam_index_destroy(bam_index_t *idx)
245 if (idx == 0) return;
246 for (i = 0; i < idx->n; ++i) {
247 khash_t(i) *index = idx->index[i];
248 bam_lidx_t *index2 = idx->index2 + i;
249 for (k = kh_begin(index); k != kh_end(index); ++k) {
250 if (kh_exist(index, k))
251 free(kh_value(index, k).list);
253 kh_destroy(i, index);
254 free(index2->offset);
256 free(idx->index); free(idx->index2);
260 void bam_index_save(const bam_index_t *idx, FILE *fp)
264 fwrite("BAI\1", 1, 4, fp);
267 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
268 } else fwrite(&idx->n, 4, 1, fp);
269 for (i = 0; i < idx->n; ++i) {
270 khash_t(i) *index = idx->index[i];
271 bam_lidx_t *index2 = idx->index2 + i;
272 // write binning index
273 size = kh_size(index);
274 if (bam_is_be) { // big endian
276 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
277 } else fwrite(&size, 4, 1, fp);
278 for (k = kh_begin(index); k != kh_end(index); ++k) {
279 if (kh_exist(index, k)) {
280 bam_binlist_t *p = &kh_value(index, k);
281 if (bam_is_be) { // big endian
283 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
284 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
285 for (x = 0; (int)x < p->n; ++x) {
286 bam_swap_endian_8p(&p->list[x].u);
287 bam_swap_endian_8p(&p->list[x].v);
289 fwrite(p->list, 16, p->n, fp);
290 for (x = 0; (int)x < p->n; ++x) {
291 bam_swap_endian_8p(&p->list[x].u);
292 bam_swap_endian_8p(&p->list[x].v);
295 fwrite(&kh_key(index, k), 4, 1, fp);
296 fwrite(&p->n, 4, 1, fp);
297 fwrite(p->list, 16, p->n, fp);
301 // write linear index (index2)
304 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
305 } else fwrite(&index2->n, 4, 1, fp);
306 if (bam_is_be) { // big endian
308 for (x = 0; (int)x < index2->n; ++x)
309 bam_swap_endian_8p(&index2->offset[x]);
310 fwrite(index2->offset, 8, index2->n, fp);
311 for (x = 0; (int)x < index2->n; ++x)
312 bam_swap_endian_8p(&index2->offset[x]);
313 } else fwrite(index2->offset, 8, index2->n, fp);
315 { // write the number of reads coor-less records.
316 uint64_t x = idx->n_no_coor;
317 if (bam_is_be) bam_swap_endian_8p(&x);
318 fwrite(&x, 8, 1, fp);
323 static bam_index_t *bam_index_load_core(FILE *fp)
329 fprintf(pysamerr, "[bam_index_load_core] fail to load index.\n");
332 fread(magic, 1, 4, fp);
333 if (strncmp(magic, "BAI\1", 4)) {
334 fprintf(pysamerr, "[bam_index_load] wrong magic number.\n");
338 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
339 fread(&idx->n, 4, 1, fp);
340 if (bam_is_be) bam_swap_endian_4p(&idx->n);
341 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
342 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
343 for (i = 0; i < idx->n; ++i) {
345 bam_lidx_t *index2 = idx->index2 + i;
350 index = idx->index[i] = kh_init(i);
351 // load binning index
352 fread(&size, 4, 1, fp);
353 if (bam_is_be) bam_swap_endian_4p(&size);
354 for (j = 0; j < (int)size; ++j) {
355 fread(&key, 4, 1, fp);
356 if (bam_is_be) bam_swap_endian_4p(&key);
357 k = kh_put(i, index, key, &ret);
358 p = &kh_value(index, k);
359 fread(&p->n, 4, 1, fp);
360 if (bam_is_be) bam_swap_endian_4p(&p->n);
362 p->list = (pair64_t*)malloc(p->m * 16);
363 fread(p->list, 16, p->n, fp);
366 for (x = 0; x < p->n; ++x) {
367 bam_swap_endian_8p(&p->list[x].u);
368 bam_swap_endian_8p(&p->list[x].v);
373 fread(&index2->n, 4, 1, fp);
374 if (bam_is_be) bam_swap_endian_4p(&index2->n);
375 index2->m = index2->n;
376 index2->offset = (uint64_t*)calloc(index2->m, 8);
377 fread(index2->offset, index2->n, 8, fp);
379 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
381 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
382 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
386 bam_index_t *bam_index_load_local(const char *_fn)
391 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
394 for (p = _fn + l - 1; p >= _fn; --p)
395 if (*p == '/') break;
397 } else fn = strdup(_fn);
398 fnidx = (char*)calloc(strlen(fn) + 5, 1);
399 strcpy(fnidx, fn); strcat(fnidx, ".bai");
400 fp = fopen(fnidx, "rb");
401 if (fp == 0) { // try "{base}.bai"
402 char *s = strstr(fn, "bam");
403 if (s == fn + strlen(fn) - 3) {
405 fnidx[strlen(fn)-1] = 'i';
406 fp = fopen(fnidx, "rb");
409 free(fnidx); free(fn);
411 bam_index_t *idx = bam_index_load_core(fp);
418 static void download_from_remote(const char *url)
420 const int buf_size = 1 * 1024 * 1024;
426 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
428 for (fn = (char*)url + l - 1; fn >= url; --fn)
429 if (*fn == '/') break;
430 ++fn; // fn now points to the file name
431 fp_remote = knet_open(url, "r");
432 if (fp_remote == 0) {
433 fprintf(pysamerr, "[download_from_remote] fail to open remote file.\n");
436 if ((fp = fopen(fn, "wb")) == 0) {
437 fprintf(pysamerr, "[download_from_remote] fail to create file in the working directory.\n");
438 knet_close(fp_remote);
441 buf = (uint8_t*)calloc(buf_size, 1);
442 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
443 fwrite(buf, 1, l, fp);
446 knet_close(fp_remote);
449 static void download_from_remote(const char *url)
455 bam_index_t *bam_index_load(const char *fn)
458 idx = bam_index_load_local(fn);
459 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
460 char *fnidx = calloc(strlen(fn) + 5, 1);
461 strcat(strcpy(fnidx, fn), ".bai");
462 fprintf(pysamerr, "[bam_index_load] attempting to download the remote index file.\n");
463 download_from_remote(fnidx);
464 idx = bam_index_load_local(fn);
466 if (idx == 0) fprintf(pysamerr, "[bam_index_load] fail to load BAM index.\n");
470 int bam_index_build2(const char *fn, const char *_fnidx)
476 if ((fp = bam_open(fn, "r")) == 0) {
477 fprintf(pysamerr, "[bam_index_build2] fail to open the BAM file.\n");
480 idx = bam_index_core(fp);
483 fprintf(pysamerr, "[bam_index_build2] fail to index the BAM file.\n");
487 fnidx = (char*)calloc(strlen(fn) + 5, 1);
488 strcpy(fnidx, fn); strcat(fnidx, ".bai");
489 } else fnidx = strdup(_fnidx);
490 fpidx = fopen(fnidx, "wb");
492 fprintf(pysamerr, "[bam_index_build2] fail to create the index file.\n");
496 bam_index_save(idx, fpidx);
497 bam_index_destroy(idx);
503 int bam_index_build(const char *fn)
505 return bam_index_build2(fn, 0);
508 int bam_index(int argc, char *argv[])
511 fprintf(pysamerr, "Usage: samtools index <in.bam> [out.index]\n");
514 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
515 else bam_index_build(argv[1]);
519 int bam_idxstats(int argc, char *argv[])
522 bam_header_t *header;
526 fprintf(pysamerr, "Usage: samtools idxstats <in.bam>\n");
529 fp = bam_open(argv[1], "r");
530 if (fp == 0) { fprintf(pysamerr, "[%s] fail to open BAM.\n", __func__); return 1; }
531 header = bam_header_read(fp);
533 idx = bam_index_load(argv[1]);
534 if (idx == 0) { fprintf(pysamerr, "[%s] fail to load the index.\n", __func__); return 1; }
535 for (i = 0; i < idx->n; ++i) {
537 khash_t(i) *h = idx->index[i];
538 printf("%s\t%d", header->target_name[i], header->target_len[i]);
539 k = kh_get(i, h, BAM_MAX_BIN);
541 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
542 else printf("\t0\t0");
545 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
546 bam_header_destroy(header);
547 bam_index_destroy(idx);
551 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
554 if (beg >= end) return 0;
555 if (end >= 1u<<29) end = 1u<<29;
558 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
559 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
560 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
561 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
562 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
566 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
568 uint32_t rbeg = b->core.pos;
569 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
570 return (rend > beg && rbeg < end);
573 struct __bam_iter_t {
574 int from_first; // read from the first record; no random access
575 int tid, beg, end, n_off, i, finished;
580 // bam_fetch helper function retrieves
581 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
584 int i, n_bins, n_off;
591 if (beg < 0) beg = 0;
592 if (end < beg) return 0;
594 iter = calloc(1, sizeof(struct __bam_iter_t));
595 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
597 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
598 n_bins = reg2bins(beg, end, bins);
599 index = idx->index[tid];
600 if (idx->index2[tid].n > 0) {
601 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
602 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
603 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
604 int n = beg>>BAM_LIDX_SHIFT;
605 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
606 for (i = n - 1; i >= 0; --i)
607 if (idx->index2[tid].offset[i] != 0) break;
608 if (i >= 0) min_off = idx->index2[tid].offset[i];
610 } else min_off = 0; // tabix 0.1.2 may produce such index files
611 for (i = n_off = 0; i < n_bins; ++i) {
612 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
613 n_off += kh_value(index, k).n;
616 free(bins); return iter;
618 off = (pair64_t*)calloc(n_off, 16);
619 for (i = n_off = 0; i < n_bins; ++i) {
620 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
622 bam_binlist_t *p = &kh_value(index, k);
623 for (j = 0; j < p->n; ++j)
624 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
629 free(off); return iter;
632 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
634 ks_introsort(off, n_off, off);
635 // resolve completely contained adjacent blocks
636 for (i = 1, l = 0; i < n_off; ++i)
637 if (off[l].v < off[i].v)
640 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
641 for (i = 1; i < n_off; ++i)
642 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
643 { // merge adjacent blocks
644 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
645 for (i = 1, l = 0; i < n_off; ++i) {
646 #ifdef BAM_TRUE_OFFSET
647 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
649 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
651 else off[++l] = off[i];
658 iter->n_off = n_off; iter->off = off;
662 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
663 { // for pysam compatibility
666 iter = bam_iter_query(idx, tid, beg, end);
667 off = iter->off; *cnt_off = iter->n_off;
672 void bam_iter_destroy(bam_iter_t iter)
674 if (iter) { free(iter->off); free(iter); }
677 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
680 if (iter && iter->finished) return -1;
681 if (iter == 0 || iter->from_first) {
682 ret = bam_read1(fp, b);
683 if (ret < 0 && iter) iter->finished = 1;
686 if (iter->off == 0) return -1;
688 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
689 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
690 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
691 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
692 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
693 iter->curr_off = bam_tell(fp);
697 if ((ret = bam_read1(fp, b)) >= 0) {
698 iter->curr_off = bam_tell(fp);
699 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
700 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
703 else if (is_overlap(iter->beg, iter->end, b)) return ret;
704 } else break; // end of file or error
710 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
716 iter = bam_iter_query(idx, tid, beg, end);
717 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
718 bam_iter_destroy(iter);
720 return (ret == -1)? 0 : ret;