5 #include "endian_swap.h"
11 * Concrete reference representation that bulk-loads the reference from
12 * the bit-pair-compacted binary file and stores it in memory also in
13 * bit-pair-compacted format. The user may request reference
14 * characters either on a per-character bases or by "stretch" using
15 * getBase(...) and getStretch(...) respectively.
17 * Most of the complexity in this class is due to the fact that we want
18 * to represent references with ambiguous (non-A/C/G/T) characters but
19 * we don't want to use more than two bits per base. This means we
20 * need a way to encode the ambiguous stretches of the reference in a
21 * way that is external to the bitpair sequence. To accomplish this,
22 * we use the RefRecords vector, which is stored in the .3.ebwt index
23 * file. The bitpairs themselves are stored in the .4.ebwt index file.
25 * Once it has been loaded, a BitPairReference is read-only, and is
26 * safe for many threads to access at once.
28 class BitPairReference {
32 * Load from .3.ebwt/.4.ebwt Bowtie index files.
34 BitPairReference(const string& in,
37 std::vector<string>* infiles,
38 std::vector<String<Dna5> >* origs,
40 bool loadSequence, // as opposed to just records
54 string s3 = in + ".3.ebwt";
55 string s4 = in + ".4.ebwt";
59 if((f3 = open(s3.c_str(), O_RDONLY)) < 0) {
60 cerr << "Could not open reference-string index file " << s3 << " for reading." << endl;
61 cerr << "This is most likely because your index was built with an older version" << endl
62 << "(<= 0.9.8.1) of bowtie-build. Please re-run bowtie-build to generate a new" << endl
63 << "index (or download one from the Bowtie website) and try again." << endl;
67 if((f4 = open(s4.c_str(), O_RDONLY)) < 0) {
68 cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
74 if(verbose_ || startVerbose) {
75 cerr << " Memory-mapping reference index file " << s4 << ": ";
79 if (stat(s4.c_str(), &sbuf) == -1) {
81 cerr << "Error: Could not stat index file " << s4.c_str() << " prior to memory-mapping" << endl;
84 mmFile = (char*)mmap((void *)0, sbuf.st_size,
85 PROT_READ, MAP_SHARED, f4, 0);
86 if(mmFile == (void *)(-1) || mmFile == NULL) {
88 cerr << "Error: Could not memory-map the index file " << s4.c_str() << endl;
93 for(off_t i = 0; i < sbuf.st_size; i += 1024) {
94 sum += (int) mmFile[i];
97 cerr << " Swept the memory-mapped ref index file; checksum: " << sum << ": ";
104 if((f3 = fopen(s3.c_str(), "rb")) == NULL) {
105 cerr << "Could not open reference-string index file " << s3 << " for reading." << endl;
106 cerr << "This is most likely because your index was built with an older version" << endl
107 << "(<= 0.9.8.1) of bowtie-build. Please re-run bowtie-build to generate a new" << endl
108 << "index (or download one from the Bowtie website) and try again." << endl;
112 if((f4 = fopen(s4.c_str(), "rb")) == NULL) {
113 cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
119 // Read endianness sentinel, set 'swap'
122 one = readU32(f3, swap);
125 cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
128 assert_eq(0x1000000, one);
129 swap = true; // have to endian swap U32s
134 sz = readU32(f3, swap);
136 cerr << "Error: number of reference records is 0 in " << s3 << endl;
144 // Cumulative count of all unambiguous characters on a per-
145 // stretch 8-bit alignment (i.e. count of bytes we need to
149 uint32_t unambiglen = 0;
151 // For each unambiguous stretch...
152 for(uint32_t i = 0; i < sz; i++) {
153 recs_.push_back(RefRecord(f3, swap));
155 for(uint32_t i = 0; i < sz; i++) {
158 refLens_.push_back(cumlen);
160 // Stupid hack to get around the fact that, for
161 // colorspace Ebwts, not only do we omit reference
162 // sequences that are *all* gaps from
163 // nPat/plen/refnames, but we also omit reference
164 // sequences that never have a stretch of more than 1
165 // unambiguous character.
166 if(unambiglen > 0 && (!color || maxlen > 1)) {
167 refApproxLens_.push_back(cumlen);
169 // More hackery to detect references that won't be
170 // in the Ebwt even though they have non-zero length
171 bool willBeInEbwt = true;
172 if(recs_[i].len > 0 && color) {
173 // Omit until we prove it should be kept
174 willBeInEbwt = false;
175 for(uint32_t j = i; j < sz; j++) {
176 if(j > i && recs_[j].first) break;
177 if(recs_[j].len >= 2) {
183 if(recs_[i].len > 0 && willBeInEbwt) {
184 // Remember that this is the first record for this
185 // reference sequence (and the last record for the one
187 refRecOffs_.push_back(i);
188 refOffs_.push_back(cumsz);
189 expandIdx_.push_back(nrefs_);
190 shrinkIdx_.push_back(nNoGapRefs_);
192 isGaps_.push_back(false);
194 shrinkIdx_.push_back(nNoGapRefs_);
195 isGaps_.push_back(true);
201 assert_eq(nNoGapRefs_, expandIdx_.size());
202 assert_eq(nrefs_, shrinkIdx_.size());
204 //cerr << "First record in reference index file was not marked as 'first'" << endl;
207 cumsz += recs_[i].len;
208 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
209 cumlen += recs_[i].off;
210 cumlen += recs_[i].len;
212 if(recs_[i].len > 0) {
213 cumlen += recs_[i].off;
214 cumlen += recs_[i].len;
217 unambiglen += recs_[i].len;
218 if(recs_[i].len > maxlen) maxlen = recs_[i].len;
220 if(verbose_ || startVerbose) {
221 cerr << "Read " << nrefs_ << " reference strings (" << nNoGapRefs_ << " non-empty) from " << sz << " records: ";
224 // Store a cap entry for the end of the last reference seq
225 refRecOffs_.push_back(recs_.size());
226 refOffs_.push_back(cumsz);
227 if(unambiglen > 0 && (!color || maxlen > 1)) {
228 refApproxLens_.push_back(cumlen);
230 refLens_.push_back(cumlen);
232 assert_eq(nNoGapRefs_, refApproxLens_.size());
233 assert_eq(sz, recs_.size());
234 MM_FILE_CLOSE(f3); // done with .3.ebwt file
235 // Round cumsz up to nearest byte boundary
236 if((cumsz & 3) != 0) {
237 cumsz += (4 - (cumsz & 3));
239 bufAllocSz_ = cumsz >> 2;
240 assert_eq(0, cumsz & 3); // should be rounded up to nearest 4
241 if(!loadSequence) return;
244 buf_ = (uint8_t*)mmFile;
246 FILE *ftmp = fopen(s4.c_str(), "rb");
247 sanityBuf_ = new uint8_t[cumsz >> 2];
248 size_t ret = fread(sanityBuf_, 1, cumsz >> 2, ftmp);
249 if(ret != (cumsz >> 2)) {
250 cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4 << endl;
254 for(size_t i = 0; i < (cumsz >> 2); i++) {
255 assert_eq(sanityBuf_[i], buf_[i]);
259 cerr << "Shouldn't be at " << __FILE__ << ":" << __LINE__ << " without BOWTIE_MM defined" << endl;
263 bool shmemLeader = true;
265 // Allocate a buffer to hold the reference string
267 buf_ = new uint8_t[cumsz >> 2];
268 if(buf_ == NULL) throw std::bad_alloc();
269 } catch(std::bad_alloc& e) {
270 cerr << "Error: Ran out of memory allocating space for the bitpacked reference. Please" << endl
271 << "re-run on a computer with more memory." << endl;
275 shmemLeader = ALLOC_SHARED_U8(
276 (s4 + "[ref]"), (cumsz >> 2), &buf_,
277 "ref", (verbose_ || startVerbose));
280 // Open the bitpair-encoded reference file
281 FILE *f4 = fopen(s4.c_str(), "rb");
283 cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
284 cerr << "This is most likely because your index was built with an older version" << endl
285 << "(<= 0.9.8.1) of bowtie-build. Please re-run bowtie-build to generate a new" << endl
286 << "index (or download one from the Bowtie website) and try again." << endl;
290 // Read the whole thing in
291 size_t ret = fread(buf_, 1, cumsz >> 2, f4);
292 // Didn't read all of it?
293 if(ret != (cumsz >> 2)) {
294 cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4 << endl;
297 // Make sure there's no more
299 ret = fread(&c, 1, 1, f4);
300 assert_eq(0, ret); // should have failed
302 if(useShmem_) NOTIFY_SHARED(buf_, (cumsz >> 2));
304 if(useShmem_) WAIT_SHARED(buf_, (cumsz >> 2));
308 // Populate byteToU32_
309 bool big = currentlyBigEndian();
310 for(int i = 0; i < 256; i++) {
313 word |= ((i >> 0) & 3) << 24;
314 word |= ((i >> 2) & 3) << 16;
315 word |= ((i >> 4) & 3) << 8;
316 word |= ((i >> 6) & 3) << 0;
318 word |= ((i >> 0) & 3) << 0;
319 word |= ((i >> 2) & 3) << 8;
320 word |= ((i >> 4) & 3) << 16;
321 word |= ((i >> 6) & 3) << 24;
323 byteToU32_[i] = word;
328 // Compare the sequence we just read from the compact index
329 // file to the true reference sequence.
330 std::vector<seqan::String<seqan::Dna5> > *os; // for holding references
331 std::vector<seqan::String<seqan::Dna5> > osv; // for holding references
332 if(infiles != NULL) {
334 for(size_t i = 0; i < infiles->size(); i++) {
335 // Remove initial backslash; that's almost
336 // certainly being used to protect the first
337 // character of the sequence from getopts (e.g.,
338 // when the first char is -)
339 if((*infiles)[i].at(0) == '\\') {
340 (*infiles)[i].erase(0, 1);
342 osv.push_back(String<Dna5>((*infiles)[i]));
345 readSequenceFiles<seqan::String<seqan::Dna5>, seqan::Fasta>(*infiles, osv);
349 assert(origs != NULL);
353 // Never mind; reference is always letters, even if index
354 // and alignment run are colorspace
356 // If we're building a colorspace index, we need to convert
357 // osv to colors first
359 // for(size_t i = 0; i < os->size(); i++) {
360 // size_t olen = seqan::length((*os)[i]);
361 // for(size_t j = 0; j < olen-1; j++) {
362 // int b1 = (int)(*os)[i][j];
363 // assert_geq(b1, 0); assert_leq(b1, 4);
364 // int b2 = (int)(*os)[i][j+1];
365 // assert_geq(b2, 0); assert_leq(b2, 4);
366 // (*os)[i][j] = (Dna5)dinuc2color[b1][b2];
367 // assert((b1 != 4 && b2 != 4) || (int)(*os)[i][j] == 4);
369 // seqan::resize((*os)[i], olen-1);
372 // Go through the loaded reference files base-by-base and
373 // sanity check against what we get by calling getBase and
376 int longestStretch = 0;
378 for(size_t i = 0; i < os->size(); i++) {
379 size_t olen = seqan::length((*os)[i]);
380 for(size_t j = 0; j < olen; j++) {
381 if((int)(*os)[i][j] < 4) {
383 if(curStretch > longestStretch) longestStretch = curStretch;
388 if(longestStretch == 0 || (color && longestStretch == 1)) {
392 size_t olenU32 = (olen + 12) / 4;
393 uint32_t *buf = new uint32_t[olenU32];
394 uint8_t *bufadj = (uint8_t*)buf;
395 bufadj += getStretch(buf, refi, 0, olen);
396 for(size_t j = 0; j < olen; j++) {
397 assert_eq((int)(*os)[i][j], (int)bufadj[j]);
398 assert_eq((int)(*os)[i][j], (int)getBase(refi, j));
407 ~BitPairReference() {
408 if(buf_ != NULL && !useMm_ && !useShmem_) delete[] buf_;
409 if(sanityBuf_ != NULL) delete[] sanityBuf_;
413 * Return a single base of the reference. Calling this repeatedly
414 * is not an efficient way to retrieve bases from the reference;
415 * use loadStretch() instead.
417 * This implementation scans linearly through the records for the
418 * unambiguous stretches of the target reference sequence. When
419 * there are many records, binary search would be more appropriate.
421 int getBase(uint32_t tidx, uint32_t toff) const {
422 uint32_t reci = refRecOffs_[tidx]; // first record for target reference sequence
423 uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
424 assert_gt(recf, reci);
425 uint32_t bufOff = refOffs_[tidx];
427 // For all records pertaining to the target reference sequence...
428 for(uint32_t i = reci; i < recf; i++) {
429 assert_geq(toff, off);
434 assert_geq(toff, off);
435 uint32_t recOff = off + recs_[i].len;
439 assert_lt(bufOff, bufSz_);
440 const uint32_t bufElt = (bufOff) >> 2;
441 const uint32_t shift = (bufOff & 3) << 1;
442 return ((buf_[bufElt] >> shift) & 3);
444 bufOff += recs_[i].len;
446 assert_geq(toff, off);
447 } // end for loop over records
452 * Load a stretch of the reference string into memory at 'dest'.
454 * This implementation scans linearly through the records for the
455 * unambiguous stretches of the target reference sequence. When
456 * there are many records, binary search would be more appropriate.
458 int getStretchNaive(uint32_t *destU32,
461 uint32_t count) const
463 uint8_t *dest = (uint8_t*)destU32;
464 uint32_t reci = refRecOffs_[tidx]; // first record for target reference sequence
465 uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
466 assert_gt(recf, reci);
468 uint32_t bufOff = refOffs_[tidx];
470 // For all records pertaining to the target reference sequence...
471 for(uint32_t i = reci; i < recf; i++) {
472 assert_geq(toff, off);
474 for(; toff < off && count > 0; toff++) {
478 if(count == 0) break;
479 assert_geq(toff, off);
480 if(toff < off + recs_[i].len) {
481 bufOff += (toff - off); // move bufOff pointer forward
483 bufOff += recs_[i].len;
486 for(; toff < off && count > 0; toff++) {
487 assert_lt(bufOff, bufSz_);
488 const uint32_t bufElt = (bufOff) >> 2;
489 const uint32_t shift = (bufOff & 3) << 1;
490 dest[cur++] = (buf_[bufElt] >> shift) & 3;
494 if(count == 0) break;
495 assert_geq(toff, off);
496 } // end for loop over records
497 // In any chars are left after scanning all the records,
498 // they must be ambiguous
508 * Load a stretch of the reference string into memory at 'dest'.
510 * This implementation scans linearly through the records for the
511 * unambiguous stretches of the target reference sequence. When
512 * there are many records, binary search would be more appropriate.
514 int getStretch(uint32_t *destU32,
517 uint32_t count) const
519 ASSERT_ONLY(uint32_t origCount = count);
520 ASSERT_ONLY(uint32_t origToff = toff);
521 if(count == 0) return 0;
522 uint8_t *dest = (uint8_t*)destU32;
524 uint32_t *destU32_2 = new uint32_t[(origCount >> 2) + 2];
525 int off2 = getStretchNaive(destU32_2, tidx, origToff, origCount);
526 uint8_t *dest_2 = ((uint8_t*)destU32_2) + off2;
528 destU32[0] = 0x04040404; // Add Ns, which we might end up using later
529 uint32_t reci = refRecOffs_[tidx]; // first record for target reference sequence
530 uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
531 assert_gt(recf, reci);
532 uint32_t cur = 4; // keep a cushion of 4 bases at the beginning
533 uint32_t bufOff = refOffs_[tidx];
536 bool firstStretch = true;
537 // For all records pertaining to the target reference sequence...
538 for(uint32_t i = reci; i < recf; i++) {
539 ASSERT_ONLY(uint32_t origBufOff = bufOff);
540 assert_geq(toff, off);
544 uint32_t cpycnt = min(off - toff, count);
545 memset(&dest[cur], 4, cpycnt);
549 if(count == 0) break;
551 assert_geq(toff, off);
552 if(toff < off + recs_[i].len) {
553 bufOff += (toff - off); // move bufOff pointer forward
555 bufOff += recs_[i].len;
560 if(toff + 8 < off && count > 8) {
561 // We already added some Ns, so we have to do
562 // a fixup at the beginning of the buffer so
563 // that we can start clobbering at cur >> 2
567 uint32_t curU32 = cur >> 2;
568 // Do the initial few bases
570 const uint32_t bufElt = (bufOff) >> 2;
571 const int low2 = bufOff & 3;
572 destU32[curU32] = byteToU32_[buf_[bufElt]];
573 for(int j = 0; j < low2; j++) {
574 ((char *)(&destU32[curU32]))[j] = 4;
578 const int chars = 4 - low2;
583 assert_eq(0, bufOff & 3);
584 uint32_t bufOffU32 = bufOff >> 2;
585 uint32_t countLim = count >> 2;
586 uint32_t offLim = (off - (toff + 4)) >> 2;
587 uint32_t lim = min(countLim, offLim);
588 // Do the fast thing for as far as possible
589 for(uint32_t j = 0; j < lim; j++) {
590 destU32[curU32] = byteToU32_[buf_[bufOffU32++]];
591 assert_eq(dest[(curU32 << 2) + 0], dest_2[(curU32 << 2) - offset + 0]);
592 assert_eq(dest[(curU32 << 2) + 1], dest_2[(curU32 << 2) - offset + 1]);
593 assert_eq(dest[(curU32 << 2) + 2], dest_2[(curU32 << 2) - offset + 2]);
594 assert_eq(dest[(curU32 << 2) + 3], dest_2[(curU32 << 2) - offset + 3]);
598 assert_leq(toff, off);
599 assert_leq((lim << 2), count);
601 bufOff = bufOffU32 << 2;
604 // Do the slow thing for the rest
605 for(; toff < off && count > 0; toff++) {
606 assert_lt(bufOff, bufSz_);
607 const uint32_t bufElt = (bufOff) >> 2;
608 const uint32_t shift = (bufOff & 3) << 1;
609 dest[cur++] = (buf_[bufElt] >> shift) & 3;
613 firstStretch = false;
616 for(; toff < off && count > 0; toff++) {
617 assert_lt(bufOff, bufSz_);
618 const uint32_t bufElt = (bufOff) >> 2;
619 const uint32_t shift = (bufOff & 3) << 1;
620 dest[cur++] = (buf_[bufElt] >> shift) & 3;
626 if(count == 0) break;
627 assert_eq(recs_[i].len, bufOff - origBufOff);
628 assert_geq(toff, off);
629 } // end for loop over records
630 // In any chars are left after scanning all the records,
631 // they must be ambiguous
643 /// Return the number of reference sequences.
644 uint32_t numRefs() const {
648 /// Return the number of reference sequences that don't consist
649 /// entirely of stretches of ambiguous characters.
650 uint32_t numNonGapRefs() const {
657 uint32_t shrinkIdx(uint32_t idx) const {
658 assert_lt(idx, shrinkIdx_.size());
659 return shrinkIdx_[idx];
665 uint32_t expandIdx(uint32_t idx) const {
666 assert_lt(idx, expandIdx_.size());
667 return expandIdx_[idx];
670 /// Return the lengths of reference sequences.
671 uint32_t approxLen(uint32_t elt) const {
672 assert_lt(elt, nrefs_);
673 return refApproxLens_[elt];
676 /// Return the lengths of reference sequences.
677 uint32_t len(uint32_t elt) const {
678 assert_lt(elt, nrefs_);
679 return refLens_[elt];
682 /// Return true iff ref 'elt' is all gaps
683 bool isAllGaps(uint32_t elt) const {
684 assert_lt(elt, nrefs_);
685 assert_eq(isGaps_.size(), nrefs_);
689 /// Return true iff buf_ and all the vectors are populated.
690 bool loaded() const {
695 * Return constant reference to the RefRecord list.
697 const std::vector<RefRecord>& refRecords() const { return recs_; }
701 uint32_t byteToU32_[256];
703 std::vector<RefRecord> recs_; /// records describing unambiguous stretches
704 std::vector<uint32_t> refApproxLens_; /// approx lens of ref seqs (excludes trailing ambig chars)
705 std::vector<uint32_t> refLens_; /// approx lens of ref seqs (excludes trailing ambig chars)
706 std::vector<uint32_t> refOffs_; /// buf_ begin offsets per ref seq
707 std::vector<uint32_t> refRecOffs_; /// record begin/end offsets per ref seq
708 std::vector<uint32_t> expandIdx_; /// map from small idxs (e.g. w/r/t plen) to large ones (w/r/t refnames)
709 std::vector<uint32_t> shrinkIdx_; /// map from large idxs to small
710 std::vector<bool> isGaps_; /// ref i is all gaps?
711 uint8_t *buf_; /// the whole reference as a big bitpacked byte array
712 uint8_t *sanityBuf_;/// for sanity-checking buf_
713 uint32_t bufSz_; /// size of buf_
714 uint32_t bufAllocSz_;
715 uint32_t nrefs_; /// the number of reference sequences
716 uint32_t nNoGapRefs_; /// the number of reference sequences that aren't totally ambiguous
717 bool loaded_; /// whether it's loaded
718 bool sanity_; /// do sanity checking
719 bool useMm_; /// load the reference as a memory-mapped file
720 bool useShmem_; /// load the reference into shared memory