15 #include <seqan/sequence.h>
16 #include <seqan/index.h>
22 #include "auto_array.h"
25 #include "assert_helpers.h"
27 #include "blockwise_sa.h"
28 #include "endian_swap.h"
30 #include "random_source.h"
33 #include "threading.h"
39 #include "color_dec.h"
40 #include "reference.h"
43 using namespace seqan;
45 #ifndef PREFETCH_LOCALITY
46 // No locality by default
47 #define PREFETCH_LOCALITY 2
50 // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
51 extern uint8_t cCntLUT_4[4][4][256];
54 #define VMSG_NL(args...) \
55 if(this->verbose()) { \
57 tmp << args << endl; \
58 this->verbose(tmp.str()); \
63 #define VMSG(args...) \
64 if(this->verbose()) { \
67 this->verbose(tmp.str()); \
72 * Flags describing type of Ebwt.
75 EBWT_COLOR = 2, // true -> Ebwt is colorspace
76 EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
77 // concatenated string reversed, rather than
78 // each stretch reversed
82 * Extended Burrows-Wheeler transform header. This together with the
83 * actual data arrays and other text-specific parameters defined in
84 * class Ebwt constitute the entire Ebwt.
91 EbwtParams(uint32_t len,
100 init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireReverse);
103 EbwtParams(const EbwtParams& eh) {
104 init(eh._len, eh._lineRate, eh._linesPerSide, eh._offRate,
105 eh._isaRate, eh._ftabChars, eh._color, eh._entireReverse);
108 void init(uint32_t len, int32_t lineRate, int32_t linesPerSide,
109 int32_t offRate, int32_t isaRate, int32_t ftabChars,
110 bool color, bool entireReverse)
113 _entireReverse = entireReverse;
117 _bwtSz = (len/4 + 1);
118 _lineRate = lineRate;
119 _linesPerSide = linesPerSide;
120 _origOffRate = offRate;
122 _offMask = 0xffffffff << _offRate;
124 _isaMask = 0xffffffff << ((_isaRate >= 0) ? _isaRate : 0);
125 _ftabChars = ftabChars;
126 _eftabLen = _ftabChars*2;
127 _eftabSz = _eftabLen*4;
128 _ftabLen = (1 << (_ftabChars*2))+1;
129 _ftabSz = _ftabLen*4;
130 _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
131 _offsSz = _offsLen*4;
132 _isaLen = (_isaRate == -1)? 0 : ((_bwtLen + (1 << _isaRate) - 1) >> _isaRate);
134 _lineSz = 1 << _lineRate;
135 _sideSz = _lineSz * _linesPerSide;
136 _sideBwtSz = _sideSz - 8;
137 _sideBwtLen = _sideBwtSz*4;
138 _numSidePairs = (_bwtSz+(2*_sideBwtSz)-1)/(2*_sideBwtSz);
139 _numSides = _numSidePairs*2;
140 _numLines = _numSides * _linesPerSide;
141 _ebwtTotLen = _numSidePairs * (2*_sideSz);
142 _ebwtTotSz = _ebwtTotLen;
146 uint32_t len() const { return _len; }
147 uint32_t bwtLen() const { return _bwtLen; }
148 uint32_t sz() const { return _sz; }
149 uint32_t bwtSz() const { return _bwtSz; }
150 int32_t lineRate() const { return _lineRate; }
151 int32_t linesPerSide() const { return _linesPerSide; }
152 int32_t origOffRate() const { return _origOffRate; }
153 int32_t offRate() const { return _offRate; }
154 uint32_t offMask() const { return _offMask; }
155 int32_t isaRate() const { return _isaRate; }
156 uint32_t isaMask() const { return _isaMask; }
157 int32_t ftabChars() const { return _ftabChars; }
158 uint32_t eftabLen() const { return _eftabLen; }
159 uint32_t eftabSz() const { return _eftabSz; }
160 uint32_t ftabLen() const { return _ftabLen; }
161 uint32_t ftabSz() const { return _ftabSz; }
162 uint32_t offsLen() const { return _offsLen; }
163 uint32_t offsSz() const { return _offsSz; }
164 uint32_t isaLen() const { return _isaLen; }
165 uint32_t isaSz() const { return _isaSz; }
166 uint32_t lineSz() const { return _lineSz; }
167 uint32_t sideSz() const { return _sideSz; }
168 uint32_t sideBwtSz() const { return _sideBwtSz; }
169 uint32_t sideBwtLen() const { return _sideBwtLen; }
170 uint32_t numSidePairs() const { return _numSidePairs; }
171 uint32_t numSides() const { return _numSides; }
172 uint32_t numLines() const { return _numLines; }
173 uint32_t ebwtTotLen() const { return _ebwtTotLen; }
174 uint32_t ebwtTotSz() const { return _ebwtTotSz; }
175 bool color() const { return _color; }
176 bool entireReverse() const { return _entireReverse; }
179 * Set a new suffix-array sampling rate, which involves updating
180 * rate, mask, sample length, and sample size.
182 void setOffRate(int __offRate) {
183 _offRate = __offRate;
184 _offMask = 0xffffffff << _offRate;
185 _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
186 _offsSz = _offsLen*4;
190 * Set a new inverse suffix-array sampling rate, which involves
191 * updating rate, mask, sample length, and sample size.
193 void setIsaRate(int __isaRate) {
194 _isaRate = __isaRate;
195 _isaMask = 0xffffffff << _isaRate;
196 _isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate;
200 /// Check that this EbwtParams is internally consistent
203 assert_gt(_lineRate, 3);
204 assert_geq(_offRate, 0);
205 assert_leq(_ftabChars, 16);
206 assert_geq(_ftabChars, 1);
207 assert_lt(_lineRate, 32);
208 assert_lt(_linesPerSide, 32);
209 assert_lt(_ftabChars, 32);
210 assert_eq(0, _ebwtTotSz % (2*_lineSz));
215 * Pretty-print the header contents to the given output stream.
217 void print(ostream& out) const {
218 out << "Headers:" << endl
219 << " len: " << _len << endl
220 << " bwtLen: " << _bwtLen << endl
221 << " sz: " << _sz << endl
222 << " bwtSz: " << _bwtSz << endl
223 << " lineRate: " << _lineRate << endl
224 << " linesPerSide: " << _linesPerSide << endl
225 << " offRate: " << _offRate << endl
226 << " offMask: 0x" << hex << _offMask << dec << endl
227 << " isaRate: " << _isaRate << endl
228 << " isaMask: 0x" << hex << _isaMask << dec << endl
229 << " ftabChars: " << _ftabChars << endl
230 << " eftabLen: " << _eftabLen << endl
231 << " eftabSz: " << _eftabSz << endl
232 << " ftabLen: " << _ftabLen << endl
233 << " ftabSz: " << _ftabSz << endl
234 << " offsLen: " << _offsLen << endl
235 << " offsSz: " << _offsSz << endl
236 << " isaLen: " << _isaLen << endl
237 << " isaSz: " << _isaSz << endl
238 << " lineSz: " << _lineSz << endl
239 << " sideSz: " << _sideSz << endl
240 << " sideBwtSz: " << _sideBwtSz << endl
241 << " sideBwtLen: " << _sideBwtLen << endl
242 << " numSidePairs: " << _numSidePairs << endl
243 << " numSides: " << _numSides << endl
244 << " numLines: " << _numLines << endl
245 << " ebwtTotLen: " << _ebwtTotLen << endl
246 << " ebwtTotSz: " << _ebwtTotSz << endl
247 << " reverse: " << _entireReverse << endl;
255 int32_t _linesPerSide;
256 int32_t _origOffRate;
273 uint32_t _sideBwtLen;
274 uint32_t _numSidePairs;
277 uint32_t _ebwtTotLen;
284 * Exception to throw when a file-realted error occurs.
286 class EbwtFileOpenException : public std::runtime_error {
288 EbwtFileOpenException(const std::string& msg = "") :
289 std::runtime_error(msg) { }
293 * Calculate size of file with given name.
295 static inline int64_t fileSize(const char* name) {
297 f.open(name, std::ios_base::binary | std::ios_base::in);
298 if (!f.good() || f.eof() || !f.is_open()) { return 0; }
299 f.seekg(0, std::ios_base::beg);
300 std::ifstream::pos_type begin_pos = f.tellg();
301 f.seekg(0, std::ios_base::end);
302 return static_cast<int64_t>(f.tellg() - begin_pos);
305 // Forward declarations for Ebwt class
307 template<typename TStr> class EbwtSearchParams;
310 * Extended Burrows-Wheeler transform data.
312 * An Ebwt may be transferred to and from RAM with calls to
313 * evictFromMemory() and loadIntoMemory(). By default, a newly-created
314 * Ebwt is not loaded into memory; if the user would like to use a
315 * newly-created Ebwt to answer queries, they must first call
318 template <typename TStr>
321 typedef typename Value<TStr>::Type TAlphabet;
324 _toBigEndian(currentlyBigEndian()), \
325 _overrideOffRate(__overrideOffRate), \
326 _overrideIsaRate(__overrideIsaRate), \
328 _passMemExc(passMemExc), \
329 _sanity(sanityCheck), \
331 _in1(MM_FILE_INIT), \
332 _in2(MM_FILE_INIT), \
334 _zEbwtByteOff(0xffffffff), \
354 #define Ebwt_STAT_INITS \
361 #define Ebwt_STAT_INITS
364 /// Construct an Ebwt from the given input file
365 Ebwt(const string& in,
367 int needEntireReverse,
369 int32_t __overrideOffRate = -1,
370 int32_t __overrideIsaRate = -1,
372 bool useShmem = false,
373 bool mmSweep = false,
374 bool loadNames = false,
375 const ReferenceMap* rmap = NULL,
376 bool verbose = false,
377 bool startVerbose = false,
378 bool passMemExc = false,
379 bool sanityCheck = false) :
383 assert(!useMm || !useShmem);
386 useShmem_ = useShmem;
387 _in1Str = in + ".1.ebwt";
388 _in2Str = in + ".2.ebwt";
390 color, // expect colorspace reference?
391 __fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
392 true, // stop after loading the header portion?
393 &_eh, // params structure to fill in
395 loadNames, // loadNames
396 startVerbose); // startVerbose
397 // If the offRate has been overridden, reflect that in the
398 // _eh._offRate field
399 if(_overrideOffRate > _eh._offRate) {
400 _eh.setOffRate(_overrideOffRate);
401 assert_eq(_overrideOffRate, _eh._offRate);
404 if(_overrideIsaRate > _eh._isaRate) {
405 _eh.setIsaRate(_overrideIsaRate);
406 assert_eq(_overrideIsaRate, _eh._isaRate);
411 /// Construct an Ebwt from the given header parameters and string
412 /// vector, optionally using a blockwise suffix sorter with the
413 /// given 'bmax' and 'dcv' parameters. The string vector is
414 /// ultimately joined and the joined string is passed to buildToDisk().
417 int32_t linesPerSide,
421 const string& file, // base filename for EBWT files
425 uint32_t bmaxSqrtMult,
428 vector<FileBuf*>& is,
429 vector<RefRecord>& szs,
430 vector<uint32_t>& plens,
432 const RefReadInParams& refparams,
434 int32_t __overrideOffRate = -1,
435 int32_t __overrideIsaRate = -1,
436 bool verbose = false,
437 bool passMemExc = false,
438 bool sanityCheck = false) :
448 refparams.reverse == REF_READ_REVERSE)
450 _in1Str = file + ".1.ebwt";
451 _in2Str = file + ".2.ebwt";
453 ofstream fout1(_in1Str.c_str(), ios::binary);
455 cerr << "Could not open index file for writing: \"" << _in1Str << "\"" << endl
456 << "Please make sure the directory exists and that permissions allow writing by" << endl
457 << "Bowtie." << endl;
460 ofstream fout2(_in2Str.c_str(), ios::binary);
462 cerr << "Could not open index file for writing: \"" << _in2Str << "\"" << endl
463 << "Please make sure the directory exists and that permissions allow writing by" << endl
464 << "Bowtie." << endl;
482 // Close output files
484 int64_t tellpSz1 = (int64_t)fout1.tellp();
485 VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str);
488 if(tellpSz1 > fileSize(_in1Str.c_str())) {
490 cerr << "Index is corrupt: File size for " << _in1Str << " should have been " << tellpSz1
491 << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
494 int64_t tellpSz2 = (int64_t)fout2.tellp();
495 VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str);
497 if(tellpSz2 > fileSize(_in2Str.c_str())) {
499 cerr << "Index is corrupt: File size for " << _in2Str << " should have been " << tellpSz2
500 << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
503 cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
506 // Reopen as input streams
507 VMSG_NL("Re-opening _in1 and _in2 as input streams");
509 VMSG_NL("Sanity-checking Ebwt");
510 assert(!isInMemory());
513 __fw ? -1 : refparams.reverse == REF_READ_REVERSE,
519 sanityCheckAll(refparams.reverse);
521 assert(!isInMemory());
523 VMSG_NL("Returning from Ebwt constructor");
529 * Write the rstarts array given the szs array for the reference.
531 void szsToDisk(const vector<RefRecord>& szs, ostream& os, int reverse) {
535 for(unsigned int i = 0; i < szs.size(); i++) {
536 if(szs[i].len == 0) continue;
537 if(szs[i].first) off = 0;
539 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
540 if(szs[i].first && szs[i].len > 0) seq++;
542 if(szs[i].first) seq++;
544 size_t seqm1 = seq-1;
545 assert_lt(seqm1, _nPat);
547 if(reverse == REF_READ_REVERSE) {
548 // Invert pattern idxs
549 seqm1 = _nPat - seqm1 - 1;
550 // Invert pattern idxs
551 assert_leq(off + szs[i].len, _plen[seqm1]);
552 fwoff = _plen[seqm1] - (off + szs[i].len);
554 writeU32(os, totlen, this->toBe()); // offset from beginning of joined string
555 writeU32(os, seqm1, this->toBe()); // sequence id
556 writeU32(os, fwoff, this->toBe()); // offset into sequence
557 totlen += szs[i].len;
563 * Helper for the constructors above. Takes a vector of text
564 * strings and joins them into a single string with a call to
565 * joinToDisk, which does a join (with padding) and writes some of
566 * the resulting data directly to disk rather than keep it in
567 * memory. It then constructs a suffix-array producer (what kind
568 * depends on 'useBlockwise') for the resulting sequence. The
569 * suffix-array producer can then be used to obtain chunks of the
570 * joined string's suffix array.
573 vector<FileBuf*>& is,
574 vector<RefRecord>& szs,
575 vector<uint32_t>& plens,
577 const RefReadInParams& refparams,
582 uint32_t bmaxSqrtMult,
587 // Compose text strings into single string
588 VMSG_NL("Calculating joined length");
589 TStr s; // holds the entire joined reference after call to joinToDisk
591 jlen = joinedLen(szs);
592 assert_geq(jlen, sztot);
593 VMSG_NL("Writing header");
594 writeFromMemory(true, out1, out2);
596 VMSG_NL("Reserving space for joined string");
597 seqan::reserve(s, jlen, Exact());
598 VMSG_NL("Joining reference sequences");
599 if(refparams.reverse == REF_READ_REVERSE) {
601 Timer timer(cout, " Time to join reference sequences: ", _verbose);
602 joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
604 Timer timer(cout, " Time to reverse reference sequence: ", _verbose);
605 vector<RefRecord> tmp;
607 reverseRefRecords(szs, tmp, false, false);
608 szsToDisk(tmp, out1, refparams.reverse);
611 Timer timer(cout, " Time to join reference sequences: ", _verbose);
612 joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
613 szsToDisk(szs, out1, refparams.reverse);
615 // Joined reference sequence now in 's'
616 } catch(bad_alloc& e) {
617 // If we throw an allocation exception in the try block,
618 // that means that the joined version of the reference
619 // string itself is too larger to fit in memory. The only
620 // alternatives are to tell the user to give us more memory
621 // or to try again with a packed representation of the
622 // reference (if we haven't tried that already).
623 cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
624 if(!isPacked() && _passMemExc) {
625 // Pass the exception up so that we can retry using a
626 // packed string representation
629 // There's no point passing this exception on. The fact
630 // that we couldn't allocate the joined string means that
631 // --bmax is irrelevant - the user should re-run with
634 cerr << "Please try running bowtie-build on a computer with more memory." << endl;
636 cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
637 << "mode (-a/--auto), or try again on a computer with more memory." << endl;
639 if(sizeof(void*) == 4) {
640 cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
641 << "this executable is 32-bit." << endl;
645 // Succesfully obtained joined reference string
646 assert_geq(length(s), jlen);
647 if(bmax != 0xffffffff) {
648 VMSG_NL("bmax according to bmax setting: " << bmax);
650 else if(bmaxSqrtMult != 0xffffffff) {
651 bmax *= bmaxSqrtMult;
652 VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
654 else if(bmaxDivN != 0xffffffff) {
655 bmax = max<uint32_t>(jlen / bmaxDivN, 1);
656 VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
659 bmax = (uint32_t)sqrt(length(s));
660 VMSG_NL("bmax defaulted to: " << bmax);
664 // Look for bmax/dcv parameters that work.
666 if(!first && bmax < 40 && _passMemExc) {
667 cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
669 // Throw an exception exception so that we can
670 // retry using a packed string representation
673 cerr << "Already tried a packed string representation." << endl;
675 cerr << "Please try indexing this reference on a computer with more memory." << endl;
676 if(sizeof(void*) == 4) {
677 cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
678 << "this executable is 32-bit." << endl;
682 if(dcv > 4096) dcv = 4096;
683 if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
684 dcv <<= 1; // double difference-cover period
686 bmax -= (bmax >> 2); // reduce by 25%
688 VMSG("Using parameters --bmax " << bmax);
690 VMSG_NL(" and *no difference cover*");
692 VMSG_NL(" --dcv " << dcv);
697 VMSG_NL(" Doing ahead-of-time memory usage test");
698 // Make a quick-and-dirty attempt to force a bad_alloc iff
699 // we would have thrown one eventually as part of
700 // constructing the DifferenceCoverSample
702 size_t sz = DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
703 AutoArray<uint8_t> tmp(sz);
705 // Likewise with the KarkkainenBlockwiseSA
706 sz = KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
707 AutoArray<uint8_t> tmp2(sz);
708 // Now throw in the 'ftab' and 'isaSample' structures
709 // that we'll eventually allocate in buildToDisk
710 AutoArray<uint32_t> ftab(_eh._ftabLen * 2);
711 AutoArray<uint8_t> side(_eh._sideSz);
712 // Grab another 20 MB out of caution
713 AutoArray<uint32_t> extra(20*1024*1024);
714 // If we made it here without throwing bad_alloc, then we
715 // passed the memory-usage stress test
716 VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
722 VMSG_NL("Constructing suffix-array element generator");
723 KarkkainenBlockwiseSA<TStr> bsa(s, bmax, dcv, seed, _sanity, _passMemExc, _verbose);
724 assert(bsa.suffixItrIsReset());
725 assert_eq(bsa.size(), length(s)+1);
726 VMSG_NL("Converting suffix-array elements to index image");
727 buildToDisk(bsa, s, out1, out2);
728 out1.flush(); out2.flush();
729 if(out1.fail() || out2.fail()) {
730 cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
734 } catch(bad_alloc& e) {
736 VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters.");
738 cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl
739 << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
746 // Now write reference sequence names on the end
747 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
748 assert_geq(this->_refnames.size(), this->_nPat);
750 assert_eq(this->_refnames.size(), this->_nPat);
752 for(size_t i = 0; i < this->_refnames.size(); i++) {
753 out1 << this->_refnames[i] << endl;
756 out1.flush(); out2.flush();
757 if(out1.fail() || out2.fail()) {
758 cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
761 VMSG_NL("Returning from initFromVector");
765 * Return the length that the joined string of the given string
766 * list will have. Note that this is indifferent to how the text
767 * fragments correspond to input sequences - it just cares about
768 * the lengths of the fragments.
770 uint32_t joinedLen(vector<RefRecord>& szs) {
772 for(unsigned int i = 0; i < szs.size(); i++) {
780 // Only free buffers if we're *not* using memory-mapped files
782 // Delete everything that was allocated in read(false, ...)
783 if(_fchr != NULL) delete[] _fchr; _fchr = NULL;
784 if(_ftab != NULL) delete[] _ftab; _ftab = NULL;
785 if(_eftab != NULL) delete[] _eftab; _eftab = NULL;
786 if(_offs != NULL && !useShmem_) {
787 delete[] _offs; _offs = NULL;
788 } else if(_offs != NULL && useShmem_) {
791 if(_isa != NULL) delete[] _isa; _isa = NULL;
792 if(_plen != NULL) delete[] _plen; _plen = NULL;
793 if(_rstarts != NULL) delete[] _rstarts; _rstarts = NULL;
794 if(_ebwt != NULL && !useShmem_) {
795 delete[] _ebwt; _ebwt = NULL;
796 } else if(_ebwt != NULL && useShmem_) {
803 cout << (_fw ? "Forward index:" : "Mirror index:") << endl;
804 cout << " mapLFEx: " << mapLFExs_ << endl;
805 cout << " mapLF: " << mapLFs_ << endl;
806 cout << " mapLF(c): " << mapLFcs_ << endl;
807 cout << " mapLF1(c): " << mapLF1cs_ << endl;
808 cout << " mapLF(c): " << mapLF1s_ << endl;
813 const EbwtParams& eh() const { return _eh; }
814 uint32_t zOff() const { return _zOff; }
815 uint32_t zEbwtByteOff() const { return _zEbwtByteOff; }
816 int zEbwtBpOff() const { return _zEbwtBpOff; }
817 uint32_t nPat() const { return _nPat; }
818 uint32_t nFrag() const { return _nFrag; }
819 uint32_t* fchr() const { return _fchr; }
820 uint32_t* ftab() const { return _ftab; }
821 uint32_t* eftab() const { return _eftab; }
822 uint32_t* offs() const { return _offs; }
823 uint32_t* isa() const { return _isa; }
824 uint32_t* plen() const { return _plen; }
825 uint32_t* rstarts() const { return _rstarts; }
826 uint8_t* ebwt() const { return _ebwt; }
827 const ReferenceMap* rmap() const { return rmap_; }
828 bool toBe() const { return _toBigEndian; }
829 bool verbose() const { return _verbose; }
830 bool sanityCheck() const { return _sanity; }
831 vector<string>& refnames() { return _refnames; }
832 bool fw() const { return _fw; }
834 /// Return true iff the Ebwt is currently in memory
835 bool isInMemory() const {
838 assert(_ftab != NULL);
839 assert(_eftab != NULL);
840 assert(_fchr != NULL);
841 assert(_offs != NULL);
842 assert(_isa != NULL);
843 assert(_rstarts != NULL);
844 assert_neq(_zEbwtByteOff, 0xffffffff);
845 assert_neq(_zEbwtBpOff, -1);
848 assert(_ftab == NULL);
849 assert(_eftab == NULL);
850 assert(_fchr == NULL);
851 assert(_offs == NULL);
852 assert(_rstarts == NULL);
853 assert_eq(_zEbwtByteOff, 0xffffffff);
854 assert_eq(_zEbwtBpOff, -1);
859 /// Return true iff the Ebwt is currently stored on disk
860 bool isEvicted() const {
861 return !isInMemory();
865 * Load this Ebwt into memory by reading it in from the _in1 and
870 int needEntireReverse,
875 color, // expect index to be colorspace?
876 needEntireReverse, // require reverse index to be concatenated reference reversed
877 false, // stop after loading the header portion?
880 loadNames, // loadNames
881 verbose); // startVerbose
885 * Frees memory associated with the Ebwt.
887 void evictFromMemory() {
888 assert(isInMemory());
893 if(!useShmem_) delete[] _offs;
895 // Keep plen; it's small and the client may want to query it
896 // even when the others are evicted.
899 if(!useShmem_) delete[] _ebwt;
906 // Keep plen; it's small and the client may want to query it
907 // even when the others are evicted.
911 _zEbwtByteOff = 0xffffffff;
916 * Non-static facade for static function ftabHi.
918 uint32_t ftabHi(uint32_t i) const {
919 return Ebwt::ftabHi(_ftab, _eftab, _eh._len, _eh._ftabLen,
924 * Get "high interpretation" of ftab entry at index i. The high
925 * interpretation of a regular ftab entry is just the entry
926 * itself. The high interpretation of an extended entry is the
927 * second correpsonding ui32 in the eftab.
929 * It's a static member because it's convenient to ask this
930 * question before the Ebwt is fully initialized.
932 static uint32_t ftabHi(uint32_t *ftab,
939 assert_lt(i, ftabLen);
943 uint32_t efIdx = ftab[i] ^ 0xffffffff;
944 assert_lt(efIdx*2+1, eftabLen);
945 return eftab[efIdx*2+1];
950 * Non-static facade for static function ftabLo.
952 uint32_t ftabLo(uint32_t i) const {
953 return Ebwt::ftabLo(_ftab, _eftab, _eh._len, _eh._ftabLen,
958 * Get "low interpretation" of ftab entry at index i. The low
959 * interpretation of a regular ftab entry is just the entry
960 * itself. The low interpretation of an extended entry is the
961 * first correpsonding ui32 in the eftab.
963 * It's a static member because it's convenient to ask this
964 * question before the Ebwt is fully initialized.
966 static uint32_t ftabLo(uint32_t *ftab,
973 assert_lt(i, ftabLen);
977 uint32_t efIdx = ftab[i] ^ 0xffffffff;
978 assert_lt(efIdx*2+1, eftabLen);
979 return eftab[efIdx*2];
984 * When using read() to create an Ebwt, we have to set a couple of
985 * additional fields in the Ebwt object that aren't part of the
986 * parameter list and are not stored explicitly in the file. Right
987 * now, this just involves initializing _zEbwtByteOff and
988 * _zEbwtBpOff from _zOff.
990 void postReadInit(EbwtParams& eh) {
991 uint32_t sideNum = _zOff / eh._sideBwtLen;
992 uint32_t sideCharOff = _zOff % eh._sideBwtLen;
993 uint32_t sideByteOff = sideNum * eh._sideSz;
994 _zEbwtByteOff = sideCharOff >> 2;
995 assert_lt(_zEbwtByteOff, eh._sideBwtSz);
996 _zEbwtBpOff = sideCharOff & 3;
997 assert_lt(_zEbwtBpOff, 4);
998 if((sideNum & 1) == 0) {
999 // This is an even (backward) side
1000 _zEbwtByteOff = eh._sideBwtSz - _zEbwtByteOff - 1;
1001 _zEbwtBpOff = 3 - _zEbwtBpOff;
1002 assert_lt(_zEbwtBpOff, 4);
1004 _zEbwtByteOff += sideByteOff;
1005 assert(repOk(eh)); // Ebwt should be fully initialized now
1009 * Pretty-print the Ebwt to the given output stream.
1011 void print(ostream& out) const {
1016 * Pretty-print the Ebwt and given EbwtParams to the given output
1019 void print(ostream& out, const EbwtParams& eh) const {
1020 eh.print(out); // print params
1021 out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
1022 << " zOff: " << _zOff << endl
1023 << " zEbwtByteOff: " << _zEbwtByteOff << endl
1024 << " zEbwtBpOff: " << _zEbwtBpOff << endl
1025 << " nPat: " << _nPat << endl
1028 out << "NULL" << endl;
1030 out << "non-NULL, [0] = " << _plen[0] << endl;
1032 out << " rstarts: ";
1033 if(_rstarts == NULL) {
1034 out << "NULL" << endl;
1036 out << "non-NULL, [0] = " << _rstarts[0] << endl;
1040 out << "NULL" << endl;
1042 out << "non-NULL, [0] = " << _ebwt[0] << endl;
1046 out << "NULL" << endl;
1048 out << "non-NULL, [0] = " << _fchr[0] << endl;
1052 out << "NULL" << endl;
1054 out << "non-NULL, [0] = " << _ftab[0] << endl;
1057 if(_eftab == NULL) {
1058 out << "NULL" << endl;
1060 out << "non-NULL, [0] = " << _eftab[0] << endl;
1064 out << "NULL" << endl;
1066 out << "non-NULL, [0] = " << _offs[0] << endl;
1071 static TStr join(vector<TStr>& l, uint32_t seed);
1072 static TStr join(vector<FileBuf*>& l, vector<RefRecord>& szs, uint32_t sztot, const RefReadInParams& refparams, uint32_t seed);
1073 void joinToDisk(vector<FileBuf*>& l, vector<RefRecord>& szs, vector<uint32_t>& plens, uint32_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed);
1074 void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2);
1077 void readIntoMemory(int color, int needEntireReverse, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
1078 void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
1079 void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
1082 void printRangeFw(uint32_t begin, uint32_t end) const;
1083 void printRangeBw(uint32_t begin, uint32_t end) const;
1084 void sanityCheckUpToSide(int upToSide) const;
1085 void sanityCheckAll(int reverse) const;
1086 void restore(TStr& s) const;
1087 void checkOrigs(const vector<String<Dna5> >& os, bool color, bool mirror) const;
1089 // Searching and reporting
1090 void joinedToTextOff(uint32_t qlen, uint32_t off, uint32_t& tidx, uint32_t& textoff, uint32_t& tlen) const;
1091 inline bool report(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<uint32_t>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, uint32_t off, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params) const;
1092 inline bool reportChaseOne(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<uint32_t>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
1093 inline bool reportReconstruct(const String<Dna5>& query, String<char>* quals, String<char>* name, String<Dna5>& lbuf, String<Dna5>& rbuf, const uint32_t *mmui32, const char* refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
1094 inline int rowL(const SideLocus& l) const;
1095 inline uint32_t countUpTo(const SideLocus& l, int c) const;
1096 inline void countUpToEx(const SideLocus& l, uint32_t* pairs) const;
1097 inline uint32_t countFwSide(const SideLocus& l, int c) const;
1098 inline void countFwSideEx(const SideLocus& l, uint32_t *pairs) const;
1099 inline uint32_t countBwSide(const SideLocus& l, int c) const;
1100 inline void countBwSideEx(const SideLocus& l, uint32_t *pairs) const;
1101 inline uint32_t mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
1102 inline void mapLFEx(const SideLocus& l, uint32_t *pairs ASSERT_ONLY(, bool overrideSanity = false)) const;
1103 inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, uint32_t *tops, uint32_t *bots ASSERT_ONLY(, bool overrideSanity = false)) const;
1104 inline uint32_t mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
1105 inline uint32_t mapLF1(uint32_t row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
1106 inline int mapLF1(uint32_t& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
1107 /// Check that in-memory Ebwt is internally consistent with respect
1108 /// to given EbwtParams; assert if not
1109 bool inMemoryRepOk(const EbwtParams& eh) const {
1110 assert_leq(ValueSize<TAlphabet>::VALUE, 4);
1111 assert_geq(_zEbwtBpOff, 0);
1112 assert_lt(_zEbwtBpOff, 4);
1113 assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
1114 assert_lt(_zOff, eh._bwtLen);
1115 assert(_rstarts != NULL);
1116 assert_geq(_nFrag, _nPat);
1120 /// Check that in-memory Ebwt is internally consistent; assert if
1122 bool inMemoryRepOk() const {
1126 /// Check that Ebwt is internally consistent with respect to given
1127 /// EbwtParams; assert if not
1128 bool repOk(const EbwtParams& eh) const {
1129 assert(_eh.repOk());
1131 return inMemoryRepOk(eh);
1136 /// Check that Ebwt is internally consistent; assert if not
1137 bool repOk() const {
1142 int32_t _overrideOffRate;
1143 int32_t _overrideIsaRate;
1147 bool _fw; // true iff this is a forward index
1148 MM_FILE _in1; // input fd for primary index file
1149 MM_FILE _in2; // input fd for secondary index file
1150 string _in1Str; // filename for primary index file
1151 string _in2Str; // filename for secondary index file
1153 uint32_t _zEbwtByteOff;
1155 uint32_t _nPat; /// number of reference texts
1156 uint32_t _nFrag; /// number of fragments
1158 uint32_t* _rstarts; // starting offset of fragments / text indexes
1159 // _fchr, _ftab and _eftab are expected to be relatively small
1160 // (usually < 1MB, perhaps a few MB if _fchr is particularly large
1161 // - like, say, 11). For this reason, we don't bother with writing
1162 // them to disk through separate output streams; we
1165 uint32_t* _eftab; // "extended" entries for _ftab
1166 // _offs may be extremely large. E.g. for DNA w/ offRate=4 (one
1167 // offset every 16 rows), the total size of _offs is the same as
1168 // the total size of the input sequence
1171 // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
1172 // is at least as large as the input sequence.
1174 bool _useMm; /// use memory-mapped files to hold the index
1175 bool useShmem_; /// use shared memory to hold large parts of the index
1176 vector<string> _refnames; /// names of the reference sequences
1177 const ReferenceMap* rmap_; /// mapping into another reference coordinate space
1190 ostream& log() const {
1191 return cout; // TODO: turn this into a parameter
1194 /// Print a verbose message and flush (flushing is helpful for
1196 void verbose(const string& s) const {
1197 if(this->verbose()) {
1199 this->log().flush();
1204 /// Specialization for packed Ebwts - return true
1206 bool Ebwt<String<Dna, Packed<> > >::isPacked() {
1210 /// By default, Ebwts are not packed
1211 template<typename TStr>
1212 bool Ebwt<TStr>::isPacked() {
1217 * Structure encapsulating search parameters, such as whether and how
1218 * to backtrack and how to deal with multiple equally-good hits.
1220 template<typename TStr>
1221 class EbwtSearchParams {
1223 EbwtSearchParams(HitSinkPerThread& sink,
1224 const vector<String<Dna5> >& texts,
1226 bool ebwtFw = true) :
1232 HitSinkPerThread& sink() const { return _sink; }
1233 void setPatId(uint32_t patid) { _patid = patid; }
1234 uint32_t patId() const { return _patid; }
1235 void setFw(bool fw) { _fw = fw; }
1236 bool fw() const { return _fw; }
1238 * Report a hit. Returns true iff caller can call off the search.
1240 bool reportHit(const String<Dna5>& query, // read sequence
1241 String<char>* quals, // read quality values
1242 String<char>* name, // read name
1243 bool color, // true -> read is colorspace
1244 bool colExEnds, // true -> exclude nucleotides at extreme ends after decoding
1245 int snpPhred, // penalty for a SNP
1246 const BitPairReference* ref, // reference (= NULL if not necessary)
1247 const ReferenceMap* rmap, // map to another reference coordinate system
1248 bool ebwtFw, // whether index is forward (true) or mirror (false)
1249 const std::vector<uint32_t>& mmui32, // mismatch list
1250 const std::vector<uint8_t>& refcs, // reference characters
1251 size_t numMms, // # mismatches
1252 U32Pair h, // ref coords
1253 U32Pair mh, // mate's ref coords
1254 bool mfw, // mate's orientation
1255 uint16_t mlen, // mate length
1256 U32Pair a, // arrow pair
1257 uint32_t tlen, // length of text
1258 uint32_t qlen, // length of query
1259 int stratum, // alignment stratum
1260 uint16_t cost, // cost of alignment
1261 uint32_t oms, // approx. # other valid alignments
1267 // Check that no two elements of the mms array are the same
1268 for(size_t i = 0; i < numMms; i++) {
1269 for(size_t j = i+1; j < numMms; j++) {
1270 assert_neq(mmui32[i], mmui32[j]);
1274 // If ebwtFw is true, then 'query' and 'quals' are reversed
1275 // If _fw is false, then 'query' and 'quals' are reverse complemented
1276 assert(!color || ref != NULL);
1277 assert(quals != NULL);
1278 assert(name != NULL);
1279 assert_eq(mmui32.size(), refcs.size());
1280 assert_leq(numMms, mmui32.size());
1283 hit.stratum = stratum;
1288 // Re-reverse the pattern and the quals back to how they
1289 // appeared in the read file
1290 ::reverseInPlace(hit.patSeq);
1291 ::reverseInPlace(hit.quals);
1294 hit.colSeq = hit.patSeq;
1295 hit.colQuals = hit.quals;
1296 hit.crefcs.resize(qlen, 0);
1297 // Turn the mmui32 and refcs arrays into the mm FixedBitset and
1299 for(size_t i = 0; i < numMms; i++) {
1300 if (ebwtFw != _fw) {
1301 // The 3' end is on the left but the mm vector encodes
1302 // mismatches w/r/t the 5' end, so we flip
1303 uint32_t off = qlen - mmui32[i] - 1;
1305 hit.crefcs[off] = refcs[i];
1307 hit.cmms.set(mmui32[i]);
1308 hit.crefcs[i] = refcs[i];
1311 assert(ref != NULL);
1313 uint32_t rfbuf[(1024+16)/4];
1314 ASSERT_ONLY(char rfbuf2[1024]);
1321 // TODO: account for indels when calculating these bounds
1323 size_t readf = seqan::length(hit.patSeq);
1325 size_t reff = readf + 1;
1326 bool maqRound = false;
1327 for(size_t i = 0; i < qlen + 1; i++) {
1329 read[i] = (int)hit.patSeq[i];
1330 qual[i] = mmPenalty(maqRound, phredCharToPhredQual(hit.quals[i]));
1332 ASSERT_ONLY(rfbuf2[i] = ref->getBase(h.first, h.second + i));
1334 int offset = ref->getStretch(rfbuf, h.first, h.second, qlen + 1);
1335 char *rf = (char*)rfbuf + offset;
1336 for(size_t i = 0; i < qlen + 1; i++) {
1337 assert_eq(rf[i], rfbuf2[i]);
1338 rf[i] = (1 << rf[i]);
1341 read, // ASCII colors, '0', '1', '2', '3', '.'
1342 qual, // ASCII quals, Phred+33 encoded
1343 readi, // offset of first character within 'read' to consider
1344 readf, // offset of last char (exclusive) in 'read' to consider
1345 rf, // reference sequence, as masks
1346 refi, // offset of first character within 'ref' to consider
1347 reff, // offset of last char (exclusive) in 'ref' to consider
1348 snpPhred, // penalty incurred by a SNP
1349 ns, // decoded nucleotides are appended here
1350 cmm, // where the color mismatches are in the string
1351 nmm, // where nucleotide mismatches are in the string
1352 cmms, // number of color mismatches
1353 nmms);// number of nucleotide mismatches
1354 size_t nqlen = qlen + (colExEnds ? -1 : 1);
1355 seqan::resize(hit.patSeq, nqlen);
1356 seqan::resize(hit.quals, nqlen);
1357 hit.refcs.resize(nqlen);
1358 size_t lo = colExEnds ? 1 : 0;
1359 size_t hi = colExEnds ? qlen : qlen+1;
1361 for(size_t i = lo; i < hi; i++, destpos++) {
1362 // Set sequence character
1363 assert_leq(ns[i], 4);
1364 assert_geq(ns[i], 0);
1365 hit.patSeq[destpos] = (Dna5)(int)ns[i];
1366 // Set initial quality
1367 hit.quals[destpos] = '!';
1368 // Color mismatches penalize quality
1370 if(cmm[i-1] == 'M') {
1371 if((int)hit.quals[destpos] + (int)qual[i-1] > 126) {
1372 hit.quals[destpos] = 126;
1374 hit.quals[destpos] += qual[i-1];
1376 } else if((int)hit.colSeq[i-1] != 4) {
1377 hit.quals[destpos] -= qual[i-1];
1382 if((int)hit.quals[destpos] + (int)qual[i] > 126) {
1383 hit.quals[destpos] = 126;
1385 hit.quals[destpos] += qual[i];
1387 } else if((int)hit.patSeq[i] != 4) {
1388 hit.quals[destpos] -= qual[i];
1391 if(hit.quals[destpos] < '!') {
1392 hit.quals[destpos] = '!';
1395 uint32_t off = i - (colExEnds? 1:0);
1396 if(!_fw) off = nqlen - off - 1;
1397 assert_lt(off, nqlen);
1399 hit.refcs[off] = "ACGT"[ref->getBase(h.first, h.second+i)];
1403 // Extreme bases have been removed; that makes the
1404 // nucleotide alignment one character shorter than the
1407 // It also shifts the alignment's offset up by 1
1410 // Extreme bases are included; that makes the
1411 // nucleotide alignment one character longer than the
1416 // Turn the mmui32 and refcs arrays into the mm FixedBitset and
1418 hit.refcs.resize(qlen, 0);
1419 for(size_t i = 0; i < numMms; i++) {
1420 if (ebwtFw != _fw) {
1421 // The 3' end is on the left but the mm vector encodes
1422 // mismatches w/r/t the 5' end, so we flip
1423 uint32_t off = qlen - mmui32[i] - 1;
1425 hit.refcs[off] = refcs[i];
1427 hit.mms.set(mmui32[i]);
1428 hit.refcs[mmui32[i]] = refcs[i];
1432 // Check the hit against the original text, if it's available
1433 if(_texts.size() > 0) {
1434 assert_lt(h.first, _texts.size());
1435 FixedBitset<1024> diffs;
1436 // This type of check assumes that only mismatches are
1437 // possible. If indels are possible, then we either need
1438 // the caller to provide information about indel locations,
1439 // or we need to extend this to a more complicated check.
1440 assert_leq(h.second + qlen, length(_texts[h.first]));
1441 for(size_t i = 0; i < qlen; i++) {
1442 assert_neq(4, (int)_texts[h.first][h.second + i]);
1443 // Forward pattern appears at h
1444 if((int)hit.patSeq[i] != (int)_texts[h.first][h.second + i]) {
1446 // if ebwtFw != _fw the 3' end is on on the
1447 // left end of the pattern, but the diff vector
1448 // should encode mismatches w/r/t the 5' end,
1450 if (_fw) diffs.set(qoff);
1451 else diffs.set(qlen - qoff - 1);
1454 if(diffs != hit.mms) {
1455 // Oops, mismatches were not where we expected them;
1456 // print a diagnostic message before asserting
1457 cerr << "Expected " << hit.mms.str() << " mismatches, got " << diffs.str() << endl;
1458 cerr << " Pat: " << hit.patSeq << endl;
1460 for(size_t i = 0; i < qlen; i++) {
1461 cerr << _texts[h.first][h.second + i];
1464 cerr << " mmui32: ";
1465 for(size_t i = 0; i < numMms; i++) {
1466 cerr << mmui32[i] << " ";
1469 cerr << " FW: " << _fw << endl;
1470 cerr << " Ebwt FW: " << ebwtFw << endl;
1472 if(diffs != hit.mms) assert(false);
1475 if(rmap != NULL) rmap->map(hit.h);
1476 hit.patId = ((patid == 0xffffffff) ? _patid : patid);
1477 hit.patName = *name;
1486 assert(hit.repOk());
1487 return sink().reportHit(hit, stratum);
1490 HitSinkPerThread& _sink;
1491 const vector<String<Dna5> >& _texts; // original texts, if available (if not
1492 // available, _texts.size() == 0)
1493 uint32_t _patid; // id of current read
1494 bool _fw; // current read is forward-oriented
1498 * Encapsulates a location in the bwt text in terms of the side it
1499 * occurs in and its offset within the side.
1511 * Construct from row and other relevant information about the Ebwt.
1513 SideLocus(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) {
1514 initFromRow(row, ep, ebwt);
1518 * Init two SideLocus objects from a top/bot pair, using the result
1519 * from one call to initFromRow to possibly avoid a second call.
1521 static void initFromTopBot(uint32_t top,
1523 const EbwtParams& ep,
1524 const uint8_t* ebwt,
1528 const uint32_t sideBwtLen = ep._sideBwtLen;
1529 const uint32_t sideBwtSz = ep._sideBwtSz;
1530 assert_gt(bot, top);
1531 ltop.initFromRow(top, ep, ebwt);
1532 uint32_t spread = bot - top;
1533 if(ltop._charOff + spread < sideBwtLen) {
1534 lbot._charOff = ltop._charOff + spread;
1535 lbot._sideNum = ltop._sideNum;
1536 lbot._sideByteOff = ltop._sideByteOff;
1537 lbot._fw = ltop._fw;
1538 lbot._by = lbot._charOff >> 2;
1539 assert_lt(lbot._by, (int)sideBwtSz);
1540 if(!lbot._fw) lbot._by = sideBwtSz - lbot._by - 1;
1541 lbot._bp = lbot._charOff & 3;
1542 if(!lbot._fw) lbot._bp ^= 3;
1544 lbot.initFromRow(bot, ep, ebwt);
1549 * Calculate SideLocus based on a row and other relevant
1550 * information about the shape of the Ebwt.
1552 void initFromRow(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) {
1553 const uint32_t sideSz = ep._sideSz;
1554 // Side length is hard-coded for now; this allows the compiler
1555 // to do clever things to accelerate / and %.
1556 _sideNum = row / 224;
1557 _charOff = row % 224;
1558 _sideByteOff = _sideNum * sideSz;
1559 assert_leq(row, ep._len);
1560 assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
1562 __builtin_prefetch((const void *)(ebwt + _sideByteOff),
1563 0 /* prepare for read */,
1566 // prefetch tjside too
1567 _fw = (_sideNum & 1) != 0; // odd-numbered sides are forward
1568 _by = _charOff >> 2; // byte within side
1569 assert_lt(_by, (int)ep._sideBwtSz);
1570 _bp = _charOff & 3; // bit-pair within byte
1572 _by = ep._sideBwtSz - _by - 1;
1577 /// Return true iff this is an initialized SideLocus
1582 /// Make this look like an invalid SideLocus
1587 const uint8_t *side(const uint8_t* ebwt) const {
1588 return ebwt + _sideByteOff;
1591 const uint8_t *oside(const uint8_t* ebwt) const {
1592 return ebwt + _sideByteOff + (_fw? (-128) : (128));
1595 uint32_t _sideByteOff; // offset of top side within ebwt[]
1596 uint32_t _sideNum; // index of side
1597 uint16_t _charOff; // character offset within side
1598 bool _fw; // side is forward or backward?
1599 int16_t _by; // byte within side (not adjusted for bw sides)
1600 int8_t _bp; // bitpair within byte (not adjusted for bw sides)
1603 #include "ebwt_search_backtrack.h"
1605 ///////////////////////////////////////////////////////////////////////
1607 // Functions for printing and sanity-checking Ebwts
1609 ///////////////////////////////////////////////////////////////////////
1612 * Given a range of positions in the EBWT array within the BWT portion
1613 * of a forward side, print the characters at those positions along
1614 * with a summary occ[] array.
1616 template<typename TStr>
1617 void Ebwt<TStr>::printRangeFw(uint32_t begin, uint32_t end) const {
1618 assert(isInMemory());
1619 uint32_t occ[] = {0, 0, 0, 0};
1620 assert_gt(end, begin);
1621 for(uint32_t i = begin; i < end; i++) {
1622 uint8_t by = this->_ebwt[i];
1623 for(int j = 0; j < 4; j++) {
1624 // Unpack from lowest to highest bit pair
1625 int twoBit = unpack_2b_from_8b(by, j);
1627 cout << "ACGT"[twoBit];
1629 assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
1631 cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
1635 * Given a range of positions in the EBWT array within the BWT portion
1636 * of a backward side, print the characters at those positions along
1637 * with a summary occ[] array.
1639 template<typename TStr>
1640 void Ebwt<TStr>::printRangeBw(uint32_t begin, uint32_t end) const {
1641 assert(isInMemory());
1642 uint32_t occ[] = {0, 0, 0, 0};
1643 assert_gt(end, begin);
1644 for(uint32_t i = end-1; i >= begin; i--) {
1645 uint8_t by = this->_ebwt[i];
1646 for(int j = 3; j >= 0; j--) {
1647 // Unpack from lowest to highest bit pair
1648 int twoBit = unpack_2b_from_8b(by, j);
1650 cout << "ACGT"[twoBit];
1652 assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
1655 cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
1659 * Check that the ebwt array is internally consistent up to (and not
1660 * including) the given side index by re-counting the chars and
1661 * comparing against the embedded occ[] arrays.
1663 template<typename TStr>
1664 void Ebwt<TStr>::sanityCheckUpToSide(int upToSide) const {
1665 assert(isInMemory());
1666 uint32_t occ[] = {0, 0, 0, 0};
1667 uint32_t occ_save[] = {0, 0};
1668 uint32_t cur = 0; // byte pointer
1669 const EbwtParams& eh = this->_eh;
1671 while(cur < (upToSide * eh._sideSz)) {
1672 assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
1673 for(uint32_t i = 0; i < eh._sideBwtSz; i++) {
1674 uint8_t by = this->_ebwt[cur + (fw ? i : eh._sideBwtSz-i-1)];
1675 for(int j = 0; j < 4; j++) {
1676 // Unpack from lowest to highest bit pair
1677 int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
1679 //if(_verbose) cout << "ACGT"[twoBit];
1681 assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
1683 assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
1685 // Finished forward bucket; check saved [G] and [T]
1686 // against the two uint32_ts encoded here
1687 ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast<uint32_t*>(&this->_ebwt[cur + eh._sideBwtSz]));
1688 ASSERT_ONLY(uint32_t gs = u32ebwt[0]);
1689 ASSERT_ONLY(uint32_t ts = u32ebwt[1]);
1690 assert_eq(gs, occ_save[0]);
1691 assert_eq(ts, occ_save[1]);
1694 // Finished backward bucket; check current [A] and [C]
1695 // against the two uint32_ts encoded here
1696 ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast<uint32_t*>(&this->_ebwt[cur + eh._sideBwtSz]));
1697 ASSERT_ONLY(uint32_t as = u32ebwt[0]);
1698 ASSERT_ONLY(uint32_t cs = u32ebwt[1]);
1699 assert(as == occ[0] || as == occ[0]-1); // one 'a' is a skipped '$' and doesn't count toward occ[]
1700 assert_eq(cs, occ[1]);
1701 occ_save[0] = occ[2]; // save gs
1702 occ_save[1] = occ[3]; // save ts
1710 * Sanity-check various pieces of the Ebwt
1712 template<typename TStr>
1713 void Ebwt<TStr>::sanityCheckAll(int reverse) const {
1714 const EbwtParams& eh = this->_eh;
1715 assert(isInMemory());
1717 for(uint32_t i = 1; i < eh._ftabLen; i++) {
1718 assert_geq(this->ftabHi(i), this->ftabLo(i-1));
1719 assert_geq(this->ftabLo(i), this->ftabHi(i-1));
1720 assert_leq(this->ftabHi(i), eh._bwtLen+1);
1722 assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen);
1725 int seenLen = (eh._bwtLen + 31) >> 5;
1728 seen = new uint32_t[seenLen]; // bitvector marking seen offsets
1729 } catch(bad_alloc& e) {
1730 cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
1733 memset(seen, 0, 4 * seenLen);
1734 uint32_t offsLen = eh._offsLen;
1735 for(uint32_t i = 0; i < offsLen; i++) {
1736 assert_lt(this->_offs[i], eh._bwtLen);
1737 int w = this->_offs[i] >> 5;
1738 int r = this->_offs[i] & 31;
1739 assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
1740 seen[w] |= (1 << r);
1745 assert_gt(this->_nPat, 0);
1748 for(uint32_t i = 0; i < this->_nPat; i++) {
1749 assert_geq(this->_plen[i], 0);
1753 for(uint32_t i = 0; i < this->_nFrag-1; i++) {
1754 assert_gt(this->_rstarts[(i+1)*3], this->_rstarts[i*3]);
1755 if(reverse == REF_READ_REVERSE) {
1756 assert(this->_rstarts[(i*3)+1] >= this->_rstarts[((i+1)*3)+1]);
1758 assert(this->_rstarts[(i*3)+1] <= this->_rstarts[((i+1)*3)+1]);
1763 sanityCheckUpToSide(eh._numSides);
1764 VMSG_NL("Ebwt::sanityCheck passed");
1767 ///////////////////////////////////////////////////////////////////////
1769 // Functions for searching Ebwts
1771 ///////////////////////////////////////////////////////////////////////
1774 * Return the final character in row i (i.e. the i'th character in the
1775 * BWT transform). Note that the 'L' in the name of the function
1776 * stands for 'last', as in the literature.
1778 template<typename TStr>
1779 inline int Ebwt<TStr>::rowL(const SideLocus& l) const {
1780 // Extract and return appropriate bit-pair
1781 #ifdef SIXTY4_FORMAT
1782 return (((uint64_t*)l.side(this->_ebwt))[l._by >> 3] >> ((((l._by & 7) << 2) + l._bp) << 1)) & 3;
1784 return unpack_2b_from_8b(l.side(this->_ebwt)[l._by], l._bp);
1789 * Inline-function version of the above. This does not always seem to
1793 // Use gcc's intrinsic popcountll. I don't recommend it because it
1794 // seems to be somewhat slower than the bit-bashing pop64 routine both
1795 // on an AMD server and on an Intel workstation. On the other hand,
1796 // perhaps when the builtin is used GCC is smart enough to insert a
1797 // pop-count instruction on architectures that have one (e.g. Itanium).
1798 // For now, it's disabled.
1799 #define pop64(x) __builtin_popcountll(x)
1801 __declspec naked int __stdcall pop64
1804 static const uint64_t C55 = 0x5555555555555555ll;
1805 static const uint64_t C33 = 0x3333333333333333ll;
1806 static const uint64_t C0F = 0x0F0F0F0F0F0F0F0Fll;
1808 MOVD MM0, [ESP+4] ;v_low
1809 PUNPCKLDQ MM0, [ESP+8] ;v
1811 PSRLD MM0, 1 ;v >> 1
1812 PAND MM0, [C55] ;(v >> 1) & 0x55555555
1813 PSUBD MM1, MM0 ;w = v - ((v >> 1) & 0x55555555)
1815 PSRLD MM1, 2 ;w >> 2
1816 PAND MM0, [C33] ;w & 0x33333333
1817 PAND MM1, [C33] ;(w >> 2) & 0x33333333
1818 PADDD MM0, MM1 ;x = (w & 0x33333333) +
1819 ; ((w >> 2) & 0x33333333)
1821 PSRLD MM0, 4 ;x >> 4
1822 PADDD MM0, MM1 ;x + (x >> 4)
1823 PAND MM0, [C0F] ;y = (x + (x >> 4) & 0x0F0F0F0F)
1825 PSADBW (MM0, MM1) ;sum across all 8 bytes
1826 MOVD EAX, MM0 ;result in EAX per calling
1828 EMMS ;clear MMX state
1829 RET 8 ;pop 8-byte argument off stack
1834 // Use a bytewise LUT version of popcount. This is slower than the
1835 // bit-bashing pop64 routine both on an AMD server and on an Intel
1836 // workstation. It seems to be about the same speed as the GCC builtin
1837 // on Intel, and a bit faster than it on AMD. For now, it's disabled.
1838 const int popcntU8Table[256] = {
1839 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
1840 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
1841 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
1842 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
1843 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
1844 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
1845 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
1846 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
1849 // Use this bytewise population count table
1850 inline static int pop64(uint64_t x) {
1851 const unsigned char * p = (const unsigned char *) &x;
1852 return popcntU8Table[p[0]] +
1853 popcntU8Table[p[1]] +
1854 popcntU8Table[p[2]] +
1855 popcntU8Table[p[3]] +
1856 popcntU8Table[p[4]] +
1857 popcntU8Table[p[5]] +
1858 popcntU8Table[p[6]] +
1859 popcntU8Table[p[7]];
1862 // Use this standard bit-bashing population count
1863 inline static int pop64(uint64_t x) {
1864 x = x - ((x >> 1) & 0x5555555555555555llu);
1865 x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
1866 x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
1875 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
1876 * within a 64-bit argument.
1878 inline static int countInU64(int c, uint64_t dw) {
1879 uint64_t dwA = dw & 0xAAAAAAAAAAAAAAAAllu;
1880 uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
1884 tmp = (dwA >> 1) | dwNA;
1887 tmp = ~(dwA >> 1) & dwNA;
1890 tmp = (dwA >> 1) & ~dwNA;
1893 tmp = (dwA >> 1) & dwNA;
1898 tmp = pop64(tmp); // Gets 7.62% in profile
1902 assert_leq(tmp, 32);
1908 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
1909 * within a 64-bit argument.
1911 * Function gets 2.32% in profile
1913 inline static void countInU64Ex(uint64_t dw, uint32_t* arrs) {
1914 uint64_t dwA = dw & 0xAAAAAAAAAAAAAAAAllu;
1915 uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
1916 arrs[0] += (32 - pop64((dwA >> 1) | dwNA));
1917 arrs[1] += pop64(~(dwA >> 1) & dwNA);
1918 arrs[2] += pop64((dwA >> 1) & ~dwNA);
1919 arrs[3] += pop64((dwA >> 1) & dwNA);
1923 * Counts the number of occurrences of character 'c' in the given Ebwt
1924 * side up to (but not including) the given byte/bitpair (by/bp).
1926 * This is a performance-critical function. This is the top search-
1927 * related hit in the time profile.
1929 * Function gets 11.09% in profile
1931 template<typename TStr>
1932 inline uint32_t Ebwt<TStr>::countUpTo(const SideLocus& l, int c) const {
1933 // Count occurrences of c in each 64-bit (using bit trickery);
1934 // Someday countInU64() and pop() functions should be
1935 // vectorized/SSE-ized in case that helps.
1937 const uint8_t *side = l.side(this->_ebwt);
1940 for(; i + 7 < l._by; i += 8) {
1941 cCnt += countInU64(c, *(uint64_t*)&side[i]);
1944 for(; i + 2 < l._by; i += 2) {
1945 cCnt += cCntLUT_16b_4[c][*(uint16_t*)&side[i]];
1948 #ifdef SIXTY4_FORMAT
1949 // Calculate number of bit pairs to shift off the end
1950 const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
1951 if(bpShiftoff < 32) {
1952 assert_lt(bpShiftoff, 32);
1953 const uint64_t sw = (*(uint64_t*)&side[i]) << (bpShiftoff << 1);
1954 cCnt += countInU64(c, sw);
1955 if(c == 0) cCnt -= bpShiftoff; // we turned these into As
1958 // Count occurences of c in the rest of the side (using LUT)
1959 for(; i < l._by; i++) {
1960 cCnt += cCntLUT_4[0][c][side[i]];
1962 // Count occurences of c in the rest of the byte
1964 cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
1971 * Counts the number of occurrences of character 'c' in the given Ebwt
1972 * side up to (but not including) the given byte/bitpair (by/bp).
1974 template<typename TStr>
1975 inline void Ebwt<TStr>::countUpToEx(const SideLocus& l, uint32_t* arrs) const {
1977 // Count occurrences of c in each 64-bit (using bit trickery);
1978 // note: this seems does not seem to lend a significant boost to
1979 // performance. If you comment out this whole loop (which won't
1980 // affect correctness - it will just cause the following loop to
1981 // take up the slack) then runtime does not change noticeably.
1982 // Someday the countInU64() and pop() functions should be
1983 // vectorized/SSE-ized in case that helps.
1984 const uint8_t *side = l.side(this->_ebwt);
1985 for(; i+7 < l._by; i += 8) {
1986 countInU64Ex(*(uint64_t*)&side[i], arrs);
1988 #ifdef SIXTY4_FORMAT
1989 // Calculate number of bit pairs to shift off the end
1990 const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
1991 assert_leq(bpShiftoff, 32);
1992 if(bpShiftoff < 32) {
1993 const uint64_t sw = (*(uint64_t*)&l.side(this->_ebwt)[i]) << (bpShiftoff << 1);
1994 countInU64Ex(sw, arrs);
1995 arrs[0] -= bpShiftoff;
1998 // Count occurences of c in the rest of the side (using LUT)
1999 for(; i < l._by; i++) {
2000 arrs[0] += cCntLUT_4[0][0][side[i]];
2001 arrs[1] += cCntLUT_4[0][1][side[i]];
2002 arrs[2] += cCntLUT_4[0][2][side[i]];
2003 arrs[3] += cCntLUT_4[0][3][side[i]];
2005 // Count occurences of c in the rest of the byte
2007 arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
2008 arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
2009 arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
2010 arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
2016 * Count all occurrences of character c from the beginning of the
2017 * forward side to <by,bp> and add in the occ[] count up to the side
2018 * break just prior to the side.
2020 template<typename TStr>
2021 inline uint32_t Ebwt<TStr>::countFwSide(const SideLocus& l, int c) const {
2024 assert_lt(l._by, (int)this->_eh._sideBwtSz);
2025 assert_geq(l._by, 0);
2026 assert_lt(l._bp, 4);
2027 assert_geq(l._bp, 0);
2028 const uint8_t *side = l.side(this->_ebwt);
2029 uint32_t cCnt = countUpTo(l, c);
2030 assert_leq(cCnt, this->_eh._sideBwtLen);
2031 if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2032 // Adjust for the fact that we represented $ with an 'A', but
2033 // shouldn't count it as an 'A' here
2034 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2035 (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
2037 cCnt--; // Adjust for '$' looking like an 'A'
2041 // Now factor in the occ[] count at the side break
2043 const uint32_t *ac = reinterpret_cast<const uint32_t*>(side - 8);
2044 assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
2045 assert_leq(ac[1], this->_eh._len);
2046 ret = ac[c] + cCnt + this->_fchr[c];
2048 const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8); // next
2049 assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2050 ret = gt[c-2] + cCnt + this->_fchr[c];
2053 assert_leq(ret, this->_fchr[c+1]); // can't have jumpded into next char's section
2055 assert_leq(cCnt, this->_eh._sideBwtLen);
2057 assert_leq(ret, this->_eh._bwtLen);
2064 * Count all occurrences of character c from the beginning of the
2065 * forward side to <by,bp> and add in the occ[] count up to the side
2066 * break just prior to the side.
2068 template<typename TStr>
2069 inline void Ebwt<TStr>::countFwSideEx(const SideLocus& l, uint32_t* arrs) const
2071 assert_lt(l._by, (int)this->_eh._sideBwtSz);
2072 assert_geq(l._by, 0);
2073 assert_lt(l._bp, 4);
2074 assert_geq(l._bp, 0);
2075 countUpToEx(l, arrs);
2077 assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
2078 assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
2079 assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
2080 assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
2082 assert_leq(arrs[0], this->_eh._sideBwtLen);
2083 assert_leq(arrs[1], this->_eh._sideBwtLen);
2084 assert_leq(arrs[2], this->_eh._sideBwtLen);
2085 assert_leq(arrs[3], this->_eh._sideBwtLen);
2086 const uint8_t *side = l.side(this->_ebwt);
2087 if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2088 // Adjust for the fact that we represented $ with an 'A', but
2089 // shouldn't count it as an 'A' here
2090 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2091 (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
2093 arrs[0]--; // Adjust for '$' looking like an 'A'
2096 // Now factor in the occ[] count at the side break
2097 const uint32_t *ac = reinterpret_cast<const uint32_t*>(side - 8);
2098 const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8);
2100 assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
2101 assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
2102 assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
2103 assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
2105 assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
2106 assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2107 arrs[0] += (ac[0] + this->_fchr[0]);
2108 arrs[1] += (ac[1] + this->_fchr[1]);
2109 arrs[2] += (gt[0] + this->_fchr[2]);
2110 arrs[3] += (gt[1] + this->_fchr[3]);
2112 assert_leq(arrs[0], this->_fchr[1]); // can't have jumpded into next char's section
2113 assert_leq(arrs[1], this->_fchr[2]); // can't have jumpded into next char's section
2114 assert_leq(arrs[2], this->_fchr[3]); // can't have jumpded into next char's section
2115 assert_leq(arrs[3], this->_fchr[4]); // can't have jumpded into next char's section
2120 * Count all instances of character c from <by,bp> to the logical end
2121 * (actual beginning) of the backward side, and subtract that from the
2122 * occ[] count up to the side break.
2124 template<typename TStr>
2125 inline uint32_t Ebwt<TStr>::countBwSide(const SideLocus& l, int c) const {
2128 assert_lt(l._by, (int)this->_eh._sideBwtSz);
2129 assert_geq(l._by, 0);
2130 assert_lt(l._bp, 4);
2131 assert_geq(l._bp, 0);
2132 const uint8_t *side = l.side(this->_ebwt);
2133 uint32_t cCnt = countUpTo(l, c);
2134 if(rowL(l) == c) cCnt++;
2135 assert_leq(cCnt, this->_eh._sideBwtLen);
2136 if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2137 // Adjust for the fact that we represented $ with an 'A', but
2138 // shouldn't count it as an 'A' here
2139 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2140 (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
2146 // Now factor in the occ[] count at the side break
2148 const uint32_t *ac = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8);
2149 assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
2150 assert_leq(ac[1], this->_eh._len);
2151 ret = ac[c] - cCnt + this->_fchr[c];
2153 const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + (2*this->_eh._sideSz) - 8); // next
2154 assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2155 ret = gt[c-2] - cCnt + this->_fchr[c];
2158 assert_leq(ret, this->_fchr[c+1]); // can't have jumped into next char's section
2160 assert_leq(cCnt, this->_eh._sideBwtLen);
2162 assert_lt(ret, this->_eh._bwtLen);
2169 * Count all instances of character c from <by,bp> to the logical end
2170 * (actual beginning) of the backward side, and subtract that from the
2171 * occ[] count up to the side break.
2173 template<typename TStr>
2174 inline void Ebwt<TStr>::countBwSideEx(const SideLocus& l, uint32_t* arrs) const {
2175 assert_lt(l._by, (int)this->_eh._sideBwtSz);
2176 assert_geq(l._by, 0);
2177 assert_lt(l._bp, 4);
2178 assert_geq(l._bp, 0);
2179 const uint8_t *side = l.side(this->_ebwt);
2180 countUpToEx(l, arrs);
2182 assert_leq(arrs[0], this->_eh._sideBwtLen);
2183 assert_leq(arrs[1], this->_eh._sideBwtLen);
2184 assert_leq(arrs[2], this->_eh._sideBwtLen);
2185 assert_leq(arrs[3], this->_eh._sideBwtLen);
2186 if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2187 // Adjust for the fact that we represented $ with an 'A', but
2188 // shouldn't count it as an 'A' here
2189 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2190 (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
2192 arrs[0]--; // Adjust for '$' looking like an 'A'
2195 // Now factor in the occ[] count at the side break
2196 const uint32_t *ac = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8);
2197 const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + (2*this->_eh._sideSz) - 8);
2199 assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
2200 assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
2201 assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
2202 assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
2204 assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
2205 assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2206 arrs[0] = (ac[0] - arrs[0] + this->_fchr[0]);
2207 arrs[1] = (ac[1] - arrs[1] + this->_fchr[1]);
2208 arrs[2] = (gt[0] - arrs[2] + this->_fchr[2]);
2209 arrs[3] = (gt[1] - arrs[3] + this->_fchr[3]);
2211 assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
2212 assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
2213 assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
2214 assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
2219 * Given top and bot loci, calculate counts of all four DNA chars up to
2220 * those loci. Used for more advanced backtracking-search.
2222 template<typename TStr>
2223 inline void Ebwt<TStr>::mapLFEx(const SideLocus& ltop,
2224 const SideLocus& lbot,
2227 ASSERT_ONLY(, bool overrideSanity)
2230 // TODO: Where there's overlap, reuse the count for the overlapping
2233 const_cast<Ebwt<TStr>*>(this)->mapLFExs_++;
2235 assert_eq(0, tops[0]); assert_eq(0, bots[0]);
2236 assert_eq(0, tops[1]); assert_eq(0, bots[1]);
2237 assert_eq(0, tops[2]); assert_eq(0, bots[2]);
2238 assert_eq(0, tops[3]); assert_eq(0, bots[3]);
2239 if(ltop._fw) countFwSideEx(ltop, tops); // Forward side
2240 else countBwSideEx(ltop, tops); // Backward side
2241 if(lbot._fw) countFwSideEx(lbot, bots); // Forward side
2242 else countBwSideEx(lbot, bots); // Backward side
2244 if(_sanity && !overrideSanity) {
2245 // Make sure results match up with individual calls to mapLF;
2246 // be sure to override sanity-checking in the callee, or we'll
2247 // have infinite recursion
2248 assert_eq(mapLF(ltop, 0, true), tops[0]);
2249 assert_eq(mapLF(ltop, 1, true), tops[1]);
2250 assert_eq(mapLF(ltop, 2, true), tops[2]);
2251 assert_eq(mapLF(ltop, 3, true), tops[3]);
2252 assert_eq(mapLF(lbot, 0, true), bots[0]);
2253 assert_eq(mapLF(lbot, 1, true), bots[1]);
2254 assert_eq(mapLF(lbot, 2, true), bots[2]);
2255 assert_eq(mapLF(lbot, 3, true), bots[3]);
2262 * Given top and bot loci, calculate counts of all four DNA chars up to
2263 * those loci. Used for more advanced backtracking-search.
2265 template<typename TStr>
2266 inline void Ebwt<TStr>::mapLFEx(const SideLocus& l,
2268 ASSERT_ONLY(, bool overrideSanity)
2271 assert_eq(0, arrs[0]);
2272 assert_eq(0, arrs[1]);
2273 assert_eq(0, arrs[2]);
2274 assert_eq(0, arrs[3]);
2275 if(l._fw) countFwSideEx(l, arrs); // Forward side
2276 else countBwSideEx(l, arrs); // Backward side
2278 if(_sanity && !overrideSanity) {
2279 // Make sure results match up with individual calls to mapLF;
2280 // be sure to override sanity-checking in the callee, or we'll
2281 // have infinite recursion
2282 assert_eq(mapLF(l, 0, true), arrs[0]);
2283 assert_eq(mapLF(l, 1, true), arrs[1]);
2284 assert_eq(mapLF(l, 2, true), arrs[2]);
2285 assert_eq(mapLF(l, 3, true), arrs[3]);
2292 * Given row i, return the row that the LF mapping maps i to.
2294 template<typename TStr>
2295 inline uint32_t Ebwt<TStr>::mapLF(const SideLocus& l
2296 ASSERT_ONLY(, bool overrideSanity)
2300 const_cast<Ebwt<TStr>*>(this)->mapLFs_++;
2303 assert(l.side(this->_ebwt) != NULL);
2307 if(l._fw) ret = countFwSide(l, c); // Forward side
2308 else ret = countBwSide(l, c); // Backward side
2309 assert_lt(ret, this->_eh._bwtLen);
2311 if(_sanity && !overrideSanity) {
2312 // Make sure results match up with results from mapLFEx;
2313 // be sure to override sanity-checking in the callee, or we'll
2314 // have infinite recursion
2315 uint32_t arrs[] = { 0, 0, 0, 0 };
2316 mapLFEx(l, arrs, true);
2317 assert_eq(arrs[c], ret);
2324 * Given row i and character c, return the row that the LF mapping maps
2325 * i to on character c.
2327 template<typename TStr>
2328 inline uint32_t Ebwt<TStr>::mapLF(const SideLocus& l, int c
2329 ASSERT_ONLY(, bool overrideSanity)
2333 const_cast<Ebwt<TStr>*>(this)->mapLFcs_++;
2338 if(l._fw) ret = countFwSide(l, c); // Forward side
2339 else ret = countBwSide(l, c); // Backward side
2340 assert_lt(ret, this->_eh._bwtLen);
2342 if(_sanity && !overrideSanity) {
2343 // Make sure results match up with results from mapLFEx;
2344 // be sure to override sanity-checking in the callee, or we'll
2345 // have infinite recursion
2346 uint32_t arrs[] = { 0, 0, 0, 0 };
2347 mapLFEx(l, arrs, true);
2348 assert_eq(arrs[c], ret);
2355 * Given row i and character c, return the row that the LF mapping maps
2356 * i to on character c.
2358 template<typename TStr>
2359 inline uint32_t Ebwt<TStr>::mapLF1(uint32_t row, const SideLocus& l, int c
2360 ASSERT_ONLY(, bool overrideSanity)
2364 const_cast<Ebwt<TStr>*>(this)->mapLF1cs_++;
2366 if(rowL(l) != c || row == _zOff) return 0xffffffff;
2370 if(l._fw) ret = countFwSide(l, c); // Forward side
2371 else ret = countBwSide(l, c); // Backward side
2372 assert_lt(ret, this->_eh._bwtLen);
2374 if(_sanity && !overrideSanity) {
2375 // Make sure results match up with results from mapLFEx;
2376 // be sure to override sanity-checking in the callee, or we'll
2377 // have infinite recursion
2378 uint32_t arrs[] = { 0, 0, 0, 0 };
2379 mapLFEx(l, arrs, true);
2380 assert_eq(arrs[c], ret);
2387 * Given row i and character c, return the row that the LF mapping maps
2388 * i to on character c.
2390 template<typename TStr>
2391 inline int Ebwt<TStr>::mapLF1(uint32_t& row, const SideLocus& l
2392 ASSERT_ONLY(, bool overrideSanity)
2396 const_cast<Ebwt<TStr>*>(this)->mapLF1s_++;
2398 if(row == _zOff) return -1;
2402 if(l._fw) row = countFwSide(l, c); // Forward side
2403 else row = countBwSide(l, c); // Backward side
2404 assert_lt(row, this->_eh._bwtLen);
2406 if(_sanity && !overrideSanity) {
2407 // Make sure results match up with results from mapLFEx;
2408 // be sure to override sanity-checking in the callee, or we'll
2409 // have infinite recursion
2410 uint32_t arrs[] = { 0, 0, 0, 0 };
2411 mapLFEx(l, arrs, true);
2412 assert_eq(arrs[c], row);
2419 * Take an offset into the joined text and translate it into the
2420 * reference of the index it falls on, the offset into the reference,
2421 * and the length of the reference. Use a binary search through the
2422 * sorted list of reference fragment ranges t
2424 template<typename TStr>
2425 void Ebwt<TStr>::joinedToTextOff(uint32_t qlen, uint32_t off,
2428 uint32_t& tlen) const
2431 uint32_t bot = _nFrag; // 1 greater than largest addressable element
2432 uint32_t elt = 0xffffffff;
2433 // Begin binary search
2435 ASSERT_ONLY(uint32_t oldelt = elt);
2436 elt = top + ((bot - top) >> 1);
2437 assert_neq(oldelt, elt); // must have made progress
2438 uint32_t lower = _rstarts[elt*3];
2440 if(elt == _nFrag-1) {
2443 upper = _rstarts[((elt+1)*3)];
2445 assert_gt(upper, lower);
2446 uint32_t fraglen = upper - lower;
2448 if(upper > off) { // not last element, but it's within
2449 // off is in this range; check if it falls off
2450 if(off + qlen > upper) {
2451 // it falls off; signal no-go and return
2453 assert_lt(elt, _nFrag-1);
2456 tidx = _rstarts[(elt*3)+1];
2457 assert_lt(tidx, this->_nPat);
2458 assert_leq(fraglen, this->_plen[tidx]);
2459 // it doesn't fall off; now calculate textoff.
2460 // Initially it's the number of characters that precede
2461 // the alignment in the fragment
2462 uint32_t fragoff = off - _rstarts[(elt*3)];
2464 fragoff = fraglen - fragoff - 1;
2465 fragoff -= (qlen-1);
2467 // Add the alignment's offset into the fragment
2468 // ('fragoff') to the fragment's offset within the text
2469 textoff = fragoff + _rstarts[(elt*3)+2];
2470 assert_lt(textoff, this->_plen[tidx]);
2471 break; // done with binary search
2473 // 'off' belongs somewhere in the region between elt
2478 // 'off' belongs somewhere in the region between top and
2482 // continue with binary search
2484 tlen = this->_plen[tidx];
2488 * Report a potential match at offset 'off' with pattern length
2489 * 'qlen'. Filter out spurious matches that span texts.
2491 template<typename TStr>
2492 inline bool Ebwt<TStr>::report(const String<Dna5>& query,
2493 String<char>* quals,
2498 const BitPairReference* ref,
2499 const std::vector<uint32_t>& mmui32,
2500 const std::vector<uint8_t>& refcs,
2510 const EbwtSearchParams<TStr>& params) const
2512 VMSG_NL("In report");
2513 assert_geq(cost, (uint32_t)(stratum << 14));
2514 assert_lt(off, this->_eh._len);
2518 joinedToTextOff(qlen, off, tidx, textoff, tlen);
2519 if(tidx == 0xffffffff) {
2522 return params.reportHit(
2523 query, // read sequence
2524 quals, // read quality values
2526 color, // true -> read is colorspace
2527 colExEnds, // true -> exclude nucleotides on ends
2528 snpPhred, // phred probability of SNP
2529 ref, // reference sequence
2530 rmap_, // map to another reference coordinate system
2531 _fw, // true = index is forward; false = mirror
2532 mmui32, // mismatch positions
2533 refcs, // reference characters for mms
2534 numMms, // # mismatches
2535 make_pair(tidx, textoff), // position
2536 make_pair(0, 0), // (bogus) mate position
2537 true, // (bogus) mate orientation
2538 0, // (bogus) mate length
2539 make_pair(top, bot), // arrows
2542 stratum, // alignment stratum
2543 cost, // cost, including stratum & quality penalty
2544 bot-top-1, // # other hits
2545 patid, // pattern id
2546 seed, // pseudo-random seed
2547 0); // mate (0 = unpaired)
2550 #include "row_chaser.h"
2553 * Report a result. Involves walking backwards along the original
2554 * string by way of the LF-mapping until we reach a marked SA row or
2555 * the row corresponding to the 0th suffix. A marked row's offset
2556 * into the original string can be read directly from the this->_offs[]
2559 template<typename TStr>
2560 inline bool Ebwt<TStr>::reportChaseOne(const String<Dna5>& query,
2561 String<char>* quals,
2566 const BitPairReference* ref,
2567 const std::vector<uint32_t>& mmui32,
2568 const std::vector<uint8_t>& refcs,
2578 const EbwtSearchParams<TStr>& params,
2581 VMSG_NL("In reportChaseOne");
2584 ASSERT_ONLY(uint32_t origi = i);
2586 const uint32_t offMask = this->_eh._offMask;
2587 const uint32_t offRate = this->_eh._offRate;
2588 const uint32_t* offs = this->_offs;
2589 // If the caller didn't give us a pre-calculated (and prefetched)
2590 // locus, then we have to do that now
2593 l->initFromRow(i, this->_eh, this->_ebwt);
2597 // Walk along until we reach the next marked row to the left
2598 while(((i & offMask) != i) && i != _zOff) {
2599 // Not a marked row; walk left one more char
2600 uint32_t newi = mapLF(*l); // calc next row
2601 assert_neq(newi, i);
2602 i = newi; // update row
2603 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2606 // This is a marked row
2608 // Special case: it's the row corresponding to the
2609 // lexicographically smallest suffix, which is implicitly
2612 VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
2614 // Normal marked row, calculate offset of row i
2615 off = offs[i >> offRate] + jumps;
2616 VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
2620 uint32_t rcoff = RowChaser<TStr>::toFlatRefOff(this, qlen, origi);
2621 assert_eq(rcoff, off);
2624 return report(query, quals, name, color, colExEnds, snpPhred, ref,
2625 mmui32, refcs, numMms, off, top, bot, qlen, stratum,
2626 cost, patid, seed, params);
2630 * Report a result. Involves walking backwards along the original
2631 * string by way of the LF-mapping until we reach a marked SA row or
2632 * the row corresponding to the 0th suffix. A marked row's offset
2633 * into the original string can be read directly from the this->_offs[]
2636 template<typename TStr>
2637 inline bool Ebwt<TStr>::reportReconstruct(const String<Dna5>& query,
2638 String<char>* quals,
2642 const uint32_t *mmui32,
2650 const EbwtSearchParams<TStr>& params,
2653 VMSG_NL("In reportReconstruct");
2654 assert_gt(_eh._isaLen, 0); // Must have inverse suffix array to reconstruct
2658 const uint32_t offMask = this->_eh._offMask;
2659 const uint32_t offRate = this->_eh._offRate;
2660 const uint32_t* offs = this->_offs;
2661 const uint32_t* isa = this->_isa;
2662 assert(isa != NULL);
2665 myl.initFromRow(i, this->_eh, this->_ebwt);
2668 clear(lbuf); clear(rbuf);
2669 // Walk along until we reach the next marked row to the left
2670 while(((i & offMask) != i) && i != _zOff) {
2671 // Not a marked row; walk left one more char
2673 appendValue(lbuf, (Dna5)c);
2677 if(l->_fw) newi = countFwSide(*l, c); // Forward side
2678 else newi = countBwSide(*l, c); // Backward side
2679 assert_lt(newi, this->_eh._bwtLen);
2680 assert_neq(newi, i);
2681 i = newi; // update row
2682 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2685 // This is a marked row
2687 // Special case: it's the row corresponding to the
2688 // lexicographically smallest suffix, which is implicitly
2691 VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
2693 // Normal marked row, calculate offset of row i
2694 off = offs[i >> offRate] + jumps;
2695 VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
2697 // 'off' now holds the text offset of the first (leftmost) position
2698 // involved in the alignment. Next we call joinedToTextOff to
2699 // check whether the seed is valid (i.e., does not straddle a
2700 // boundary between two reference seuqences) and to obtain its
2702 uint32_t tidx; // the index (id) of the reference we hit in
2703 uint32_t textoff; // the offset of the alignment within the reference
2704 uint32_t tlen; // length of reference seed hit in
2705 joinedToTextOff(qlen, off, tidx, textoff, tlen);
2706 if(tidx == 0xffffffff) {
2707 // The seed straddled a reference boundary, and so is spurious.
2708 // Return false, indicating that we shouldn't stop.
2711 if(jumps > textoff) {
2712 // In our progress toward a marked row, we passed the boundary
2713 // between the reference sequence containing the seed and the
2714 // reference sequence to the left of it. That's OK, we just
2715 // need to knock off the extra characters we added to 'lbuf'.
2716 assert_eq(jumps, length(lbuf));
2717 _setLength(lbuf, textoff);
2719 assert_eq(textoff, length(lbuf));
2720 } else if(jumps < textoff) {
2721 // Keep walking until we reach the end of the reference
2722 assert_neq(i, _zOff);
2723 uint32_t diff = textoff-jumps;
2724 for(size_t j = 0; j < diff; j++) {
2725 // Not a marked row; walk left one more char
2727 appendValue(lbuf, (Dna5)c);
2731 if(l->_fw) newi = countFwSide(*l, c); // Forward side
2732 else newi = countBwSide(*l, c); // Backward side
2733 assert_lt(newi, this->_eh._bwtLen);
2734 assert_neq(newi, i);
2735 i = newi; // update row
2736 assert_neq(i, _zOff);
2737 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2740 assert_eq(textoff, jumps);
2741 assert_eq(textoff, length(lbuf));
2743 assert_eq(textoff, jumps);
2744 assert_eq(textoff, length(lbuf));
2745 // Calculate the right-hand extent of the reference
2746 uint32_t ref_right = off - textoff + tlen;
2747 // Round the right-hand extent to the nearest ISA element that maps
2748 // to it or a character to its right
2749 uint32_t ref_right_rounded = ref_right;
2750 if((ref_right_rounded & _eh._isaMask) != ref_right_rounded) {
2751 ref_right_rounded = ((ref_right_rounded >> _eh._isaRate)+1) << _eh._isaRate;
2753 // TODO: handle case where ref_right_rounded is off the end of _isa
2754 // Let the current suffix-array elt be determined by the ISA
2755 if((ref_right_rounded >> _eh._isaRate) >= _eh._isaLen) {
2757 ref_right_rounded = _eh._len;
2759 i = isa[ref_right_rounded >> _eh._isaRate];
2761 uint32_t right_steps_rounded = ref_right_rounded - (off + qlen);
2762 uint32_t right_steps = ref_right - (off + qlen);
2763 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2764 for(size_t j = 0; j < right_steps_rounded; j++) {
2765 // Not a marked row; walk left one more char
2767 appendValue(rbuf, (Dna5)c);
2769 assert_lt(c, 4); assert_geq(c, 0);
2770 if(l->_fw) newi = countFwSide(*l, c); // Forward side
2771 else newi = countBwSide(*l, c); // Backward side
2772 assert_lt(newi, this->_eh._bwtLen);
2773 assert_neq(newi, i);
2774 i = newi; // update row
2775 assert_neq(i, _zOff);
2776 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2779 if(right_steps_rounded > right_steps) {
2780 jumps -= (right_steps_rounded - right_steps);
2781 _setLength(rbuf, right_steps);
2783 assert_eq(right_steps, length(rbuf));
2784 assert_eq(tlen, jumps + qlen);
2785 ::reverseInPlace(lbuf);
2786 ::reverseInPlace(rbuf);
2788 cout << "reportReconstruct:" << endl
2789 << " " << lbuf << query << rbuf << endl;
2791 for(size_t i = 0; i < length(lbuf); i++) cout << " ";
2792 cout << query << endl;
2794 // Now we've reconstructed the
2799 * Transform this Ebwt into the original string in linear time by using
2800 * the LF mapping to walk backwards starting at the row correpsonding
2801 * to the end of the string. The result is written to s. The Ebwt
2802 * must be in memory.
2804 template<typename TStr>
2805 void Ebwt<TStr>::restore(TStr& s) const {
2806 assert(isInMemory());
2807 resize(s, this->_eh._len, Exact());
2809 uint32_t i = this->_eh._len; // should point to final SA elt (starting with '$')
2810 SideLocus l(i, this->_eh, this->_ebwt);
2812 assert_lt(jumps, this->_eh._len);
2813 //if(_verbose) cout << "restore: i: " << i << endl;
2814 // Not a marked row; go back a char in the original string
2815 uint32_t newi = mapLF(l);
2816 assert_neq(newi, i);
2817 s[this->_eh._len - jumps - 1] = rowL(l);
2819 l.initFromRow(i, this->_eh, this->_ebwt);
2822 assert_eq(jumps, this->_eh._len);
2826 * Check that this Ebwt, when restored via restore(), matches up with
2827 * the given array of reference sequences. For sanity checking.
2829 template <typename TStr>
2830 void Ebwt<TStr>::checkOrigs(const vector<String<Dna5> >& os,
2831 bool color, bool mirror) const
2835 uint32_t restOff = 0;
2836 size_t i = 0, j = 0;
2841 while(i < os.size()) {
2842 size_t olen = length(os[i]);
2844 for(; j < olen; j++) {
2846 if(mirror) joff = olen - j - 1;
2847 if((int)os[i][joff] == 4) {
2851 while(j < olen && (int)os[i][j] == 4) j++;
2853 while(j < olen && (int)os[i][olen-j-1] == 4) j++;
2858 if(lastorig == -1 && color) {
2859 lastorig = os[i][joff];
2863 assert_neq(-1, lastorig);
2864 assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]);
2866 assert_eq(os[i][joff], rest[restOff]);
2868 lastorig = (int)os[i][joff];
2871 if(j == length(os[i])) {
2872 // Moved to next sequence
2876 // Just jumped over a gap
2881 ///////////////////////////////////////////////////////////////////////
2883 // Functions for reading and writing Ebwts
2885 ///////////////////////////////////////////////////////////////////////
2888 * Read an Ebwt from file with given filename.
2890 template<typename TStr>
2891 void Ebwt<TStr>::readIntoMemory(
2900 bool switchEndian; // dummy; caller doesn't care
2902 char *mmFile[] = { NULL, NULL };
2904 if(_in1Str.length() > 0) {
2905 if(_verbose || startVerbose) {
2906 cerr << " About to open input files: ";
2910 // Initialize our primary and secondary input-stream fields
2911 if(_in1 != -1) close(_in1);
2912 if(_verbose || startVerbose) {
2913 cerr << "Opening \"" << _in1Str << "\"" << endl;
2915 if((_in1 = open(_in1Str.c_str(), O_RDONLY)) < 0) {
2916 cerr << "Could not open index file " << _in1Str << endl;
2918 if(_in2 != -1) close(_in2);
2919 if(_verbose || startVerbose) {
2920 cerr << "Opening \"" << _in2Str << "\"" << endl;
2922 if((_in2 = open(_in2Str.c_str(), O_RDONLY)) < 0) {
2923 cerr << "Could not open index file " << _in2Str << endl;
2926 // Initialize our primary and secondary input-stream fields
2927 if(_in1 != NULL) fclose(_in1);
2928 if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str << "\"" << endl;
2929 if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
2930 cerr << "Could not open index file " << _in1Str << endl;
2932 if(_in2 != NULL) fclose(_in2);
2933 if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str << "\"" << endl;
2934 if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
2935 cerr << "Could not open index file " << _in2Str << endl;
2938 if(_verbose || startVerbose) {
2939 cerr << " Finished opening input files: ";
2944 if(_useMm && !justHeader) {
2945 const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
2946 int fds[] = { _in1, _in2 };
2947 for(int i = 0; i < 2; i++) {
2948 if(_verbose || startVerbose) {
2949 cerr << " Memory-mapping input file " << (i+1) << ": ";
2953 if (stat(names[i], &sbuf) == -1) {
2955 cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
2958 mmFile[i] = (char*)mmap((void *)0, sbuf.st_size,
2959 PROT_READ, MAP_SHARED, fds[i], 0);
2960 if(mmFile == (void *)(-1)) {
2962 cerr << "Error: Could not memory-map the index file " << names[i] << endl;
2967 for(off_t j = 0; j < sbuf.st_size; j += 1024) {
2968 sum += (int) mmFile[i][j];
2971 cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
2976 mmFile1_ = mmFile[0];
2977 mmFile2_ = mmFile[1];
2982 else if(_useMm && !justHeader) {
2983 mmFile[0] = mmFile1_;
2984 mmFile[1] = mmFile2_;
2986 if(_useMm && !justHeader) {
2987 assert(mmFile[0] == mmFile1_);
2988 assert(mmFile[1] == mmFile2_);
2992 if(_verbose || startVerbose) {
2993 cerr << " Reading header: ";
2997 // Read endianness hints from both streams
2998 size_t bytesRead = 0;
2999 switchEndian = false;
3000 uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream
3003 assert_eq(one, readU32(_in2, switchEndian)); // should match!
3005 readU32(_in2, switchEndian);
3008 assert_eq((1u<<24), one);
3009 assert_eq(1, endianSwapU32(one));
3010 switchEndian = true;
3013 // Can't switch endianness and use memory-mapped files; in order to
3014 // support this, someone has to modify the file to switch
3015 // endiannesses appropriately, and we can't do this inside Bowtie
3016 // or we might be setting up a race condition with other processes.
3017 if(switchEndian && _useMm) {
3018 cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
3022 // Reads header entries one by one from primary stream
3023 uint32_t len = readU32(_in1, switchEndian);
3025 int32_t lineRate = readI32(_in1, switchEndian);
3027 int32_t linesPerSide = readI32(_in1, switchEndian);
3029 int32_t offRate = readI32(_in1, switchEndian);
3031 // TODO: add isaRate to the actual file format (right now, the
3032 // user has to tell us whether there's an ISA sample and what the
3033 // sampling rate is.
3034 int32_t isaRate = _overrideIsaRate;
3035 int32_t ftabChars = readI32(_in1, switchEndian);
3037 // chunkRate was deprecated in an earlier version of Bowtie; now
3038 // we use it to hold flags.
3039 int32_t flags = readI32(_in1, switchEndian);
3040 bool entireRev = false;
3041 if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
3042 if(color != -1 && !color) {
3043 cerr << "Error: -C was not specified when running bowtie, but index is in colorspace. If" << endl
3044 << "your reads are in colorspace, please use the -C option. If your reads are not" << endl
3045 << "in colorspace, please use a normal index (one built without specifying -C to" << endl
3046 << "bowtie-build)." << endl;
3050 } else if(flags < 0) {
3051 if(color != -1 && color) {
3052 cerr << "Error: -C was specified when running bowtie, but index is not in colorspace. If" << endl
3053 << "your reads are in colorspace, please use a colorspace index (one built using" << endl
3054 << "bowtie-build -C). If your reads are not in colorspace, don't specify -C when" << endl
3055 << "running bowtie." << endl;
3060 if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) {
3061 if(needEntireRev != -1 && needEntireRev != 0) {
3062 cerr << "Error: This index is not compatible with this version of bowtie. Please use a" << endl
3063 << "current version of bowtie-build." << endl;
3066 } else entireRev = true;
3069 // Create a new EbwtParams from the entries read from primary stream
3071 bool deleteEh = false;
3072 if(params != NULL) {
3073 params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev);
3074 if(_verbose || startVerbose) params->print(cerr);
3077 eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev);
3081 // Set up overridden suffix-array-sample parameters
3082 uint32_t offsLen = eh->_offsLen;
3083 uint32_t offRateDiff = 0;
3084 uint32_t offsLenSampled = offsLen;
3085 if(_overrideOffRate > offRate) {
3086 offRateDiff = _overrideOffRate - offRate;
3088 if(offRateDiff > 0) {
3089 offsLenSampled >>= offRateDiff;
3090 if((offsLen & ~(0xffffffff << offRateDiff)) != 0) {
3095 // Set up overridden inverted-suffix-array-sample parameters
3096 uint32_t isaLen = eh->_isaLen;
3097 uint32_t isaRateDiff = 0;