Commit patch to not break on spaces.
[bowtie.git] / ebwt.h
1 #ifndef EBWT_H_
2 #define EBWT_H_
3
4 #include <stdint.h>
5 #include <string.h>
6 #include <algorithm>
7 #include <iostream>
8 #include <fstream>
9 #include <vector>
10 #include <set>
11 #include <sstream>
12 #include <fcntl.h>
13 #include <errno.h>
14 #include <stdexcept>
15 #include <seqan/sequence.h>
16 #include <seqan/index.h>
17 #include <sys/stat.h>
18 #ifdef BOWTIE_MM
19 #include <sys/mman.h>
20 #include <sys/shm.h>
21 #endif
22 #include "auto_array.h"
23 #include "shmem.h"
24 #include "alphabet.h"
25 #include "assert_helpers.h"
26 #include "bitpack.h"
27 #include "blockwise_sa.h"
28 #include "endian_swap.h"
29 #include "word_io.h"
30 #include "random_source.h"
31 #include "hit.h"
32 #include "ref_read.h"
33 #include "threading.h"
34 #include "bitset.h"
35 #include "str_util.h"
36 #include "mm.h"
37 #include "timer.h"
38 #include "refmap.h"
39 #include "color_dec.h"
40 #include "reference.h"
41
42 using namespace std;
43 using namespace seqan;
44
45 #ifndef PREFETCH_LOCALITY
46 // No locality by default
47 #define PREFETCH_LOCALITY 2
48 #endif
49
50 // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
51 extern uint8_t cCntLUT_4[4][4][256];
52
53 #ifndef VMSG_NL
54 #define VMSG_NL(args...) \
55 if(this->verbose()) { \
56         stringstream tmp; \
57         tmp << args << endl; \
58         this->verbose(tmp.str()); \
59 }
60 #endif
61
62 #ifndef VMSG
63 #define VMSG(args...) \
64 if(this->verbose()) { \
65         stringstream tmp; \
66         tmp << args; \
67         this->verbose(tmp.str()); \
68 }
69 #endif
70
71 /**
72  * Flags describing type of Ebwt.
73  */
74 enum EBWT_FLAGS {
75         EBWT_COLOR = 2,     // true -> Ebwt is colorspace
76         EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
77                             // concatenated string reversed, rather than
78                             // each stretch reversed
79 };
80
81 /**
82  * Extended Burrows-Wheeler transform header.  This together with the
83  * actual data arrays and other text-specific parameters defined in
84  * class Ebwt constitute the entire Ebwt.
85  */
86 class EbwtParams {
87
88 public:
89         EbwtParams() { }
90
91         EbwtParams(uint32_t len,
92                    int32_t lineRate,
93                    int32_t linesPerSide,
94                    int32_t offRate,
95                    int32_t isaRate,
96                    int32_t ftabChars,
97                    bool color,
98                    bool entireReverse)
99         {
100                 init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireReverse);
101         }
102
103         EbwtParams(const EbwtParams& eh) {
104                 init(eh._len, eh._lineRate, eh._linesPerSide, eh._offRate,
105                      eh._isaRate, eh._ftabChars, eh._color, eh._entireReverse);
106         }
107
108         void init(uint32_t len, int32_t lineRate, int32_t linesPerSide,
109                   int32_t offRate, int32_t isaRate, int32_t ftabChars,
110                   bool color, bool entireReverse)
111         {
112                 _color = color;
113                 _entireReverse = entireReverse;
114                 _len = len;
115                 _bwtLen = _len + 1;
116                 _sz = (len+3)/4;
117                 _bwtSz = (len/4 + 1);
118                 _lineRate = lineRate;
119                 _linesPerSide = linesPerSide;
120                 _origOffRate = offRate;
121                 _offRate = offRate;
122                 _offMask = 0xffffffff << _offRate;
123                 _isaRate = isaRate;
124                 _isaMask = 0xffffffff << ((_isaRate >= 0) ? _isaRate : 0);
125                 _ftabChars = ftabChars;
126                 _eftabLen = _ftabChars*2;
127                 _eftabSz = _eftabLen*4;
128                 _ftabLen = (1 << (_ftabChars*2))+1;
129                 _ftabSz = _ftabLen*4;
130                 _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
131                 _offsSz = _offsLen*4;
132                 _isaLen = (_isaRate == -1)? 0 : ((_bwtLen + (1 << _isaRate) - 1) >> _isaRate);
133                 _isaSz = _isaLen*4;
134                 _lineSz = 1 << _lineRate;
135                 _sideSz = _lineSz * _linesPerSide;
136                 _sideBwtSz = _sideSz - 8;
137                 _sideBwtLen = _sideBwtSz*4;
138                 _numSidePairs = (_bwtSz+(2*_sideBwtSz)-1)/(2*_sideBwtSz);
139                 _numSides = _numSidePairs*2;
140                 _numLines = _numSides * _linesPerSide;
141                 _ebwtTotLen = _numSidePairs * (2*_sideSz);
142                 _ebwtTotSz = _ebwtTotLen;
143                 assert(repOk());
144         }
145
146         uint32_t len() const           { return _len; }
147         uint32_t bwtLen() const        { return _bwtLen; }
148         uint32_t sz() const            { return _sz; }
149         uint32_t bwtSz() const         { return _bwtSz; }
150         int32_t  lineRate() const      { return _lineRate; }
151         int32_t  linesPerSide() const  { return _linesPerSide; }
152         int32_t  origOffRate() const   { return _origOffRate; }
153         int32_t  offRate() const       { return _offRate; }
154         uint32_t offMask() const       { return _offMask; }
155         int32_t  isaRate() const       { return _isaRate; }
156         uint32_t isaMask() const       { return _isaMask; }
157         int32_t  ftabChars() const     { return _ftabChars; }
158         uint32_t eftabLen() const      { return _eftabLen; }
159         uint32_t eftabSz() const       { return _eftabSz; }
160         uint32_t ftabLen() const       { return _ftabLen; }
161         uint32_t ftabSz() const        { return _ftabSz; }
162         uint32_t offsLen() const       { return _offsLen; }
163         uint32_t offsSz() const        { return _offsSz; }
164         uint32_t isaLen() const        { return _isaLen; }
165         uint32_t isaSz() const         { return _isaSz; }
166         uint32_t lineSz() const        { return _lineSz; }
167         uint32_t sideSz() const        { return _sideSz; }
168         uint32_t sideBwtSz() const     { return _sideBwtSz; }
169         uint32_t sideBwtLen() const    { return _sideBwtLen; }
170         uint32_t numSidePairs() const  { return _numSidePairs; }
171         uint32_t numSides() const      { return _numSides; }
172         uint32_t numLines() const      { return _numLines; }
173         uint32_t ebwtTotLen() const    { return _ebwtTotLen; }
174         uint32_t ebwtTotSz() const     { return _ebwtTotSz; }
175         bool color() const             { return _color; }
176         bool entireReverse() const     { return _entireReverse; }
177
178         /**
179          * Set a new suffix-array sampling rate, which involves updating
180          * rate, mask, sample length, and sample size.
181          */
182         void setOffRate(int __offRate) {
183                 _offRate = __offRate;
184                 _offMask = 0xffffffff << _offRate;
185                 _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
186                 _offsSz = _offsLen*4;
187         }
188
189         /**
190          * Set a new inverse suffix-array sampling rate, which involves
191          * updating rate, mask, sample length, and sample size.
192          */
193         void setIsaRate(int __isaRate) {
194                 _isaRate = __isaRate;
195                 _isaMask = 0xffffffff << _isaRate;
196                 _isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate;
197                 _isaSz = _isaLen*4;
198         }
199
200         /// Check that this EbwtParams is internally consistent
201         bool repOk() const {
202                 assert_gt(_len, 0);
203                 assert_gt(_lineRate, 3);
204                 assert_geq(_offRate, 0);
205                 assert_leq(_ftabChars, 16);
206                 assert_geq(_ftabChars, 1);
207                 assert_lt(_lineRate, 32);
208                 assert_lt(_linesPerSide, 32);
209                 assert_lt(_ftabChars, 32);
210                 assert_eq(0, _ebwtTotSz % (2*_lineSz));
211                 return true;
212         }
213
214         /**
215          * Pretty-print the header contents to the given output stream.
216          */
217         void print(ostream& out) const {
218                 out << "Headers:" << endl
219                     << "    len: "          << _len << endl
220                     << "    bwtLen: "       << _bwtLen << endl
221                     << "    sz: "           << _sz << endl
222                     << "    bwtSz: "        << _bwtSz << endl
223                     << "    lineRate: "     << _lineRate << endl
224                     << "    linesPerSide: " << _linesPerSide << endl
225                     << "    offRate: "      << _offRate << endl
226                     << "    offMask: 0x"    << hex << _offMask << dec << endl
227                     << "    isaRate: "      << _isaRate << endl
228                     << "    isaMask: 0x"    << hex << _isaMask << dec << endl
229                     << "    ftabChars: "    << _ftabChars << endl
230                     << "    eftabLen: "     << _eftabLen << endl
231                     << "    eftabSz: "      << _eftabSz << endl
232                     << "    ftabLen: "      << _ftabLen << endl
233                     << "    ftabSz: "       << _ftabSz << endl
234                     << "    offsLen: "      << _offsLen << endl
235                     << "    offsSz: "       << _offsSz << endl
236                     << "    isaLen: "       << _isaLen << endl
237                     << "    isaSz: "        << _isaSz << endl
238                     << "    lineSz: "       << _lineSz << endl
239                     << "    sideSz: "       << _sideSz << endl
240                     << "    sideBwtSz: "    << _sideBwtSz << endl
241                     << "    sideBwtLen: "   << _sideBwtLen << endl
242                     << "    numSidePairs: " << _numSidePairs << endl
243                     << "    numSides: "     << _numSides << endl
244                     << "    numLines: "     << _numLines << endl
245                     << "    ebwtTotLen: "   << _ebwtTotLen << endl
246                     << "    ebwtTotSz: "    << _ebwtTotSz << endl
247                     << "    reverse: "      << _entireReverse << endl;
248         }
249
250         uint32_t _len;
251         uint32_t _bwtLen;
252         uint32_t _sz;
253         uint32_t _bwtSz;
254         int32_t  _lineRate;
255         int32_t  _linesPerSide;
256         int32_t  _origOffRate;
257         int32_t  _offRate;
258         uint32_t _offMask;
259         int32_t  _isaRate;
260         uint32_t _isaMask;
261         int32_t  _ftabChars;
262         uint32_t _eftabLen;
263         uint32_t _eftabSz;
264         uint32_t _ftabLen;
265         uint32_t _ftabSz;
266         uint32_t _offsLen;
267         uint32_t _offsSz;
268         uint32_t _isaLen;
269         uint32_t _isaSz;
270         uint32_t _lineSz;
271         uint32_t _sideSz;
272         uint32_t _sideBwtSz;
273         uint32_t _sideBwtLen;
274         uint32_t _numSidePairs;
275         uint32_t _numSides;
276         uint32_t _numLines;
277         uint32_t _ebwtTotLen;
278         uint32_t _ebwtTotSz;
279         bool     _color;
280         bool     _entireReverse;
281 };
282
283 /**
284  * Exception to throw when a file-realted error occurs.
285  */
286 class EbwtFileOpenException : public std::runtime_error {
287 public:
288         EbwtFileOpenException(const std::string& msg = "") :
289                 std::runtime_error(msg) { }
290 };
291
292 /**
293  * Calculate size of file with given name.
294  */
295 static inline int64_t fileSize(const char* name) {
296         std::ifstream f;
297         f.open(name, std::ios_base::binary | std::ios_base::in);
298         if (!f.good() || f.eof() || !f.is_open()) { return 0; }
299         f.seekg(0, std::ios_base::beg);
300         std::ifstream::pos_type begin_pos = f.tellg();
301         f.seekg(0, std::ios_base::end);
302         return static_cast<int64_t>(f.tellg() - begin_pos);
303 }
304
305 // Forward declarations for Ebwt class
306 class SideLocus;
307 template<typename TStr> class EbwtSearchParams;
308
309 /**
310  * Extended Burrows-Wheeler transform data.
311  *
312  * An Ebwt may be transferred to and from RAM with calls to
313  * evictFromMemory() and loadIntoMemory().  By default, a newly-created
314  * Ebwt is not loaded into memory; if the user would like to use a
315  * newly-created Ebwt to answer queries, they must first call
316  * loadIntoMemory().
317  */
318 template <typename TStr>
319 class Ebwt {
320 public:
321         typedef typename Value<TStr>::Type TAlphabet;
322
323         #define Ebwt_INITS \
324             _toBigEndian(currentlyBigEndian()), \
325             _overrideOffRate(__overrideOffRate), \
326             _overrideIsaRate(__overrideIsaRate), \
327             _verbose(verbose), \
328             _passMemExc(passMemExc), \
329             _sanity(sanityCheck), \
330             _fw(__fw), \
331             _in1(MM_FILE_INIT), \
332             _in2(MM_FILE_INIT), \
333             _zOff(0xffffffff), \
334             _zEbwtByteOff(0xffffffff), \
335             _zEbwtBpOff(-1), \
336             _nPat(0), \
337             _nFrag(0), \
338             _plen(NULL), \
339             _rstarts(NULL), \
340             _fchr(NULL), \
341             _ftab(NULL), \
342             _eftab(NULL), \
343             _offs(NULL), \
344             _isa(NULL), \
345             _ebwt(NULL), \
346             _useMm(false), \
347             useShmem_(false), \
348             _refnames(), \
349             rmap_(NULL), \
350             mmFile1_(NULL), \
351             mmFile2_(NULL)
352
353 #ifdef EBWT_STATS
354 #define Ebwt_STAT_INITS \
355         ,mapLFExs_(0llu), \
356         mapLFs_(0llu), \
357         mapLFcs_(0llu), \
358         mapLF1cs_(0llu), \
359         mapLF1s_(0llu)
360 #else
361 #define Ebwt_STAT_INITS
362 #endif
363
364         /// Construct an Ebwt from the given input file
365         Ebwt(const string& in,
366              int color,
367              int needEntireReverse,
368              bool __fw,
369              int32_t __overrideOffRate = -1,
370              int32_t __overrideIsaRate = -1,
371              bool useMm = false,
372              bool useShmem = false,
373              bool mmSweep = false,
374              bool loadNames = false,
375              const ReferenceMap* rmap = NULL,
376              bool verbose = false,
377              bool startVerbose = false,
378              bool passMemExc = false,
379              bool sanityCheck = false) :
380              Ebwt_INITS
381              Ebwt_STAT_INITS
382         {
383                 assert(!useMm || !useShmem);
384                 rmap_ = rmap;
385                 _useMm = useMm;
386                 useShmem_ = useShmem;
387                 _in1Str = in + ".1.ebwt";
388                 _in2Str = in + ".2.ebwt";
389                 readIntoMemory(
390                         color,         // expect colorspace reference?
391                         __fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
392                         true,          // stop after loading the header portion?
393                         &_eh,          // params structure to fill in
394                         mmSweep,       // mmSweep
395                         loadNames,     // loadNames
396                         startVerbose); // startVerbose
397                 // If the offRate has been overridden, reflect that in the
398                 // _eh._offRate field
399                 if(_overrideOffRate > _eh._offRate) {
400                         _eh.setOffRate(_overrideOffRate);
401                         assert_eq(_overrideOffRate, _eh._offRate);
402                 }
403                 // Same with isaRate
404                 if(_overrideIsaRate > _eh._isaRate) {
405                         _eh.setIsaRate(_overrideIsaRate);
406                         assert_eq(_overrideIsaRate, _eh._isaRate);
407                 }
408                 assert(repOk());
409         }
410
411         /// Construct an Ebwt from the given header parameters and string
412         /// vector, optionally using a blockwise suffix sorter with the
413         /// given 'bmax' and 'dcv' parameters.  The string vector is
414         /// ultimately joined and the joined string is passed to buildToDisk().
415         Ebwt(int color,
416              int32_t lineRate,
417              int32_t linesPerSide,
418              int32_t offRate,
419              int32_t isaRate,
420              int32_t ftabChars,
421              const string& file,   // base filename for EBWT files
422              bool __fw,
423              bool useBlockwise,
424              uint32_t bmax,
425              uint32_t bmaxSqrtMult,
426              uint32_t bmaxDivN,
427              int dcv,
428              vector<FileBuf*>& is,
429              vector<RefRecord>& szs,
430              vector<uint32_t>& plens,
431              uint32_t sztot,
432              const RefReadInParams& refparams,
433              uint32_t seed,
434              int32_t __overrideOffRate = -1,
435              int32_t __overrideIsaRate = -1,
436              bool verbose = false,
437              bool passMemExc = false,
438              bool sanityCheck = false) :
439              Ebwt_INITS
440              Ebwt_STAT_INITS,
441              _eh(joinedLen(szs),
442                  lineRate,
443                  linesPerSide,
444                  offRate,
445                  isaRate,
446                  ftabChars,
447                  color,
448                  refparams.reverse == REF_READ_REVERSE)
449         {
450                 _in1Str = file + ".1.ebwt";
451                 _in2Str = file + ".2.ebwt";
452                 // Open output files
453                 ofstream fout1(_in1Str.c_str(), ios::binary);
454                 if(!fout1.good()) {
455                         cerr << "Could not open index file for writing: \"" << _in1Str << "\"" << endl
456                              << "Please make sure the directory exists and that permissions allow writing by" << endl
457                              << "Bowtie." << endl;
458                         throw 1;
459                 }
460                 ofstream fout2(_in2Str.c_str(), ios::binary);
461                 if(!fout2.good()) {
462                         cerr << "Could not open index file for writing: \"" << _in2Str << "\"" << endl
463                              << "Please make sure the directory exists and that permissions allow writing by" << endl
464                              << "Bowtie." << endl;
465                         throw 1;
466                 }
467                 // Build
468                 initFromVector(
469                         is,
470                         szs,
471                         plens,
472                         sztot,
473                         refparams,
474                         fout1,
475                         fout2,
476                         useBlockwise,
477                         bmax,
478                         bmaxSqrtMult,
479                         bmaxDivN,
480                         dcv,
481                         seed);
482                 // Close output files
483                 fout1.flush();
484                 int64_t tellpSz1 = (int64_t)fout1.tellp();
485                 VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str);
486                 fout1.close();
487                 bool err = false;
488                 if(tellpSz1 > fileSize(_in1Str.c_str())) {
489                         err = true;
490                         cerr << "Index is corrupt: File size for " << _in1Str << " should have been " << tellpSz1
491                              << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
492                 }
493                 fout2.flush();
494                 int64_t tellpSz2 = (int64_t)fout2.tellp();
495                 VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str);
496                 fout2.close();
497                 if(tellpSz2 > fileSize(_in2Str.c_str())) {
498                         err = true;
499                         cerr << "Index is corrupt: File size for " << _in2Str << " should have been " << tellpSz2
500                              << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
501                 }
502                 if(err) {
503                         cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
504                         throw 1;
505                 }
506                 // Reopen as input streams
507                 VMSG_NL("Re-opening _in1 and _in2 as input streams");
508                 if(_sanity) {
509                         VMSG_NL("Sanity-checking Ebwt");
510                         assert(!isInMemory());
511                         readIntoMemory(
512                                 color,
513                                 __fw ? -1 : refparams.reverse == REF_READ_REVERSE,
514                                 false,
515                                 NULL,
516                                 false,
517                                 true,
518                                 false);
519                         sanityCheckAll(refparams.reverse);
520                         evictFromMemory();
521                         assert(!isInMemory());
522                 }
523                 VMSG_NL("Returning from Ebwt constructor");
524         }
525
526         bool isPacked();
527
528         /**
529          * Write the rstarts array given the szs array for the reference.
530          */
531         void szsToDisk(const vector<RefRecord>& szs, ostream& os, int reverse) {
532                 size_t seq = 0;
533                 uint32_t off = 0;
534                 uint32_t totlen = 0;
535                 for(unsigned int i = 0; i < szs.size(); i++) {
536                         if(szs[i].len == 0) continue;
537                         if(szs[i].first) off = 0;
538                         off += szs[i].off;
539 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
540                         if(szs[i].first && szs[i].len > 0) seq++;
541 #else
542                         if(szs[i].first) seq++;
543 #endif
544                         size_t seqm1 = seq-1;
545                         assert_lt(seqm1, _nPat);
546                         size_t fwoff = off;
547                         if(reverse == REF_READ_REVERSE) {
548                                 // Invert pattern idxs
549                                 seqm1 = _nPat - seqm1 - 1;
550                                 // Invert pattern idxs
551                                 assert_leq(off + szs[i].len, _plen[seqm1]);
552                                 fwoff = _plen[seqm1] - (off + szs[i].len);
553                         }
554                         writeU32(os, totlen, this->toBe()); // offset from beginning of joined string
555                         writeU32(os, seqm1,  this->toBe()); // sequence id
556                         writeU32(os, fwoff,  this->toBe()); // offset into sequence
557                         totlen += szs[i].len;
558                         off += szs[i].len;
559                 }
560         }
561
562         /**
563          * Helper for the constructors above.  Takes a vector of text
564          * strings and joins them into a single string with a call to
565          * joinToDisk, which does a join (with padding) and writes some of
566          * the resulting data directly to disk rather than keep it in
567          * memory.  It then constructs a suffix-array producer (what kind
568          * depends on 'useBlockwise') for the resulting sequence.  The
569          * suffix-array producer can then be used to obtain chunks of the
570          * joined string's suffix array.
571          */
572         void initFromVector(
573                 vector<FileBuf*>& is,
574                 vector<RefRecord>& szs,
575                 vector<uint32_t>& plens,
576                 uint32_t sztot,
577                 const RefReadInParams& refparams,
578                 ofstream& out1,
579                 ofstream& out2,
580                 bool useBlockwise,
581                 uint32_t bmax,
582                 uint32_t bmaxSqrtMult,
583                 uint32_t bmaxDivN,
584                 int dcv,
585                 uint32_t seed)
586         {
587                 // Compose text strings into single string
588                 VMSG_NL("Calculating joined length");
589                 TStr s; // holds the entire joined reference after call to joinToDisk
590                 uint32_t jlen;
591                 jlen = joinedLen(szs);
592                 assert_geq(jlen, sztot);
593                 VMSG_NL("Writing header");
594                 writeFromMemory(true, out1, out2);
595                 try {
596                         VMSG_NL("Reserving space for joined string");
597                         seqan::reserve(s, jlen, Exact());
598                         VMSG_NL("Joining reference sequences");
599                         if(refparams.reverse == REF_READ_REVERSE) {
600                                 {
601                                         Timer timer(cout, "  Time to join reference sequences: ", _verbose);
602                                         joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
603                                 } {
604                                         Timer timer(cout, "  Time to reverse reference sequence: ", _verbose);
605                                         vector<RefRecord> tmp;
606                                         reverseInPlace(s);
607                                         reverseRefRecords(szs, tmp, false, false);
608                                         szsToDisk(tmp, out1, refparams.reverse);
609                                 }
610                         } else {
611                                 Timer timer(cout, "  Time to join reference sequences: ", _verbose);
612                                 joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
613                                 szsToDisk(szs, out1, refparams.reverse);
614                         }
615                         // Joined reference sequence now in 's'
616                 } catch(bad_alloc& e) {
617                         // If we throw an allocation exception in the try block,
618                         // that means that the joined version of the reference
619                         // string itself is too larger to fit in memory.  The only
620                         // alternatives are to tell the user to give us more memory
621                         // or to try again with a packed representation of the
622                         // reference (if we haven't tried that already).
623                         cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
624                         if(!isPacked() && _passMemExc) {
625                                 // Pass the exception up so that we can retry using a
626                                 // packed string representation
627                                 throw e;
628                         }
629                         // There's no point passing this exception on.  The fact
630                         // that we couldn't allocate the joined string means that
631                         // --bmax is irrelevant - the user should re-run with
632                         // ebwt-build-packed
633                         if(isPacked()) {
634                                 cerr << "Please try running bowtie-build on a computer with more memory." << endl;
635                         } else {
636                                 cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
637                                      << "mode (-a/--auto), or try again on a computer with more memory." << endl;
638                         }
639                         if(sizeof(void*) == 4) {
640                                 cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
641                                      << "this executable is 32-bit." << endl;
642                         }
643                         throw 1;
644                 }
645                 // Succesfully obtained joined reference string
646                 assert_geq(length(s), jlen);
647                 if(bmax != 0xffffffff) {
648                         VMSG_NL("bmax according to bmax setting: " << bmax);
649                 }
650                 else if(bmaxSqrtMult != 0xffffffff) {
651                         bmax *= bmaxSqrtMult;
652                         VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
653                 }
654                 else if(bmaxDivN != 0xffffffff) {
655                         bmax = max<uint32_t>(jlen / bmaxDivN, 1);
656                         VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
657                 }
658                 else {
659                         bmax = (uint32_t)sqrt(length(s));
660                         VMSG_NL("bmax defaulted to: " << bmax);
661                 }
662                 int iter = 0;
663                 bool first = true;
664                 // Look for bmax/dcv parameters that work.
665                 while(true) {
666                         if(!first && bmax < 40 && _passMemExc) {
667                                 cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
668                                 if(!isPacked()) {
669                                         // Throw an exception exception so that we can
670                                         // retry using a packed string representation
671                                         throw bad_alloc();
672                                 } else {
673                                         cerr << "Already tried a packed string representation." << endl;
674                                 }
675                                 cerr << "Please try indexing this reference on a computer with more memory." << endl;
676                                 if(sizeof(void*) == 4) {
677                                         cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
678                                                  << "this executable is 32-bit." << endl;
679                                 }
680                                 throw 1;
681                         }
682                         if(dcv > 4096) dcv = 4096;
683                         if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
684                                 dcv <<= 1; // double difference-cover period
685                         } else {
686                                 bmax -= (bmax >> 2); // reduce by 25%
687                         }
688                         VMSG("Using parameters --bmax " << bmax);
689                         if(dcv == 0) {
690                                 VMSG_NL(" and *no difference cover*");
691                         } else {
692                                 VMSG_NL(" --dcv " << dcv);
693                         }
694                         iter++;
695                         try {
696                                 {
697                                         VMSG_NL("  Doing ahead-of-time memory usage test");
698                                         // Make a quick-and-dirty attempt to force a bad_alloc iff
699                                         // we would have thrown one eventually as part of
700                                         // constructing the DifferenceCoverSample
701                                         dcv <<= 1;
702                                         size_t sz = DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
703                                         AutoArray<uint8_t> tmp(sz);
704                                         dcv >>= 1;
705                                         // Likewise with the KarkkainenBlockwiseSA
706                                         sz = KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
707                                         AutoArray<uint8_t> tmp2(sz);
708                                         // Now throw in the 'ftab' and 'isaSample' structures
709                                         // that we'll eventually allocate in buildToDisk
710                                         AutoArray<uint32_t> ftab(_eh._ftabLen * 2);
711                                         AutoArray<uint8_t> side(_eh._sideSz);
712                                         // Grab another 20 MB out of caution
713                                         AutoArray<uint32_t> extra(20*1024*1024);
714                                         // If we made it here without throwing bad_alloc, then we
715                                         // passed the memory-usage stress test
716                                         VMSG("  Passed!  Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
717                                         if(isPacked()) {
718                                                 VMSG(" --packed");
719                                         }
720                                         VMSG_NL("");
721                                 }
722                                 VMSG_NL("Constructing suffix-array element generator");
723                                 KarkkainenBlockwiseSA<TStr> bsa(s, bmax, dcv, seed, _sanity, _passMemExc, _verbose);
724                                 assert(bsa.suffixItrIsReset());
725                                 assert_eq(bsa.size(), length(s)+1);
726                                 VMSG_NL("Converting suffix-array elements to index image");
727                                 buildToDisk(bsa, s, out1, out2);
728                                 out1.flush(); out2.flush();
729                                 if(out1.fail() || out2.fail()) {
730                                         cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
731                                         throw 1;
732                                 }
733                                 break;
734                         } catch(bad_alloc& e) {
735                                 if(_passMemExc) {
736                                         VMSG_NL("  Ran out of memory; automatically trying more memory-economical parameters.");
737                                 } else {
738                                         cerr << "Out of memory while constructing suffix array.  Please try using a smaller" << endl
739                                                  << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
740                                         throw 1;
741                                 }
742                         }
743                         first = false;
744                 }
745                 assert(repOk());
746                 // Now write reference sequence names on the end
747 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
748                 assert_geq(this->_refnames.size(), this->_nPat);
749 #else
750                 assert_eq(this->_refnames.size(), this->_nPat);
751 #endif
752                 for(size_t i = 0; i < this->_refnames.size(); i++) {
753                         out1 << this->_refnames[i] << endl;
754                 }
755                 out1 << '\0';
756                 out1.flush(); out2.flush();
757                 if(out1.fail() || out2.fail()) {
758                         cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
759                         throw 1;
760                 }
761                 VMSG_NL("Returning from initFromVector");
762         }
763
764         /**
765          * Return the length that the joined string of the given string
766          * list will have.  Note that this is indifferent to how the text
767          * fragments correspond to input sequences - it just cares about
768          * the lengths of the fragments.
769          */
770         uint32_t joinedLen(vector<RefRecord>& szs) {
771                 uint32_t ret = 0;
772                 for(unsigned int i = 0; i < szs.size(); i++) {
773                         ret += szs[i].len;
774                 }
775                 return ret;
776         }
777
778         /// Destruct an Ebwt
779         ~Ebwt() {
780                 // Only free buffers if we're *not* using memory-mapped files
781                 if(!_useMm) {
782                         // Delete everything that was allocated in read(false, ...)
783                         if(_fchr    != NULL) delete[] _fchr;    _fchr    = NULL;
784                         if(_ftab    != NULL) delete[] _ftab;    _ftab    = NULL;
785                         if(_eftab   != NULL) delete[] _eftab;   _eftab   = NULL;
786                         if(_offs != NULL && !useShmem_) {
787                                 delete[] _offs; _offs = NULL;
788                         } else if(_offs != NULL && useShmem_) {
789                                 FREE_SHARED(_offs);
790                         }
791                         if(_isa     != NULL) delete[] _isa;     _isa     = NULL;
792                         if(_plen    != NULL) delete[] _plen;    _plen    = NULL;
793                         if(_rstarts != NULL) delete[] _rstarts; _rstarts = NULL;
794                         if(_ebwt != NULL && !useShmem_) {
795                                 delete[] _ebwt; _ebwt = NULL;
796                         } else if(_ebwt != NULL && useShmem_) {
797                                 FREE_SHARED(_ebwt);
798                         }
799                 }
800                 MM_FILE_CLOSE(_in1);
801                 MM_FILE_CLOSE(_in2);
802 #ifdef EBWT_STATS
803                 cout << (_fw ? "Forward index:" : "Mirror index:") << endl;
804                 cout << "  mapLFEx:   " << mapLFExs_ << endl;
805                 cout << "  mapLF:     " << mapLFs_   << endl;
806                 cout << "  mapLF(c):  " << mapLFcs_  << endl;
807                 cout << "  mapLF1(c): " << mapLF1cs_ << endl;
808                 cout << "  mapLF(c):  " << mapLF1s_  << endl;
809 #endif
810         }
811
812         /// Accessors
813         const EbwtParams& eh() const     { return _eh; }
814         uint32_t    zOff() const         { return _zOff; }
815         uint32_t    zEbwtByteOff() const { return _zEbwtByteOff; }
816         int         zEbwtBpOff() const   { return _zEbwtBpOff; }
817         uint32_t    nPat() const         { return _nPat; }
818         uint32_t    nFrag() const        { return _nFrag; }
819         uint32_t*   fchr() const         { return _fchr; }
820         uint32_t*   ftab() const         { return _ftab; }
821         uint32_t*   eftab() const        { return _eftab; }
822         uint32_t*   offs() const         { return _offs; }
823         uint32_t*   isa() const          { return _isa; }
824         uint32_t*   plen() const         { return _plen; }
825         uint32_t*   rstarts() const      { return _rstarts; }
826         uint8_t*    ebwt() const         { return _ebwt; }
827         const ReferenceMap* rmap() const { return rmap_; }
828         bool        toBe() const         { return _toBigEndian; }
829         bool        verbose() const      { return _verbose; }
830         bool        sanityCheck() const  { return _sanity; }
831         vector<string>& refnames()       { return _refnames; }
832         bool        fw() const           { return _fw; }
833
834         /// Return true iff the Ebwt is currently in memory
835         bool isInMemory() const {
836                 if(_ebwt != NULL) {
837                         assert(_eh.repOk());
838                         assert(_ftab != NULL);
839                         assert(_eftab != NULL);
840                         assert(_fchr != NULL);
841                         assert(_offs != NULL);
842                         assert(_isa != NULL);
843                         assert(_rstarts != NULL);
844                         assert_neq(_zEbwtByteOff, 0xffffffff);
845                         assert_neq(_zEbwtBpOff, -1);
846                         return true;
847                 } else {
848                         assert(_ftab == NULL);
849                         assert(_eftab == NULL);
850                         assert(_fchr == NULL);
851                         assert(_offs == NULL);
852                         assert(_rstarts == NULL);
853                         assert_eq(_zEbwtByteOff, 0xffffffff);
854                         assert_eq(_zEbwtBpOff, -1);
855                         return false;
856                 }
857         }
858
859         /// Return true iff the Ebwt is currently stored on disk
860         bool isEvicted() const {
861                 return !isInMemory();
862         }
863
864         /**
865          * Load this Ebwt into memory by reading it in from the _in1 and
866          * _in2 streams.
867          */
868         void loadIntoMemory(
869                 int color,
870                 int needEntireReverse,
871                 bool loadNames,
872                 bool verbose)
873         {
874                 readIntoMemory(
875                         color,      // expect index to be colorspace?
876                         needEntireReverse, // require reverse index to be concatenated reference reversed
877                         false,      // stop after loading the header portion?
878                         NULL,       // params
879                         false,      // mmSweep
880                         loadNames,  // loadNames
881                         verbose);   // startVerbose
882         }
883
884         /**
885          * Frees memory associated with the Ebwt.
886          */
887         void evictFromMemory() {
888                 assert(isInMemory());
889                 if(!_useMm) {
890                         delete[] _fchr;
891                         delete[] _ftab;
892                         delete[] _eftab;
893                         if(!useShmem_) delete[] _offs;
894                         delete[] _isa;
895                         // Keep plen; it's small and the client may want to query it
896                         // even when the others are evicted.
897                         //delete[] _plen;
898                         delete[] _rstarts;
899                         if(!useShmem_) delete[] _ebwt;
900                 }
901                 _fchr  = NULL;
902                 _ftab  = NULL;
903                 _eftab = NULL;
904                 _offs  = NULL;
905                 _isa   = NULL;
906                 // Keep plen; it's small and the client may want to query it
907                 // even when the others are evicted.
908                 //_plen  = NULL;
909                 _rstarts = NULL;
910                 _ebwt    = NULL;
911                 _zEbwtByteOff = 0xffffffff;
912                 _zEbwtBpOff = -1;
913         }
914
915         /**
916          * Non-static facade for static function ftabHi.
917          */
918         uint32_t ftabHi(uint32_t i) const {
919                 return Ebwt::ftabHi(_ftab, _eftab, _eh._len, _eh._ftabLen,
920                                     _eh._eftabLen, i);
921         }
922
923         /**
924          * Get "high interpretation" of ftab entry at index i.  The high
925          * interpretation of a regular ftab entry is just the entry
926          * itself.  The high interpretation of an extended entry is the
927          * second correpsonding ui32 in the eftab.
928          *
929          * It's a static member because it's convenient to ask this
930          * question before the Ebwt is fully initialized.
931          */
932         static uint32_t ftabHi(uint32_t *ftab,
933                                uint32_t *eftab,
934                                uint32_t len,
935                                uint32_t ftabLen,
936                                uint32_t eftabLen,
937                                uint32_t i)
938         {
939                 assert_lt(i, ftabLen);
940                 if(ftab[i] <= len) {
941                         return ftab[i];
942                 } else {
943                         uint32_t efIdx = ftab[i] ^ 0xffffffff;
944                         assert_lt(efIdx*2+1, eftabLen);
945                         return eftab[efIdx*2+1];
946                 }
947         }
948
949         /**
950          * Non-static facade for static function ftabLo.
951          */
952         uint32_t ftabLo(uint32_t i) const {
953                 return Ebwt::ftabLo(_ftab, _eftab, _eh._len, _eh._ftabLen,
954                                     _eh._eftabLen, i);
955         }
956
957         /**
958          * Get "low interpretation" of ftab entry at index i.  The low
959          * interpretation of a regular ftab entry is just the entry
960          * itself.  The low interpretation of an extended entry is the
961          * first correpsonding ui32 in the eftab.
962          *
963          * It's a static member because it's convenient to ask this
964          * question before the Ebwt is fully initialized.
965          */
966         static uint32_t ftabLo(uint32_t *ftab,
967                                uint32_t *eftab,
968                                uint32_t len,
969                                uint32_t ftabLen,
970                                uint32_t eftabLen,
971                                uint32_t i)
972         {
973                 assert_lt(i, ftabLen);
974                 if(ftab[i] <= len) {
975                         return ftab[i];
976                 } else {
977                         uint32_t efIdx = ftab[i] ^ 0xffffffff;
978                         assert_lt(efIdx*2+1, eftabLen);
979                         return eftab[efIdx*2];
980                 }
981         }
982
983         /**
984          * When using read() to create an Ebwt, we have to set a couple of
985          * additional fields in the Ebwt object that aren't part of the
986          * parameter list and are not stored explicitly in the file.  Right
987          * now, this just involves initializing _zEbwtByteOff and
988          * _zEbwtBpOff from _zOff.
989          */
990         void postReadInit(EbwtParams& eh) {
991                 uint32_t sideNum     = _zOff / eh._sideBwtLen;
992                 uint32_t sideCharOff = _zOff % eh._sideBwtLen;
993                 uint32_t sideByteOff = sideNum * eh._sideSz;
994                 _zEbwtByteOff = sideCharOff >> 2;
995                 assert_lt(_zEbwtByteOff, eh._sideBwtSz);
996                 _zEbwtBpOff = sideCharOff & 3;
997                 assert_lt(_zEbwtBpOff, 4);
998                 if((sideNum & 1) == 0) {
999                         // This is an even (backward) side
1000                         _zEbwtByteOff = eh._sideBwtSz - _zEbwtByteOff - 1;
1001                         _zEbwtBpOff = 3 - _zEbwtBpOff;
1002                         assert_lt(_zEbwtBpOff, 4);
1003                 }
1004                 _zEbwtByteOff += sideByteOff;
1005                 assert(repOk(eh)); // Ebwt should be fully initialized now
1006         }
1007
1008         /**
1009          * Pretty-print the Ebwt to the given output stream.
1010          */
1011         void print(ostream& out) const {
1012                 print(out, _eh);
1013         }
1014
1015         /**
1016          * Pretty-print the Ebwt and given EbwtParams to the given output
1017          * stream.
1018          */
1019         void print(ostream& out, const EbwtParams& eh) const {
1020                 eh.print(out); // print params
1021                 out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
1022                     << "    zOff: "         << _zOff << endl
1023                     << "    zEbwtByteOff: " << _zEbwtByteOff << endl
1024                     << "    zEbwtBpOff: "   << _zEbwtBpOff << endl
1025                     << "    nPat: "  << _nPat << endl
1026                     << "    plen: ";
1027                 if(_plen == NULL) {
1028                         out << "NULL" << endl;
1029                 } else {
1030                         out << "non-NULL, [0] = " << _plen[0] << endl;
1031                 }
1032                 out << "    rstarts: ";
1033                 if(_rstarts == NULL) {
1034                         out << "NULL" << endl;
1035                 } else {
1036                         out << "non-NULL, [0] = " << _rstarts[0] << endl;
1037                 }
1038                 out << "    ebwt: ";
1039                 if(_ebwt == NULL) {
1040                         out << "NULL" << endl;
1041                 } else {
1042                         out << "non-NULL, [0] = " << _ebwt[0] << endl;
1043                 }
1044                 out << "    fchr: ";
1045                 if(_fchr == NULL) {
1046                         out << "NULL" << endl;
1047                 } else {
1048                         out << "non-NULL, [0] = " << _fchr[0] << endl;
1049                 }
1050                 out << "    ftab: ";
1051                 if(_ftab == NULL) {
1052                         out << "NULL" << endl;
1053                 } else {
1054                         out << "non-NULL, [0] = " << _ftab[0] << endl;
1055                 }
1056                 out << "    eftab: ";
1057                 if(_eftab == NULL) {
1058                         out << "NULL" << endl;
1059                 } else {
1060                         out << "non-NULL, [0] = " << _eftab[0] << endl;
1061                 }
1062                 out << "    offs: ";
1063                 if(_offs == NULL) {
1064                         out << "NULL" << endl;
1065                 } else {
1066                         out << "non-NULL, [0] = " << _offs[0] << endl;
1067                 }
1068         }
1069
1070         // Building
1071         static TStr join(vector<TStr>& l, uint32_t seed);
1072         static TStr join(vector<FileBuf*>& l, vector<RefRecord>& szs, uint32_t sztot, const RefReadInParams& refparams, uint32_t seed);
1073         void joinToDisk(vector<FileBuf*>& l, vector<RefRecord>& szs, vector<uint32_t>& plens, uint32_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed);
1074         void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2);
1075
1076         // I/O
1077         void readIntoMemory(int color, int needEntireReverse, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
1078         void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
1079         void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
1080
1081         // Sanity checking
1082         void printRangeFw(uint32_t begin, uint32_t end) const;
1083         void printRangeBw(uint32_t begin, uint32_t end) const;
1084         void sanityCheckUpToSide(int upToSide) const;
1085         void sanityCheckAll(int reverse) const;
1086         void restore(TStr& s) const;
1087         void checkOrigs(const vector<String<Dna5> >& os, bool color, bool mirror) const;
1088
1089         // Searching and reporting
1090         void joinedToTextOff(uint32_t qlen, uint32_t off, uint32_t& tidx, uint32_t& textoff, uint32_t& tlen) const;
1091         inline bool report(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<uint32_t>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, uint32_t off, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params) const;
1092         inline bool reportChaseOne(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<uint32_t>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
1093         inline bool reportReconstruct(const String<Dna5>& query, String<char>* quals, String<char>* name, String<Dna5>& lbuf, String<Dna5>& rbuf, const uint32_t *mmui32, const char* refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
1094         inline int rowL(const SideLocus& l) const;
1095         inline uint32_t countUpTo(const SideLocus& l, int c) const;
1096         inline void countUpToEx(const SideLocus& l, uint32_t* pairs) const;
1097         inline uint32_t countFwSide(const SideLocus& l, int c) const;
1098         inline void countFwSideEx(const SideLocus& l, uint32_t *pairs) const;
1099         inline uint32_t countBwSide(const SideLocus& l, int c) const;
1100         inline void countBwSideEx(const SideLocus& l, uint32_t *pairs) const;
1101         inline uint32_t mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
1102         inline void mapLFEx(const SideLocus& l, uint32_t *pairs ASSERT_ONLY(, bool overrideSanity = false)) const;
1103         inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, uint32_t *tops, uint32_t *bots ASSERT_ONLY(, bool overrideSanity = false)) const;
1104         inline uint32_t mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
1105         inline uint32_t mapLF1(uint32_t row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
1106         inline int mapLF1(uint32_t& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
1107         /// Check that in-memory Ebwt is internally consistent with respect
1108         /// to given EbwtParams; assert if not
1109         bool inMemoryRepOk(const EbwtParams& eh) const {
1110                 assert_leq(ValueSize<TAlphabet>::VALUE, 4);
1111                 assert_geq(_zEbwtBpOff, 0);
1112                 assert_lt(_zEbwtBpOff, 4);
1113                 assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
1114                 assert_lt(_zOff, eh._bwtLen);
1115                 assert(_rstarts != NULL);
1116                 assert_geq(_nFrag, _nPat);
1117                 return true;
1118         }
1119
1120         /// Check that in-memory Ebwt is internally consistent; assert if
1121         /// not
1122         bool inMemoryRepOk() const {
1123                 return repOk(_eh);
1124         }
1125
1126         /// Check that Ebwt is internally consistent with respect to given
1127         /// EbwtParams; assert if not
1128         bool repOk(const EbwtParams& eh) const {
1129                 assert(_eh.repOk());
1130                 if(isInMemory()) {
1131                         return inMemoryRepOk(eh);
1132                 }
1133                 return true;
1134         }
1135
1136         /// Check that Ebwt is internally consistent; assert if not
1137         bool repOk() const {
1138                 return repOk(_eh);
1139         }
1140
1141         bool       _toBigEndian;
1142         int32_t    _overrideOffRate;
1143         int32_t    _overrideIsaRate;
1144         bool       _verbose;
1145         bool       _passMemExc;
1146         bool       _sanity;
1147         bool       _fw;     // true iff this is a forward index
1148         MM_FILE  _in1;    // input fd for primary index file
1149         MM_FILE  _in2;    // input fd for secondary index file
1150         string     _in1Str; // filename for primary index file
1151         string     _in2Str; // filename for secondary index file
1152         uint32_t   _zOff;
1153         uint32_t   _zEbwtByteOff;
1154         int        _zEbwtBpOff;
1155         uint32_t   _nPat;  /// number of reference texts
1156         uint32_t   _nFrag; /// number of fragments
1157         uint32_t*  _plen;
1158         uint32_t*  _rstarts; // starting offset of fragments / text indexes
1159         // _fchr, _ftab and _eftab are expected to be relatively small
1160         // (usually < 1MB, perhaps a few MB if _fchr is particularly large
1161         // - like, say, 11).  For this reason, we don't bother with writing
1162         // them to disk through separate output streams; we
1163         uint32_t*  _fchr;
1164         uint32_t*  _ftab;
1165         uint32_t*  _eftab; // "extended" entries for _ftab
1166         // _offs may be extremely large.  E.g. for DNA w/ offRate=4 (one
1167         // offset every 16 rows), the total size of _offs is the same as
1168         // the total size of the input sequence
1169         uint32_t*  _offs;
1170         uint32_t*  _isa;
1171         // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
1172         // is at least as large as the input sequence.
1173         uint8_t*   _ebwt;
1174         bool       _useMm;        /// use memory-mapped files to hold the index
1175         bool       useShmem_;     /// use shared memory to hold large parts of the index
1176         vector<string> _refnames; /// names of the reference sequences
1177         const ReferenceMap* rmap_; /// mapping into another reference coordinate space
1178         char *mmFile1_;
1179         char *mmFile2_;
1180         EbwtParams _eh;
1181
1182 #ifdef EBWT_STATS
1183         uint64_t   mapLFExs_;
1184         uint64_t   mapLFs_;
1185         uint64_t   mapLFcs_;
1186 #endif
1187
1188 private:
1189
1190         ostream& log() const {
1191                 return cout; // TODO: turn this into a parameter
1192         }
1193
1194         /// Print a verbose message and flush (flushing is helpful for
1195         /// debugging)
1196         void verbose(const string& s) const {
1197                 if(this->verbose()) {
1198                         this->log() << s;
1199                         this->log().flush();
1200                 }
1201         }
1202 };
1203
1204 /// Specialization for packed Ebwts - return true
1205 template<>
1206 bool Ebwt<String<Dna, Packed<> > >::isPacked() {
1207         return true;
1208 }
1209
1210 /// By default, Ebwts are not packed
1211 template<typename TStr>
1212 bool Ebwt<TStr>::isPacked() {
1213         return false;
1214 }
1215
1216 /**
1217  * Structure encapsulating search parameters, such as whether and how
1218  * to backtrack and how to deal with multiple equally-good hits.
1219  */
1220 template<typename TStr>
1221 class EbwtSearchParams {
1222 public:
1223         EbwtSearchParams(HitSinkPerThread& sink,
1224                          const vector<String<Dna5> >& texts,
1225                          bool fw = true,
1226                          bool ebwtFw = true) :
1227                 _sink(sink),
1228                 _texts(texts),
1229                 _patid(0xffffffff),
1230                 _fw(fw) { }
1231
1232         HitSinkPerThread& sink() const { return _sink; }
1233         void setPatId(uint32_t patid)  { _patid = patid; }
1234         uint32_t patId() const         { return _patid; }
1235         void setFw(bool fw)            { _fw = fw; }
1236         bool fw() const                { return _fw; }
1237         /**
1238          * Report a hit.  Returns true iff caller can call off the search.
1239          */
1240         bool reportHit(const String<Dna5>& query, // read sequence
1241                        String<char>* quals, // read quality values
1242                        String<char>* name,  // read name
1243                        bool color,          // true -> read is colorspace
1244                        bool colExEnds,      // true -> exclude nucleotides at extreme ends after decoding
1245                        int snpPhred,        // penalty for a SNP
1246                        const BitPairReference* ref, // reference (= NULL if not necessary)
1247                        const ReferenceMap* rmap, // map to another reference coordinate system
1248                        bool ebwtFw,         // whether index is forward (true) or mirror (false)
1249                        const std::vector<uint32_t>& mmui32, // mismatch list
1250                        const std::vector<uint8_t>& refcs,  // reference characters
1251                        size_t numMms,      // # mismatches
1252                        U32Pair h,          // ref coords
1253                        U32Pair mh,         // mate's ref coords
1254                        bool mfw,           // mate's orientation
1255                        uint16_t mlen,      // mate length
1256                        U32Pair a,          // arrow pair
1257                        uint32_t tlen,      // length of text
1258                        uint32_t qlen,      // length of query
1259                        int stratum,        // alignment stratum
1260                        uint16_t cost,      // cost of alignment
1261                        uint32_t oms,       // approx. # other valid alignments
1262                        uint32_t patid,
1263                        uint32_t seed,
1264                        uint8_t mate) const
1265         {
1266 #ifndef NDEBUG
1267                 // Check that no two elements of the mms array are the same
1268                 for(size_t i = 0; i < numMms; i++) {
1269                         for(size_t j = i+1; j < numMms; j++) {
1270                                 assert_neq(mmui32[i], mmui32[j]);
1271                         }
1272                 }
1273 #endif
1274                 // If ebwtFw is true, then 'query' and 'quals' are reversed
1275                 // If _fw is false, then 'query' and 'quals' are reverse complemented
1276                 assert(!color || ref != NULL);
1277                 assert(quals != NULL);
1278                 assert(name != NULL);
1279                 assert_eq(mmui32.size(), refcs.size());
1280                 assert_leq(numMms, mmui32.size());
1281                 assert_gt(qlen, 0);
1282                 Hit hit;
1283                 hit.stratum = stratum;
1284                 hit.cost = cost;
1285                 hit.patSeq = query;
1286                 hit.quals = *quals;
1287                 if(!ebwtFw) {
1288                         // Re-reverse the pattern and the quals back to how they
1289                         // appeared in the read file
1290                         ::reverseInPlace(hit.patSeq);
1291                         ::reverseInPlace(hit.quals);
1292                 }
1293                 if(color) {
1294                         hit.colSeq = hit.patSeq;
1295                         hit.colQuals = hit.quals;
1296                         hit.crefcs.resize(qlen, 0);
1297                         // Turn the mmui32 and refcs arrays into the mm FixedBitset and
1298                         // the refc vector
1299                         for(size_t i = 0; i < numMms; i++) {
1300                                 if (ebwtFw != _fw) {
1301                                         // The 3' end is on the left but the mm vector encodes
1302                                         // mismatches w/r/t the 5' end, so we flip
1303                                         uint32_t off = qlen - mmui32[i] - 1;
1304                                         hit.cmms.set(off);
1305                                         hit.crefcs[off] = refcs[i];
1306                                 } else {
1307                                         hit.cmms.set(mmui32[i]);
1308                                         hit.crefcs[i] = refcs[i];
1309                                 }
1310                         }
1311                         assert(ref != NULL);
1312                         char read[1024];
1313                         uint32_t rfbuf[(1024+16)/4];
1314                         ASSERT_ONLY(char rfbuf2[1024]);
1315                         char qual[1024];
1316                         char ns[1024];
1317                         char cmm[1024];
1318                         char nmm[1024];
1319                         int cmms = 0;
1320                         int nmms = 0;
1321                         // TODO: account for indels when calculating these bounds
1322                         size_t readi = 0;
1323                         size_t readf = seqan::length(hit.patSeq);
1324                         size_t refi = 0;
1325                         size_t reff = readf + 1;
1326                         bool maqRound = false;
1327                         for(size_t i = 0; i < qlen + 1; i++) {
1328                                 if(i < qlen) {
1329                                         read[i] = (int)hit.patSeq[i];
1330                                         qual[i] = mmPenalty(maqRound, phredCharToPhredQual(hit.quals[i]));
1331                                 }
1332                                 ASSERT_ONLY(rfbuf2[i] = ref->getBase(h.first, h.second + i));
1333                         }
1334                         int offset = ref->getStretch(rfbuf, h.first, h.second, qlen + 1);
1335                         char *rf = (char*)rfbuf + offset;
1336                         for(size_t i = 0; i < qlen + 1; i++) {
1337                                 assert_eq(rf[i], rfbuf2[i]);
1338                                 rf[i] = (1 << rf[i]);
1339                         }
1340                         decodeHit(
1341                                 read,  // ASCII colors, '0', '1', '2', '3', '.'
1342                                 qual,  // ASCII quals, Phred+33 encoded
1343                                 readi, // offset of first character within 'read' to consider
1344                                 readf, // offset of last char (exclusive) in 'read' to consider
1345                                 rf,    // reference sequence, as masks
1346                                 refi, // offset of first character within 'ref' to consider
1347                                 reff, // offset of last char (exclusive) in 'ref' to consider
1348                                 snpPhred, // penalty incurred by a SNP
1349                                 ns,  // decoded nucleotides are appended here
1350                                 cmm, // where the color mismatches are in the string
1351                                 nmm, // where nucleotide mismatches are in the string
1352                                 cmms, // number of color mismatches
1353                                 nmms);// number of nucleotide mismatches
1354                         size_t nqlen = qlen + (colExEnds ? -1 : 1);
1355                         seqan::resize(hit.patSeq, nqlen);
1356                         seqan::resize(hit.quals, nqlen);
1357                         hit.refcs.resize(nqlen);
1358                         size_t lo = colExEnds ? 1 : 0;
1359                         size_t hi = colExEnds ? qlen : qlen+1;
1360                         size_t destpos = 0;
1361                         for(size_t i = lo; i < hi; i++, destpos++) {
1362                                 // Set sequence character
1363                                 assert_leq(ns[i], 4);
1364                                 assert_geq(ns[i], 0);
1365                                 hit.patSeq[destpos] = (Dna5)(int)ns[i];
1366                                 // Set initial quality
1367                                 hit.quals[destpos] = '!';
1368                                 // Color mismatches penalize quality
1369                                 if(i > 0) {
1370                                         if(cmm[i-1] == 'M') {
1371                                                 if((int)hit.quals[destpos] + (int)qual[i-1] > 126) {
1372                                                         hit.quals[destpos] = 126;
1373                                                 } else {
1374                                                         hit.quals[destpos] += qual[i-1];
1375                                                 }
1376                                         } else if((int)hit.colSeq[i-1] != 4) {
1377                                                 hit.quals[destpos] -= qual[i-1];
1378                                         }
1379                                 }
1380                                 if(i < qlen) {
1381                                         if(cmm[i] == 'M') {
1382                                                 if((int)hit.quals[destpos] + (int)qual[i] > 126) {
1383                                                         hit.quals[destpos] = 126;
1384                                                 } else {
1385                                                         hit.quals[destpos] += qual[i];
1386                                                 }
1387                                         } else if((int)hit.patSeq[i] != 4) {
1388                                                 hit.quals[destpos] -= qual[i];
1389                                         }
1390                                 }
1391                                 if(hit.quals[destpos] < '!') {
1392                                         hit.quals[destpos] = '!';
1393                                 }
1394                                 if(nmm[i] != 'M') {
1395                                         uint32_t off = i - (colExEnds? 1:0);
1396                                         if(!_fw) off = nqlen - off - 1;
1397                                         assert_lt(off, nqlen);
1398                                         hit.mms.set(off);
1399                                         hit.refcs[off] = "ACGT"[ref->getBase(h.first, h.second+i)];
1400                                 }
1401                         }
1402                         if(colExEnds) {
1403                                 // Extreme bases have been removed; that makes the
1404                                 // nucleotide alignment one character shorter than the
1405                                 // color alignment
1406                                 qlen--; mlen--;
1407                                 // It also shifts the alignment's offset up by 1
1408                                 h.second++;
1409                         } else {
1410                                 // Extreme bases are included; that makes the
1411                                 // nucleotide alignment one character longer than the
1412                                 // color alignment
1413                                 qlen++; mlen++;
1414                         }
1415                 } else {
1416                         // Turn the mmui32 and refcs arrays into the mm FixedBitset and
1417                         // the refc vector
1418                         hit.refcs.resize(qlen, 0);
1419                         for(size_t i = 0; i < numMms; i++) {
1420                                 if (ebwtFw != _fw) {
1421                                         // The 3' end is on the left but the mm vector encodes
1422                                         // mismatches w/r/t the 5' end, so we flip
1423                                         uint32_t off = qlen - mmui32[i] - 1;
1424                                         hit.mms.set(off);
1425                                         hit.refcs[off] = refcs[i];
1426                                 } else {
1427                                         hit.mms.set(mmui32[i]);
1428                                         hit.refcs[mmui32[i]] = refcs[i];
1429                                 }
1430                         }
1431                 }
1432                 // Check the hit against the original text, if it's available
1433                 if(_texts.size() > 0) {
1434                         assert_lt(h.first, _texts.size());
1435                         FixedBitset<1024> diffs;
1436                         // This type of check assumes that only mismatches are
1437                         // possible.  If indels are possible, then we either need
1438                         // the caller to provide information about indel locations,
1439                         // or we need to extend this to a more complicated check.
1440                         assert_leq(h.second + qlen, length(_texts[h.first]));
1441                         for(size_t i = 0; i < qlen; i++) {
1442                                 assert_neq(4, (int)_texts[h.first][h.second + i]);
1443                                 // Forward pattern appears at h
1444                                 if((int)hit.patSeq[i] != (int)_texts[h.first][h.second + i]) {
1445                                         uint32_t qoff = i;
1446                                         // if ebwtFw != _fw the 3' end is on on the
1447                                         // left end of the pattern, but the diff vector
1448                                         // should encode mismatches w/r/t the 5' end,
1449                                         // so we flip
1450                                         if (_fw) diffs.set(qoff);
1451                                         else     diffs.set(qlen - qoff - 1);
1452                                 }
1453                         }
1454                         if(diffs != hit.mms) {
1455                                 // Oops, mismatches were not where we expected them;
1456                                 // print a diagnostic message before asserting
1457                                 cerr << "Expected " << hit.mms.str() << " mismatches, got " << diffs.str() << endl;
1458                                 cerr << "  Pat:  " << hit.patSeq << endl;
1459                                 cerr << "  Tseg: ";
1460                                 for(size_t i = 0; i < qlen; i++) {
1461                                         cerr << _texts[h.first][h.second + i];
1462                                 }
1463                                 cerr << endl;
1464                                 cerr << "  mmui32: ";
1465                                 for(size_t i = 0; i < numMms; i++) {
1466                                         cerr << mmui32[i] << " ";
1467                                 }
1468                                 cerr << endl;
1469                                 cerr << "  FW: " << _fw << endl;
1470                                 cerr << "  Ebwt FW: " << ebwtFw << endl;
1471                         }
1472                         if(diffs != hit.mms) assert(false);
1473                 }
1474                 hit.h = h;
1475                 if(rmap != NULL) rmap->map(hit.h);
1476                 hit.patId = ((patid == 0xffffffff) ? _patid : patid);
1477                 hit.patName = *name;
1478                 hit.mh = mh;
1479                 hit.fw = _fw;
1480                 hit.mfw = mfw;
1481                 hit.mlen = mlen;
1482                 hit.oms = oms;
1483                 hit.mate = mate;
1484                 hit.color = color;
1485                 hit.seed = seed;
1486                 assert(hit.repOk());
1487                 return sink().reportHit(hit, stratum);
1488         }
1489 private:
1490         HitSinkPerThread& _sink;
1491         const vector<String<Dna5> >& _texts; // original texts, if available (if not
1492                                     // available, _texts.size() == 0)
1493         uint32_t _patid;      // id of current read
1494         bool _fw;             // current read is forward-oriented
1495 };
1496
1497 /**
1498  * Encapsulates a location in the bwt text in terms of the side it
1499  * occurs in and its offset within the side.
1500  */
1501 struct SideLocus {
1502         SideLocus() :
1503         _sideByteOff(0),
1504         _sideNum(0),
1505         _charOff(0),
1506         _fw(true),
1507         _by(-1),
1508         _bp(-1) { }
1509
1510         /**
1511          * Construct from row and other relevant information about the Ebwt.
1512          */
1513         SideLocus(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) {
1514                 initFromRow(row, ep, ebwt);
1515         }
1516
1517         /**
1518          * Init two SideLocus objects from a top/bot pair, using the result
1519          * from one call to initFromRow to possibly avoid a second call.
1520          */
1521         static void initFromTopBot(uint32_t top,
1522                                    uint32_t bot,
1523                                    const EbwtParams& ep,
1524                                    const uint8_t* ebwt,
1525                                    SideLocus& ltop,
1526                                    SideLocus& lbot)
1527         {
1528                 const uint32_t sideBwtLen = ep._sideBwtLen;
1529                 const uint32_t sideBwtSz  = ep._sideBwtSz;
1530                 assert_gt(bot, top);
1531                 ltop.initFromRow(top, ep, ebwt);
1532                 uint32_t spread = bot - top;
1533                 if(ltop._charOff + spread < sideBwtLen) {
1534                         lbot._charOff = ltop._charOff + spread;
1535                         lbot._sideNum = ltop._sideNum;
1536                         lbot._sideByteOff = ltop._sideByteOff;
1537                         lbot._fw = ltop._fw;
1538                         lbot._by = lbot._charOff >> 2;
1539                         assert_lt(lbot._by, (int)sideBwtSz);
1540                         if(!lbot._fw) lbot._by = sideBwtSz - lbot._by - 1;
1541                         lbot._bp = lbot._charOff & 3;
1542                         if(!lbot._fw) lbot._bp ^= 3;
1543                 } else {
1544                         lbot.initFromRow(bot, ep, ebwt);
1545                 }
1546         }
1547
1548         /**
1549          * Calculate SideLocus based on a row and other relevant
1550          * information about the shape of the Ebwt.
1551          */
1552         void initFromRow(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) {
1553                 const uint32_t sideSz     = ep._sideSz;
1554                 // Side length is hard-coded for now; this allows the compiler
1555                 // to do clever things to accelerate / and %.
1556                 _sideNum                  = row / 224;
1557                 _charOff                  = row % 224;
1558                 _sideByteOff              = _sideNum * sideSz;
1559                 assert_leq(row, ep._len);
1560                 assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
1561 #ifndef NO_PREFETCH
1562                 __builtin_prefetch((const void *)(ebwt + _sideByteOff),
1563                                    0 /* prepare for read */,
1564                                    PREFETCH_LOCALITY);
1565 #endif
1566                 // prefetch tjside too
1567                 _fw = (_sideNum & 1) != 0; // odd-numbered sides are forward
1568                 _by = _charOff >> 2; // byte within side
1569                 assert_lt(_by, (int)ep._sideBwtSz);
1570                 _bp = _charOff & 3;  // bit-pair within byte
1571                 if(!_fw) {
1572                         _by = ep._sideBwtSz - _by - 1;
1573                         _bp ^= 3;
1574                 }
1575         }
1576
1577         /// Return true iff this is an initialized SideLocus
1578         bool valid() {
1579                 return _bp != -1;
1580         }
1581
1582         /// Make this look like an invalid SideLocus
1583         void invalidate() {
1584                 _bp = -1;
1585         }
1586
1587         const uint8_t *side(const uint8_t* ebwt) const {
1588                 return ebwt + _sideByteOff;
1589         }
1590
1591         const uint8_t *oside(const uint8_t* ebwt) const {
1592                 return ebwt + _sideByteOff + (_fw? (-128) : (128));
1593         }
1594
1595         uint32_t _sideByteOff; // offset of top side within ebwt[]
1596         uint32_t _sideNum;     // index of side
1597         uint16_t _charOff;     // character offset within side
1598         bool _fw;              // side is forward or backward?
1599         int16_t _by;           // byte within side (not adjusted for bw sides)
1600         int8_t _bp;            // bitpair within byte (not adjusted for bw sides)
1601 };
1602
1603 #include "ebwt_search_backtrack.h"
1604
1605 ///////////////////////////////////////////////////////////////////////
1606 //
1607 // Functions for printing and sanity-checking Ebwts
1608 //
1609 ///////////////////////////////////////////////////////////////////////
1610
1611 /**
1612  * Given a range of positions in the EBWT array within the BWT portion
1613  * of a forward side, print the characters at those positions along
1614  * with a summary occ[] array.
1615  */
1616 template<typename TStr>
1617 void Ebwt<TStr>::printRangeFw(uint32_t begin, uint32_t end) const {
1618         assert(isInMemory());
1619         uint32_t occ[] = {0, 0, 0, 0};
1620         assert_gt(end, begin);
1621         for(uint32_t i = begin; i < end; i++) {
1622                 uint8_t by = this->_ebwt[i];
1623                 for(int j = 0; j < 4; j++) {
1624                         // Unpack from lowest to highest bit pair
1625                         int twoBit = unpack_2b_from_8b(by, j);
1626                         occ[twoBit]++;
1627                         cout << "ACGT"[twoBit];
1628                 }
1629                 assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
1630         }
1631         cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
1632 }
1633
1634 /**
1635  * Given a range of positions in the EBWT array within the BWT portion
1636  * of a backward side, print the characters at those positions along
1637  * with a summary occ[] array.
1638  */
1639 template<typename TStr>
1640 void Ebwt<TStr>::printRangeBw(uint32_t begin, uint32_t end) const {
1641         assert(isInMemory());
1642         uint32_t occ[] = {0, 0, 0, 0};
1643         assert_gt(end, begin);
1644         for(uint32_t i = end-1; i >= begin; i--) {
1645                 uint8_t by = this->_ebwt[i];
1646                 for(int j = 3; j >= 0; j--) {
1647                         // Unpack from lowest to highest bit pair
1648                         int twoBit = unpack_2b_from_8b(by, j);
1649                         occ[twoBit]++;
1650                         cout << "ACGT"[twoBit];
1651                 }
1652                 assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
1653                 if(i == 0) break;
1654         }
1655         cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
1656 }
1657
1658 /**
1659  * Check that the ebwt array is internally consistent up to (and not
1660  * including) the given side index by re-counting the chars and
1661  * comparing against the embedded occ[] arrays.
1662  */
1663 template<typename TStr>
1664 void Ebwt<TStr>::sanityCheckUpToSide(int upToSide) const {
1665         assert(isInMemory());
1666         uint32_t occ[] = {0, 0, 0, 0};
1667         uint32_t occ_save[] = {0, 0};
1668         uint32_t cur = 0; // byte pointer
1669         const EbwtParams& eh = this->_eh;
1670         bool fw = false;
1671         while(cur < (upToSide * eh._sideSz)) {
1672                 assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
1673                 for(uint32_t i = 0; i < eh._sideBwtSz; i++) {
1674                         uint8_t by = this->_ebwt[cur + (fw ? i : eh._sideBwtSz-i-1)];
1675                         for(int j = 0; j < 4; j++) {
1676                                 // Unpack from lowest to highest bit pair
1677                                 int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
1678                                 occ[twoBit]++;
1679                                 //if(_verbose) cout << "ACGT"[twoBit];
1680                         }
1681                         assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
1682                 }
1683                 assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
1684                 if(fw) {
1685                         // Finished forward bucket; check saved [G] and [T]
1686                         // against the two uint32_ts encoded here
1687                         ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast<uint32_t*>(&this->_ebwt[cur + eh._sideBwtSz]));
1688                         ASSERT_ONLY(uint32_t gs = u32ebwt[0]);
1689                         ASSERT_ONLY(uint32_t ts = u32ebwt[1]);
1690                         assert_eq(gs, occ_save[0]);
1691                         assert_eq(ts, occ_save[1]);
1692                         fw = false;
1693                 } else {
1694                         // Finished backward bucket; check current [A] and [C]
1695                         // against the two uint32_ts encoded here
1696                         ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast<uint32_t*>(&this->_ebwt[cur + eh._sideBwtSz]));
1697                         ASSERT_ONLY(uint32_t as = u32ebwt[0]);
1698                         ASSERT_ONLY(uint32_t cs = u32ebwt[1]);
1699                         assert(as == occ[0] || as == occ[0]-1); // one 'a' is a skipped '$' and doesn't count toward occ[]
1700                         assert_eq(cs, occ[1]);
1701                         occ_save[0] = occ[2]; // save gs
1702                         occ_save[1] = occ[3]; // save ts
1703                         fw = true;
1704                 }
1705                 cur += eh._sideSz;
1706         }
1707 }
1708
1709 /**
1710  * Sanity-check various pieces of the Ebwt
1711  */
1712 template<typename TStr>
1713 void Ebwt<TStr>::sanityCheckAll(int reverse) const {
1714         const EbwtParams& eh = this->_eh;
1715         assert(isInMemory());
1716         // Check ftab
1717         for(uint32_t i = 1; i < eh._ftabLen; i++) {
1718                 assert_geq(this->ftabHi(i), this->ftabLo(i-1));
1719                 assert_geq(this->ftabLo(i), this->ftabHi(i-1));
1720                 assert_leq(this->ftabHi(i), eh._bwtLen+1);
1721         }
1722         assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen);
1723
1724         // Check offs
1725         int seenLen = (eh._bwtLen + 31) >> 5;
1726         uint32_t *seen;
1727         try {
1728                 seen = new uint32_t[seenLen]; // bitvector marking seen offsets
1729         } catch(bad_alloc& e) {
1730                 cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
1731                 throw e;
1732         }
1733         memset(seen, 0, 4 * seenLen);
1734         uint32_t offsLen = eh._offsLen;
1735         for(uint32_t i = 0; i < offsLen; i++) {
1736                 assert_lt(this->_offs[i], eh._bwtLen);
1737                 int w = this->_offs[i] >> 5;
1738                 int r = this->_offs[i] & 31;
1739                 assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
1740                 seen[w] |= (1 << r);
1741         }
1742         delete[] seen;
1743
1744         // Check nPat
1745         assert_gt(this->_nPat, 0);
1746
1747         // Check plen, flen
1748         for(uint32_t i = 0; i < this->_nPat; i++) {
1749                 assert_geq(this->_plen[i], 0);
1750         }
1751
1752         // Check rstarts
1753         for(uint32_t i = 0; i < this->_nFrag-1; i++) {
1754                 assert_gt(this->_rstarts[(i+1)*3], this->_rstarts[i*3]);
1755                 if(reverse == REF_READ_REVERSE) {
1756                         assert(this->_rstarts[(i*3)+1] >= this->_rstarts[((i+1)*3)+1]);
1757                 } else {
1758                         assert(this->_rstarts[(i*3)+1] <= this->_rstarts[((i+1)*3)+1]);
1759                 }
1760         }
1761
1762         // Check ebwt
1763         sanityCheckUpToSide(eh._numSides);
1764         VMSG_NL("Ebwt::sanityCheck passed");
1765 }
1766
1767 ///////////////////////////////////////////////////////////////////////
1768 //
1769 // Functions for searching Ebwts
1770 //
1771 ///////////////////////////////////////////////////////////////////////
1772
1773 /**
1774  * Return the final character in row i (i.e. the i'th character in the
1775  * BWT transform).  Note that the 'L' in the name of the function
1776  * stands for 'last', as in the literature.
1777  */
1778 template<typename TStr>
1779 inline int Ebwt<TStr>::rowL(const SideLocus& l) const {
1780         // Extract and return appropriate bit-pair
1781 #ifdef SIXTY4_FORMAT
1782         return (((uint64_t*)l.side(this->_ebwt))[l._by >> 3] >> ((((l._by & 7) << 2) + l._bp) << 1)) & 3;
1783 #else
1784         return unpack_2b_from_8b(l.side(this->_ebwt)[l._by], l._bp);
1785 #endif
1786 }
1787
1788 /**
1789  * Inline-function version of the above.  This does not always seem to
1790  * be inlined
1791  */
1792 #if 0
1793 // Use gcc's intrinsic popcountll.  I don't recommend it because it
1794 // seems to be somewhat slower than the bit-bashing pop64 routine both
1795 // on an AMD server and on an Intel workstation.  On the other hand,
1796 // perhaps when the builtin is used GCC is smart enough to insert a
1797 // pop-count instruction on architectures that have one (e.g. Itanium).
1798 // For now, it's disabled.
1799 #define pop64(x) __builtin_popcountll(x)
1800 #elif 0
1801 __declspec naked int __stdcall pop64
1802 (uint64_t v)
1803 {
1804 static const uint64_t C55 = 0x5555555555555555ll;
1805 static const uint64_t C33 = 0x3333333333333333ll;
1806 static const uint64_t C0F = 0x0F0F0F0F0F0F0F0Fll;
1807 __asm {
1808    MOVD      MM0, [ESP+4] ;v_low
1809    PUNPCKLDQ MM0, [ESP+8] ;v
1810    MOVQ      MM1, MM0     ;v
1811    PSRLD     MM0, 1       ;v >> 1
1812    PAND      MM0, [C55]   ;(v >> 1) & 0x55555555
1813    PSUBD     MM1, MM0     ;w = v - ((v >> 1) & 0x55555555)
1814    MOVQ      MM0, MM1     ;w
1815    PSRLD     MM1, 2       ;w >> 2
1816    PAND      MM0, [C33]   ;w & 0x33333333
1817    PAND      MM1, [C33]   ;(w >> 2)  & 0x33333333
1818    PADDD     MM0, MM1     ;x = (w & 0x33333333) +
1819                           ; ((w >> 2) & 0x33333333)
1820    MOVQ      MM1, MM0     ;x
1821    PSRLD     MM0, 4       ;x >> 4
1822    PADDD     MM0, MM1     ;x + (x >> 4)
1823    PAND      MM0, [C0F]   ;y = (x + (x >> 4) & 0x0F0F0F0F)
1824    PXOR      MM1, MM1     ;0
1825    PSADBW    (MM0, MM1)   ;sum across all 8 bytes
1826    MOVD      EAX, MM0     ;result in EAX per calling
1827                           ; convention
1828    EMMS                   ;clear MMX state
1829    RET  8                 ;pop 8-byte argument off stack
1830                           ; and return
1831    }
1832 }
1833 #elif 0
1834 // Use a bytewise LUT version of popcount.  This is slower than the
1835 // bit-bashing pop64 routine both on an AMD server and on an Intel
1836 // workstation.  It seems to be about the same speed as the GCC builtin
1837 // on Intel, and a bit faster than it on AMD.  For now, it's disabled.
1838 const int popcntU8Table[256] = {
1839     0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
1840     1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
1841     1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
1842     2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
1843     1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
1844     2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
1845     2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
1846     3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
1847 };
1848
1849 // Use this bytewise population count table
1850 inline static int pop64(uint64_t x) {
1851         const unsigned char * p = (const unsigned char *) &x;
1852         return popcntU8Table[p[0]] +
1853                popcntU8Table[p[1]] +
1854                popcntU8Table[p[2]] +
1855                popcntU8Table[p[3]] +
1856                popcntU8Table[p[4]] +
1857                popcntU8Table[p[5]] +
1858                popcntU8Table[p[6]] +
1859                popcntU8Table[p[7]];
1860 }
1861 #else
1862 // Use this standard bit-bashing population count
1863 inline static int pop64(uint64_t x) {
1864    x = x - ((x >> 1) & 0x5555555555555555llu);
1865    x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
1866    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
1867    x = x + (x >> 8);
1868    x = x + (x >> 16);
1869    x = x + (x >> 32);
1870    return x & 0x3F;
1871 }
1872 #endif
1873
1874 /**
1875  * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
1876  * within a 64-bit argument.
1877  */
1878 inline static int countInU64(int c, uint64_t dw) {
1879         uint64_t dwA  = dw &  0xAAAAAAAAAAAAAAAAllu;
1880         uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
1881         uint64_t tmp;
1882         switch(c) {
1883         case 0:
1884                 tmp = (dwA >> 1) | dwNA;
1885                 break;
1886         case 1:
1887                 tmp = ~(dwA >> 1) & dwNA;
1888                 break;
1889         case 2:
1890                 tmp = (dwA >> 1) & ~dwNA;
1891                 break;
1892         case 3:
1893                 tmp = (dwA >> 1) & dwNA;
1894                 break;
1895         default:
1896                 throw;
1897         }
1898         tmp = pop64(tmp); // Gets 7.62% in profile
1899         if(c == 0) {
1900                 tmp = 32 - tmp;
1901         }
1902         assert_leq(tmp, 32);
1903         assert_geq(tmp, 0);
1904         return (int)tmp;
1905 }
1906
1907 /**
1908  * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
1909  * within a 64-bit argument.
1910  *
1911  * Function gets 2.32% in profile
1912  */
1913 inline static void countInU64Ex(uint64_t dw, uint32_t* arrs) {
1914         uint64_t dwA  = dw &  0xAAAAAAAAAAAAAAAAllu;
1915         uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
1916         arrs[0] += (32 - pop64((dwA >> 1) | dwNA));
1917         arrs[1] += pop64(~(dwA >> 1) & dwNA);
1918         arrs[2] += pop64((dwA >> 1) & ~dwNA);
1919         arrs[3] += pop64((dwA >> 1) & dwNA);
1920 }
1921
1922 /**
1923  * Counts the number of occurrences of character 'c' in the given Ebwt
1924  * side up to (but not including) the given byte/bitpair (by/bp).
1925  *
1926  * This is a performance-critical function.  This is the top search-
1927  * related hit in the time profile.
1928  *
1929  * Function gets 11.09% in profile
1930  */
1931 template<typename TStr>
1932 inline uint32_t Ebwt<TStr>::countUpTo(const SideLocus& l, int c) const {
1933         // Count occurrences of c in each 64-bit (using bit trickery);
1934         // Someday countInU64() and pop() functions should be
1935         // vectorized/SSE-ized in case that helps.
1936         uint32_t cCnt = 0;
1937         const uint8_t *side = l.side(this->_ebwt);
1938         int i = 0;
1939 #if 1
1940         for(; i + 7 < l._by; i += 8) {
1941                 cCnt += countInU64(c, *(uint64_t*)&side[i]);
1942         }
1943 #else
1944         for(; i + 2 < l._by; i += 2) {
1945                 cCnt += cCntLUT_16b_4[c][*(uint16_t*)&side[i]];
1946         }
1947 #endif
1948 #ifdef SIXTY4_FORMAT
1949         // Calculate number of bit pairs to shift off the end
1950         const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
1951         if(bpShiftoff < 32) {
1952                 assert_lt(bpShiftoff, 32);
1953                 const uint64_t sw = (*(uint64_t*)&side[i]) << (bpShiftoff << 1);
1954                 cCnt += countInU64(c, sw);
1955                 if(c == 0) cCnt -= bpShiftoff; // we turned these into As
1956         }
1957 #else
1958         // Count occurences of c in the rest of the side (using LUT)
1959         for(; i < l._by; i++) {
1960                 cCnt += cCntLUT_4[0][c][side[i]];
1961         }
1962         // Count occurences of c in the rest of the byte
1963         if(l._bp > 0) {
1964                 cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
1965         }
1966 #endif
1967         return cCnt;
1968 }
1969
1970 /**
1971  * Counts the number of occurrences of character 'c' in the given Ebwt
1972  * side up to (but not including) the given byte/bitpair (by/bp).
1973  */
1974 template<typename TStr>
1975 inline void Ebwt<TStr>::countUpToEx(const SideLocus& l, uint32_t* arrs) const {
1976         int i = 0;
1977         // Count occurrences of c in each 64-bit (using bit trickery);
1978         // note: this seems does not seem to lend a significant boost to
1979         // performance.  If you comment out this whole loop (which won't
1980         // affect correctness - it will just cause the following loop to
1981         // take up the slack) then runtime does not change noticeably.
1982         // Someday the countInU64() and pop() functions should be
1983         // vectorized/SSE-ized in case that helps.
1984         const uint8_t *side = l.side(this->_ebwt);
1985         for(; i+7 < l._by; i += 8) {
1986                 countInU64Ex(*(uint64_t*)&side[i], arrs);
1987         }
1988 #ifdef SIXTY4_FORMAT
1989         // Calculate number of bit pairs to shift off the end
1990         const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
1991         assert_leq(bpShiftoff, 32);
1992         if(bpShiftoff < 32) {
1993                 const uint64_t sw = (*(uint64_t*)&l.side(this->_ebwt)[i]) << (bpShiftoff << 1);
1994                 countInU64Ex(sw, arrs);
1995                 arrs[0] -= bpShiftoff;
1996         }
1997 #else
1998         // Count occurences of c in the rest of the side (using LUT)
1999         for(; i < l._by; i++) {
2000                 arrs[0] += cCntLUT_4[0][0][side[i]];
2001                 arrs[1] += cCntLUT_4[0][1][side[i]];
2002                 arrs[2] += cCntLUT_4[0][2][side[i]];
2003                 arrs[3] += cCntLUT_4[0][3][side[i]];
2004         }
2005         // Count occurences of c in the rest of the byte
2006         if(l._bp > 0) {
2007                 arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
2008                 arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
2009                 arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
2010                 arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
2011         }
2012 #endif
2013 }
2014
2015 /**
2016  * Count all occurrences of character c from the beginning of the
2017  * forward side to <by,bp> and add in the occ[] count up to the side
2018  * break just prior to the side.
2019  */
2020 template<typename TStr>
2021 inline uint32_t Ebwt<TStr>::countFwSide(const SideLocus& l, int c) const {
2022         assert_lt(c, 4);
2023         assert_geq(c, 0);
2024         assert_lt(l._by, (int)this->_eh._sideBwtSz);
2025         assert_geq(l._by, 0);
2026         assert_lt(l._bp, 4);
2027         assert_geq(l._bp, 0);
2028         const uint8_t *side = l.side(this->_ebwt);
2029         uint32_t cCnt = countUpTo(l, c);
2030         assert_leq(cCnt, this->_eh._sideBwtLen);
2031         if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2032                 // Adjust for the fact that we represented $ with an 'A', but
2033                 // shouldn't count it as an 'A' here
2034                 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2035                    (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
2036                 {
2037                         cCnt--; // Adjust for '$' looking like an 'A'
2038                 }
2039         }
2040         uint32_t ret;
2041         // Now factor in the occ[] count at the side break
2042         if(c < 2) {
2043                 const uint32_t *ac = reinterpret_cast<const uint32_t*>(side - 8);
2044                 assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
2045                 assert_leq(ac[1], this->_eh._len);
2046                 ret = ac[c] + cCnt + this->_fchr[c];
2047         } else {
2048                 const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8); // next
2049                 assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2050                 ret = gt[c-2] + cCnt + this->_fchr[c];
2051         }
2052 #ifndef NDEBUG
2053         assert_leq(ret, this->_fchr[c+1]); // can't have jumpded into next char's section
2054         if(c == 0) {
2055                 assert_leq(cCnt, this->_eh._sideBwtLen);
2056         } else {
2057                 assert_leq(ret, this->_eh._bwtLen);
2058         }
2059 #endif
2060         return ret;
2061 }
2062
2063 /**
2064  * Count all occurrences of character c from the beginning of the
2065  * forward side to <by,bp> and add in the occ[] count up to the side
2066  * break just prior to the side.
2067  */
2068 template<typename TStr>
2069 inline void Ebwt<TStr>::countFwSideEx(const SideLocus& l, uint32_t* arrs) const
2070 {
2071         assert_lt(l._by, (int)this->_eh._sideBwtSz);
2072         assert_geq(l._by, 0);
2073         assert_lt(l._bp, 4);
2074         assert_geq(l._bp, 0);
2075         countUpToEx(l, arrs);
2076 #ifndef NDEBUG
2077         assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
2078         assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
2079         assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
2080         assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
2081 #endif
2082         assert_leq(arrs[0], this->_eh._sideBwtLen);
2083         assert_leq(arrs[1], this->_eh._sideBwtLen);
2084         assert_leq(arrs[2], this->_eh._sideBwtLen);
2085         assert_leq(arrs[3], this->_eh._sideBwtLen);
2086         const uint8_t *side = l.side(this->_ebwt);
2087         if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2088                 // Adjust for the fact that we represented $ with an 'A', but
2089                 // shouldn't count it as an 'A' here
2090                 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2091                    (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
2092                 {
2093                         arrs[0]--; // Adjust for '$' looking like an 'A'
2094                 }
2095         }
2096         // Now factor in the occ[] count at the side break
2097         const uint32_t *ac = reinterpret_cast<const uint32_t*>(side - 8);
2098         const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8);
2099 #ifndef NDEBUG
2100         assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
2101         assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
2102         assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
2103         assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
2104 #endif
2105         assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
2106         assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2107         arrs[0] += (ac[0] + this->_fchr[0]);
2108         arrs[1] += (ac[1] + this->_fchr[1]);
2109         arrs[2] += (gt[0] + this->_fchr[2]);
2110         arrs[3] += (gt[1] + this->_fchr[3]);
2111 #ifndef NDEBUG
2112         assert_leq(arrs[0], this->_fchr[1]); // can't have jumpded into next char's section
2113         assert_leq(arrs[1], this->_fchr[2]); // can't have jumpded into next char's section
2114         assert_leq(arrs[2], this->_fchr[3]); // can't have jumpded into next char's section
2115         assert_leq(arrs[3], this->_fchr[4]); // can't have jumpded into next char's section
2116 #endif
2117 }
2118
2119 /**
2120  * Count all instances of character c from <by,bp> to the logical end
2121  * (actual beginning) of the backward side, and subtract that from the
2122  * occ[] count up to the side break.
2123  */
2124 template<typename TStr>
2125 inline uint32_t Ebwt<TStr>::countBwSide(const SideLocus& l, int c) const {
2126         assert_lt(c, 4);
2127         assert_geq(c, 0);
2128         assert_lt(l._by, (int)this->_eh._sideBwtSz);
2129         assert_geq(l._by, 0);
2130         assert_lt(l._bp, 4);
2131         assert_geq(l._bp, 0);
2132         const uint8_t *side = l.side(this->_ebwt);
2133         uint32_t cCnt = countUpTo(l, c);
2134         if(rowL(l) == c) cCnt++;
2135         assert_leq(cCnt, this->_eh._sideBwtLen);
2136         if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2137                 // Adjust for the fact that we represented $ with an 'A', but
2138                 // shouldn't count it as an 'A' here
2139                 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2140                    (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
2141                 {
2142                         cCnt--;
2143                 }
2144         }
2145         uint32_t ret;
2146         // Now factor in the occ[] count at the side break
2147         if(c < 2) {
2148                 const uint32_t *ac = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8);
2149                 assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
2150                 assert_leq(ac[1], this->_eh._len);
2151                 ret = ac[c] - cCnt + this->_fchr[c];
2152         } else {
2153                 const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + (2*this->_eh._sideSz) - 8); // next
2154                 assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2155                 ret = gt[c-2] - cCnt + this->_fchr[c];
2156         }
2157 #ifndef NDEBUG
2158         assert_leq(ret, this->_fchr[c+1]); // can't have jumped into next char's section
2159         if(c == 0) {
2160                 assert_leq(cCnt, this->_eh._sideBwtLen);
2161         } else {
2162                 assert_lt(ret, this->_eh._bwtLen);
2163         }
2164 #endif
2165         return ret;
2166 }
2167
2168 /**
2169  * Count all instances of character c from <by,bp> to the logical end
2170  * (actual beginning) of the backward side, and subtract that from the
2171  * occ[] count up to the side break.
2172  */
2173 template<typename TStr>
2174 inline void Ebwt<TStr>::countBwSideEx(const SideLocus& l, uint32_t* arrs) const {
2175         assert_lt(l._by, (int)this->_eh._sideBwtSz);
2176         assert_geq(l._by, 0);
2177         assert_lt(l._bp, 4);
2178         assert_geq(l._bp, 0);
2179         const uint8_t *side = l.side(this->_ebwt);
2180         countUpToEx(l, arrs);
2181         arrs[rowL(l)]++;
2182         assert_leq(arrs[0], this->_eh._sideBwtLen);
2183         assert_leq(arrs[1], this->_eh._sideBwtLen);
2184         assert_leq(arrs[2], this->_eh._sideBwtLen);
2185         assert_leq(arrs[3], this->_eh._sideBwtLen);
2186         if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
2187                 // Adjust for the fact that we represented $ with an 'A', but
2188                 // shouldn't count it as an 'A' here
2189                 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
2190                    (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
2191                 {
2192                         arrs[0]--; // Adjust for '$' looking like an 'A'
2193                 }
2194         }
2195         // Now factor in the occ[] count at the side break
2196         const uint32_t *ac = reinterpret_cast<const uint32_t*>(side + this->_eh._sideSz - 8);
2197         const uint32_t *gt = reinterpret_cast<const uint32_t*>(side + (2*this->_eh._sideSz) - 8);
2198 #ifndef NDEBUG
2199         assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
2200         assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
2201         assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
2202         assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
2203 #endif
2204         assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
2205         assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
2206         arrs[0] = (ac[0] - arrs[0] + this->_fchr[0]);
2207         arrs[1] = (ac[1] - arrs[1] + this->_fchr[1]);
2208         arrs[2] = (gt[0] - arrs[2] + this->_fchr[2]);
2209         arrs[3] = (gt[1] - arrs[3] + this->_fchr[3]);
2210 #ifndef NDEBUG
2211         assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
2212         assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
2213         assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
2214         assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
2215 #endif
2216 }
2217
2218 /**
2219  * Given top and bot loci, calculate counts of all four DNA chars up to
2220  * those loci.  Used for more advanced backtracking-search.
2221  */
2222 template<typename TStr>
2223 inline void Ebwt<TStr>::mapLFEx(const SideLocus& ltop,
2224                                 const SideLocus& lbot,
2225                                 uint32_t *tops,
2226                                 uint32_t *bots
2227                                 ASSERT_ONLY(, bool overrideSanity)
2228                                 ) const
2229 {
2230         // TODO: Where there's overlap, reuse the count for the overlapping
2231         // portion
2232 #ifdef EBWT_STATS
2233         const_cast<Ebwt<TStr>*>(this)->mapLFExs_++;
2234 #endif
2235         assert_eq(0, tops[0]); assert_eq(0, bots[0]);
2236         assert_eq(0, tops[1]); assert_eq(0, bots[1]);
2237         assert_eq(0, tops[2]); assert_eq(0, bots[2]);
2238         assert_eq(0, tops[3]); assert_eq(0, bots[3]);
2239         if(ltop._fw) countFwSideEx(ltop, tops); // Forward side
2240         else         countBwSideEx(ltop, tops); // Backward side
2241         if(lbot._fw) countFwSideEx(lbot, bots); // Forward side
2242         else         countBwSideEx(lbot, bots); // Backward side
2243 #ifndef NDEBUG
2244         if(_sanity && !overrideSanity) {
2245                 // Make sure results match up with individual calls to mapLF;
2246                 // be sure to override sanity-checking in the callee, or we'll
2247                 // have infinite recursion
2248                 assert_eq(mapLF(ltop, 0, true), tops[0]);
2249                 assert_eq(mapLF(ltop, 1, true), tops[1]);
2250                 assert_eq(mapLF(ltop, 2, true), tops[2]);
2251                 assert_eq(mapLF(ltop, 3, true), tops[3]);
2252                 assert_eq(mapLF(lbot, 0, true), bots[0]);
2253                 assert_eq(mapLF(lbot, 1, true), bots[1]);
2254                 assert_eq(mapLF(lbot, 2, true), bots[2]);
2255                 assert_eq(mapLF(lbot, 3, true), bots[3]);
2256         }
2257 #endif
2258 }
2259
2260 #ifndef NDEBUG
2261 /**
2262  * Given top and bot loci, calculate counts of all four DNA chars up to
2263  * those loci.  Used for more advanced backtracking-search.
2264  */
2265 template<typename TStr>
2266 inline void Ebwt<TStr>::mapLFEx(const SideLocus& l,
2267                                 uint32_t *arrs
2268                                 ASSERT_ONLY(, bool overrideSanity)
2269                                 ) const
2270 {
2271         assert_eq(0, arrs[0]);
2272         assert_eq(0, arrs[1]);
2273         assert_eq(0, arrs[2]);
2274         assert_eq(0, arrs[3]);
2275         if(l._fw) countFwSideEx(l, arrs); // Forward side
2276         else      countBwSideEx(l, arrs); // Backward side
2277 #ifndef NDEBUG
2278         if(_sanity && !overrideSanity) {
2279                 // Make sure results match up with individual calls to mapLF;
2280                 // be sure to override sanity-checking in the callee, or we'll
2281                 // have infinite recursion
2282                 assert_eq(mapLF(l, 0, true), arrs[0]);
2283                 assert_eq(mapLF(l, 1, true), arrs[1]);
2284                 assert_eq(mapLF(l, 2, true), arrs[2]);
2285                 assert_eq(mapLF(l, 3, true), arrs[3]);
2286         }
2287 #endif
2288 }
2289 #endif
2290
2291 /**
2292  * Given row i, return the row that the LF mapping maps i to.
2293  */
2294 template<typename TStr>
2295 inline uint32_t Ebwt<TStr>::mapLF(const SideLocus& l
2296                                   ASSERT_ONLY(, bool overrideSanity)
2297                                   ) const
2298 {
2299 #ifdef EBWT_STATS
2300         const_cast<Ebwt<TStr>*>(this)->mapLFs_++;
2301 #endif
2302         uint32_t ret;
2303         assert(l.side(this->_ebwt) != NULL);
2304         int c = rowL(l);
2305         assert_lt(c, 4);
2306         assert_geq(c, 0);
2307         if(l._fw) ret = countFwSide(l, c); // Forward side
2308         else      ret = countBwSide(l, c); // Backward side
2309         assert_lt(ret, this->_eh._bwtLen);
2310 #ifndef NDEBUG
2311         if(_sanity && !overrideSanity) {
2312                 // Make sure results match up with results from mapLFEx;
2313                 // be sure to override sanity-checking in the callee, or we'll
2314                 // have infinite recursion
2315                 uint32_t arrs[] = { 0, 0, 0, 0 };
2316                 mapLFEx(l, arrs, true);
2317                 assert_eq(arrs[c], ret);
2318         }
2319 #endif
2320         return ret;
2321 }
2322
2323 /**
2324  * Given row i and character c, return the row that the LF mapping maps
2325  * i to on character c.
2326  */
2327 template<typename TStr>
2328 inline uint32_t Ebwt<TStr>::mapLF(const SideLocus& l, int c
2329                                   ASSERT_ONLY(, bool overrideSanity)
2330                                   ) const
2331 {
2332 #ifdef EBWT_STATS
2333         const_cast<Ebwt<TStr>*>(this)->mapLFcs_++;
2334 #endif
2335         uint32_t ret;
2336         assert_lt(c, 4);
2337         assert_geq(c, 0);
2338         if(l._fw) ret = countFwSide(l, c); // Forward side
2339         else      ret = countBwSide(l, c); // Backward side
2340         assert_lt(ret, this->_eh._bwtLen);
2341 #ifndef NDEBUG
2342         if(_sanity && !overrideSanity) {
2343                 // Make sure results match up with results from mapLFEx;
2344                 // be sure to override sanity-checking in the callee, or we'll
2345                 // have infinite recursion
2346                 uint32_t arrs[] = { 0, 0, 0, 0 };
2347                 mapLFEx(l, arrs, true);
2348                 assert_eq(arrs[c], ret);
2349         }
2350 #endif
2351         return ret;
2352 }
2353
2354 /**
2355  * Given row i and character c, return the row that the LF mapping maps
2356  * i to on character c.
2357  */
2358 template<typename TStr>
2359 inline uint32_t Ebwt<TStr>::mapLF1(uint32_t row, const SideLocus& l, int c
2360                                    ASSERT_ONLY(, bool overrideSanity)
2361                                    ) const
2362 {
2363 #ifdef EBWT_STATS
2364         const_cast<Ebwt<TStr>*>(this)->mapLF1cs_++;
2365 #endif
2366         if(rowL(l) != c || row == _zOff) return 0xffffffff;
2367         uint32_t ret;
2368         assert_lt(c, 4);
2369         assert_geq(c, 0);
2370         if(l._fw) ret = countFwSide(l, c); // Forward side
2371         else      ret = countBwSide(l, c); // Backward side
2372         assert_lt(ret, this->_eh._bwtLen);
2373 #ifndef NDEBUG
2374         if(_sanity && !overrideSanity) {
2375                 // Make sure results match up with results from mapLFEx;
2376                 // be sure to override sanity-checking in the callee, or we'll
2377                 // have infinite recursion
2378                 uint32_t arrs[] = { 0, 0, 0, 0 };
2379                 mapLFEx(l, arrs, true);
2380                 assert_eq(arrs[c], ret);
2381         }
2382 #endif
2383         return ret;
2384 }
2385
2386 /**
2387  * Given row i and character c, return the row that the LF mapping maps
2388  * i to on character c.
2389  */
2390 template<typename TStr>
2391 inline int Ebwt<TStr>::mapLF1(uint32_t& row, const SideLocus& l
2392                               ASSERT_ONLY(, bool overrideSanity)
2393                               ) const
2394 {
2395 #ifdef EBWT_STATS
2396         const_cast<Ebwt<TStr>*>(this)->mapLF1s_++;
2397 #endif
2398         if(row == _zOff) return -1;
2399         int c = rowL(l);
2400         assert_lt(c, 4);
2401         assert_geq(c, 0);
2402         if(l._fw) row = countFwSide(l, c); // Forward side
2403         else      row = countBwSide(l, c); // Backward side
2404         assert_lt(row, this->_eh._bwtLen);
2405 #ifndef NDEBUG
2406         if(_sanity && !overrideSanity) {
2407                 // Make sure results match up with results from mapLFEx;
2408                 // be sure to override sanity-checking in the callee, or we'll
2409                 // have infinite recursion
2410                 uint32_t arrs[] = { 0, 0, 0, 0 };
2411                 mapLFEx(l, arrs, true);
2412                 assert_eq(arrs[c], row);
2413         }
2414 #endif
2415         return c;
2416 }
2417
2418 /**
2419  * Take an offset into the joined text and translate it into the
2420  * reference of the index it falls on, the offset into the reference,
2421  * and the length of the reference.  Use a binary search through the
2422  * sorted list of reference fragment ranges t
2423  */
2424 template<typename TStr>
2425 void Ebwt<TStr>::joinedToTextOff(uint32_t qlen, uint32_t off,
2426                                  uint32_t& tidx,
2427                                  uint32_t& textoff,
2428                                  uint32_t& tlen) const
2429 {
2430         uint32_t top = 0;
2431         uint32_t bot = _nFrag; // 1 greater than largest addressable element
2432         uint32_t elt = 0xffffffff;
2433         // Begin binary search
2434         while(true) {
2435                 ASSERT_ONLY(uint32_t oldelt = elt);
2436                 elt = top + ((bot - top) >> 1);
2437                 assert_neq(oldelt, elt); // must have made progress
2438                 uint32_t lower = _rstarts[elt*3];
2439                 uint32_t upper;
2440                 if(elt == _nFrag-1) {
2441                         upper = _eh._len;
2442                 } else {
2443                         upper = _rstarts[((elt+1)*3)];
2444                 }
2445                 assert_gt(upper, lower);
2446                 uint32_t fraglen = upper - lower;
2447                 if(lower <= off) {
2448                         if(upper > off) { // not last element, but it's within
2449                                 // off is in this range; check if it falls off
2450                                 if(off + qlen > upper) {
2451                                         // it falls off; signal no-go and return
2452                                         tidx = 0xffffffff;
2453                                         assert_lt(elt, _nFrag-1);
2454                                         return;
2455                                 }
2456                                 tidx = _rstarts[(elt*3)+1];
2457                                 assert_lt(tidx, this->_nPat);
2458                                 assert_leq(fraglen, this->_plen[tidx]);
2459                                 // it doesn't fall off; now calculate textoff.
2460                                 // Initially it's the number of characters that precede
2461                                 // the alignment in the fragment
2462                                 uint32_t fragoff = off - _rstarts[(elt*3)];
2463                                 if(!this->_fw) {
2464                                         fragoff = fraglen - fragoff - 1;
2465                                         fragoff -= (qlen-1);
2466                                 }
2467                                 // Add the alignment's offset into the fragment
2468                                 // ('fragoff') to the fragment's offset within the text
2469                                 textoff = fragoff + _rstarts[(elt*3)+2];
2470                                 assert_lt(textoff, this->_plen[tidx]);
2471                                 break; // done with binary search
2472                         } else {
2473                                 // 'off' belongs somewhere in the region between elt
2474                                 // and bot
2475                                 top = elt;
2476                         }
2477                 } else {
2478                         // 'off' belongs somewhere in the region between top and
2479                         // elt
2480                         bot = elt;
2481                 }
2482                 // continue with binary search
2483         }
2484         tlen = this->_plen[tidx];
2485 }
2486
2487 /**
2488  * Report a potential match at offset 'off' with pattern length
2489  * 'qlen'.  Filter out spurious matches that span texts.
2490  */
2491 template<typename TStr>
2492 inline bool Ebwt<TStr>::report(const String<Dna5>& query,
2493                                String<char>* quals,
2494                                String<char>* name,
2495                                bool color,
2496                                bool colExEnds,
2497                                int snpPhred,
2498                                const BitPairReference* ref,
2499                                const std::vector<uint32_t>& mmui32,
2500                                const std::vector<uint8_t>& refcs,
2501                                size_t numMms,
2502                                uint32_t off,
2503                                uint32_t top,
2504                                uint32_t bot,
2505                                uint32_t qlen,
2506                                int stratum,
2507                                uint16_t cost,
2508                                uint32_t patid,
2509                                uint32_t seed,
2510                                const EbwtSearchParams<TStr>& params) const
2511 {
2512         VMSG_NL("In report");
2513         assert_geq(cost, (uint32_t)(stratum << 14));
2514         assert_lt(off, this->_eh._len);
2515         uint32_t tidx;
2516         uint32_t textoff;
2517         uint32_t tlen;
2518         joinedToTextOff(qlen, off, tidx, textoff, tlen);
2519         if(tidx == 0xffffffff) {
2520                 return false;
2521         }
2522         return params.reportHit(
2523                         query,                    // read sequence
2524                         quals,                    // read quality values
2525                         name,                     // read name
2526                         color,                    // true -> read is colorspace
2527                         colExEnds,                // true -> exclude nucleotides on ends
2528                         snpPhred,                 // phred probability of SNP
2529                         ref,                      // reference sequence
2530                         rmap_,                    // map to another reference coordinate system
2531                         _fw,                      // true = index is forward; false = mirror
2532                         mmui32,                   // mismatch positions
2533                         refcs,                    // reference characters for mms
2534                         numMms,                   // # mismatches
2535                         make_pair(tidx, textoff), // position
2536                         make_pair(0, 0),          // (bogus) mate position
2537                         true,                     // (bogus) mate orientation
2538                         0,                        // (bogus) mate length
2539                         make_pair(top, bot),      // arrows
2540                         tlen,                     // textlen
2541                         qlen,                     // qlen
2542                         stratum,                  // alignment stratum
2543                         cost,                     // cost, including stratum & quality penalty
2544                         bot-top-1,                // # other hits
2545                         patid,                    // pattern id
2546                         seed,                     // pseudo-random seed
2547                         0);                       // mate (0 = unpaired)
2548 }
2549
2550 #include "row_chaser.h"
2551
2552 /**
2553  * Report a result.  Involves walking backwards along the original
2554  * string by way of the LF-mapping until we reach a marked SA row or
2555  * the row corresponding to the 0th suffix.  A marked row's offset
2556  * into the original string can be read directly from the this->_offs[]
2557  * array.
2558  */
2559 template<typename TStr>
2560 inline bool Ebwt<TStr>::reportChaseOne(const String<Dna5>& query,
2561                                        String<char>* quals,
2562                                        String<char>* name,
2563                                        bool color,
2564                                        bool colExEnds,
2565                                        int snpPhred,
2566                                        const BitPairReference* ref,
2567                                        const std::vector<uint32_t>& mmui32,
2568                                        const std::vector<uint8_t>& refcs,
2569                                        size_t numMms,
2570                                        uint32_t i,
2571                                        uint32_t top,
2572                                        uint32_t bot,
2573                                        uint32_t qlen,
2574                                        int stratum,
2575                                        uint16_t cost,
2576                                        uint32_t patid,
2577                                        uint32_t seed,
2578                                        const EbwtSearchParams<TStr>& params,
2579                                        SideLocus *l) const
2580 {
2581         VMSG_NL("In reportChaseOne");
2582         uint32_t off;
2583         uint32_t jumps = 0;
2584         ASSERT_ONLY(uint32_t origi = i);
2585         SideLocus myl;
2586         const uint32_t offMask = this->_eh._offMask;
2587         const uint32_t offRate = this->_eh._offRate;
2588         const uint32_t* offs = this->_offs;
2589         // If the caller didn't give us a pre-calculated (and prefetched)
2590         // locus, then we have to do that now
2591         if(l == NULL) {
2592                 l = &myl;
2593                 l->initFromRow(i, this->_eh, this->_ebwt);
2594         }
2595         assert(l != NULL);
2596         assert(l->valid());
2597         // Walk along until we reach the next marked row to the left
2598         while(((i & offMask) != i) && i != _zOff) {
2599                 // Not a marked row; walk left one more char
2600                 uint32_t newi = mapLF(*l); // calc next row
2601                 assert_neq(newi, i);
2602                 i = newi;                                  // update row
2603                 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2604                 jumps++;
2605         }
2606         // This is a marked row
2607         if(i == _zOff) {
2608                 // Special case: it's the row corresponding to the
2609                 // lexicographically smallest suffix, which is implicitly
2610                 // marked 0
2611                 off = jumps;
2612                 VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
2613         } else {
2614                 // Normal marked row, calculate offset of row i
2615                 off = offs[i >> offRate] + jumps;
2616                 VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
2617         }
2618 #ifndef NDEBUG
2619         {
2620                 uint32_t rcoff = RowChaser<TStr>::toFlatRefOff(this, qlen, origi);
2621                 assert_eq(rcoff, off);
2622         }
2623 #endif
2624         return report(query, quals, name, color, colExEnds, snpPhred, ref,
2625                       mmui32, refcs, numMms, off, top, bot, qlen, stratum,
2626                       cost, patid, seed, params);
2627 }
2628
2629 /**
2630  * Report a result.  Involves walking backwards along the original
2631  * string by way of the LF-mapping until we reach a marked SA row or
2632  * the row corresponding to the 0th suffix.  A marked row's offset
2633  * into the original string can be read directly from the this->_offs[]
2634  * array.
2635  */
2636 template<typename TStr>
2637 inline bool Ebwt<TStr>::reportReconstruct(const String<Dna5>& query,
2638                                           String<char>* quals,
2639                                           String<char>* name,
2640                                           String<Dna5>& lbuf,
2641                                           String<Dna5>& rbuf,
2642                                           const uint32_t *mmui32,
2643                                           const char* refcs,
2644                                           size_t numMms,
2645                                           uint32_t i,
2646                                           uint32_t top,
2647                                           uint32_t bot,
2648                                           uint32_t qlen,
2649                                           int stratum,
2650                                           const EbwtSearchParams<TStr>& params,
2651                                           SideLocus *l) const
2652 {
2653         VMSG_NL("In reportReconstruct");
2654         assert_gt(_eh._isaLen, 0); // Must have inverse suffix array to reconstruct
2655         uint32_t off;
2656         uint32_t jumps = 0;
2657         SideLocus myl;
2658         const uint32_t offMask = this->_eh._offMask;
2659         const uint32_t offRate = this->_eh._offRate;
2660         const uint32_t* offs = this->_offs;
2661         const uint32_t* isa = this->_isa;
2662         assert(isa != NULL);
2663         if(l == NULL) {
2664                 l = &myl;
2665                 myl.initFromRow(i, this->_eh, this->_ebwt);
2666         }
2667         assert(l != NULL);
2668         clear(lbuf); clear(rbuf);
2669         // Walk along until we reach the next marked row to the left
2670         while(((i & offMask) != i) && i != _zOff) {
2671                 // Not a marked row; walk left one more char
2672                 int c = rowL(*l);
2673                 appendValue(lbuf, (Dna5)c);
2674                 uint32_t newi;
2675                 assert_lt(c, 4);
2676                 assert_geq(c, 0);
2677                 if(l->_fw) newi = countFwSide(*l, c); // Forward side
2678                 else       newi = countBwSide(*l, c); // Backward side
2679                 assert_lt(newi, this->_eh._bwtLen);
2680                 assert_neq(newi, i);
2681                 i = newi;                                  // update row
2682                 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2683                 jumps++;
2684         }
2685         // This is a marked row
2686         if(i == _zOff) {
2687                 // Special case: it's the row corresponding to the
2688                 // lexicographically smallest suffix, which is implicitly
2689                 // marked 0
2690                 off = jumps;
2691                 VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
2692         } else {
2693                 // Normal marked row, calculate offset of row i
2694                 off = offs[i >> offRate] + jumps;
2695                 VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
2696         }
2697         // 'off' now holds the text offset of the first (leftmost) position
2698         // involved in the alignment.  Next we call joinedToTextOff to
2699         // check whether the seed is valid (i.e., does not straddle a
2700         // boundary between two reference seuqences) and to obtain its
2701         // extents
2702         uint32_t tidx;    // the index (id) of the reference we hit in
2703         uint32_t textoff; // the offset of the alignment within the reference
2704         uint32_t tlen;    // length of reference seed hit in
2705         joinedToTextOff(qlen, off, tidx, textoff, tlen);
2706         if(tidx == 0xffffffff) {
2707                 // The seed straddled a reference boundary, and so is spurious.
2708                 // Return false, indicating that we shouldn't stop.
2709                 return false;
2710         }
2711         if(jumps > textoff) {
2712                 // In our progress toward a marked row, we passed the boundary
2713                 // between the reference sequence containing the seed and the
2714                 // reference sequence to the left of it.  That's OK, we just
2715                 // need to knock off the extra characters we added to 'lbuf'.
2716                 assert_eq(jumps, length(lbuf));
2717                 _setLength(lbuf, textoff);
2718                 jumps = textoff;
2719                 assert_eq(textoff, length(lbuf));
2720         } else if(jumps < textoff) {
2721                 // Keep walking until we reach the end of the reference
2722                 assert_neq(i, _zOff);
2723                 uint32_t diff = textoff-jumps;
2724                 for(size_t j = 0; j < diff; j++) {
2725                         // Not a marked row; walk left one more char
2726                         int c = rowL(*l);
2727                         appendValue(lbuf, (Dna5)c);
2728                         uint32_t newi;
2729                         assert_lt(c, 4);
2730                         assert_geq(c, 0);
2731                         if(l->_fw) newi = countFwSide(*l, c); // Forward side
2732                         else       newi = countBwSide(*l, c); // Backward side
2733                         assert_lt(newi, this->_eh._bwtLen);
2734                         assert_neq(newi, i);
2735                         i = newi;                                  // update row
2736                         assert_neq(i, _zOff);
2737                         l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2738                         jumps++;
2739                 }
2740                 assert_eq(textoff, jumps);
2741                 assert_eq(textoff, length(lbuf));
2742         }
2743         assert_eq(textoff, jumps);
2744         assert_eq(textoff, length(lbuf));
2745         // Calculate the right-hand extent of the reference
2746         uint32_t ref_right = off - textoff + tlen;
2747         // Round the right-hand extent to the nearest ISA element that maps
2748         // to it or a character to its right
2749         uint32_t ref_right_rounded = ref_right;
2750         if((ref_right_rounded & _eh._isaMask) != ref_right_rounded) {
2751                 ref_right_rounded = ((ref_right_rounded >> _eh._isaRate)+1) << _eh._isaRate;
2752         }
2753         // TODO: handle case where ref_right_rounded is off the end of _isa
2754         // Let the current suffix-array elt be determined by the ISA
2755         if((ref_right_rounded >> _eh._isaRate) >= _eh._isaLen) {
2756                 i = _eh._len;
2757                 ref_right_rounded = _eh._len;
2758         } else {
2759                 i = isa[ref_right_rounded >> _eh._isaRate];
2760         }
2761         uint32_t right_steps_rounded = ref_right_rounded - (off + qlen);
2762         uint32_t right_steps = ref_right - (off + qlen);
2763         l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2764         for(size_t j = 0; j < right_steps_rounded; j++) {
2765                 // Not a marked row; walk left one more char
2766                 int c = rowL(*l);
2767                 appendValue(rbuf, (Dna5)c);
2768                 uint32_t newi;
2769                 assert_lt(c, 4); assert_geq(c, 0);
2770                 if(l->_fw) newi = countFwSide(*l, c); // Forward side
2771                 else       newi = countBwSide(*l, c); // Backward side
2772                 assert_lt(newi, this->_eh._bwtLen);
2773                 assert_neq(newi, i);
2774                 i = newi;                                  // update row
2775                 assert_neq(i, _zOff);
2776                 l->initFromRow(i, this->_eh, this->_ebwt); // update locus
2777                 jumps++;
2778         }
2779         if(right_steps_rounded > right_steps) {
2780                 jumps -= (right_steps_rounded - right_steps);
2781                 _setLength(rbuf, right_steps);
2782         }
2783         assert_eq(right_steps, length(rbuf));
2784         assert_eq(tlen, jumps + qlen);
2785         ::reverseInPlace(lbuf);
2786         ::reverseInPlace(rbuf);
2787         {
2788                 cout << "reportReconstruct:" << endl
2789                          << "  " << lbuf << query << rbuf << endl;
2790                 cout << "  ";
2791                 for(size_t i = 0; i < length(lbuf); i++) cout << " ";
2792                 cout << query << endl;
2793         }
2794         // Now we've reconstructed the
2795         return false;
2796 }
2797
2798 /**
2799  * Transform this Ebwt into the original string in linear time by using
2800  * the LF mapping to walk backwards starting at the row correpsonding
2801  * to the end of the string.  The result is written to s.  The Ebwt
2802  * must be in memory.
2803  */
2804 template<typename TStr>
2805 void Ebwt<TStr>::restore(TStr& s) const {
2806         assert(isInMemory());
2807         resize(s, this->_eh._len, Exact());
2808         uint32_t jumps = 0;
2809         uint32_t i = this->_eh._len; // should point to final SA elt (starting with '$')
2810         SideLocus l(i, this->_eh, this->_ebwt);
2811         while(i != _zOff) {
2812                 assert_lt(jumps, this->_eh._len);
2813                 //if(_verbose) cout << "restore: i: " << i << endl;
2814                 // Not a marked row; go back a char in the original string
2815                 uint32_t newi = mapLF(l);
2816                 assert_neq(newi, i);
2817                 s[this->_eh._len - jumps - 1] = rowL(l);
2818                 i = newi;
2819                 l.initFromRow(i, this->_eh, this->_ebwt);
2820                 jumps++;
2821         }
2822         assert_eq(jumps, this->_eh._len);
2823 }
2824
2825 /**
2826  * Check that this Ebwt, when restored via restore(), matches up with
2827  * the given array of reference sequences.  For sanity checking.
2828  */
2829 template <typename TStr>
2830 void Ebwt<TStr>::checkOrigs(const vector<String<Dna5> >& os,
2831                             bool color, bool mirror) const
2832 {
2833         TStr rest;
2834         restore(rest);
2835         uint32_t restOff = 0;
2836         size_t i = 0, j = 0;
2837         if(mirror) {
2838                 // TODO: FIXME
2839                 return;
2840         }
2841         while(i < os.size()) {
2842                 size_t olen = length(os[i]);
2843                 int lastorig = -1;
2844                 for(; j < olen; j++) {
2845                         size_t joff = j;
2846                         if(mirror) joff = olen - j - 1;
2847                         if((int)os[i][joff] == 4) {
2848                                 // Skip over Ns
2849                                 lastorig = -1;
2850                                 if(!mirror) {
2851                                         while(j < olen && (int)os[i][j] == 4) j++;
2852                                 } else {
2853                                         while(j < olen && (int)os[i][olen-j-1] == 4) j++;
2854                                 }
2855                                 j--;
2856                                 continue;
2857                         }
2858                         if(lastorig == -1 && color) {
2859                                 lastorig = os[i][joff];
2860                                 continue;
2861                         }
2862                         if(color) {
2863                                 assert_neq(-1, lastorig);
2864                                 assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]);
2865                         } else {
2866                                 assert_eq(os[i][joff], rest[restOff]);
2867                         }
2868                         lastorig = (int)os[i][joff];
2869                         restOff++;
2870                 }
2871                 if(j == length(os[i])) {
2872                         // Moved to next sequence
2873                         i++;
2874                         j = 0;
2875                 } else {
2876                         // Just jumped over a gap
2877                 }
2878         }
2879 }
2880
2881 ///////////////////////////////////////////////////////////////////////
2882 //
2883 // Functions for reading and writing Ebwts
2884 //
2885 ///////////////////////////////////////////////////////////////////////
2886
2887 /**
2888  * Read an Ebwt from file with given filename.
2889  */
2890 template<typename TStr>
2891 void Ebwt<TStr>::readIntoMemory(
2892         int color,
2893         int needEntireRev,
2894         bool justHeader,
2895         EbwtParams *params,
2896         bool mmSweep,
2897         bool loadNames,
2898         bool startVerbose)
2899 {
2900         bool switchEndian; // dummy; caller doesn't care
2901 #ifdef BOWTIE_MM
2902         char *mmFile[] = { NULL, NULL };
2903 #endif
2904         if(_in1Str.length() > 0) {
2905                 if(_verbose || startVerbose) {
2906                         cerr << "  About to open input files: ";
2907                         logTime(cerr);
2908                 }
2909 #ifdef BOWTIE_MM
2910                 // Initialize our primary and secondary input-stream fields
2911                 if(_in1 != -1) close(_in1);
2912                 if(_verbose || startVerbose) {
2913                         cerr << "Opening \"" << _in1Str << "\"" << endl;
2914                 }
2915                 if((_in1 = open(_in1Str.c_str(), O_RDONLY)) < 0) {
2916                         cerr << "Could not open index file " << _in1Str << endl;
2917                 }
2918                 if(_in2 != -1) close(_in2);
2919                 if(_verbose || startVerbose) {
2920                         cerr << "Opening \"" << _in2Str << "\"" << endl;
2921                 }
2922                 if((_in2 = open(_in2Str.c_str(), O_RDONLY)) < 0) {
2923                         cerr << "Could not open index file " << _in2Str << endl;
2924                 }
2925 #else
2926                 // Initialize our primary and secondary input-stream fields
2927                 if(_in1 != NULL) fclose(_in1);
2928                 if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str << "\"" << endl;
2929                 if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
2930                         cerr << "Could not open index file " << _in1Str << endl;
2931                 }
2932                 if(_in2 != NULL) fclose(_in2);
2933                 if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str << "\"" << endl;
2934                 if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
2935                         cerr << "Could not open index file " << _in2Str << endl;
2936                 }
2937 #endif
2938                 if(_verbose || startVerbose) {
2939                         cerr << "  Finished opening input files: ";
2940                         logTime(cerr);
2941                 }
2942
2943 #ifdef BOWTIE_MM
2944                 if(_useMm && !justHeader) {
2945                         const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
2946                         int fds[] = { _in1, _in2 };
2947                         for(int i = 0; i < 2; i++) {
2948                                 if(_verbose || startVerbose) {
2949                                         cerr << "  Memory-mapping input file " << (i+1) << ": ";
2950                                         logTime(cerr);
2951                                 }
2952                                 struct stat sbuf;
2953                                 if (stat(names[i], &sbuf) == -1) {
2954                                         perror("stat");
2955                                         cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
2956                                         throw 1;
2957                                 }
2958                                 mmFile[i] = (char*)mmap((void *)0, sbuf.st_size,
2959                                                                                 PROT_READ, MAP_SHARED, fds[i], 0);
2960                                 if(mmFile == (void *)(-1)) {
2961                                         perror("mmap");
2962                                         cerr << "Error: Could not memory-map the index file " << names[i] << endl;
2963                                         throw 1;
2964                                 }
2965                                 if(mmSweep) {
2966                                         int sum = 0;
2967                                         for(off_t j = 0; j < sbuf.st_size; j += 1024) {
2968                                                 sum += (int) mmFile[i][j];
2969                                         }
2970                                         if(startVerbose) {
2971                                                 cerr << "  Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
2972                                                 logTime(cerr);
2973                                         }
2974                                 }
2975                         }
2976                         mmFile1_ = mmFile[0];
2977                         mmFile2_ = mmFile[1];
2978                 }
2979 #endif
2980         }
2981 #ifdef BOWTIE_MM
2982         else if(_useMm && !justHeader) {
2983                 mmFile[0] = mmFile1_;
2984                 mmFile[1] = mmFile2_;
2985         }
2986         if(_useMm && !justHeader) {
2987                 assert(mmFile[0] == mmFile1_);
2988                 assert(mmFile[1] == mmFile2_);
2989         }
2990 #endif
2991
2992         if(_verbose || startVerbose) {
2993                 cerr << "  Reading header: ";
2994                 logTime(cerr);
2995         }
2996
2997         // Read endianness hints from both streams
2998         size_t bytesRead = 0;
2999         switchEndian = false;
3000         uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream
3001         bytesRead += 4;
3002         #ifndef NDEBUG
3003         assert_eq(one, readU32(_in2, switchEndian)); // should match!
3004         #else
3005         readU32(_in2, switchEndian);
3006         #endif
3007         if(one != 1) {
3008                 assert_eq((1u<<24), one);
3009                 assert_eq(1, endianSwapU32(one));
3010                 switchEndian = true;
3011         }
3012
3013         // Can't switch endianness and use memory-mapped files; in order to
3014         // support this, someone has to modify the file to switch
3015         // endiannesses appropriately, and we can't do this inside Bowtie
3016         // or we might be setting up a race condition with other processes.
3017         if(switchEndian && _useMm) {
3018                 cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
3019                 throw 1;
3020         }
3021
3022         // Reads header entries one by one from primary stream
3023         uint32_t len          = readU32(_in1, switchEndian);
3024         bytesRead += 4;
3025         int32_t  lineRate     = readI32(_in1, switchEndian);
3026         bytesRead += 4;
3027         int32_t  linesPerSide = readI32(_in1, switchEndian);
3028         bytesRead += 4;
3029         int32_t  offRate      = readI32(_in1, switchEndian);
3030         bytesRead += 4;
3031         // TODO: add isaRate to the actual file format (right now, the
3032         // user has to tell us whether there's an ISA sample and what the
3033         // sampling rate is.
3034         int32_t  isaRate      = _overrideIsaRate;
3035         int32_t  ftabChars    = readI32(_in1, switchEndian);
3036         bytesRead += 4;
3037         // chunkRate was deprecated in an earlier version of Bowtie; now
3038         // we use it to hold flags.
3039         int32_t flags = readI32(_in1, switchEndian);
3040         bool entireRev = false;
3041         if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
3042                 if(color != -1 && !color) {
3043                         cerr << "Error: -C was not specified when running bowtie, but index is in colorspace.  If" << endl
3044                              << "your reads are in colorspace, please use the -C option.  If your reads are not" << endl
3045                              << "in colorspace, please use a normal index (one built without specifying -C to" << endl
3046                              << "bowtie-build)." << endl;
3047                         throw 1;
3048                 }
3049                 color = 1;
3050         } else if(flags < 0) {
3051                 if(color != -1 && color) {
3052                         cerr << "Error: -C was specified when running bowtie, but index is not in colorspace.  If" << endl
3053                              << "your reads are in colorspace, please use a colorspace index (one built using" << endl
3054                              << "bowtie-build -C).  If your reads are not in colorspace, don't specify -C when" << endl
3055                              << "running bowtie." << endl;
3056                         throw 1;
3057                 }
3058                 color = 0;
3059         }
3060         if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) {
3061                 if(needEntireRev != -1 && needEntireRev != 0) {
3062                         cerr << "Error: This index is not compatible with this version of bowtie.  Please use a" << endl
3063                              << "current version of bowtie-build." << endl;
3064                         throw 1;
3065                 }
3066         } else entireRev = true;
3067         bytesRead += 4;
3068
3069         // Create a new EbwtParams from the entries read from primary stream
3070         EbwtParams *eh;
3071         bool deleteEh = false;
3072         if(params != NULL) {
3073                 params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev);
3074                 if(_verbose || startVerbose) params->print(cerr);
3075                 eh = params;
3076         } else {
3077                 eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev);
3078                 deleteEh = true;
3079         }
3080
3081         // Set up overridden suffix-array-sample parameters
3082         uint32_t offsLen = eh->_offsLen;
3083         uint32_t offRateDiff = 0;
3084         uint32_t offsLenSampled = offsLen;
3085         if(_overrideOffRate > offRate) {
3086                 offRateDiff = _overrideOffRate - offRate;
3087         }
3088         if(offRateDiff > 0) {
3089                 offsLenSampled >>= offRateDiff;
3090                 if((offsLen & ~(0xffffffff << offRateDiff)) != 0) {
3091                         offsLenSampled++;
3092                 }
3093         }
3094
3095         // Set up overridden inverted-suffix-array-sample parameters
3096         uint32_t isaLen = eh->_isaLen;
3097         uint32_t isaRateDiff = 0;