Commit patch to not break on spaces.
[bowtie.git] / ebwt_search_backtrack.h
1 #ifndef EBWT_SEARCH_BACKTRACK_H_
2 #define EBWT_SEARCH_BACKTRACK_H_
3
4 #include <stdexcept>
5 #include <seqan/sequence.h>
6 #include "pat.h"
7 #include "qual.h"
8 #include "ebwt_search_util.h"
9 #include "range.h"
10 #include "range_source.h"
11 #include "aligner_metrics.h"
12 #include "search_globals.h"
13
14 /**
15  * Class that coordinates quality- and quantity-aware backtracking over
16  * some range of a read sequence.
17  *
18  * The creator can configure the BacktrackManager to treat different
19  * stretches of the read differently.
20  */
21 class GreedyDFSRangeSource {
22
23         typedef std::pair<int, int> TIntPair;
24         typedef seqan::String<seqan::Dna> DnaString;
25
26 public:
27         GreedyDFSRangeSource(
28                         const Ebwt<DnaString>* ebwt,
29                         const EbwtSearchParams<DnaString>& params,
30                         const BitPairReference* refs,
31                         uint32_t qualThresh,  /// max acceptable q-distance
32                         const int maxBts, /// maximum # backtracks allowed
33                         uint32_t reportPartials = 0,
34                         bool reportExacts = true,
35                         bool reportRanges = false,
36                         PartialAlignmentManager* partials = NULL,
37                         String<QueryMutation>* muts = NULL,
38                         bool verbose = true,
39                         vector<String<Dna5> >* os = NULL,
40                         bool considerQuals = true,  // whether to consider quality values when making backtracking decisions
41                         bool halfAndHalf = false, // hacky way of supporting separate revisitable regions
42                         bool maqPenalty = true) :
43                 _refs(refs),
44                 _qry(NULL),
45                 _qlen(0),
46                 _qual(NULL),
47                 _name(NULL),
48                 _ebwt(ebwt),
49                 _params(params),
50                 _unrevOff(0),
51                 _1revOff(0),
52                 _2revOff(0),
53                 _3revOff(0),
54                 _maqPenalty(maqPenalty),
55                 _qualThresh(qualThresh),
56                 _pairs(NULL),
57                 _elims(NULL),
58                 _mms(),
59                 _refcs(),
60                 _chars(NULL),
61                 _reportPartials(reportPartials),
62                 _reportExacts(reportExacts),
63                 _reportRanges(reportRanges),
64                 _partials(partials),
65                 _muts(muts),
66                 _os(os),
67                 _sanity(_os != NULL && _os->size() > 0),
68                 _considerQuals(considerQuals),
69                 _halfAndHalf(halfAndHalf),
70                 _5depth(0),
71                 _3depth(0),
72                 _numBts(0),
73                 _totNumBts(0),
74                 _maxBts(maxBts),
75                 _precalcedSideLocus(false),
76                 _preLtop(),
77                 _preLbot(),
78                 _verbose(verbose),
79                 _ihits(0llu)
80         { }
81
82         ~GreedyDFSRangeSource() {
83                 if(_pairs != NULL) delete[] _pairs;
84                 if(_elims != NULL) delete[] _elims;
85                 if(_chars != NULL) delete[] _chars;
86         }
87
88         /**
89          * Set a new query read.
90          */
91         void setQuery(ReadBuf& r) {
92                 const bool fw = _params.fw();
93                 const bool ebwtFw = _ebwt->fw();
94                 if(ebwtFw) {
95                         _qry  = fw ? &r.patFw : &r.patRc;
96                         _qual = fw ? &r.qual  : &r.qualRev;
97                 } else {
98                         _qry  = fw ? &r.patFwRev : &r.patRcRev;
99                         _qual = fw ? &r.qualRev  : &r.qual;
100                 }
101                 _name = &r.name;
102                 // Reset _qlen
103                 if(length(*_qry) > _qlen) {
104                         try {
105                                 _qlen = length(*_qry);
106                                 // Resize _pairs
107                                 if(_pairs != NULL) { delete[] _pairs; }
108                                 _pairs = new uint32_t[_qlen*_qlen*8];
109                                 // Resize _elims
110                                 if(_elims != NULL) { delete[] _elims; }
111                                 _elims = new uint8_t[_qlen*_qlen];
112                                 memset(_elims, 0, _qlen*_qlen);
113                                 // Resize _chars
114                                 if(_chars != NULL) { delete[] _chars; }
115                                 _chars = new char[_qlen];
116                                 assert(_pairs != NULL && _elims != NULL && _chars != NULL);
117                         } catch(std::bad_alloc& e) {
118                                 ThreadSafe _ts(&gLock);
119                                 cerr << "Unable to allocate memory for depth-first "
120                                      << "backtracking search; new length = " << length(*_qry)
121                                      << endl;
122                                 throw 1;
123                         }
124                 } else {
125                         // New length is less than old length, so there's no need
126                         // to resize any data structures.
127                         assert(_pairs != NULL && _elims != NULL && _chars != NULL);
128                         _qlen = length(*_qry);
129                 }
130                 _mms.clear();
131                 _refcs.clear();
132                 assert_geq(length(*_qual), _qlen);
133                 if(_verbose) {
134                         cout << "setQuery(_qry=" << (*_qry) << ", _qual=" << (*_qual) << ")" << endl;
135                 }
136                 // Initialize the random source using new read as part of the
137                 // seed.
138                 _color = r.color;
139                 _seed = r.seed;
140                 _patid = r.patid;
141                 _rand.init(r.seed);
142         }
143
144         /**
145          * Apply a batch of mutations to this read, possibly displacing a
146          * previous batch of mutations.
147          */
148         void setMuts(String<QueryMutation>* muts) {
149                 if(_muts != NULL) {
150                         // Undo previous mutations
151                         assert_gt(length(*_muts), 0);
152                         undoPartialMutations();
153                 }
154                 _muts = muts;
155                 if(_muts != NULL) {
156                         assert_gt(length(*_muts), 0);
157                         applyPartialMutations();
158                 }
159         }
160
161         /**
162          * Set backtracking constraints.
163          */
164         void setOffs(uint32_t depth5,   // depth of far edge of hi-half
165                      uint32_t depth3,   // depth of far edge of lo-half
166                      uint32_t unrevOff, // depth above which we cannot backtrack
167                      uint32_t revOff1,  // depth above which we may backtrack just once
168                      uint32_t revOff2,  // depth above which we may backtrack just twice
169                      uint32_t revOff3)  // depth above which we may backtrack just three times
170         {
171                 _5depth   = depth5;
172                 _3depth   = depth3;
173                 assert_geq(depth3, depth5);
174                 _unrevOff = unrevOff;
175                 _1revOff  = revOff1;
176                 _2revOff  = revOff2;
177                 _3revOff  = revOff3;
178         }
179
180         /**
181          * Reset number of backtracks to 0.
182          */
183         void resetNumBacktracks() {
184                 _totNumBts = 0;
185         }
186
187         /**
188          * Return number of backtracks since the last time the count was
189          * reset.
190          */
191         uint32_t numBacktracks() {
192                 return _totNumBts;
193         }
194
195         /**
196          * Set whether to report exact hits.
197          */
198         void setReportExacts(int stratum) {
199                 _reportExacts = stratum;
200         }
201
202         /**
203          * Set the Bowtie index to search against.
204          */
205         void setEbwt(const Ebwt<String<Dna> >* ebwt) {
206                 _ebwt = ebwt;
207         }
208
209         /**
210          * Return the current range
211          */
212         Range& range() {
213                 return _curRange;
214         }
215
216         /**
217          * Set _qlen.  Don't let it exceed length of query.
218          */
219         void setQlen(uint32_t qlen) {
220                 assert(_qry != NULL);
221                 _qlen = min<uint32_t>(length(*_qry), qlen);
222         }
223
224         /// Return the maximum number of allowed backtracks in a given call
225         /// to backtrack()
226         uint32_t maxBacktracks() {
227                 return _maxBts;
228         }
229
230         /**
231          * Initiate the recursive backtracking routine starting at the
232          * extreme right-hand side of the pattern.  Use the ftab to match
233          * the first several characters in one chomp, as long as doing so
234          * does not "jump over" any legal backtracking targets.
235          *
236          * Return true iff the HitSink has indicated that we're done with
237          * this read.
238          */
239         bool backtrack(uint32_t ham = 0) {
240                 assert_gt(length(*_qry), 0);
241                 assert_leq(_qlen, length(*_qry));
242                 assert_geq(length(*_qual), length(*_qry));
243                 const Ebwt<String<Dna> >& ebwt = *_ebwt;
244                 int ftabChars = ebwt._eh._ftabChars;
245                 int nsInSeed = 0; int nsInFtab = 0;
246                 if(!tallyNs(nsInSeed, nsInFtab)) {
247                         // No alignments are possible because of the distribution
248                         // of Ns in the read in combination with the backtracking
249                         // constraints.
250                         return false;
251                 }
252                 bool ret;
253                 // m = depth beyond which ftab must not extend or else we might
254                 // miss some legitimate paths
255                 uint32_t m = min<uint32_t>(_unrevOff, _qlen);
256                 if(nsInFtab == 0 && m >= (uint32_t)ftabChars) {
257                         uint32_t ftabOff = calcFtabOff();
258                         uint32_t top = ebwt.ftabHi(ftabOff);
259                         uint32_t bot = ebwt.ftabLo(ftabOff+1);
260                         if(_qlen == (uint32_t)ftabChars && bot > top) {
261                                 // We have a match!
262                                 if(_reportPartials > 0) {
263                                         // Oops - we're trying to find seedlings, so we've
264                                         // gone too far; start again
265                                         ret = backtrack(0,   // depth
266                                                         0,   // top
267                                                         0,   // bot
268                                                         ham,
269                                                         nsInFtab > 0);
270                                 } else {
271                                         // We have a match!
272                                         ret = reportAlignment(0, top, bot, ham);
273                                 }
274                         } else if (bot > top) {
275                                 // We have an arrow pair from which we can backtrack
276                                 ret = backtrack(ftabChars, // depth
277                                                 top,       // top
278                                                 bot,       // bot
279                                                 ham,
280                                                 nsInFtab > 0);
281                         } else {
282                                 // The arrows are already closed; give up
283                                 ret = false;
284                         }
285                 } else {
286                         // The ftab *does* extend past the unrevisitable portion;
287                         // we can't use it in this case, because we might jump past
288                         // a legitimate mismatch
289                         ret = backtrack(0,   // depth
290                                         0,   // top
291                                         0,   // bot
292                                         ham,
293                                         // disable ftab jumping if there is more
294                                         // than 1 N in it
295                                         nsInFtab > 0);
296                 }
297                 if(finalize()) ret = true;
298                 return ret;
299         }
300
301         /**
302          * If there are any buffered results that have yet to be committed,
303          * commit them.  This happens when looking for partial alignments.
304          */
305         bool finalize() {
306                 bool ret = false;
307                 if(_reportPartials > 0) {
308                         // We're in partial alignment mode; take elements of the
309                         // _partialBuf and install them in the _partials database
310                         assert(_partials != NULL);
311                         if(_partialsBuf.size() > 0) {
312 #ifndef NDEBUG
313                                 for(size_t i = 0; i < _partialsBuf.size(); i++) {
314                                         assert(_partialsBuf[i].repOk(_qualThresh, _qlen, (*_qual), _maqPenalty));
315                                 }
316 #endif
317                                 _partials->addPartials(_params.patId(), _partialsBuf);
318                                 _partialsBuf.clear();
319                                 ret = true;
320                         } else {
321                                 assert(!ret);
322                         }
323                 }
324                 assert_eq(0, _partialsBuf.size());
325                 return ret;
326         }
327
328         /**
329          * Starting at the given "depth" relative to the 5' end, and the
330          * given top and bot indexes (where top=0 and bot=0 means it's up
331          * to us to calculate the initial range), and initial weighted
332          * hamming distance iham, find a hit using randomized, quality-
333          * aware backtracking.
334          */
335         bool backtrack(uint32_t depth,
336                        uint32_t top,
337                        uint32_t bot,
338                        uint32_t iham = 0,
339                        bool disableFtab = false)
340         {
341                 HitSinkPerThread& sink = _params.sink();
342                 _ihits = sink.retainedHits().size();
343
344                 // Initiate the recursive, randomized quality-aware backtracker
345                 // with a stack depth of 0 (no backtracks so far)
346                 _bailedOnBacktracks = false;
347                 bool done = backtrack(0, depth, _unrevOff, _1revOff, _2revOff, _3revOff,
348                                       top, bot, iham, iham, _pairs, _elims, disableFtab);
349
350                 _totNumBts += _numBts;
351                 _numBts = 0;
352                 _precalcedSideLocus = false;
353                 _bailedOnBacktracks = false;
354                 return done;
355         }
356
357         /**
358          * Recursive routine for progressing to the next backtracking
359          * decision given some initial conditions.  If a hit is found, it
360          * is recorded and true is returned.  Otherwise, if there are more
361          * backtracking opportunities, the function will call itself
362          * recursively and return the result.  As soon as there is a
363          * mismatch and no backtracking opportunities, false is returned.
364          */
365         bool backtrack(uint32_t  stackDepth, // depth of the recursion stack; = # mismatches so far
366                        uint32_t  depth,    // next depth where a post-pair needs to be calculated
367                        uint32_t  unrevOff, // depths < unrevOff are unrevisitable
368                        uint32_t  oneRevOff,// depths < oneRevOff are 1-revisitable
369                        uint32_t  twoRevOff,// depths < twoRevOff are 2-revisitable
370                        uint32_t  threeRevOff,// depths < threeRevOff are 3-revisitable
371                        uint32_t  top,      // top arrow in pair prior to 'depth'
372                        uint32_t  bot,      // bottom arrow in pair prior to 'depth'
373                        uint32_t  ham,      // weighted hamming distance so far
374                        uint32_t  iham,     // initial weighted hamming distance
375                        uint32_t* pairs,    // portion of pairs array to be used for this backtrack frame
376                        uint8_t*  elims,    // portion of elims array to be used for this backtrack frame
377                        bool disableFtab = false)
378         {
379                 // Can't have already exceeded weighted hamming distance threshold
380                 assert_leq(stackDepth, depth);
381                 assert_gt(length(*_qry), 0);
382                 assert_leq(_qlen, length(*_qry));
383                 assert_geq(length(*_qual), length(*_qry));
384                 assert(_qry != NULL);
385                 assert(_qual != NULL);
386                 assert(_name != NULL);
387                 assert(_qlen != 0);
388                 assert_leq(ham, _qualThresh);
389                 assert_lt(depth, _qlen); // can't have run off the end of qry
390                 assert_geq(bot, top);    // could be that both are 0
391                 assert(pairs != NULL);
392                 assert(elims != NULL);
393                 assert_leq(stackDepth, _qlen);
394                 const Ebwt<String<Dna> >& ebwt = *_ebwt;
395                 HitSinkPerThread& sink = _params.sink();
396                 uint64_t prehits = sink.numValidHits();
397                 if(_halfAndHalf) {
398                         assert_eq(0, _reportPartials);
399                         assert_gt(_3depth, _5depth);
400                 }
401                 if(_reportPartials) {
402                         assert(!_halfAndHalf);
403                 }
404                 if(_verbose) {
405                         cout << "  backtrack(stackDepth=" << stackDepth << ", "
406                              << "depth=" << depth << ", "
407                              << "top=" << top << ", "
408                              << "bot=" << bot << ", "
409                              << "ham=" << ham << ", "
410                              << "iham=" << iham << ", "
411                              << "pairs=" << pairs << ", "
412                              << "elims=" << (void*)elims << "): \"";
413                         for(int i = (int)depth - 1; i >= 0; i--) {
414                                 cout << _chars[i];
415                         }
416                         cout << "\"" << endl;
417                 }
418                 // Do this early on so that we can clear _precalcedSideLocus
419                 // before we have too many opportunities to bail and leave it
420                 // 'true'
421                 SideLocus ltop, lbot;
422                 if(_precalcedSideLocus) {
423                         ltop = _preLtop;
424                         lbot = _preLbot;
425                         _precalcedSideLocus = false;
426                 } else if(top != 0 || bot != 0) {
427                         SideLocus::initFromTopBot(top, bot, ebwt._eh, ebwt._ebwt, ltop, lbot);
428                 }
429                 // Check whether we've exceeded any backtracking limit
430                 if(_halfAndHalf) {
431                         if(_maxBts > 0 && _numBts == _maxBts) {
432                                 _bailedOnBacktracks = true;
433                                 return false;
434                         }
435                         _numBts++;
436                 }
437                 // # positions with at least one legal outgoing path
438                 uint32_t altNum = 0;
439                 // # positions tied for "best" outgoing qual
440                 uint32_t eligibleNum = 0;
441                 // total range-size for all eligibles
442                 uint32_t eligibleSz = 0;
443                 // If there is just one eligible slot at the moment (a common
444                 // case), these are its parameters
445                 uint32_t eli = 0;
446                 bool     elignore = true; // ignore the el values because they didn't come from a recent override
447                 uint32_t eltop = 0;
448                 uint32_t elbot = 0;
449                 uint32_t elham = ham;
450                 char     elchar = 0;
451                 int      elcint = 0;
452                 // The lowest quality value associated with any alternative
453                 // ranges; all alternative ranges with this quality are
454                 // eligible
455                 uint8_t lowAltQual = 0xff;
456                 uint32_t d = depth;
457                 uint32_t cur = _qlen - d - 1; // current offset into _qry
458                 while(cur < _qlen) {
459                         // Try to advance further given that
460                         if(_verbose) {
461                                 cout << "    cur=" << cur << " \"";
462                                 for(int i = (int)d - 1; i >= 0; i--) {
463                                         cout << _chars[i];
464                                 }
465                                 cout << "\"";
466                         }
467
468                         // If we're searching for a half-and-half solution, then
469                         // enforce the boundary-crossing constraints here.
470                         if(_halfAndHalf && !hhCheckTop(stackDepth, d, iham, _mms, prehits)) {
471                                 return false;
472                         }
473
474                         bool curIsEligible = false;
475                         // Reset eligibleNum and eligibleSz if there are any
476                         // eligible pairs discovered at this spot
477                         bool curOverridesEligible = false;
478                         // Determine whether ranges at this location are
479                         // candidates for backtracking
480                         int c = (int)(*_qry)[cur];
481                         assert_leq(c, 4);
482                         uint8_t q = qualAt(cur);
483                         // The current query position is a legit alternative if it a) is
484                         // not in the unrevisitable region, and b) the quality ceiling (if
485                         // one exists) is not exceeded
486                         bool curIsAlternative =
487                                 (d >= unrevOff) &&
488                             (!_considerQuals ||
489                              (ham + mmPenalty(_maqPenalty, q) <= _qualThresh));
490                         if(curIsAlternative) {
491                                 if(_considerQuals) {
492                                         // Is it the best alternative?
493                                         if(q < lowAltQual) {
494                                                 // Ranges at this depth in this backtracking frame are
495                                                 // eligible, unless we learn otherwise.  Ranges previously
496                                                 // thought to be eligible are not any longer.
497                                                 curIsEligible = true;
498                                                 curOverridesEligible = true;
499                                         } else if(q == lowAltQual) {
500                                                 // Ranges at this depth in this backtracking frame
501                                                 // are eligible, unless we learn otherwise
502                                                 curIsEligible = true;
503                                         }
504                                 } else {
505                                         // When quality values are not considered, all positions
506                                         // are eligible
507                                         curIsEligible = true;
508                                 }
509                         }
510                         if(curIsEligible) assert(curIsAlternative);
511                         if(curOverridesEligible) assert(curIsEligible);
512                         if(curIsAlternative && !curIsEligible) {
513                                 assert_gt(eligibleSz, 0);
514                                 assert_gt(eligibleNum, 0);
515                         }
516                         if(_verbose) {
517                                 cout << " alternative: " << curIsAlternative;
518                                 cout << ", eligible: " << curIsEligible;
519                                 if(curOverridesEligible) cout << "(overrides)";
520                                 cout << endl;
521                         }
522
523                         // If c is 'N', then it's guaranteed to be a mismatch
524                         if(c == 4 && d > 0) {
525                                 // Force the 'else if(curIsAlternative)' branch below
526                                 top = bot = 1;
527                         } else if(c == 4) {
528                                 // We'll take the 'if(top == 0 && bot == 0)' branch below
529                                 assert_eq(0, top);
530                                 assert_eq(0, bot);
531                         }
532                         // Calculate the ranges for this position
533                         if(top == 0 && bot == 0) {
534                                 // Calculate first quartet of ranges using the _fchr[]
535                                 // array
536                                                pairs[0 + 0] = ebwt._fchr[0];
537                                 pairs[0 + 4] = pairs[1 + 0] = ebwt._fchr[1];
538                                 pairs[1 + 4] = pairs[2 + 0] = ebwt._fchr[2];
539                                 pairs[2 + 4] = pairs[3 + 0] = ebwt._fchr[3];
540                                 pairs[3 + 4]                = ebwt._fchr[4];
541                                 // Update top and bot
542                                 if(c < 4) {
543                                         top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c);
544                                         assert_geq(bot, top);
545                                 }
546                         } else if(curIsAlternative) {
547                                 // Clear pairs
548                                 memset(&pairs[d*8], 0, 8 * 4);
549                                 // Calculate next quartet of ranges
550                                 ebwt.mapLFEx(ltop, lbot, &pairs[d*8], &pairs[(d*8)+4]);
551                                 // Update top and bot
552                                 if(c < 4) {
553                                         top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c);
554                                         assert_geq(bot, top);
555                                 }
556                         } else {
557                                 // This query character is not even a legitimate
558                                 // alternative (because backtracking here would blow
559                                 // our mismatch quality budget), so no need to do the
560                                 // bookkeeping for the entire quartet, just do c
561                                 if(c < 4) {
562                                         if(top+1 == bot) {
563                                                 bot = top = ebwt.mapLF1(top, ltop, c);
564                                                 if(bot != 0xffffffff) bot++;
565                                         } else {
566                                                 top = ebwt.mapLF(ltop, c); bot = ebwt.mapLF(lbot, c);
567                                                 assert_geq(bot, top);
568                                         }
569                                 }
570                         }
571                         if(top != bot) {
572                                 // Calculate loci from row indices; do it now so that
573                                 // those prefetches are fired off as soon as possible.
574                                 // This eventually calls SideLocus.initfromRow().
575                                 SideLocus::initFromTopBot(top, bot, ebwt._eh, ebwt._ebwt, ltop, lbot);
576                         }
577                         // Update the elim array
578                         eliminate(elims, d, c);
579
580                         if(curIsAlternative) {
581                                 // Given the just-calculated range quartet, update
582                                 // elims, altNum, eligibleNum, eligibleSz
583                                 for(int i = 0; i < 4; i++) {
584                                         if(i == c) continue;
585                                         assert_leq(pairTop(pairs, d, i), pairBot(pairs, d, i));
586                                         uint32_t spread = pairSpread(pairs, d, i);
587                                         if(spread == 0) {
588                                                 // Indicate this char at this position is
589                                                 // eliminated as far as this backtracking frame is
590                                                 // concerned, since its range is empty
591                                                 elims[d] |= (1 << i);
592                                                 assert_lt(elims[d], 16);
593                                         }
594                                         if(spread > 0 && ((elims[d] & (1 << i)) == 0)) {
595                                                 // This char at this position is an alternative
596                                                 if(curIsEligible) {
597                                                         if(curOverridesEligible) {
598                                                                 // Only now that we know there is at least
599                                                                 // one potential backtrack target at this
600                                                                 // most-eligible position should we reset
601                                                                 // these eligibility parameters
602                                                                 lowAltQual = q;
603                                                                 eligibleNum = 0;
604                                                                 eligibleSz = 0;
605                                                                 curOverridesEligible = false;
606                                                                 // Remember these parameters in case
607                                                                 // this turns out to be the only
608                                                                 // eligible target
609                                                                 eli = d;
610                                                                 eltop = pairTop(pairs, d, i);
611                                                                 elbot = pairBot(pairs, d, i);
612                                                                 assert_eq(elbot-eltop, spread);
613                                                                 elham = mmPenalty(_maqPenalty, q);
614                                                                 elchar = "acgt"[i];
615                                                                 elcint = i;
616                                                                 elignore = false;
617                                                         }
618                                                         eligibleSz += spread;
619                                                         eligibleNum++;
620                                                 }
621                                                 assert_gt(eligibleSz, 0);
622                                                 assert_gt(eligibleNum, 0);
623                                                 altNum++;
624                                         }
625                                 }
626                         }
627                         if(altNum > 0) {
628                                 assert_gt(eligibleSz, 0);
629                                 assert_gt(eligibleNum, 0);
630                         }
631                         assert_leq(eligibleNum, eligibleSz);
632                         assert_leq(eligibleNum, altNum);
633                         assert_lt(elims[d], 16);
634                         assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
635
636                         // Achieved a match, but need to keep going
637                         bool backtrackDespiteMatch = false;
638                         bool reportedPartial = false;
639                         if(cur == 0 &&  // we've consumed the entire pattern
640                            top < bot && // there's a hit to report
641                            stackDepth < _reportPartials && // not yet used up our mismatches
642                            _reportPartials > 0)  // there are still legel backtracking targets
643                         {
644                                 assert(!_halfAndHalf);
645                                 if(altNum > 0) backtrackDespiteMatch = true;
646                                 if(stackDepth > 0) {
647                                         // This is a legit seedling; report it
648                                         reportPartial(stackDepth);
649                                         reportedPartial = true;
650                                 }
651                                 // Now continue on to find legitimate seedlings with
652                                 // more mismatches than this one
653                         }
654                         // Check whether we've obtained an exact alignment when
655                         // we've been instructed not to report exact alignments
656                         bool invalidExact = false;
657                         if(cur == 0 && stackDepth == 0 && bot > top && !_reportExacts) {
658                                 invalidExact = true;
659                                 backtrackDespiteMatch = true;
660                         }
661                         // Set this to true if the only way to make legal progress
662                         // is via one or more additional backtracks.  This is
663                         // helpful in half-and-half mode.
664                         bool mustBacktrack = false;
665                         bool invalidHalfAndHalf = false;
666                         if(_halfAndHalf) {
667                                 ASSERT_ONLY(uint32_t lim = (_3revOff == _2revOff)? 2 : 3);
668                                 if((d == (_5depth-1)) && top < bot) {
669                                         // We're crossing the boundary separating the hi-half
670                                         // from the non-seed portion of the read.
671                                         // We should induce a mismatch if we haven't mismatched
672                                         // yet, so that we don't waste time pursuing a match
673                                         // that was covered by a previous phase
674                                         assert_eq(0, _reportPartials);
675                                         assert_leq(stackDepth, lim-1);
676                                         invalidHalfAndHalf = (stackDepth == 0);
677                                         if(stackDepth == 0 && altNum > 0) {
678                                                 backtrackDespiteMatch = true;
679                                                 mustBacktrack = true;
680                                         } else if(stackDepth == 0) {
681                                                 // We're returning from the bottommost frame
682                                                 // without having found any hits; let's
683                                                 // sanity-check that there really aren't any
684                                                 return false;
685                                         }
686                                 }
687                                 else if((d == (_3depth-1)) && top < bot) {
688                                         // We're crossing the boundary separating the lo-half
689                                         // from the non-seed portion of the read
690                                         assert_eq(0, _reportPartials);
691                                         assert_leq(stackDepth, lim);
692                                         assert_gt(stackDepth, 0);
693                                         // Count the mismatches in the lo and hi halves
694                                         uint32_t loHalfMms = 0, hiHalfMms = 0;
695                                         assert_geq(_mms.size(), stackDepth);
696                                         for(size_t i = 0; i < stackDepth; i++) {
697                                                 uint32_t d = _qlen - _mms[i] - 1;
698                                                 if     (d < _5depth) hiHalfMms++;
699                                                 else if(d < _3depth) loHalfMms++;
700                                                 else assert(false);
701                                         }
702                                         assert_leq(loHalfMms + hiHalfMms, lim);
703                                         invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
704                                         if((stackDepth < 2 || invalidHalfAndHalf) && altNum > 0) {
705                                                 // We backtracked fewer times than necessary;
706                                                 // force a backtrack
707                                                 mustBacktrack = true;
708                                                 backtrackDespiteMatch = true;
709                                         } else if(stackDepth < 2) {
710                                                 return false;
711                                         }
712                                 }
713                                 if(d < _5depth-1) {
714                                         assert_leq(stackDepth, lim-1);
715                                 }
716                                 else if(d >= _5depth && d < _3depth-1) {
717                                         assert_gt(stackDepth, 0);
718                                         assert_leq(stackDepth, lim);
719                                 }
720                         }
721                         // This is necessary for the rare case where we're about
722                         // to declare success because bot > top and we've consumed
723                         // the final character, but all hits between top and bot
724                         // are spurious.  This check ensures that we keep looking
725                         // for non-spurious hits in that case.
726                         if(cur == 0 &&            // we made it to the left-hand-side of the read
727                            bot > top &&           // there are alignments to report
728                            !invalidHalfAndHalf && // alignment isn't disqualified by half-and-half requirement
729                            !invalidExact &&       // alignment isn't disqualified by no-exact-hits setting
730                            !reportedPartial)      // for when it's a partial alignment we've already reported
731                         {
732                                 bool ret = reportAlignment(stackDepth, top, bot, ham);
733                                 if(!ret) {
734                                         // reportAlignment returned false, so enter the
735                                         // backtrack loop and keep going
736                                         top = bot;
737                                 } else {
738                                         // reportAlignment returned true, so stop
739                                         return true;
740                                 }
741                         }
742                         //
743                         // Mismatch with alternatives
744                         //
745                         while((top == bot || backtrackDespiteMatch) && altNum > 0) {
746                                 if(_verbose) cout << "    top (" << top << "), bot ("
747                                                  << bot << ") with " << altNum
748                                                  << " alternatives, eligible: "
749                                                  << eligibleNum << ", " << eligibleSz
750                                                  << endl;
751                                 assert_gt(eligibleSz, 0);
752                                 assert_gt(eligibleNum, 0);
753                                 // Mismatch!  Must now choose where we are going to
754                                 // take our quality penalty.  We can only look as far
755                                 // back as our last decision point.
756                                 assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
757                                 // Pick out the arrow pair we selected and target it
758                                 // for backtracking
759                                 ASSERT_ONLY(uint32_t eligiblesVisited = 0);
760                                 size_t i = d, j = 0;
761                                 assert_geq(i, depth);
762                                 uint32_t bttop = 0;
763                                 uint32_t btbot = 0;
764                                 uint32_t btham = ham;
765                                 char     btchar = 0;
766                                 int      btcint = 0;
767                                 uint32_t icur = 0;
768                                 // The common case is that eligibleSz == 1
769                                 if(eligibleNum > 1 || elignore) {
770                                         bool foundTarget = false;
771                                         // Walk from left to right
772                                         for(; i >= depth; i--) {
773                                                 assert_geq(i, unrevOff);
774                                                 icur = _qlen - i - 1; // current offset into _qry
775                                                 uint8_t qi = qualAt(icur);
776                                                 assert_lt(elims[i], 16);
777                                                 if((qi == lowAltQual || !_considerQuals) && elims[i] != 15) {
778                                                         // This is the leftmost eligible position with at
779                                                         // least one remaining backtrack target
780                                                         uint32_t posSz = 0;
781                                                         // Add up the spreads for A, C, G, T
782                                                         for(j = 0; j < 4; j++) {
783                                                                 if((elims[i] & (1 << j)) == 0) {
784                                                                         assert_gt(pairSpread(pairs, i, j), 0);
785                                                                         posSz += pairSpread(pairs, i, j);
786                                                                 }
787                                                         }
788                                                         // Generate a random number
789                                                         assert_gt(posSz, 0);
790                                                         uint32_t r = _rand.nextU32() % posSz;
791                                                         for(j = 0; j < 4; j++) {
792                                                                 if((elims[i] & (1 << j)) == 0) {
793                                                                         // This range has not been eliminated
794                                                                         ASSERT_ONLY(eligiblesVisited++);
795                                                                         uint32_t spread = pairSpread(pairs, i, j);
796                                                                         if(r < spread) {
797                                                                                 // This is our randomly-selected
798                                                                                 // backtrack target
799                                                                                 foundTarget = true;
800                                                                                 bttop = pairTop(pairs, i, j);
801                                                                                 btbot = pairBot(pairs, i, j);
802                                                                                 btham += mmPenalty(_maqPenalty, qi);
803                                                                                 btcint = j;
804                                                                                 btchar = "acgt"[j];
805                                                                                 assert_leq(btham, _qualThresh);
806                                                                                 break; // found our target; we can stop
807                                                                         }
808                                                                         r -= spread;
809                                                                 }
810                                                         }
811                                                         assert(foundTarget);
812                                                         break; // escape left-to-right walk
813                                                 }
814                                         }
815                                         assert_leq(i, d);
816                                         assert_lt(j, 4);
817                                         assert_leq(eligiblesVisited, eligibleNum);
818                                         assert(foundTarget);
819                                         assert_neq(0, btchar);
820                                         assert_gt(btbot, bttop);
821                                         assert_leq(btbot-bttop, eligibleSz);
822                                 } else {
823                                         // There was only one eligible target; we can just
824                                         // copy its parameters
825                                         assert_eq(1, eligibleNum);
826                                         assert(!elignore);
827                                         i = eli;
828                                         bttop = eltop;
829                                         btbot = elbot;
830                                         btham += elham;
831                                         j = btcint = elcint;
832                                         btchar = elchar;
833                                         assert_neq(0, btchar);
834                                         assert_gt(btbot, bttop);
835                                         assert_leq(btbot-bttop, eligibleSz);
836                                 }
837                                 // This is the earliest that we know what the next top/
838                                 // bot combo is going to be
839                                 SideLocus::initFromTopBot(bttop, btbot,
840                                                           ebwt._eh, ebwt._ebwt,
841                                                           _preLtop, _preLbot);
842                                 icur = _qlen - i - 1; // current offset into _qry
843                                 // Slide over to the next backtacking frame within
844                                 // pairs and elims; won't interfere with our frame or
845                                 // any of our parents' frames
846                                 uint32_t *newPairs = pairs + (_qlen*8);
847                                 uint8_t  *newElims = elims + (_qlen);
848                                 // If we've selected a backtracking target that's in
849                                 // the 1-revisitable region, then we ask the recursive
850                                 // callee to consider the 1-revisitable region as also
851                                 // being unrevisitable (since we just "used up" all of
852                                 // our visits)
853                                 uint32_t btUnrevOff  = unrevOff;
854                                 uint32_t btOneRevOff = oneRevOff;
855                                 uint32_t btTwoRevOff = twoRevOff;
856                                 uint32_t btThreeRevOff = threeRevOff;
857                                 assert_geq(i, unrevOff);
858                                 assert_geq(oneRevOff, unrevOff);
859                                 assert_geq(twoRevOff, oneRevOff);
860                                 assert_geq(threeRevOff, twoRevOff);
861                                 if(i < oneRevOff) {
862                                         // Extend unrevisitable region to include former 1-
863                                         // revisitable region
864                                         btUnrevOff = oneRevOff;
865                                         // Extend 1-revisitable region to include former 2-
866                                         // revisitable region
867                                         btOneRevOff = twoRevOff;
868                                         // Extend 2-revisitable region to include former 3-
869                                         // revisitable region
870                                         btTwoRevOff = threeRevOff;
871                                 }
872                                 else if(i < twoRevOff) {
873                                         // Extend 1-revisitable region to include former 2-
874                                         // revisitable region
875                                         btOneRevOff = twoRevOff;
876                                         // Extend 2-revisitable region to include former 3-
877                                         // revisitable region
878                                         btTwoRevOff = threeRevOff;
879                                 }
880                                 else if(i < threeRevOff) {
881                                         // Extend 2-revisitable region to include former 3-
882                                         // revisitable region
883                                         btTwoRevOff = threeRevOff;
884                                 }
885                                 // Note the character that we're backtracking on in the
886                                 // mm array:
887                                 if(_mms.size() <= stackDepth) {
888                                         assert_eq(_mms.size(), stackDepth);
889                                         _mms.push_back(icur);
890                                 } else {
891                                         _mms[stackDepth] = icur;
892                                 }
893                                 assert_eq(1, dna4Cat[(int)btchar]);
894                                 if(_refcs.size() <= stackDepth) {
895                                         assert_eq(_refcs.size(), stackDepth);
896                                         _refcs.push_back(btchar);
897                                 } else {
898                                         _refcs[stackDepth] = btchar;
899                                 }
900 #ifndef NDEBUG
901                                 for(uint32_t j = 0; j < stackDepth; j++) {
902                                         assert_neq(_mms[j], icur);
903                                 }
904 #endif
905                                 _chars[i] = btchar;
906                                 assert_leq(i+1, _qlen);
907                                 bool ret;
908                                 if(i+1 == _qlen) {
909                                         ret = reportAlignment(stackDepth+1, bttop, btbot, btham);
910                                 } else if(_halfAndHalf &&
911                                           !disableFtab &&
912                                           _2revOff == _3revOff &&
913                                           i+1 < (uint32_t)ebwt._eh._ftabChars &&
914                                           (uint32_t)ebwt._eh._ftabChars <= _5depth)
915                                 {
916                                         // The ftab doesn't extend past the unrevisitable portion,
917                                         // so we can go ahead and use it
918                                         // Rightmost char gets least significant bit-pairs
919                                         int ftabChars = ebwt._eh._ftabChars;
920                                         uint32_t ftabOff = (*_qry)[_qlen - ftabChars];
921                                         assert_lt(ftabOff, 4);
922                                         assert_lt(ftabOff, ebwt._eh._ftabLen-1);
923                                         for(int j = ftabChars - 1; j > 0; j--) {
924                                                 ftabOff <<= 2;
925                                                 if(_qlen-j == icur) {
926                                                         ftabOff |= btcint;
927                                                 } else {
928                                                         assert_lt((uint32_t)(*_qry)[_qlen-j], 4);
929                                                         ftabOff |= (uint32_t)(*_qry)[_qlen-j];
930                                                 }
931                                                 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
932                                         }
933                                         assert_lt(ftabOff, ebwt._eh._ftabLen-1);
934                                         uint32_t ftabTop = ebwt.ftabHi(ftabOff);
935                                         uint32_t ftabBot = ebwt.ftabLo(ftabOff+1);
936                                         assert_geq(ftabBot, ftabTop);
937                                         if(ftabTop == ftabBot) {
938                                                 ret = false;
939                                         } else {
940                                                 assert(!_precalcedSideLocus);
941                                                 assert_leq(iham, _qualThresh);
942                                                 ret = backtrack(stackDepth+1,
943                                                                 ebwt._eh._ftabChars,
944                                                                 btUnrevOff,  // new unrevisitable boundary
945                                                                 btOneRevOff, // new 1-revisitable boundary
946                                                                 btTwoRevOff, // new 2-revisitable boundary
947                                                                 btThreeRevOff, // new 3-revisitable boundary
948                                                                 ftabTop,  // top arrow in range prior to 'depth'
949                                                                 ftabBot,  // bottom arrow in range prior to 'depth'
950                                                                 btham,  // weighted hamming distance so far
951                                                                 iham,   // initial weighted hamming distance
952                                                                 newPairs,
953                                                                 newElims);
954                                         }
955                                 } else {
956                                         // We already called initFromTopBot for the range
957                                         // we're going to continue from
958                                         _precalcedSideLocus = true;
959                                         assert_leq(iham, _qualThresh);
960                                         // Continue from selected alternative range
961                                         ret = backtrack(stackDepth+1,// added 1 mismatch to alignment
962                                                         i+1,         // start from next position after
963                                                         btUnrevOff,  // new unrevisitable boundary
964                                                         btOneRevOff, // new 1-revisitable boundary
965                                                         btTwoRevOff, // new 2-revisitable boundary
966                                                         btThreeRevOff, // new 3-revisitable boundary
967                                                         bttop,  // top arrow in range prior to 'depth'
968                                                         btbot,  // bottom arrow in range prior to 'depth'
969                                                         btham,  // weighted hamming distance so far
970                                                         iham,   // initial weighted hamming distance
971                                                         newPairs,
972                                                         newElims);
973                                 }
974                                 if(ret) {
975                                         assert_gt(sink.numValidHits(), prehits);
976                                         return true; // return, signaling that we're done
977                                 }
978                                 if(_bailedOnBacktracks ||
979                                   (_halfAndHalf && (_maxBts > 0) && (_numBts >= _maxBts)))
980                                 {
981                                         _bailedOnBacktracks = true;
982                                         return false;
983                                 }
984                                 // No hit was reported; update elims[], eligibleSz,
985                                 // eligibleNum, altNum
986                                 _chars[i] = (*_qry)[icur];
987                                 assert_neq(15, elims[i]);
988                                 ASSERT_ONLY(uint8_t oldElim = elims[i]);
989                                 elims[i] |= (1 << j);
990                                 assert_lt(elims[i], 16);
991                                 assert_gt(elims[i], oldElim);
992                                 eligibleSz -= (btbot-bttop);
993                                 eligibleNum--;
994                                 elignore = true;
995                                 assert_geq(eligibleNum, 0);
996                                 altNum--;
997                                 assert_geq(altNum, 0);
998                                 if(altNum == 0) {
999                                         // No alternative backtracking points; all legal
1000                                         // backtracking targets have been exhausted
1001                                         assert_eq(0, altNum);
1002                                         assert_eq(0, eligibleSz);
1003                                         assert_eq(0, eligibleNum);
1004                                         return false;
1005                                 }
1006                                 else if(eligibleNum == 0 && _considerQuals) {
1007                                         // Find the next set of eligible backtrack points
1008                                         // by re-scanning this backtracking frame (from
1009                                         // 'depth' up to 'd')
1010                                         lowAltQual = 0xff;
1011                                         for(size_t k = d; k >= depth && k <= _qlen; k--) {
1012                                                 uint32_t kcur = _qlen - k - 1; // current offset into _qry
1013                                                 uint8_t kq = qualAt(kcur);
1014                                                 if(k < unrevOff) break; // already visited all revisitable positions
1015                                                 bool kCurIsAlternative = (ham + mmPenalty(_maqPenalty, kq) <= _qualThresh);
1016                                                 bool kCurOverridesEligible = false;
1017                                                 if(kCurIsAlternative) {
1018                                                         if(kq < lowAltQual) {
1019                                                                 // This target is more eligible than
1020                                                                 // any targets that came before, so we
1021                                                                 // set it to supplant/override them
1022                                                                 kCurOverridesEligible = true;
1023                                                         }
1024                                                         if(kq <= lowAltQual) {
1025                                                                 // Position is eligible
1026                                                                 for(int l = 0; l < 4; l++) {
1027                                                                         if((elims[k] & (1 << l)) == 0) {
1028                                                                                 // Not yet eliminated
1029                                                                                 uint32_t spread = pairSpread(pairs, k, l);
1030                                                                                 if(kCurOverridesEligible) {
1031                                                                                         // Clear previous eligible results;
1032                                                                                         // this one's better
1033                                                                                         lowAltQual = kq;
1034                                                                                         kCurOverridesEligible = false;
1035                                                                                         // Keep these parameters in
1036                                                                                         // case this target turns
1037                                                                                         // out to be the only
1038                                                                                         // eligible target and we
1039                                                                                         // can avoid having to
1040                                                                                         // recalculate them
1041                                                                                         eligibleNum = 0;
1042                                                                                         eligibleSz = 0;
1043                                                                                         eli = k;
1044                                                                                         eltop = pairTop(pairs, k, l);
1045                                                                                         elbot = pairBot(pairs, k, l);
1046                                                                                         assert_eq(elbot-eltop, spread);
1047                                                                                         elham = mmPenalty(_maqPenalty, kq);
1048                                                                                         elchar = "acgt"[l];
1049                                                                                         elcint = l;
1050                                                                                         elignore = false;
1051                                                                                 }
1052                                                                                 eligibleNum++;
1053                                                                                 assert_gt(spread, 0);
1054                                                                                 eligibleSz += spread;
1055                                                                         }
1056                                                                 }
1057                                                         }
1058                                                 }
1059                                         }
1060                                 }
1061                                 assert_gt(eligibleNum, 0);
1062                                 assert_leq(eligibleNum, altNum);
1063                                 assert_gt(eligibleSz, 0);
1064                                 assert_geq(eligibleSz, eligibleNum);
1065                                 assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
1066                                 // Try again
1067                         } // while(top == bot && altNum > 0)
1068                         if(mustBacktrack || invalidHalfAndHalf || invalidExact) {
1069                                 return false;
1070                         }
1071                         // Mismatch with no alternatives
1072                         if(top == bot && altNum == 0) {
1073                                 assert_eq(0, altNum);
1074                                 assert_eq(0, eligibleSz);
1075                                 assert_eq(0, eligibleNum);
1076                                 return false;
1077                         }
1078                         // Match!
1079                         _chars[d] = (*_qry)[cur];
1080                         d++; cur--;
1081                 } // while(cur < _qlen)
1082                 assert_eq(0xffffffff, cur);
1083                 assert_gt(bot, top);
1084                 if(_reportPartials > 0) {
1085                         // Stack depth should not exceed given hamming distance
1086                         assert_leq(stackDepth, _reportPartials);
1087                 }
1088                 bool ret = false;
1089                 if(stackDepth >= _reportPartials) {
1090                         ret = reportAlignment(stackDepth, top, bot, ham);
1091                 }
1092                 return ret;
1093         }
1094
1095         /**
1096          * Pretty print a hit along with the backtracking constraints.
1097          */
1098         void printHit(const Hit& h) {
1099                 ::printHit(*_os, h, *_qry, _qlen, _unrevOff, _1revOff, _2revOff, _3revOff, _ebwt->fw());
1100         }
1101
1102         /**
1103          * Return true iff we're enforcing a half-and-half constraint
1104          * (forced edits in both seed halves).
1105          */
1106         bool halfAndHalf() const {
1107                 return _halfAndHalf;
1108         }
1109
1110 protected:
1111
1112         /**
1113          * Return true iff we're OK to continue after considering which
1114          * half-seed boundary we're passing through, together with the
1115          * number of mismatches accumulated so far.  Return false if we
1116          * should stop because a half-and-half constraint is violated.  If
1117          * we're not currently passing a half-seed boundary, just return
1118          * true.
1119          */
1120         bool hhCheck(uint32_t stackDepth, uint32_t depth,
1121                      const std::vector<uint32_t>& mms, bool empty)
1122         {
1123                 ASSERT_ONLY(uint32_t lim = (_3revOff == _2revOff)? 2 : 3);
1124                 if((depth == (_5depth-1)) && !empty) {
1125                         // We're crossing the boundary separating the hi-half
1126                         // from the non-seed portion of the read.
1127                         // We should induce a mismatch if we haven't mismatched
1128                         // yet, so that we don't waste time pursuing a match
1129                         // that was covered by a previous phase
1130                         assert_eq(0, _reportPartials);
1131                         assert_leq(stackDepth, lim-1);
1132                         return stackDepth > 0;
1133                 } else if((depth == (_3depth-1)) && !empty) {
1134                         // We're crossing the boundary separating the lo-half
1135                         // from the non-seed portion of the read
1136                         assert_eq(0, _reportPartials);
1137                         assert_leq(stackDepth, lim);
1138                         assert_gt(stackDepth, 0);
1139                         // Count the mismatches in the lo and hi halves
1140                         uint32_t loHalfMms = 0, hiHalfMms = 0;
1141                         for(size_t i = 0; i < stackDepth; i++) {
1142                                 uint32_t depth = _qlen - mms[i] - 1;
1143                                 if     (depth < _5depth) hiHalfMms++;
1144                                 else if(depth < _3depth) loHalfMms++;
1145                                 else assert(false);
1146                         }
1147                         assert_leq(loHalfMms + hiHalfMms, lim);
1148                         bool invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
1149                         return (stackDepth >= 2 && !invalidHalfAndHalf);
1150                 }
1151                 if(depth < _5depth-1) {
1152                         assert_leq(stackDepth, lim-1);
1153                 }
1154                 else if(depth >= _5depth && depth < _3depth-1) {
1155                         assert_gt(stackDepth, 0);
1156                         assert_leq(stackDepth, lim);
1157                 }
1158                 return true;
1159         }
1160
1161         /**
1162          * Calculate the stratum of the partial (or full) alignment
1163          * currently under consideration.  Stratum is equal to the number
1164          * of mismatches in the seed portion of the alignment.
1165          */
1166         int calcStratum(const std::vector<uint32_t>& mms, uint32_t stackDepth) {
1167                 int stratum = 0;
1168                 for(size_t i = 0; i < stackDepth; i++) {
1169                         if(mms[i] >= (_qlen - _3revOff)) {
1170                                 // This mismatch falls within the seed; count it
1171                                 // toward the stratum to report
1172                                 stratum++;
1173                                 // Don't currently support more than 3
1174                                 // mismatches in the seed
1175                                 assert_leq(stratum, 3);
1176                         }
1177                 }
1178                 return stratum;
1179         }
1180
1181         /**
1182          * Mark character c at depth d as being eliminated with respect to
1183          * future backtracks.
1184          */
1185         void eliminate(uint8_t *elims, uint32_t d, int c) {
1186                 if(c < 4) {
1187                         elims[d] = (1 << c);
1188                         assert_gt(elims[d], 0);
1189                         assert_lt(elims[d], 16);
1190                 } else {
1191                         elims[d] = 0;
1192                 }
1193                 assert_lt(elims[d], 16);
1194         }
1195
1196         /**
1197          * Return true iff the state of the backtracker as encoded by
1198          * stackDepth, d and iham is compatible with the current half-and-
1199          * half alignment mode.  prehits is for sanity checking when
1200          * bailing.
1201          */
1202         bool hhCheckTop(uint32_t stackDepth,
1203                         uint32_t d,
1204                         uint32_t iham,
1205                         const std::vector<uint32_t>& mms,
1206                         uint64_t prehits = 0xffffffffffffffffllu)
1207         {
1208                 assert_eq(0, _reportPartials);
1209                 // Crossing from the hi-half into the lo-half
1210                 if(d == _5depth) {
1211                         if(_3revOff == _2revOff) {
1212                                 // Total of 2 mismatches allowed: 1 hi, 1 lo
1213                                 // The backtracking logic should have prevented us from
1214                                 // backtracking more than once into this region
1215                                 assert_leq(stackDepth, 1);
1216                                 // Reject if we haven't encountered mismatch by this point
1217                                 if(stackDepth == 0) {
1218                                         return false;
1219                                 }
1220                         } else { // if(_3revOff != _2revOff)
1221                                 // Total of 3 mismatches allowed: 1 hi, 1 or 2 lo
1222                                 // The backtracking logic should have prevented us from
1223                                 // backtracking more than twice into this region
1224                                 assert_leq(stackDepth, 2);
1225                                 // Reject if we haven't encountered mismatch by this point
1226                                 if(stackDepth < 1) {
1227                                         return false;
1228                                 }
1229                         }
1230                 } else if(d == _3depth) {
1231                         // Crossing from lo-half to outside of the seed
1232                         if(_3revOff == _2revOff) {
1233                                 // Total of 2 mismatches allowed: 1 hi, 1 lo
1234                                 // The backtracking logic should have prevented us from
1235                                 // backtracking more than twice within this region
1236                                 assert_leq(stackDepth, 2);
1237                                 // Must have encountered two mismatches by this point
1238                                 if(stackDepth < 2) {
1239                                         // We're returning from the bottommost frame
1240                                         // without having found any hits; let's
1241                                         // sanity-check that there really aren't any
1242                                         return false;
1243                                 }
1244                         } else { // if(_3revOff != _2revOff)
1245                                 // Total of 3 mismatches allowed: 1 hi, 1 or 2 lo
1246                                 // Count the mismatches in the lo and hi halves
1247                                 int loHalfMms = 0, hiHalfMms = 0;
1248                                 assert_geq(mms.size(), stackDepth);
1249                                 for(size_t i = 0; i < stackDepth; i++) {
1250                                         uint32_t d = _qlen - mms[i] - 1;
1251                                         if     (d < _5depth) hiHalfMms++;
1252                                         else if(d < _3depth) loHalfMms++;
1253                                         else assert(false);
1254                                 }
1255                                 assert_leq(loHalfMms + hiHalfMms, 3);
1256                                 assert_gt(hiHalfMms, 0);
1257                                 if(loHalfMms == 0) {
1258                                         // We're returning from the bottommost frame
1259                                         // without having found any hits; let's
1260                                         // sanity-check that there really aren't any
1261                                         return false;
1262                                 }
1263                                 assert_geq(stackDepth, 2);
1264                                 // The backtracking logic should have prevented us from
1265                                 // backtracking more than twice within this region
1266                                 assert_leq(stackDepth, 3);
1267                         }
1268                 } else {
1269                         // We didn't just cross a boundary, so do an in-between check
1270                         if(d >= _5depth) {
1271                                 assert_geq(stackDepth, 1);
1272                         } else if(d >= _3depth) {
1273                                 assert_geq(stackDepth, 2);
1274                         }
1275                 }
1276                 return true;
1277         }
1278
1279         /**
1280          * Return the Phred quality value for the most likely base at
1281          * offset 'off' in the read.
1282          */
1283         inline uint8_t qualAt(size_t off) {
1284                 return phredCharToPhredQual((*_qual)[off]);
1285         }
1286
1287         /// Get the top offset for character c at depth d
1288         inline uint32_t pairTop(uint32_t* pairs, size_t d, size_t c) {
1289                 return pairs[d*8 + c + 0];
1290         }
1291
1292         /// Get the bot offset for character c at depth d
1293         inline uint32_t pairBot(uint32_t* pairs, size_t d, size_t c) {
1294                 return pairs[d*8 + c + 4];
1295         }
1296
1297         /// Get the spread between the bot and top offsets for character c
1298         /// at depth d
1299         inline uint32_t pairSpread(uint32_t* pairs, size_t d, size_t c) {
1300                 assert_geq(pairBot(pairs, d, c), pairTop(pairs, d, c));
1301                 return pairBot(pairs, d, c) - pairTop(pairs, d, c);
1302         }
1303
1304         /**
1305          * Tally how many Ns occur in the seed region and in the ftab-
1306          * jumpable region of the read.  Check whether the mismatches
1307          * induced by the Ns already violates the current policy.  Return
1308          * false if the policy is already violated, true otherwise.
1309          */
1310         bool tallyNs(int& nsInSeed, int& nsInFtab) {
1311                 const Ebwt<String<Dna> >& ebwt = *_ebwt;
1312                 int ftabChars = ebwt._eh._ftabChars;
1313                 // Count Ns in the seed region of the read and short-circuit if
1314                 // the configuration of Ns guarantees that there will be no
1315                 // valid alignments given the backtracking constraints.
1316                 for(size_t i = 0; i < _3revOff; i++) {
1317                         if((int)(*_qry)[_qlen-i-1] == 4) {
1318                                 nsInSeed++;
1319                                 if(nsInSeed == 1) {
1320                                         if(i < _unrevOff) {
1321                                                 return false; // Exceeded mm budget on Ns alone
1322                                         }
1323                                 } else if(nsInSeed == 2) {
1324                                         if(i < _1revOff) {
1325                                                 return false; // Exceeded mm budget on Ns alone
1326                                         }
1327                                 } else if(nsInSeed == 3) {
1328                                         if(i < _2revOff) {
1329                                                 return false; // Exceeded mm budget on Ns alone
1330                                         }
1331                                 } else {
1332                                         assert_gt(nsInSeed, 3);
1333                                         return false;     // Exceeded mm budget on Ns alone
1334                                 }
1335                         }
1336                 }
1337                 // Calculate the number of Ns there are in the region that
1338                 // would get jumped over if the ftab were used.
1339                 for(size_t i = 0; i < (size_t)ftabChars && i < _qlen; i++) {
1340                         if((int)(*_qry)[_qlen-i-1] == 4) nsInFtab++;
1341                 }
1342                 return true;
1343         }
1344
1345         /**
1346          * Calculate the offset into the ftab for the rightmost 'ftabChars'
1347          * characters of the current query. Rightmost char gets least
1348          * significant bit-pair.
1349          */
1350         uint32_t calcFtabOff() {
1351                 const Ebwt<String<Dna> >& ebwt = *_ebwt;
1352                 int ftabChars = ebwt._eh._ftabChars;
1353                 uint32_t ftabOff = (*_qry)[_qlen - ftabChars];
1354                 assert_lt(ftabOff, 4);
1355                 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
1356                 for(int i = ftabChars - 1; i > 0; i--) {
1357                         ftabOff <<= 2;
1358                         assert_lt((uint32_t)(*_qry)[_qlen-i], 4);
1359                         ftabOff |= (uint32_t)(*_qry)[_qlen-i];
1360                         assert_lt(ftabOff, ebwt._eh._ftabLen-1);
1361                 }
1362                 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
1363                 return ftabOff;
1364         }
1365
1366         /**
1367          * Mutate the _qry string according to the contents of the _muts
1368          * array, which represents a partial alignment.
1369          */
1370         void applyPartialMutations() {
1371                 if(_muts == NULL) {
1372                         // No mutations to apply
1373                         return;
1374                 }
1375                 for(size_t i = 0; i < length(*_muts); i++) {
1376                         const QueryMutation& m = (*_muts)[i];
1377                         assert_lt(m.pos, _qlen);
1378                         assert_leq(m.oldBase, 4);
1379                         assert_lt(m.newBase, 4);
1380                         assert_neq(m.oldBase, m.newBase);
1381                         assert_eq((uint32_t)((*_qry)[m.pos]), (uint32_t)m.oldBase);
1382                         (*_qry)[m.pos] = (Dna5)(int)m.newBase; // apply it
1383                 }
1384         }
1385
1386         /**
1387          * Take partial-alignment mutations present in the _muts list and
1388          * place them on the _mm list so that they become part of the
1389          * reported alignment.
1390          */
1391         void promotePartialMutations(int stackDepth) {
1392                 if(_muts == NULL) {
1393                         // No mutations to undo
1394                         return;
1395                 }
1396                 size_t numMuts = length(*_muts);
1397                 assert_leq(numMuts, _qlen);
1398                 for(size_t i = 0; i < numMuts; i++) {
1399                         // Entries in _mms[] are in terms of offset into
1400                         // _qry - not in terms of offset from 3' or 5' end
1401                         assert_lt(stackDepth + i, _qlen);
1402                         // All partial-alignment mutations should fall
1403                         // within bounds
1404                         assert_lt((*_muts)[i].pos, _qlen);
1405                         // All partial-alignment mutations should fall
1406                         // within unrevisitable region
1407                         assert_lt(_qlen - (*_muts)[i].pos - 1, _unrevOff);
1408 #ifndef NDEBUG
1409                         // Shouldn't be any overlap between mismatched positions
1410                         // and positions that mismatched in the partial alignment.
1411                         for(size_t j = 0; j < stackDepth + i; j++) {
1412                                 assert_neq(_mms[j], (uint32_t)(*_muts)[i].pos);
1413                         }
1414 #endif
1415                         if(_mms.size() <= stackDepth + i) {
1416                                 assert_eq(_mms.size(), stackDepth + i);
1417                                 _mms.push_back((*_muts)[i].pos);
1418                         } else {
1419                                 _mms[stackDepth + i] = (*_muts)[i].pos;
1420                         }
1421                         if(_refcs.size() <= stackDepth + i) {
1422                                 assert_eq(_refcs.size(), stackDepth + i);
1423                                 _refcs.push_back("ACGT"[(*_muts)[i].newBase]);
1424                         } else {
1425                                 _refcs[stackDepth + i] = "ACGT"[(*_muts)[i].newBase];
1426                         }
1427                 }
1428         }
1429
1430         /**
1431          * Undo mutations to the _qry string, returning it to the original
1432          * read.
1433          */
1434         void undoPartialMutations() {
1435                 if(_muts == NULL) {
1436                         // No mutations to undo
1437                         return;
1438                 }
1439                 for(size_t i = 0; i < length(*_muts); i++) {
1440                         const QueryMutation& m = (*_muts)[i];
1441                         assert_lt(m.pos, _qlen);
1442                         assert_leq(m.oldBase, 4);
1443                         assert_lt(m.newBase, 4);
1444                         assert_neq(m.oldBase, m.newBase);
1445                         assert_eq((uint32_t)((*_qry)[m.pos]), (uint32_t)m.newBase);
1446                         (*_qry)[m.pos] = (Dna5)(int)m.oldBase; // undo it
1447                 }
1448         }
1449
1450         /**
1451          * Report a range of alignments with # mismatches = stackDepth and
1452          * with the mutations (also mismatches) contained in _muts.  The
1453          * range is delimited by top and bot.  Returns true iff one or more
1454          * full alignments were successfully reported and the caller can
1455          * stop searching.
1456          */
1457         bool reportAlignment(uint32_t stackDepth, uint32_t top,
1458                              uint32_t bot, uint16_t cost)
1459         {
1460 #ifndef NDEBUG
1461                 // No two elements of _mms[] should be the same
1462                 assert_geq(_mms.size(), stackDepth);
1463                 for(size_t i = 0; i < stackDepth; i++) {
1464                         for(size_t j = i+1; j < stackDepth; j++) {
1465                                 assert_neq(_mms[j], _mms[i]);
1466                         }
1467                         // All elements of _mms[] should fall within bounds
1468                         assert_lt(_mms[i], _qlen);
1469                 }
1470 #endif
1471                 if(_reportPartials) {
1472                         assert_leq(stackDepth, _reportPartials);
1473                         if(stackDepth > 0) {
1474                                 // Report this partial alignment.  A partial alignment
1475                                 // is defined purely by its mismatches; top and bot are
1476                                 // ignored.
1477                                 reportPartial(stackDepth);
1478                         }
1479                         return false; // keep going - we want to find all partial alignments
1480                 }
1481                 int stratum = 0;
1482                 if(stackDepth > 0) {
1483                         stratum = calcStratum(_mms, stackDepth);
1484                 }
1485                 assert_lt(stratum, 4);
1486                 assert_geq(stratum, 0);
1487                 bool hit;
1488                 // If _muts != NULL then this alignment extends a partial
1489                 // alignment, so we have to account for the differences present
1490                 // in the partial.
1491                 if(_muts != NULL) {
1492                         // Undo partial-alignment mutations to get original _qry
1493                         ASSERT_ONLY(String<Dna5> tmp = (*_qry));
1494                         undoPartialMutations();
1495                         assert_neq(tmp, (*_qry));
1496                         // Add the partial-alignment mutations to the _mms[] array
1497                         promotePartialMutations(stackDepth);
1498                         // All muts are in the seed, so they count toward the stratum
1499                         size_t numMuts = length(*_muts);
1500                         stratum += numMuts;
1501                         cost |= (stratum << 14);
1502                         assert_geq(cost, (uint32_t)(stratum << 14));
1503                         // Report the range of full alignments
1504                         hit = reportFullAlignment(stackDepth + numMuts, top, bot, stratum, cost);
1505                         // Re-apply partial-alignment mutations
1506                         applyPartialMutations();
1507                         assert_eq(tmp, (*_qry));
1508                 } else {
1509                         // Report the range of full alignments
1510                         cost |= (stratum << 14);
1511                         assert_geq(cost, (uint32_t)(stratum << 14));
1512                         hit = reportFullAlignment(stackDepth, top, bot, stratum, cost);
1513                 }
1514                 return hit;
1515         }
1516
1517         /**
1518          * Report a range of full alignments with # mismatches = stackDepth.
1519          * The range is delimited by top and bot.  Returns true if one or
1520          * more alignments were successfully reported.  Returns true iff
1521          * one or more full alignments were successfully reported and the
1522          * caller can stop searching.
1523          */
1524         bool reportFullAlignment(uint32_t stackDepth,
1525                                  uint32_t top,
1526                                  uint32_t bot,
1527                                  int stratum,
1528                                  uint16_t cost)
1529         {
1530                 assert_gt(bot, top);
1531                 if(stackDepth == 0 && !_reportExacts) {
1532                         // We are not reporting exact hits (usually because we've
1533                         // already reported them as part of a previous invocation
1534                         // of the backtracker)
1535                         return false;
1536                 }
1537                 assert(!_reportRanges);
1538                 uint32_t spread = bot - top;
1539                 // Pick a random spot in the range to begin report
1540                 uint32_t r = top + (_rand.nextU32() % spread);
1541                 for(uint32_t i = 0; i < spread; i++) {
1542                         uint32_t ri = r + i;
1543                         if(ri >= bot) ri -= spread;
1544                         // reportChaseOne takes the _mms[] list in terms of
1545                         // their indices into the query string; not in terms
1546                         // of their offset from the 3' or 5' end.
1547                         assert_geq(cost, (uint32_t)(stratum << 14));
1548                         if(_ebwt->reportChaseOne((*_qry), _qual, _name,
1549                                                  _color, colorExEnds, snpPhred, _refs, _mms,
1550                                                  _refcs, stackDepth, ri, top, bot,
1551                                                  _qlen, stratum, cost, _patid, _seed,
1552                                                  _params))
1553                         {
1554                                 // Return value of true means that we can stop
1555                                 return true;
1556                         }
1557                         // Return value of false means that we should continue
1558                         // searching.  This could happen if we the call to
1559                         // reportChaseOne() reported a hit, but the user asked for
1560                         // multiple hits and we haven't reached the ceiling yet.
1561                         // This might also happen if the call to reportChaseOne()
1562                         // didn't report a hit because the alignment was spurious
1563                         // (i.e. overlapped some padding).
1564                 }
1565                 // All range elements were examined and we should keep going
1566                 return false;
1567         }
1568
1569         /**
1570          * Report the partial alignment represented by the current stack
1571          * state (_mms[] and stackDepth).
1572          */
1573         bool reportPartial(uint32_t stackDepth) {
1574                 // Sanity-check stack depth
1575                 if(_3revOff != _2revOff) {
1576                         assert_leq(stackDepth, 3);
1577                 } else if(_2revOff != _1revOff) {
1578                         assert_leq(stackDepth, 2);
1579                 } else {
1580                         assert_leq(stackDepth, 1);
1581                 }
1582
1583                 // Possibly report
1584                 assert_gt(_reportPartials, 0);
1585                 assert(_partials != NULL);
1586                 ASSERT_ONLY(uint32_t qualTot = 0);
1587                 PartialAlignment al;
1588                 al.u64.u64 = 0xffffffffffffffffllu;
1589                 assert_leq(stackDepth, 3);
1590                 assert_gt(stackDepth, 0);
1591
1592                 // First mismatch
1593                 assert_gt(_mms.size(), 0);
1594                 assert_lt(_mms[0], _qlen);
1595                 // First, append the mismatch position in the read
1596                 al.entry.pos0 = (uint16_t)_mms[0]; // pos
1597                 ASSERT_ONLY(uint8_t qual0 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[0]])));
1598                 ASSERT_ONLY(qualTot += qual0);
1599                 uint32_t ci = _qlen - _mms[0] - 1;
1600                 // _chars[] is index in terms of RHS-relative depth
1601                 int c = (int)(Dna5)_chars[ci];
1602                 assert_lt(c, 4);
1603                 assert_neq(c, (int)(*_qry)[_mms[0]]);
1604                 // Second, append the substituted character for the position
1605                 al.entry.char0 = c;
1606
1607                 if(stackDepth > 1) {
1608                         assert_gt(_mms.size(), 1);
1609                         // Second mismatch
1610                         assert_lt(_mms[1], _qlen);
1611                         // First, append the mismatch position in the read
1612                         al.entry.pos1 = (uint16_t)_mms[1]; // pos
1613                         ASSERT_ONLY(uint8_t qual1 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[1]])));
1614                         ASSERT_ONLY(qualTot += qual1);
1615                         ci = _qlen - _mms[1] - 1;
1616                         // _chars[] is index in terms of RHS-relative depth
1617                         c = (int)(Dna5)_chars[ci];
1618                         assert_lt(c, 4);
1619                         assert_neq(c, (int)(*_qry)[_mms[1]]);
1620                         // Second, append the substituted character for the position
1621                         al.entry.char1 = c;
1622                         if(stackDepth > 2) {
1623                                 assert_gt(_mms.size(), 2);
1624                                 // Second mismatch
1625                                 assert_lt(_mms[2], _qlen);
1626                                 // First, append the mismatch position in the read
1627                                 al.entry.pos2 = (uint16_t)_mms[2]; // pos
1628                                 ASSERT_ONLY(uint8_t qual2 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[2]])));
1629                                 ASSERT_ONLY(qualTot += qual2);
1630                                 ci = _qlen - _mms[2] - 1;
1631                                 // _chars[] is index in terms of RHS-relative depth
1632                                 c = (int)(Dna5)_chars[ci];
1633                                 assert_lt(c, 4);
1634                                 assert_neq(c, (int)(*_qry)[_mms[2]]);
1635                                 // Second, append the substituted character for the position
1636                                 al.entry.char2 = c;
1637                         } else {
1638                                 // Signal that the '2' slot is empty
1639                                 al.entry.pos2 = 0xffff;
1640                         }
1641                 } else {
1642                         // Signal that the '1' slot is empty
1643                         al.entry.pos1 = 0xffff;
1644                 }
1645
1646                 assert_leq(qualTot, _qualThresh);
1647                 assert(validPartialAlignment(al));
1648 #ifndef NDEBUG
1649                 assert(al.repOk(_qualThresh, _qlen, (*_qual), _maqPenalty));
1650                 for(size_t i = 0; i < _partialsBuf.size(); i++) {
1651                         assert(validPartialAlignment(_partialsBuf[i]));
1652                         assert(!samePartialAlignment(_partialsBuf[i], al));
1653                 }
1654 #endif
1655                 _partialsBuf.push_back(al);
1656                 return true;
1657         }
1658
1659         /**
1660          * Check that the given eligibility parameters (lowAltQual,
1661          * eligibleSz, eligibleNum) are correct, given the appropriate
1662          * inputs (pairs, elims, depth, d, unrevOff)
1663          */
1664         bool sanityCheckEligibility(uint32_t  depth,
1665                                     uint32_t  d,
1666                                     uint32_t  unrevOff,
1667                                     uint32_t  lowAltQual,
1668                                     uint32_t  eligibleSz,
1669                                     uint32_t  eligibleNum,
1670                                     uint32_t* pairs,
1671                                     uint8_t*  elims)
1672         {
1673                 // Sanity check that the lay of the land is as we
1674                 // expect given eligibleNum and eligibleSz
1675                 size_t i = max(depth, unrevOff), j = 0;
1676                 uint32_t cumSz = 0;
1677                 uint32_t eligiblesVisited = 0;
1678                 for(; i <= d; i++) {
1679                         uint32_t icur = _qlen - i - 1; // current offset into _qry
1680                         uint8_t qi = qualAt(icur);
1681                         assert_lt(elims[i], 16);
1682                         if((qi == lowAltQual || !_considerQuals) && elims[i] != 15) {
1683                                 // This is an eligible position with at least
1684                                 // one remaining backtrack target
1685                                 for(j = 0; j < 4; j++) {
1686                                         if((elims[i] & (1 << j)) == 0) {
1687                                                 // This pair has not been eliminated
1688                                                 assert_gt(pairBot(pairs, i, j), pairTop(pairs, i, j));
1689                                                 cumSz += pairSpread(pairs, i, j);
1690                                                 eligiblesVisited++;
1691                                         }
1692                                 }
1693                         }
1694                 }
1695                 assert_eq(cumSz, eligibleSz);
1696                 assert_eq(eligiblesVisited, eligibleNum);
1697                 return true;
1698         }
1699
1700         const BitPairReference* _refs; // reference sequences (or NULL if not colorspace)
1701         String<Dna5>*       _qry;    // query (read) sequence
1702         size_t              _qlen;   // length of _qry
1703         String<char>*       _qual;   // quality values for _qry
1704         String<char>*       _name;   // name of _qry
1705         bool                _color;  // whether read is colorspace
1706         const Ebwt<String<Dna> >* _ebwt;   // Ebwt to search in
1707         const EbwtSearchParams<String<Dna> >& _params;   // Ebwt to search in
1708         uint32_t            _unrevOff; // unrevisitable chunk
1709         uint32_t            _1revOff;  // 1-revisitable chunk
1710         uint32_t            _2revOff;  // 2-revisitable chunk
1711         uint32_t            _3revOff;  // 3-revisitable chunk
1712         /// Whether to round qualities off Maq-style when calculating penalties
1713         bool                _maqPenalty;
1714         uint32_t            _qualThresh; // only accept hits with weighted
1715                                      // hamming distance <= _qualThresh
1716         uint32_t           *_pairs;  // ranges, leveled in parallel
1717                                      // with decision stack
1718         uint8_t            *_elims;  // which ranges have been
1719                                      // eliminated, leveled in parallel
1720                                      // with decision stack
1721         std::vector<uint32_t> _mms;  // array for holding mismatches
1722         std::vector<uint8_t> _refcs;  // array for holding mismatches
1723         // Entries in _mms[] are in terms of offset into
1724         // _qry - not in terms of offset from 3' or 5' end
1725         char               *_chars;  // characters selected so far
1726         // If > 0, report partial alignments up to this many mismatches
1727         uint32_t            _reportPartials;
1728         /// Do not report alignments with stratum < this limit
1729         bool                _reportExacts;
1730         /// When reporting a full alignment, report top/bot; don't chase
1731         /// any of the results
1732         bool                _reportRanges;
1733         /// Append partial alignments here
1734         PartialAlignmentManager *_partials;
1735         /// Set of mutations that apply for a partial alignment
1736         String<QueryMutation> *_muts;
1737         /// Reference texts (NULL if they are unavailable
1738         vector<String<Dna5> >* _os;
1739         /// Whether to use the _os array together with a naive matching
1740         /// algorithm to double-check reported alignments (or the lack
1741         /// thereof)
1742         bool                _sanity;
1743         /// Whether to consider quality values when deciding where to
1744         /// backtrack
1745         bool                _considerQuals;
1746         bool                _halfAndHalf;
1747         /// Depth of 5'-seed-half border
1748         uint32_t            _5depth;
1749         /// Depth of 3'-seed-half border
1750         uint32_t            _3depth;
1751         /// Default quals
1752         String<char>        _qualDefault;
1753         /// Number of backtracks in last call to backtrack()
1754         uint32_t            _numBts;
1755         /// Number of backtracks since last reset
1756         uint32_t            _totNumBts;
1757         /// Max # of backtracks to allow before giving up
1758         uint32_t            _maxBts;
1759         /// Whether we precalcualted the Ebwt locus information for the
1760         /// next top/bot pair
1761         bool    _precalcedSideLocus;
1762         /// Precalculated top locus
1763         SideLocus           _preLtop;
1764         /// Precalculated bot locus
1765         SideLocus           _preLbot;
1766         /// Flag to record whether a 'false' return from backtracker is due
1767         /// to having exceeded one or more backrtacking limits
1768         bool                _bailedOnBacktracks;
1769         /// Source of pseudo-random numbers
1770         RandomSource        _rand;
1771         /// Be talkative
1772         bool                _verbose;
1773         uint64_t            _ihits;
1774         // Holding area for partial alignments
1775         vector<PartialAlignment> _partialsBuf;
1776         // Current range to expose to consumers
1777         Range               _curRange;
1778         uint32_t            _patid;
1779         uint32_t            _seed;
1780 #ifndef NDEBUG
1781         std::set<int64_t>   allTops_;
1782 #endif
1783 };
1784
1785 /**
1786  * Class that coordinates quality- and quantity-aware backtracking over
1787  * some range of a read sequence.
1788  *
1789  * The creator can configure the BacktrackManager to treat different
1790  * stretches of the read differently.
1791  */
1792 class EbwtRangeSource : public RangeSource {
1793         typedef Ebwt<String<Dna> > TEbwt;
1794         typedef std::pair<int, int> TIntPair;
1795 public:
1796         EbwtRangeSource(
1797                         const TEbwt* ebwt,
1798                         bool         fw,
1799                         uint32_t     qualLim,
1800                         bool         reportExacts,
1801                         bool         verbose,
1802                         bool         quiet,
1803                         int          halfAndHalf,
1804                         bool         partial,
1805                         bool         maqPenalty,
1806                         bool         qualOrder,
1807                         AlignerMetrics *metrics = NULL) :
1808                 RangeSource(),
1809                 qry_(NULL),
1810                 qlen_(0),
1811                 qual_(NULL),
1812                 name_(NULL),
1813                 altQry_(NULL),
1814                 altQual_(NULL),
1815                 alts_(0),
1816                 fuzzy_(false),
1817                 ebwt_(ebwt),
1818                 fw_(fw),
1819                 offRev0_(0),
1820                 offRev1_(0),
1821                 offRev2_(0),
1822                 offRev3_(0),
1823                 maqPenalty_(maqPenalty),
1824                 qualOrder_(qualOrder),
1825                 qualLim_(qualLim),
1826                 reportExacts_(reportExacts),
1827                 halfAndHalf_(halfAndHalf),
1828                 partial_(partial),
1829                 depth5_(0),
1830                 depth3_(0),
1831                 verbose_(verbose),
1832                 quiet_(quiet),
1833                 skippingThisRead_(false),
1834                 metrics_(metrics)
1835         { curEbwt_ = ebwt_; }
1836
1837         /**
1838          * Set a new query read.
1839          */
1840         virtual void setQuery(ReadBuf& r, Range *seedRange) {
1841                 const bool ebwtFw = ebwt_->fw();
1842                 if(ebwtFw) {
1843                         qry_  = fw_ ? &r.patFw : &r.patRc;
1844                         qual_ = fw_ ? &r.qual  : &r.qualRev;
1845                         altQry_  = (String<Dna5>*)(fw_ ? r.altPatFw : r.altPatRc);
1846                         altQual_ = (String<char>*)(fw_ ? r.altQual  : r.altQualRev);
1847                 } else {
1848                         qry_  = fw_ ? &r.patFwRev : &r.patRcRev;
1849                         qual_ = fw_ ? &r.qualRev  : &r.qual;
1850                         altQry_  = (String<Dna5>*)(fw_ ? r.altPatFwRev : r.altPatRcRev);
1851                         altQual_ = (String<char>*)(fw_ ? r.altQualRev  : r.altQual);
1852                 }
1853                 alts_ = r.alts;
1854                 name_ = &r.name;
1855                 fuzzy_ = r.fuzzy;
1856                 if(seedRange != NULL) seedRange_ = *seedRange;
1857                 else                  seedRange_.invalidate();
1858                 qlen_ = length(*qry_);
1859                 skippingThisRead_ = false;
1860                 // Apply edits from the partial alignment to the query pattern
1861                 if(seedRange_.valid()) {
1862                         qryBuf_ = *qry_;
1863                         const size_t srSz = seedRange_.mms.size();
1864                         assert_gt(srSz, 0);
1865                         assert_eq(srSz, seedRange_.refcs.size());
1866                         for(size_t i = 0; i < srSz; i++) {
1867                                 assert_lt(seedRange_.mms[i], qlen_);
1868                                 char rc = (char)seedRange_.refcs[i];
1869                                 assert(rc == 'A' || rc == 'C' || rc == 'G' || rc == 'T');
1870                                 ASSERT_ONLY(char oc = (char)qryBuf_[qlen_ - seedRange_.mms[i] - 1]);
1871                                 assert_neq(rc, oc);
1872                                 qryBuf_[qlen_ - seedRange_.mms[i] - 1] = (Dna5)rc;
1873                                 assert_neq((Dna5)rc, (*qry_)[qlen_ - seedRange_.mms[i] - 1]);
1874                         }
1875                         qry_ = &qryBuf_;
1876                 }
1877                 // Make sure every qual is a valid qual ASCII character (>= 33)
1878                 for(size_t i = 0; i < length(*qual_); i++) {
1879                         assert_geq((*qual_)[i], 33);
1880                         for(int j = 0; j < alts_; j++) {
1881                                 assert_geq(altQual_[j][i], 33);
1882                         }
1883                 }
1884                 assert_geq(length(*qual_), qlen_);
1885                 this->done = false;
1886                 this->foundRange = false;
1887                 color_ = r.color;
1888                 rand_.init(r.seed);
1889         }
1890
1891         /**
1892          * Set backtracking constraints.
1893          */
1894         void setOffs(uint32_t depth5,   // depth of far edge of hi-half
1895                      uint32_t depth3,   // depth of far edge of lo-half
1896                      uint32_t unrevOff, // depth above which we cannot backtrack
1897                      uint32_t revOff1,  // depth above which we may backtrack just once
1898                      uint32_t revOff2,  // depth above which we may backtrack just twice
1899                      uint32_t revOff3)  // depth above which we may backtrack just three times
1900         {
1901                 depth5_   = depth5;
1902                 depth3_   = depth3;
1903                 assert_geq(depth3_, depth5_);
1904                 offRev0_ = unrevOff;
1905                 offRev1_  = revOff1;
1906                 offRev2_  = revOff2;
1907                 offRev3_  = revOff3;
1908         }
1909
1910         /**
1911          * Return true iff this RangeSource is allowed to report exact
1912          * alignments (exact = no edits).
1913          */
1914         bool reportExacts() const {
1915                 return reportExacts_;
1916         }
1917
1918         /// Return the current range
1919         virtual Range& range() {
1920                 return curRange_;
1921         }
1922
1923         /**
1924          * Set qlen_ according to parameter, except don't let it fall below
1925          * the length of the query.
1926          */
1927         void setQlen(uint32_t qlen) {
1928                 assert(qry_ != NULL);
1929                 qlen_ = min<uint32_t>(length(*qry_), qlen);
1930         }
1931
1932         /**
1933          * Initiate continuations so that the next call to advance() begins
1934          * a new search.  Note that contMan is empty upon return if there
1935          * are no valid continuations to begin with.  Also note that
1936          * calling initConts() may result in finding a range (i.e., if we
1937          * immediately jump to a valid range using the ftab).
1938          */
1939         virtual void
1940         initBranch(PathManager& pm) {
1941                 assert(curEbwt_ != NULL);
1942                 assert_gt(length(*qry_), 0);
1943                 assert_leq(qlen_, length(*qry_));
1944                 assert_geq(length(*qual_), length(*qry_));
1945                 const Ebwt<String<Dna> >& ebwt = *ebwt_;
1946                 int ftabChars = ebwt._eh._ftabChars;
1947                 this->foundRange = false;
1948                 int nsInSeed = 0; int nsInFtab = 0;
1949                 ASSERT_ONLY(allTops_.clear());
1950                 if(skippingThisRead_) {
1951                         this->done = true;
1952                         return;
1953                 }
1954                 if(qlen_ < 4) {
1955                         uint32_t maxmms = 0;
1956                         if(offRev0_ != offRev1_) maxmms = 1;
1957                         if(offRev1_ != offRev2_) maxmms = 2;
1958                         if(offRev2_ != offRev3_) maxmms = 3;
1959                         if(qlen_ <= maxmms) {
1960                                 if(!quiet_) {
1961                                         ThreadSafe _ts(&gLock);
1962                                         cerr << "Warning: Read (" << (*name_) << ") is less than " << (maxmms+1) << " characters long; skipping..." << endl;
1963                                 }
1964                                 this->done = true;
1965                                 skippingThisRead_ = true;
1966                                 return;
1967                         }
1968                 }
1969                 if(!tallyNs(nsInSeed, nsInFtab)) {
1970                         // No alignments are possible because of the distribution
1971                         // of Ns in the read in combination with the backtracking
1972                         // constraints.
1973                         return;
1974                 }
1975                 // icost = total cost penalty (major bits = stratum, minor bits =
1976                 // quality penalty) incurred so far by partial alignment
1977                 uint16_t icost = (seedRange_.valid()) ? seedRange_.cost : 0;
1978                 // iham = total quality penalty incurred so far by partial alignment
1979                 uint16_t iham = (seedRange_.valid() && qualOrder_) ? (seedRange_.cost & ~0xc000): 0;
1980                 assert_leq(iham, qualLim_);
1981                 // m = depth beyond which ftab must not extend or else we might
1982                 // miss some legitimate paths
1983                 uint32_t m = min<uint32_t>(offRev0_, qlen_);
1984                 // Let skipInvalidExact = true if using the ftab would be a
1985                 // waste because it would jump directly to an alignment we
1986                 // couldn't use.
1987                 bool ftabSkipsToEnd = (qlen_ == (uint32_t)ftabChars);
1988                 bool skipInvalidExact = (!reportExacts_ && ftabSkipsToEnd);
1989
1990                 // If it's OK to use the ftab...
1991                 if(nsInFtab == 0 && m >= (uint32_t)ftabChars && !skipInvalidExact) {
1992                         // Use the ftab to jump 'ftabChars' chars into the read
1993                         // from the right
1994                         uint32_t ftabOff = calcFtabOff();
1995                         uint32_t top = ebwt.ftabHi(ftabOff);
1996                         uint32_t bot = ebwt.ftabLo(ftabOff+1);
1997                         if(qlen_ == (uint32_t)ftabChars && bot > top) {
1998                                 // We found a range with 0 mismatches immediately.  Set
1999                                 // fields to indicate we found a range.
2000                                 assert(reportExacts_);
2001                                 curRange_.top     = top;
2002                                 curRange_.bot     = bot;
2003                                 curRange_.stratum = (icost >> 14);
2004                                 curRange_.cost    = icost;
2005                                 curRange_.numMms  = 0;
2006                                 curRange_.ebwt    = ebwt_;
2007                                 curRange_.fw      = fw_;
2008                                 curRange_.mms.clear(); // no mismatches
2009                                 curRange_.refcs.clear(); // no mismatches
2010                                 // Lump in the edits from the partial alignment
2011                                 addPartialEdits();
2012                                 assert(curRange_.repOk());
2013                                 // no need to do anything with curRange_.refcs
2014                                 this->foundRange  = true;
2015                                 //this->done = true;
2016                                 return;
2017                         } else if (bot > top) {
2018                                 // We have a range to extend
2019                                 assert_leq(top, ebwt._eh._len);
2020                                 assert_leq(bot, ebwt._eh._len);
2021                                 Branch *b = pm.bpool.alloc();
2022                                 if(b == NULL) {
2023                                         assert(pm.empty());
2024                                         return;
2025                                 }
2026                                 if(!b->init(
2027                                         pm.rpool, pm.epool, pm.bpool.lastId(), qlen_,
2028                                         offRev0_, offRev1_, offRev2_, offRev3_,
2029                                         0, ftabChars, icost, iham, top, bot,
2030                                         ebwt._eh, ebwt._ebwt))
2031                                 {
2032                                         // Negative result from b->init() indicates we ran
2033                                         // out of best-first chunk memory
2034                                         assert(pm.empty());
2035                                         return;
2036                                 }
2037                                 assert(!b->curtailed_);
2038                                 assert(!b->exhausted_);
2039                                 assert_gt(b->depth3_, 0);
2040                                 pm.push(b); // insert into priority queue
2041                                 assert(!pm.empty());
2042                         } else {
2043                                 // The arrows are already closed within the
2044                                 // unrevisitable region; give up
2045                         }
2046                 } else {
2047                         // We can't use the ftab, so we start from the rightmost
2048                         // position and use _fchr
2049                         Branch *b = pm.bpool.alloc();
2050                         if(b == NULL) {
2051                                 assert(pm.empty());
2052                                 return;
2053                         }
2054                         if(!b->init(pm.rpool, pm.epool, pm.bpool.lastId(), qlen_,
2055                                 offRev0_, offRev1_, offRev2_, offRev3_,
2056                                 0, 0, icost, iham, 0, 0, ebwt._eh, ebwt._ebwt))
2057                         {
2058                                 // Negative result from b->init() indicates we ran
2059                                 // out of best-first chunk memory
2060                                 assert(pm.empty());
2061                                 return;
2062                         }
2063                         assert(!b->curtailed_);
2064                         assert(!b->exhausted_);
2065                         assert_gt(b->depth3_, 0);
2066                         pm.push(b); // insert into priority queue
2067                         assert(!pm.empty());
2068                 }
2069                 return;
2070         }
2071
2072         /**
2073          * Advance along the lowest-cost branch managed by the given
2074          * PathManager.  Keep advancing until condition 'until' is
2075          * satisfied.  Typically, the stopping condition 'until' is
2076          * set to stop whenever pm's minCost changes.
2077          */
2078         virtual void
2079         advanceBranch(int until, uint16_t minCost, PathManager& pm) {
2080                 assert(curEbwt_ != NULL);
2081
2082                 // Let this->foundRange = false; we'll set it to true iff this call
2083                 // to advance yielded a new valid-alignment range.
2084                 this->foundRange = false;
2085
2086                 // Can't have already exceeded weighted hamming distance threshold
2087                 assert_gt(length(*qry_), 0);
2088                 assert_leq(qlen_, length(*qry_));
2089                 assert_geq(length(*qual_), length(*qry_));
2090                 assert(!pm.empty());
2091
2092                 do {
2093                         assert(pm.repOk());
2094                         // Get the highest-priority branch according to the priority
2095                         // queue in 'pm'
2096                         Branch* br = pm.front();
2097                         // Shouldn't be curtailed or exhausted
2098                         assert(!br->exhausted_);
2099                         assert(!br->curtailed_);
2100                         assert_gt(br->depth3_, 0);
2101                         assert_leq(br->ham_, qualLim_);
2102                         if(verbose_) {
2103                                 br->print((*qry_), (*qual_), minCost, cout, (halfAndHalf_>0), partial_, fw_, ebwt_->fw());
2104                                 if(!br->edits_.empty()) {
2105                                         cout << "Edit: ";
2106                                         for(size_t i = 0; i < br->edits_.size(); i++) {
2107                                                 Edit e = br->edits_.get(i);
2108                                                 cout << (curEbwt_->fw() ? (qlen_ - e.pos - 1) : e.pos)
2109                                                          << (char)e.chr;
2110                                                 if(i < br->edits_.size()-1) cout << " ";
2111                                         }
2112                                         cout << endl;
2113                                 }
2114
2115                         }
2116                         assert(br->repOk(qlen_));
2117
2118                         ASSERT_ONLY(int stratum = br->cost_ >> 14); // shift the stratum over
2119                         assert_lt(stratum, 4);
2120                         // Not necessarily true with rounding
2121                         uint32_t depth = br->tipDepth();
2122
2123                         const Ebwt<String<Dna> >& ebwt = *ebwt_;
2124
2125                         if(halfAndHalf_ > 0) assert_gt(depth3_, depth5_);
2126
2127                         bool reportedPartial = false;
2128                         bool invalidExact = false;
2129                         bool empty = false;
2130                         bool hit = false;
2131                         uint16_t cost = br->cost_;
2132                         uint32_t cur = 0;
2133                         uint32_t nedits = 0;
2134
2135                         if(halfAndHalf_ && !hhCheckTop(br, depth, 0)) {
2136                                 // Stop extending this branch because it violates a half-
2137                                 // and-half constraint
2138                                 if(metrics_ != NULL) metrics_->curBacktracks_++;
2139                                 pm.curtail(br, qlen_, depth3_, qualOrder_);
2140                                 goto bail;
2141                         }
2142
2143                         cur = qlen_ - depth - 1; // current offset into qry_
2144                         if(depth < qlen_) {
2145                                 // Determine whether ranges at this location are candidates
2146                                 // for backtracking
2147                                 int c = (int)(*qry_)[cur]; // get char at this position
2148                                 int nextc = -1;
2149                                 if(cur < qlen_-1) nextc = (int)(*qry_)[cur+1];
2150                                 assert_leq(c, 4);
2151                                 // If any uncalled base's penalty is still under
2152                                 // the ceiling, then this position is an alternative
2153                                 uint8_t q[4] = {'!', '!', '!', '!'};
2154                                 uint8_t bestq;
2155                                 // get unrounded penalties at this position
2156                                 if(fuzzy_) {
2157                                         bestq = penaltiesAt(cur, q, alts_, *qual_, altQry_, altQual_);
2158                                 } else {
2159                                         bestq = q[0] = q[1] = q[2] = q[3] =
2160                                                 mmPenalty(maqPenalty_, qualAt(cur));
2161                                 }
2162
2163                                 // The current query position is a legit alternative if it a) is
2164                                 // not in the unrevisitable region, and b) its selection would
2165                                 // not necessarily cause the quality ceiling (if one exists) to
2166                                 // be exceeded
2167                                 bool curIsAlternative = (depth >= br->depth0_) &&
2168                                                         (br->ham_ + bestq <= qualLim_);
2169                                 ASSERT_ONLY(uint32_t obot = br->bot_);
2170                                 uint32_t otop = br->top_;
2171
2172                                 // If c is 'N', then it's a mismatch
2173                                 if(c == 4 && depth > 0) {
2174                                         // Force the 'else if(curIsAlternative)' or 'else'
2175                                         // branches below
2176                                         br->top_ = br->bot_ = 1;
2177                                 } else if(c == 4) {
2178                                         // We'll take the 'if(br->top == 0 && br->bot == 0)'
2179                                         // branch below
2180                                         assert_eq(0, br->top_);
2181                                         assert_eq(0, br->bot_);
2182                                 }
2183
2184                                 // Get the range state for the current position
2185                                 RangeState *rs = br->rangeState();
2186                                 assert(rs != NULL);
2187                                 // Calculate the ranges for this position
2188                                 if(br->top_ == 0 && br->bot_ == 0) {
2189                                         // Calculate first quartet of ranges using the _fchr[]
2190                                         // array
2191                                                                   rs->tops[0] = ebwt._fchr[0];
2192                                         rs->bots[0] = rs->tops[1] = ebwt._fchr[1];
2193                                         rs->bots[1] = rs->tops[2] = ebwt._fchr[2];
2194                                         rs->bots[2] = rs->tops[3] = ebwt._fchr[3];
2195                                         rs->bots[3]               = ebwt._fchr[4];
2196                                         ASSERT_ONLY(int r =)
2197                                         br->installRanges(c, nextc, fuzzy_, qualLim_ - br->ham_, q);
2198                                         assert(r < 4 || c == 4);
2199                                         // Update top and bot
2200                                         if(c < 4) {
2201                                                 br->top_ = rs->tops[c];
2202                                                 br->bot_ = rs->bots[c];
2203                                         }
2204                                 } else if(curIsAlternative && (br->bot_ > br->top_ || c == 4)) {
2205                                         // Calculate next quartet of ranges.  We hope that the
2206                                         // appropriate cache lines are prefetched.
2207                                         assert(br->ltop_.valid());
2208                                                                   rs->tops[0] =
2209                                         rs->bots[0] = rs->tops[1] =
2210                                         rs->bots[1] = rs->tops[2] =
2211                                         rs->bots[2] = rs->tops[3] =
2212                                         rs->bots[3]               = 0;
2213                                         if(br->lbot_.valid()) {
2214                                                 if(metrics_ != NULL) metrics_->curBwtOps_++;
2215                                                 ebwt.mapLFEx(br->ltop_, br->lbot_, rs->tops, rs->bots);
2216                                         } else {
2217 #ifndef NDEBUG
2218                                                 uint32_t tmptops[] = {0, 0, 0, 0};
2219                                                 uint32_t tmpbots[] = {0, 0, 0, 0};
2220                                                 SideLocus ltop, lbot;
2221                                                 ltop.initFromRow(otop, ebwt_->_eh, ebwt_->_ebwt);
2222                                                 lbot.initFromRow(obot, ebwt_->_eh, ebwt_->_ebwt);
2223                                                 ebwt.mapLFEx(ltop, lbot, tmptops, tmpbots);
2224 #endif
2225                                                 if(metrics_ != NULL) metrics_->curBwtOps_++;
2226                                                 int cc = ebwt.mapLF1(otop, br->ltop_);
2227                                                 br->top_ = otop;
2228                                                 assert(cc == -1 || (cc >= 0 && cc < 4));
2229                                                 if(cc >= 0) {
2230                                                         assert_lt(cc, 4);
2231                                                         rs->tops[cc] = br->top_;
2232                                                         rs->bots[cc] = (br->top_ + 1);
2233                                                 }
2234 #ifndef NDEBUG
2235                                                 for(int i = 0; i < 4; i++) {
2236                                                         assert_eq(tmpbots[i] - tmptops[i],
2237                                                                   rs->bots[i] - rs->tops[i]);
2238                                                 }
2239 #endif
2240                                         }
2241                                         ASSERT_ONLY(int r =)
2242                                         br->installRanges(c, nextc, fuzzy_, qualLim_ - br->ham_, q);
2243                                         assert(r < 4 || c == 4);
2244                                         // Update top and bot
2245                                         if(c < 4) {
2246                                                 br->top_ = rs->tops[c];
2247                                                 br->bot_ = rs->bots[c];
2248                                         } else {
2249                                                 br->top_ = br->bot_ = 1;
2250                                         }
2251                                 } else if(br->bot_ > br->top_) {
2252                                         // This read position is not a legitimate backtracking
2253                                         // alternative.  No need to do the bookkeeping for the
2254                                         // entire quartet, just do c.  We hope that the
2255                                         // appropriate cache lines are prefetched before now;
2256                                         // otherwise, we're about to take an expensive cache
2257                                         // miss.
2258                                         assert(br->ltop_.valid());
2259                                         rs->eliminated_ = true; // eliminate all alternatives leaving this node
2260                                         assert(br->eliminated(br->len_));
2261                                         if(c < 4) {
2262                                                 if(br->top_ + 1 == br->bot_) {
2263                                                         if(metrics_ != NULL) metrics_->curBwtOps_++;
2264                                                         br->bot_ = br->top_ = ebwt.mapLF1(br->top_, br->ltop_, c);
2265                                                         if(br->bot_ != 0xffffffff) br->bot_++;
2266                                                 } else {
2267                                                         if(metrics_ != NULL) metrics_->curBwtOps_++;
2268                                                         br->top_ = ebwt.mapLF(br->ltop_, c);
2269                                                         assert(br->lbot_.valid());
2270                                                         if(metrics_ != NULL) metrics_->curBwtOps_++;
2271                                                         br->bot_ = ebwt.mapLF(br->lbot_, c);
2272                                                 }
2273                                         }
2274                                 } else {
2275                                         rs->eliminated_ = true;
2276                                 }
2277                                 assert(rs->repOk());
2278                                 // br->top_ and br->bot_ now contain the next top and bot
2279                         } else {
2280                                 // The continuation had already processed the whole read
2281                                 assert_eq(qlen_, depth);
2282                                 cur = 0;
2283                         }
2284                         empty = (br->top_ == br->bot_);
2285                         hit = (cur == 0 && !empty);
2286
2287                         // Check whether we've obtained an exact alignment when
2288                         // we've been instructed not to report exact alignments
2289                         nedits = br->edits_.size();
2290                         invalidExact = (hit && nedits == 0 && !reportExacts_);
2291                         assert_leq(br->ham_, qualLim_);
2292
2293                         // Set this to true if the only way to make legal progress
2294                         // is via one or more additional backtracks.
2295                         if(halfAndHalf_ && !hhCheck(br, depth, empty)) {
2296                                 // This alignment doesn't satisfy the half-and-half
2297                                 // requirements; reject it
2298                                 if(metrics_ != NULL) metrics_->curBacktracks_++;
2299                                 pm.curtail(br, qlen_, depth3_, qualOrder_);
2300                                 goto bail;
2301                         }
2302
2303                         if(hit &&            // there is a range to report
2304                            !invalidExact &&  // not disqualified by no-exact-hits setting
2305                            !reportedPartial) // not an already-reported partial alignment
2306                         {
2307                                 if(verbose_) {
2308                                         if(partial_) {
2309                                                 cout << " Partial alignment:" << endl;
2310                                         } else {
2311                                                 cout << " Final alignment:" << endl;
2312                                         }
2313                                         br->len_++;
2314                                         br->print((*qry_), (*qual_), minCost, cout, halfAndHalf_ > 0, partial_, fw_, ebwt_->fw());
2315                                         br->len_--;
2316                                         cout << endl;
2317                                 }
2318                                 assert_gt(br->bot_, br->top_);
2319                                 assert_leq(br->ham_, qualLim_);
2320                                 assert_leq((uint32_t)(br->cost_ & ~0xc000), qualLim_);
2321                                 if(metrics_ != NULL) metrics_->setReadHasRange();
2322                                 curRange_.top     = br->top_;
2323                                 curRange_.bot     = br->bot_;
2324                                 curRange_.cost    = br->cost_;
2325                                 curRange_.stratum = (br->cost_ >> 14);
2326                                 curRange_.numMms  = nedits;
2327                                 curRange_.fw      = fw_;
2328                                 curRange_.mms.clear();
2329                                 curRange_.refcs.clear();
2330                                 for(size_t i = 0; i < nedits; i++) {
2331                                         curRange_.mms.push_back(qlen_ - br->edits_.get(i).pos - 1);
2332                                         curRange_.refcs.push_back((char)br->edits_.get(i).chr);
2333                                 }
2334                                 addPartialEdits();
2335                                 curRange_.ebwt    = ebwt_;
2336                                 this->foundRange  = true;
2337         #ifndef NDEBUG
2338                                 int64_t top2 = (int64_t)br->top_;
2339                                 top2++; // ensure it's not 0
2340                                 if(ebwt_->fw()) top2 = -top2;
2341                                 assert(allTops_.find(top2) == allTops_.end());
2342                                 allTops_.insert(top2);
2343         #endif
2344                                 assert(curRange_.repOk());
2345                                 // Must curtail because we've consumed the whole pattern
2346                                 if(metrics_ != NULL) metrics_->curBacktracks_++;
2347                                 pm.curtail(br, qlen_, depth3_, qualOrder_);
2348                         } else if(empty || cur == 0) {
2349                                 // The branch couldn't be extended further
2350                                 if(metrics_ != NULL) metrics_->curBacktracks_++;
2351                                 pm.curtail(br, qlen_, depth3_, qualOrder_);
2352                         } else {
2353                                 // Extend the branch by one position; no change to its cost
2354                                 // so there's no need to reconsider where it lies in the
2355                                 // priority queue
2356                                 assert_neq(0, cur);
2357                                 br->extend();
2358                         }
2359                 bail:
2360                         // Make sure the front element of the priority queue is
2361                         // extendable (i.e. not curtailed) and then prep it.
2362                         if(!pm.splitAndPrep(rand_, qlen_, qualLim_, depth3_,
2363                                             qualOrder_, fuzzy_,
2364                                             ebwt_->_eh, ebwt_->_ebwt, ebwt_->_fw))
2365                         {
2366                                 pm.reset(0);
2367                                 assert(pm.empty());
2368                         }
2369                         if(pm.empty()) {
2370                                 // No more branches
2371                                 break;
2372                         }
2373                         assert(!pm.front()->curtailed_);
2374                         assert(!pm.front()->exhausted_);
2375
2376                         if(until == ADV_COST_CHANGES && pm.front()->cost_ != cost) break;
2377                         else if(until == ADV_STEP) break;
2378
2379                 } while(!this->foundRange);
2380                 if(!pm.empty()) {
2381                         assert(!pm.front()->curtailed_);
2382                         assert(!pm.front()->exhausted_);
2383                 }
2384         }
2385
2386         /**
2387          * Return true iff we're enforcing a half-and-half constraint
2388          * (forced edits in both seed halves).
2389          */
2390         int halfAndHalf() const {
2391                 return halfAndHalf_;
2392         }
2393
2394 protected:
2395
2396         /**
2397          * Lump all the seed-alignment edits from the seedRange_ range
2398          * found previously to the curRange_ range just found.
2399          */
2400         void addPartialEdits() {
2401                 // Lump in the edits from the partial alignment
2402                 if(seedRange_.valid()) {
2403                         const size_t srSz = seedRange_.mms.size();
2404                         for(size_t i = 0; i < srSz; i++) {
2405                                 curRange_.mms.push_back(qlen_ - seedRange_.mms[i] - 1);
2406                                 curRange_.refcs.push_back(seedRange_.refcs[i]);
2407                         }
2408                         curRange_.numMms += srSz;
2409                 }
2410         }
2411
2412         /**
2413          * Return true iff we're OK to continue after considering which
2414          * half-seed boundary we're passing through, together with the
2415          * number of mismatches accumulated so far.  Return false if we
2416          * should stop because a half-and-half constraint is violated.  If
2417          * we're not currently passing a half-seed boundary, just return
2418          * true.
2419          */
2420         bool hhCheck(Branch *b, uint32_t depth, bool empty) {
2421                 const uint32_t nedits = b->edits_.size();
2422                 ASSERT_ONLY(uint32_t lim3 = (offRev3_ == offRev2_)? 2 : 3);
2423                 ASSERT_ONLY(uint32_t lim5 = (offRev1_ == offRev0_)? 2 : 1);
2424                 if((depth == (depth5_-1)) && !empty) {
2425                         // We're crossing the boundary separating the hi-half
2426                         // from the non-seed portion of the read.
2427                         // We should induce a mismatch if we haven't mismatched
2428                         // yet, so that we don't waste time pursuing a match
2429                         // that was covered by a previous phase
2430                         assert_leq(nedits, lim5);
2431                         return nedits > 0;
2432                 } else if((depth == (depth3_-1)) && !empty) {
2433                         // We're crossing the boundary separating the lo-half
2434                         // from the non-seed portion of the read
2435                         assert_leq(nedits, lim3);
2436                         assert_gt(nedits, 0);
2437                         // Count the mismatches in the lo and hi halves
2438                         uint32_t loHalfMms = 0, hiHalfMms = 0;
2439                         for(size_t i = 0; i < nedits; i++) {
2440                                 uint32_t depth = b->edits_.get(i).pos;
2441                                 if     (depth < depth5_) hiHalfMms++;
2442                                 else if(depth < depth3_) loHalfMms++;
2443                                 else assert(false);
2444                         }
2445                         assert_leq(loHalfMms + hiHalfMms, lim3);
2446                         bool invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
2447                         return (nedits >= (uint32_t)halfAndHalf_ && !invalidHalfAndHalf);
2448                 }
2449 #ifndef NDEBUG
2450                 if(depth < depth5_-1) {
2451                         assert_leq(nedits, lim5);
2452                 }
2453                 else if(depth >= depth5_ && depth < depth3_-1) {
2454                         assert_gt(nedits, 0);
2455                         assert_leq(nedits, lim3);
2456                 }
2457 #endif
2458                 return true;
2459         }
2460
2461         /**
2462          * Return true iff the state of the backtracker as encoded by
2463          * stackDepth, d and iham is compatible with the current half-and-
2464          * half alignment mode.  prehits is for sanity checking when
2465          * bailing.
2466          */
2467         bool hhCheckTop(Branch* b,
2468                         uint32_t d,
2469                         uint32_t iham,
2470                         uint64_t prehits = 0xffffffffffffffffllu)
2471         {
2472                 // Crossing from the hi-half into the lo-half
2473                 ASSERT_ONLY(uint32_t lim3 = (offRev3_ == offRev2_)? 2 : 3);
2474                 ASSERT_ONLY(uint32_t lim5 = (offRev1_ == offRev0_)? 2 : 1);
2475                 const uint32_t nedits = b->edits_.size();
2476                 if(d == depth5_) {
2477                         assert_leq(nedits, lim5);
2478                         if(nedits == 0) {
2479                                 return false;
2480                         }
2481                 } else if(d == depth3_) {
2482                         assert_leq(nedits, lim3);
2483                         if(nedits < (uint32_t)halfAndHalf_) {
2484                                 return false;
2485                         }
2486                 }
2487 #ifndef NDEBUG
2488                 else {
2489                         // We didn't just cross a boundary, so do an in-between check
2490                         if(d >= depth5_) {
2491                                 assert_geq(nedits, 1);
2492                         } else if(d >= depth3_) {
2493                                 assert_geq(nedits, lim3);
2494                         }
2495                 }
2496 #endif
2497                 return true;
2498         }
2499
2500         /**
2501          * Return the Phred-scale quality value at position 'off'
2502          */
2503         inline uint8_t qualAt(size_t off) {
2504                 return phredCharToPhredQual((*qual_)[off]);
2505         }
2506
2507         /**
2508          * Tally how many Ns occur in the seed region and in the ftab-
2509          * jumpable region of the read.  Check whether the mismatches
2510          * induced by the Ns already violates the current policy.  Return
2511          * false if the policy is already violated, true otherwise.
2512          */
2513         bool tallyNs(int& nsInSeed, int& nsInFtab) {
2514                 const Ebwt<String<Dna> >& ebwt = *ebwt_;
2515                 int ftabChars = ebwt._eh._ftabChars;
2516                 // Count Ns in the seed region of the read and short-circuit if
2517                 // the configuration of Ns guarantees that there will be no
2518                 // valid alignments given the backtracking constraints.
2519                 for(size_t i = 0; i < offRev3_; i++) {
2520                         if((int)(*qry_)[qlen_-i-1] == 4) {
2521                                 nsInSeed++;
2522                                 if(nsInSeed == 1) {
2523                                         if(i < offRev0_) {
2524                                                 return false; // Exceeded mm budget on Ns alone
2525                                         }
2526                                 } else if(nsInSeed == 2) {
2527                                         if(i < offRev1_) {
2528                                                 return false; // Exceeded mm budget on Ns alone
2529                                         }
2530                                 } else if(nsInSeed == 3) {
2531                                         if(i < offRev2_) {
2532                                                 return false; // Exceeded mm budget on Ns alone
2533                                         }
2534                                 } else {
2535                                         assert_gt(nsInSeed, 3);
2536                                         return false;     // Exceeded mm budget on Ns alone
2537                                 }
2538                         }
2539                 }
2540                 // Calculate the number of Ns there are in the region that
2541                 // would get jumped over if the ftab were used.
2542                 for(size_t i = 0; i < (size_t)ftabChars && i < qlen_; i++) {
2543                         if((int)(*qry_)[qlen_-i-1] == 4) nsInFtab++;
2544                 }
2545                 return true;
2546         }
2547
2548         /**
2549          * Calculate the offset into the ftab for the rightmost 'ftabChars'
2550          * characters of the current query. Rightmost char gets least
2551          * significant bit-pair.
2552          */
2553         uint32_t calcFtabOff() {
2554                 const Ebwt<String<Dna> >& ebwt = *ebwt_;
2555                 int ftabChars = ebwt._eh._ftabChars;
2556                 uint32_t ftabOff = (*qry_)[qlen_ - ftabChars];
2557                 assert_lt(ftabOff, 4);
2558                 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
2559                 for(int i = ftabChars - 1; i > 0; i--) {
2560                         ftabOff <<= 2;
2561                         assert_lt((uint32_t)(*qry_)[qlen_-i], 4);
2562                         ftabOff |= (uint32_t)(*qry_)[qlen_-i];
2563                         assert_lt(ftabOff, ebwt._eh._ftabLen-1);
2564                 }
2565                 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
2566                 return ftabOff;
2567         }
2568
2569         String<Dna5>*       qry_;    // query (read) sequence
2570         String<Dna5>        qryBuf_; // for composing modified qry_ strings
2571         size_t              qlen_;   // length of _qry
2572         String<char>*       qual_;   // quality values for _qry
2573         String<char>*       name_;   // name of _qry
2574         bool                color_;  // true -> read is colorspace
2575         String<Dna5>*       altQry_; // alternate basecalls
2576         String<char>*       altQual_; // quality values for alternate basecalls
2577         int                 alts_;   // max # alternatives
2578         bool                fuzzy_;  // alternate scoring scheme?
2579         const Ebwt<String<Dna> >*   ebwt_;   // Ebwt to search in
2580         bool                fw_;
2581         uint32_t            offRev0_; // unrevisitable chunk
2582         uint32_t            offRev1_;  // 1-revisitable chunk
2583         uint32_t            offRev2_;  // 2-revisitable chunk
2584         uint32_t            offRev3_;  // 3-revisitable chunk
2585         /// Whether to round qualities off Maq-style when calculating penalties
2586         bool                maqPenalty_;
2587         /// Whether to order paths on our search in a way that takes
2588         /// qualities into account.  If this is false, the effect is that
2589         /// the first path reported is guaranteed to be in the best
2590         /// stratum, but it's not guaranteed to have the best quals.
2591         bool                qualOrder_;
2592         /// Reject alignments where sum of qualities at mismatched
2593         /// positions is greater than qualLim_
2594         uint32_t            qualLim_;
2595         /// Report exact alignments iff this is true
2596         bool                reportExacts_;
2597         /// Whether to use the _os array together with a naive matching
2598         /// algorithm to double-check reported alignments (or the lack
2599         /// thereof)
2600         int                 halfAndHalf_;
2601         /// Whether we're generating partial alignments for a longer
2602         /// alignment in the opposite index.
2603         bool                partial_;
2604         /// Depth of 5'-seed-half border
2605         uint32_t            depth5_;
2606         /// Depth of 3'-seed-half border
2607         uint32_t            depth3_;
2608         /// Source of pseudo-random numbers
2609         RandomSource        rand_;
2610         /// Be talkative
2611         bool                verbose_;
2612         /// Suppress unnecessary output
2613         bool                quiet_;
2614         // Current range to expose to consumers
2615         Range               curRange_;
2616         // Range for the partial alignment we're extending (NULL if we
2617         // aren't extending a partial)
2618         Range               seedRange_;
2619         // Starts as false; set to true as soon as we know we want to skip
2620         // all further processing of this read
2621         bool                skippingThisRead_;
2622         // Object encapsulating metrics
2623         AlignerMetrics*     metrics_;
2624 #ifndef NDEBUG
2625         std::set<int64_t>   allTops_;
2626 #endif
2627 };
2628
2629 /**
2630  * Concrete factory for EbwtRangeSource objects.
2631  */
2632 class EbwtRangeSourceFactory {
2633         typedef Ebwt<String<Dna> > TEbwt;
2634 public:
2635         EbwtRangeSourceFactory(
2636                         const TEbwt* ebwt,
2637                         bool         fw,
2638                         uint32_t     qualThresh,
2639                         bool         reportExacts,
2640                         bool         verbose,
2641                         bool         quiet,
2642                         bool         halfAndHalf,
2643                         bool         seeded,
2644                         bool         maqPenalty,
2645                         bool         qualOrder,
2646                         AlignerMetrics *metrics = NULL) :
2647                         ebwt_(ebwt),
2648                         fw_(fw),
2649                         qualThresh_(qualThresh),
2650                         reportExacts_(reportExacts),
2651                         verbose_(verbose),
2652                         quiet_(quiet),
2653                         halfAndHalf_(halfAndHalf),
2654                         seeded_(seeded),
2655                         maqPenalty_(maqPenalty),
2656                         qualOrder_(qualOrder),
2657                         metrics_(metrics) { }
2658
2659         /**
2660          * Return new EbwtRangeSource with predefined params.s
2661          */
2662         EbwtRangeSource *create() {
2663                 return new EbwtRangeSource(ebwt_, fw_, qualThresh_,
2664                                            reportExacts_, verbose_, quiet_,
2665                                            halfAndHalf_, seeded_, maqPenalty_,
2666                                            qualOrder_, metrics_);
2667         }
2668
2669 protected:
2670         const TEbwt* ebwt_;
2671         bool         fw_;
2672         uint32_t     qualThresh_;
2673         bool         reportExacts_;
2674         bool         verbose_;
2675         bool         quiet_;
2676         bool         halfAndHalf_;
2677         bool         seeded_;
2678         bool         maqPenalty_;
2679         bool         qualOrder_;
2680         AlignerMetrics *metrics_;
2681 };
2682
2683 /**
2684  * What boundary within the alignment to "pin" a particular
2685  * backtracking constraint to.
2686  */
2687 enum SearchConstraintExtent {
2688         PIN_TO_BEGINNING = 1, // depth 0; i.e., constraint is inactive
2689         PIN_TO_LEN,           // constraint applies to while alignment
2690         PIN_TO_HI_HALF_EDGE,  // constraint applies to hi-half of seed region
2691         PIN_TO_SEED_EDGE      // constraint applies to entire seed region
2692 };
2693
2694 /**
2695  * Concrete RangeSourceDriver that deals properly with
2696  * GreedyDFSRangeSource by calling setOffs() with the appropriate
2697  * parameters when initializing it;
2698  */
2699 class EbwtRangeSourceDriver :
2700         public SingleRangeSourceDriver<EbwtRangeSource>
2701 {
2702 public:
2703         EbwtRangeSourceDriver(
2704                         EbwtSearchParams<String<Dna> >& params,
2705                         EbwtRangeSource* rs,
2706                         bool fw,
2707                         bool seed,
2708                         bool maqPenalty,
2709                         bool qualOrder,
2710                         HitSink& sink,
2711                         HitSinkPerThread* sinkPt,
2712                         uint32_t seedLen,
2713                         bool nudgeLeft,
2714                         SearchConstraintExtent rev0Off,
2715                         SearchConstraintExtent rev1Off,
2716                         SearchConstraintExtent rev2Off,
2717                         SearchConstraintExtent rev3Off,
2718                         vector<String<Dna5> >& os,
2719                         bool verbose,
2720                         bool quiet,
2721                         bool mate1,
2722                         ChunkPool* pool,
2723                         int *btCnt) :
2724                         SingleRangeSourceDriver<EbwtRangeSource>(
2725                                         params, rs, fw, sink, sinkPt, os, verbose,
2726                                         quiet, mate1, 0, pool, btCnt),
2727                         seed_(seed),
2728                         maqPenalty_(maqPenalty),
2729                         qualOrder_(qualOrder),
2730                         rs_(rs), seedLen_(seedLen),
2731                         nudgeLeft_(nudgeLeft),
2732                         rev0Off_(rev0Off), rev1Off_(rev1Off),
2733                         rev2Off_(rev2Off), rev3Off_(rev3Off),
2734                         verbose_(verbose), quiet_(quiet)
2735         {
2736                 if(seed_) assert_gt(seedLen, 0);
2737         }
2738
2739         virtual ~EbwtRangeSourceDriver() { }
2740
2741         bool seed() const { return seed_; }
2742
2743         bool ebwtFw() const { return rs_->curEbwt()->fw(); }
2744
2745         /**
2746          * Called every time setQuery() is called in the parent class,
2747          * after setQuery() has been called on the RangeSource but before
2748          * initConts() has been called.
2749          */
2750         virtual void initRangeSource(const String<char>& qual, bool fuzzy,
2751                                      int alts, const String<char>* altQuals)
2752         {
2753                 // If seedLen_ is huge, then it will always cover the whole
2754                 // alignment
2755                 assert_eq(len_, seqan::length(qual));
2756                 uint32_t s = (seedLen_ > 0 ? min(seedLen_, len_) : len_);
2757                 uint32_t sLeft  = s >> 1;
2758                 uint32_t sRight = s >> 1;
2759                 // If seed has odd length, then nudge appropriate half up by 1
2760                 if((s & 1) != 0) { if(nudgeLeft_) sLeft++; else sRight++; }
2761                 uint32_t rev0Off = cextToDepth(rev0Off_, sRight, s, len_);
2762                 uint32_t rev1Off = cextToDepth(rev1Off_, sRight, s, len_);
2763                 uint32_t rev2Off = cextToDepth(rev2Off_, sRight, s, len_);
2764                 uint32_t rev3Off = cextToDepth(rev3Off_, sRight, s, len_);
2765                 // Truncate the pattern if necessary
2766                 uint32_t qlen = seqan::length(qual);
2767                 if(seed_) {
2768                         if(len_ > s) {
2769                                 rs_->setQlen(s);
2770                                 qlen = s;
2771                         }
2772                         assert(!rs_->reportExacts());
2773                 }
2774                 // If there are any Ns in the unrevisitable region, then this
2775                 // driver is guaranteed to yield no fruit.
2776                 uint16_t minCost = 0;