1 #ifndef EBWT_SEARCH_BACKTRACK_H_
2 #define EBWT_SEARCH_BACKTRACK_H_
5 #include <seqan/sequence.h>
8 #include "ebwt_search_util.h"
10 #include "range_source.h"
11 #include "aligner_metrics.h"
12 #include "search_globals.h"
15 * Class that coordinates quality- and quantity-aware backtracking over
16 * some range of a read sequence.
18 * The creator can configure the BacktrackManager to treat different
19 * stretches of the read differently.
21 class GreedyDFSRangeSource {
23 typedef std::pair<int, int> TIntPair;
24 typedef seqan::String<seqan::Dna> DnaString;
28 const Ebwt<DnaString>* ebwt,
29 const EbwtSearchParams<DnaString>& params,
30 const BitPairReference* refs,
31 uint32_t qualThresh, /// max acceptable q-distance
32 const int maxBts, /// maximum # backtracks allowed
33 uint32_t reportPartials = 0,
34 bool reportExacts = true,
35 bool reportRanges = false,
36 PartialAlignmentManager* partials = NULL,
37 String<QueryMutation>* muts = NULL,
39 vector<String<Dna5> >* os = NULL,
40 bool considerQuals = true, // whether to consider quality values when making backtracking decisions
41 bool halfAndHalf = false, // hacky way of supporting separate revisitable regions
42 bool maqPenalty = true) :
54 _maqPenalty(maqPenalty),
55 _qualThresh(qualThresh),
61 _reportPartials(reportPartials),
62 _reportExacts(reportExacts),
63 _reportRanges(reportRanges),
67 _sanity(_os != NULL && _os->size() > 0),
68 _considerQuals(considerQuals),
69 _halfAndHalf(halfAndHalf),
75 _precalcedSideLocus(false),
82 ~GreedyDFSRangeSource() {
83 if(_pairs != NULL) delete[] _pairs;
84 if(_elims != NULL) delete[] _elims;
85 if(_chars != NULL) delete[] _chars;
89 * Set a new query read.
91 void setQuery(ReadBuf& r) {
92 const bool fw = _params.fw();
93 const bool ebwtFw = _ebwt->fw();
95 _qry = fw ? &r.patFw : &r.patRc;
96 _qual = fw ? &r.qual : &r.qualRev;
98 _qry = fw ? &r.patFwRev : &r.patRcRev;
99 _qual = fw ? &r.qualRev : &r.qual;
103 if(length(*_qry) > _qlen) {
105 _qlen = length(*_qry);
107 if(_pairs != NULL) { delete[] _pairs; }
108 _pairs = new uint32_t[_qlen*_qlen*8];
110 if(_elims != NULL) { delete[] _elims; }
111 _elims = new uint8_t[_qlen*_qlen];
112 memset(_elims, 0, _qlen*_qlen);
114 if(_chars != NULL) { delete[] _chars; }
115 _chars = new char[_qlen];
116 assert(_pairs != NULL && _elims != NULL && _chars != NULL);
117 } catch(std::bad_alloc& e) {
118 ThreadSafe _ts(&gLock);
119 cerr << "Unable to allocate memory for depth-first "
120 << "backtracking search; new length = " << length(*_qry)
125 // New length is less than old length, so there's no need
126 // to resize any data structures.
127 assert(_pairs != NULL && _elims != NULL && _chars != NULL);
128 _qlen = length(*_qry);
132 assert_geq(length(*_qual), _qlen);
134 cout << "setQuery(_qry=" << (*_qry) << ", _qual=" << (*_qual) << ")" << endl;
136 // Initialize the random source using new read as part of the
145 * Apply a batch of mutations to this read, possibly displacing a
146 * previous batch of mutations.
148 void setMuts(String<QueryMutation>* muts) {
150 // Undo previous mutations
151 assert_gt(length(*_muts), 0);
152 undoPartialMutations();
156 assert_gt(length(*_muts), 0);
157 applyPartialMutations();
162 * Set backtracking constraints.
164 void setOffs(uint32_t depth5, // depth of far edge of hi-half
165 uint32_t depth3, // depth of far edge of lo-half
166 uint32_t unrevOff, // depth above which we cannot backtrack
167 uint32_t revOff1, // depth above which we may backtrack just once
168 uint32_t revOff2, // depth above which we may backtrack just twice
169 uint32_t revOff3) // depth above which we may backtrack just three times
173 assert_geq(depth3, depth5);
174 _unrevOff = unrevOff;
181 * Reset number of backtracks to 0.
183 void resetNumBacktracks() {
188 * Return number of backtracks since the last time the count was
191 uint32_t numBacktracks() {
196 * Set whether to report exact hits.
198 void setReportExacts(int stratum) {
199 _reportExacts = stratum;
203 * Set the Bowtie index to search against.
205 void setEbwt(const Ebwt<String<Dna> >* ebwt) {
210 * Return the current range
217 * Set _qlen. Don't let it exceed length of query.
219 void setQlen(uint32_t qlen) {
220 assert(_qry != NULL);
221 _qlen = min<uint32_t>(length(*_qry), qlen);
224 /// Return the maximum number of allowed backtracks in a given call
226 uint32_t maxBacktracks() {
231 * Initiate the recursive backtracking routine starting at the
232 * extreme right-hand side of the pattern. Use the ftab to match
233 * the first several characters in one chomp, as long as doing so
234 * does not "jump over" any legal backtracking targets.
236 * Return true iff the HitSink has indicated that we're done with
239 bool backtrack(uint32_t ham = 0) {
240 assert_gt(length(*_qry), 0);
241 assert_leq(_qlen, length(*_qry));
242 assert_geq(length(*_qual), length(*_qry));
243 const Ebwt<String<Dna> >& ebwt = *_ebwt;
244 int ftabChars = ebwt._eh._ftabChars;
245 int nsInSeed = 0; int nsInFtab = 0;
246 if(!tallyNs(nsInSeed, nsInFtab)) {
247 // No alignments are possible because of the distribution
248 // of Ns in the read in combination with the backtracking
253 // m = depth beyond which ftab must not extend or else we might
254 // miss some legitimate paths
255 uint32_t m = min<uint32_t>(_unrevOff, _qlen);
256 if(nsInFtab == 0 && m >= (uint32_t)ftabChars) {
257 uint32_t ftabOff = calcFtabOff();
258 uint32_t top = ebwt.ftabHi(ftabOff);
259 uint32_t bot = ebwt.ftabLo(ftabOff+1);
260 if(_qlen == (uint32_t)ftabChars && bot > top) {
262 if(_reportPartials > 0) {
263 // Oops - we're trying to find seedlings, so we've
264 // gone too far; start again
265 ret = backtrack(0, // depth
272 ret = reportAlignment(0, top, bot, ham);
274 } else if (bot > top) {
275 // We have an arrow pair from which we can backtrack
276 ret = backtrack(ftabChars, // depth
282 // The arrows are already closed; give up
286 // The ftab *does* extend past the unrevisitable portion;
287 // we can't use it in this case, because we might jump past
288 // a legitimate mismatch
289 ret = backtrack(0, // depth
293 // disable ftab jumping if there is more
297 if(finalize()) ret = true;
302 * If there are any buffered results that have yet to be committed,
303 * commit them. This happens when looking for partial alignments.
307 if(_reportPartials > 0) {
308 // We're in partial alignment mode; take elements of the
309 // _partialBuf and install them in the _partials database
310 assert(_partials != NULL);
311 if(_partialsBuf.size() > 0) {
313 for(size_t i = 0; i < _partialsBuf.size(); i++) {
314 assert(_partialsBuf[i].repOk(_qualThresh, _qlen, (*_qual), _maqPenalty));
317 _partials->addPartials(_params.patId(), _partialsBuf);
318 _partialsBuf.clear();
324 assert_eq(0, _partialsBuf.size());
329 * Starting at the given "depth" relative to the 5' end, and the
330 * given top and bot indexes (where top=0 and bot=0 means it's up
331 * to us to calculate the initial range), and initial weighted
332 * hamming distance iham, find a hit using randomized, quality-
333 * aware backtracking.
335 bool backtrack(uint32_t depth,
339 bool disableFtab = false)
341 HitSinkPerThread& sink = _params.sink();
342 _ihits = sink.retainedHits().size();
344 // Initiate the recursive, randomized quality-aware backtracker
345 // with a stack depth of 0 (no backtracks so far)
346 _bailedOnBacktracks = false;
347 bool done = backtrack(0, depth, _unrevOff, _1revOff, _2revOff, _3revOff,
348 top, bot, iham, iham, _pairs, _elims, disableFtab);
350 _totNumBts += _numBts;
352 _precalcedSideLocus = false;
353 _bailedOnBacktracks = false;
358 * Recursive routine for progressing to the next backtracking
359 * decision given some initial conditions. If a hit is found, it
360 * is recorded and true is returned. Otherwise, if there are more
361 * backtracking opportunities, the function will call itself
362 * recursively and return the result. As soon as there is a
363 * mismatch and no backtracking opportunities, false is returned.
365 bool backtrack(uint32_t stackDepth, // depth of the recursion stack; = # mismatches so far
366 uint32_t depth, // next depth where a post-pair needs to be calculated
367 uint32_t unrevOff, // depths < unrevOff are unrevisitable
368 uint32_t oneRevOff,// depths < oneRevOff are 1-revisitable
369 uint32_t twoRevOff,// depths < twoRevOff are 2-revisitable
370 uint32_t threeRevOff,// depths < threeRevOff are 3-revisitable
371 uint32_t top, // top arrow in pair prior to 'depth'
372 uint32_t bot, // bottom arrow in pair prior to 'depth'
373 uint32_t ham, // weighted hamming distance so far
374 uint32_t iham, // initial weighted hamming distance
375 uint32_t* pairs, // portion of pairs array to be used for this backtrack frame
376 uint8_t* elims, // portion of elims array to be used for this backtrack frame
377 bool disableFtab = false)
379 // Can't have already exceeded weighted hamming distance threshold
380 assert_leq(stackDepth, depth);
381 assert_gt(length(*_qry), 0);
382 assert_leq(_qlen, length(*_qry));
383 assert_geq(length(*_qual), length(*_qry));
384 assert(_qry != NULL);
385 assert(_qual != NULL);
386 assert(_name != NULL);
388 assert_leq(ham, _qualThresh);
389 assert_lt(depth, _qlen); // can't have run off the end of qry
390 assert_geq(bot, top); // could be that both are 0
391 assert(pairs != NULL);
392 assert(elims != NULL);
393 assert_leq(stackDepth, _qlen);
394 const Ebwt<String<Dna> >& ebwt = *_ebwt;
395 HitSinkPerThread& sink = _params.sink();
396 uint64_t prehits = sink.numValidHits();
398 assert_eq(0, _reportPartials);
399 assert_gt(_3depth, _5depth);
401 if(_reportPartials) {
402 assert(!_halfAndHalf);
405 cout << " backtrack(stackDepth=" << stackDepth << ", "
406 << "depth=" << depth << ", "
407 << "top=" << top << ", "
408 << "bot=" << bot << ", "
409 << "ham=" << ham << ", "
410 << "iham=" << iham << ", "
411 << "pairs=" << pairs << ", "
412 << "elims=" << (void*)elims << "): \"";
413 for(int i = (int)depth - 1; i >= 0; i--) {
416 cout << "\"" << endl;
418 // Do this early on so that we can clear _precalcedSideLocus
419 // before we have too many opportunities to bail and leave it
421 SideLocus ltop, lbot;
422 if(_precalcedSideLocus) {
425 _precalcedSideLocus = false;
426 } else if(top != 0 || bot != 0) {
427 SideLocus::initFromTopBot(top, bot, ebwt._eh, ebwt._ebwt, ltop, lbot);
429 // Check whether we've exceeded any backtracking limit
431 if(_maxBts > 0 && _numBts == _maxBts) {
432 _bailedOnBacktracks = true;
437 // # positions with at least one legal outgoing path
439 // # positions tied for "best" outgoing qual
440 uint32_t eligibleNum = 0;
441 // total range-size for all eligibles
442 uint32_t eligibleSz = 0;
443 // If there is just one eligible slot at the moment (a common
444 // case), these are its parameters
446 bool elignore = true; // ignore the el values because they didn't come from a recent override
449 uint32_t elham = ham;
452 // The lowest quality value associated with any alternative
453 // ranges; all alternative ranges with this quality are
455 uint8_t lowAltQual = 0xff;
457 uint32_t cur = _qlen - d - 1; // current offset into _qry
459 // Try to advance further given that
461 cout << " cur=" << cur << " \"";
462 for(int i = (int)d - 1; i >= 0; i--) {
468 // If we're searching for a half-and-half solution, then
469 // enforce the boundary-crossing constraints here.
470 if(_halfAndHalf && !hhCheckTop(stackDepth, d, iham, _mms, prehits)) {
474 bool curIsEligible = false;
475 // Reset eligibleNum and eligibleSz if there are any
476 // eligible pairs discovered at this spot
477 bool curOverridesEligible = false;
478 // Determine whether ranges at this location are
479 // candidates for backtracking
480 int c = (int)(*_qry)[cur];
482 uint8_t q = qualAt(cur);
483 // The current query position is a legit alternative if it a) is
484 // not in the unrevisitable region, and b) the quality ceiling (if
485 // one exists) is not exceeded
486 bool curIsAlternative =
489 (ham + mmPenalty(_maqPenalty, q) <= _qualThresh));
490 if(curIsAlternative) {
492 // Is it the best alternative?
494 // Ranges at this depth in this backtracking frame are
495 // eligible, unless we learn otherwise. Ranges previously
496 // thought to be eligible are not any longer.
497 curIsEligible = true;
498 curOverridesEligible = true;
499 } else if(q == lowAltQual) {
500 // Ranges at this depth in this backtracking frame
501 // are eligible, unless we learn otherwise
502 curIsEligible = true;
505 // When quality values are not considered, all positions
507 curIsEligible = true;
510 if(curIsEligible) assert(curIsAlternative);
511 if(curOverridesEligible) assert(curIsEligible);
512 if(curIsAlternative && !curIsEligible) {
513 assert_gt(eligibleSz, 0);
514 assert_gt(eligibleNum, 0);
517 cout << " alternative: " << curIsAlternative;
518 cout << ", eligible: " << curIsEligible;
519 if(curOverridesEligible) cout << "(overrides)";
523 // If c is 'N', then it's guaranteed to be a mismatch
524 if(c == 4 && d > 0) {
525 // Force the 'else if(curIsAlternative)' branch below
528 // We'll take the 'if(top == 0 && bot == 0)' branch below
532 // Calculate the ranges for this position
533 if(top == 0 && bot == 0) {
534 // Calculate first quartet of ranges using the _fchr[]
536 pairs[0 + 0] = ebwt._fchr[0];
537 pairs[0 + 4] = pairs[1 + 0] = ebwt._fchr[1];
538 pairs[1 + 4] = pairs[2 + 0] = ebwt._fchr[2];
539 pairs[2 + 4] = pairs[3 + 0] = ebwt._fchr[3];
540 pairs[3 + 4] = ebwt._fchr[4];
541 // Update top and bot
543 top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c);
544 assert_geq(bot, top);
546 } else if(curIsAlternative) {
548 memset(&pairs[d*8], 0, 8 * 4);
549 // Calculate next quartet of ranges
550 ebwt.mapLFEx(ltop, lbot, &pairs[d*8], &pairs[(d*8)+4]);
551 // Update top and bot
553 top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c);
554 assert_geq(bot, top);
557 // This query character is not even a legitimate
558 // alternative (because backtracking here would blow
559 // our mismatch quality budget), so no need to do the
560 // bookkeeping for the entire quartet, just do c
563 bot = top = ebwt.mapLF1(top, ltop, c);
564 if(bot != 0xffffffff) bot++;
566 top = ebwt.mapLF(ltop, c); bot = ebwt.mapLF(lbot, c);
567 assert_geq(bot, top);
572 // Calculate loci from row indices; do it now so that
573 // those prefetches are fired off as soon as possible.
574 // This eventually calls SideLocus.initfromRow().
575 SideLocus::initFromTopBot(top, bot, ebwt._eh, ebwt._ebwt, ltop, lbot);
577 // Update the elim array
578 eliminate(elims, d, c);
580 if(curIsAlternative) {
581 // Given the just-calculated range quartet, update
582 // elims, altNum, eligibleNum, eligibleSz
583 for(int i = 0; i < 4; i++) {
585 assert_leq(pairTop(pairs, d, i), pairBot(pairs, d, i));
586 uint32_t spread = pairSpread(pairs, d, i);
588 // Indicate this char at this position is
589 // eliminated as far as this backtracking frame is
590 // concerned, since its range is empty
591 elims[d] |= (1 << i);
592 assert_lt(elims[d], 16);
594 if(spread > 0 && ((elims[d] & (1 << i)) == 0)) {
595 // This char at this position is an alternative
597 if(curOverridesEligible) {
598 // Only now that we know there is at least
599 // one potential backtrack target at this
600 // most-eligible position should we reset
601 // these eligibility parameters
605 curOverridesEligible = false;
606 // Remember these parameters in case
607 // this turns out to be the only
610 eltop = pairTop(pairs, d, i);
611 elbot = pairBot(pairs, d, i);
612 assert_eq(elbot-eltop, spread);
613 elham = mmPenalty(_maqPenalty, q);
618 eligibleSz += spread;
621 assert_gt(eligibleSz, 0);
622 assert_gt(eligibleNum, 0);
628 assert_gt(eligibleSz, 0);
629 assert_gt(eligibleNum, 0);
631 assert_leq(eligibleNum, eligibleSz);
632 assert_leq(eligibleNum, altNum);
633 assert_lt(elims[d], 16);
634 assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
636 // Achieved a match, but need to keep going
637 bool backtrackDespiteMatch = false;
638 bool reportedPartial = false;
639 if(cur == 0 && // we've consumed the entire pattern
640 top < bot && // there's a hit to report
641 stackDepth < _reportPartials && // not yet used up our mismatches
642 _reportPartials > 0) // there are still legel backtracking targets
644 assert(!_halfAndHalf);
645 if(altNum > 0) backtrackDespiteMatch = true;
647 // This is a legit seedling; report it
648 reportPartial(stackDepth);
649 reportedPartial = true;
651 // Now continue on to find legitimate seedlings with
652 // more mismatches than this one
654 // Check whether we've obtained an exact alignment when
655 // we've been instructed not to report exact alignments
656 bool invalidExact = false;
657 if(cur == 0 && stackDepth == 0 && bot > top && !_reportExacts) {
659 backtrackDespiteMatch = true;
661 // Set this to true if the only way to make legal progress
662 // is via one or more additional backtracks. This is
663 // helpful in half-and-half mode.
664 bool mustBacktrack = false;
665 bool invalidHalfAndHalf = false;
667 ASSERT_ONLY(uint32_t lim = (_3revOff == _2revOff)? 2 : 3);
668 if((d == (_5depth-1)) && top < bot) {
669 // We're crossing the boundary separating the hi-half
670 // from the non-seed portion of the read.
671 // We should induce a mismatch if we haven't mismatched
672 // yet, so that we don't waste time pursuing a match
673 // that was covered by a previous phase
674 assert_eq(0, _reportPartials);
675 assert_leq(stackDepth, lim-1);
676 invalidHalfAndHalf = (stackDepth == 0);
677 if(stackDepth == 0 && altNum > 0) {
678 backtrackDespiteMatch = true;
679 mustBacktrack = true;
680 } else if(stackDepth == 0) {
681 // We're returning from the bottommost frame
682 // without having found any hits; let's
683 // sanity-check that there really aren't any
687 else if((d == (_3depth-1)) && top < bot) {
688 // We're crossing the boundary separating the lo-half
689 // from the non-seed portion of the read
690 assert_eq(0, _reportPartials);
691 assert_leq(stackDepth, lim);
692 assert_gt(stackDepth, 0);
693 // Count the mismatches in the lo and hi halves
694 uint32_t loHalfMms = 0, hiHalfMms = 0;
695 assert_geq(_mms.size(), stackDepth);
696 for(size_t i = 0; i < stackDepth; i++) {
697 uint32_t d = _qlen - _mms[i] - 1;
698 if (d < _5depth) hiHalfMms++;
699 else if(d < _3depth) loHalfMms++;
702 assert_leq(loHalfMms + hiHalfMms, lim);
703 invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
704 if((stackDepth < 2 || invalidHalfAndHalf) && altNum > 0) {
705 // We backtracked fewer times than necessary;
707 mustBacktrack = true;
708 backtrackDespiteMatch = true;
709 } else if(stackDepth < 2) {
714 assert_leq(stackDepth, lim-1);
716 else if(d >= _5depth && d < _3depth-1) {
717 assert_gt(stackDepth, 0);
718 assert_leq(stackDepth, lim);
721 // This is necessary for the rare case where we're about
722 // to declare success because bot > top and we've consumed
723 // the final character, but all hits between top and bot
724 // are spurious. This check ensures that we keep looking
725 // for non-spurious hits in that case.
726 if(cur == 0 && // we made it to the left-hand-side of the read
727 bot > top && // there are alignments to report
728 !invalidHalfAndHalf && // alignment isn't disqualified by half-and-half requirement
729 !invalidExact && // alignment isn't disqualified by no-exact-hits setting
730 !reportedPartial) // for when it's a partial alignment we've already reported
732 bool ret = reportAlignment(stackDepth, top, bot, ham);
734 // reportAlignment returned false, so enter the
735 // backtrack loop and keep going
738 // reportAlignment returned true, so stop
743 // Mismatch with alternatives
745 while((top == bot || backtrackDespiteMatch) && altNum > 0) {
746 if(_verbose) cout << " top (" << top << "), bot ("
747 << bot << ") with " << altNum
748 << " alternatives, eligible: "
749 << eligibleNum << ", " << eligibleSz
751 assert_gt(eligibleSz, 0);
752 assert_gt(eligibleNum, 0);
753 // Mismatch! Must now choose where we are going to
754 // take our quality penalty. We can only look as far
755 // back as our last decision point.
756 assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
757 // Pick out the arrow pair we selected and target it
759 ASSERT_ONLY(uint32_t eligiblesVisited = 0);
761 assert_geq(i, depth);
764 uint32_t btham = ham;
768 // The common case is that eligibleSz == 1
769 if(eligibleNum > 1 || elignore) {
770 bool foundTarget = false;
771 // Walk from left to right
772 for(; i >= depth; i--) {
773 assert_geq(i, unrevOff);
774 icur = _qlen - i - 1; // current offset into _qry
775 uint8_t qi = qualAt(icur);
776 assert_lt(elims[i], 16);
777 if((qi == lowAltQual || !_considerQuals) && elims[i] != 15) {
778 // This is the leftmost eligible position with at
779 // least one remaining backtrack target
781 // Add up the spreads for A, C, G, T
782 for(j = 0; j < 4; j++) {
783 if((elims[i] & (1 << j)) == 0) {
784 assert_gt(pairSpread(pairs, i, j), 0);
785 posSz += pairSpread(pairs, i, j);
788 // Generate a random number
790 uint32_t r = _rand.nextU32() % posSz;
791 for(j = 0; j < 4; j++) {
792 if((elims[i] & (1 << j)) == 0) {
793 // This range has not been eliminated
794 ASSERT_ONLY(eligiblesVisited++);
795 uint32_t spread = pairSpread(pairs, i, j);
797 // This is our randomly-selected
800 bttop = pairTop(pairs, i, j);
801 btbot = pairBot(pairs, i, j);
802 btham += mmPenalty(_maqPenalty, qi);
805 assert_leq(btham, _qualThresh);
806 break; // found our target; we can stop
812 break; // escape left-to-right walk
817 assert_leq(eligiblesVisited, eligibleNum);
819 assert_neq(0, btchar);
820 assert_gt(btbot, bttop);
821 assert_leq(btbot-bttop, eligibleSz);
823 // There was only one eligible target; we can just
824 // copy its parameters
825 assert_eq(1, eligibleNum);
833 assert_neq(0, btchar);
834 assert_gt(btbot, bttop);
835 assert_leq(btbot-bttop, eligibleSz);
837 // This is the earliest that we know what the next top/
838 // bot combo is going to be
839 SideLocus::initFromTopBot(bttop, btbot,
840 ebwt._eh, ebwt._ebwt,
842 icur = _qlen - i - 1; // current offset into _qry
843 // Slide over to the next backtacking frame within
844 // pairs and elims; won't interfere with our frame or
845 // any of our parents' frames
846 uint32_t *newPairs = pairs + (_qlen*8);
847 uint8_t *newElims = elims + (_qlen);
848 // If we've selected a backtracking target that's in
849 // the 1-revisitable region, then we ask the recursive
850 // callee to consider the 1-revisitable region as also
851 // being unrevisitable (since we just "used up" all of
853 uint32_t btUnrevOff = unrevOff;
854 uint32_t btOneRevOff = oneRevOff;
855 uint32_t btTwoRevOff = twoRevOff;
856 uint32_t btThreeRevOff = threeRevOff;
857 assert_geq(i, unrevOff);
858 assert_geq(oneRevOff, unrevOff);
859 assert_geq(twoRevOff, oneRevOff);
860 assert_geq(threeRevOff, twoRevOff);
862 // Extend unrevisitable region to include former 1-
863 // revisitable region
864 btUnrevOff = oneRevOff;
865 // Extend 1-revisitable region to include former 2-
866 // revisitable region
867 btOneRevOff = twoRevOff;
868 // Extend 2-revisitable region to include former 3-
869 // revisitable region
870 btTwoRevOff = threeRevOff;
872 else if(i < twoRevOff) {
873 // Extend 1-revisitable region to include former 2-
874 // revisitable region
875 btOneRevOff = twoRevOff;
876 // Extend 2-revisitable region to include former 3-
877 // revisitable region
878 btTwoRevOff = threeRevOff;
880 else if(i < threeRevOff) {
881 // Extend 2-revisitable region to include former 3-
882 // revisitable region
883 btTwoRevOff = threeRevOff;
885 // Note the character that we're backtracking on in the
887 if(_mms.size() <= stackDepth) {
888 assert_eq(_mms.size(), stackDepth);
889 _mms.push_back(icur);
891 _mms[stackDepth] = icur;
893 assert_eq(1, dna4Cat[(int)btchar]);
894 if(_refcs.size() <= stackDepth) {
895 assert_eq(_refcs.size(), stackDepth);
896 _refcs.push_back(btchar);
898 _refcs[stackDepth] = btchar;
901 for(uint32_t j = 0; j < stackDepth; j++) {
902 assert_neq(_mms[j], icur);
906 assert_leq(i+1, _qlen);
909 ret = reportAlignment(stackDepth+1, bttop, btbot, btham);
910 } else if(_halfAndHalf &&
912 _2revOff == _3revOff &&
913 i+1 < (uint32_t)ebwt._eh._ftabChars &&
914 (uint32_t)ebwt._eh._ftabChars <= _5depth)
916 // The ftab doesn't extend past the unrevisitable portion,
917 // so we can go ahead and use it
918 // Rightmost char gets least significant bit-pairs
919 int ftabChars = ebwt._eh._ftabChars;
920 uint32_t ftabOff = (*_qry)[_qlen - ftabChars];
921 assert_lt(ftabOff, 4);
922 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
923 for(int j = ftabChars - 1; j > 0; j--) {
925 if(_qlen-j == icur) {
928 assert_lt((uint32_t)(*_qry)[_qlen-j], 4);
929 ftabOff |= (uint32_t)(*_qry)[_qlen-j];
931 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
933 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
934 uint32_t ftabTop = ebwt.ftabHi(ftabOff);
935 uint32_t ftabBot = ebwt.ftabLo(ftabOff+1);
936 assert_geq(ftabBot, ftabTop);
937 if(ftabTop == ftabBot) {
940 assert(!_precalcedSideLocus);
941 assert_leq(iham, _qualThresh);
942 ret = backtrack(stackDepth+1,
944 btUnrevOff, // new unrevisitable boundary
945 btOneRevOff, // new 1-revisitable boundary
946 btTwoRevOff, // new 2-revisitable boundary
947 btThreeRevOff, // new 3-revisitable boundary
948 ftabTop, // top arrow in range prior to 'depth'
949 ftabBot, // bottom arrow in range prior to 'depth'
950 btham, // weighted hamming distance so far
951 iham, // initial weighted hamming distance
956 // We already called initFromTopBot for the range
957 // we're going to continue from
958 _precalcedSideLocus = true;
959 assert_leq(iham, _qualThresh);
960 // Continue from selected alternative range
961 ret = backtrack(stackDepth+1,// added 1 mismatch to alignment
962 i+1, // start from next position after
963 btUnrevOff, // new unrevisitable boundary
964 btOneRevOff, // new 1-revisitable boundary
965 btTwoRevOff, // new 2-revisitable boundary
966 btThreeRevOff, // new 3-revisitable boundary
967 bttop, // top arrow in range prior to 'depth'
968 btbot, // bottom arrow in range prior to 'depth'
969 btham, // weighted hamming distance so far
970 iham, // initial weighted hamming distance
975 assert_gt(sink.numValidHits(), prehits);
976 return true; // return, signaling that we're done
978 if(_bailedOnBacktracks ||
979 (_halfAndHalf && (_maxBts > 0) && (_numBts >= _maxBts)))
981 _bailedOnBacktracks = true;
984 // No hit was reported; update elims[], eligibleSz,
985 // eligibleNum, altNum
986 _chars[i] = (*_qry)[icur];
987 assert_neq(15, elims[i]);
988 ASSERT_ONLY(uint8_t oldElim = elims[i]);
989 elims[i] |= (1 << j);
990 assert_lt(elims[i], 16);
991 assert_gt(elims[i], oldElim);
992 eligibleSz -= (btbot-bttop);
995 assert_geq(eligibleNum, 0);
997 assert_geq(altNum, 0);
999 // No alternative backtracking points; all legal
1000 // backtracking targets have been exhausted
1001 assert_eq(0, altNum);
1002 assert_eq(0, eligibleSz);
1003 assert_eq(0, eligibleNum);
1006 else if(eligibleNum == 0 && _considerQuals) {
1007 // Find the next set of eligible backtrack points
1008 // by re-scanning this backtracking frame (from
1009 // 'depth' up to 'd')
1011 for(size_t k = d; k >= depth && k <= _qlen; k--) {
1012 uint32_t kcur = _qlen - k - 1; // current offset into _qry
1013 uint8_t kq = qualAt(kcur);
1014 if(k < unrevOff) break; // already visited all revisitable positions
1015 bool kCurIsAlternative = (ham + mmPenalty(_maqPenalty, kq) <= _qualThresh);
1016 bool kCurOverridesEligible = false;
1017 if(kCurIsAlternative) {
1018 if(kq < lowAltQual) {
1019 // This target is more eligible than
1020 // any targets that came before, so we
1021 // set it to supplant/override them
1022 kCurOverridesEligible = true;
1024 if(kq <= lowAltQual) {
1025 // Position is eligible
1026 for(int l = 0; l < 4; l++) {
1027 if((elims[k] & (1 << l)) == 0) {
1028 // Not yet eliminated
1029 uint32_t spread = pairSpread(pairs, k, l);
1030 if(kCurOverridesEligible) {
1031 // Clear previous eligible results;
1032 // this one's better
1034 kCurOverridesEligible = false;
1035 // Keep these parameters in
1036 // case this target turns
1037 // out to be the only
1038 // eligible target and we
1039 // can avoid having to
1044 eltop = pairTop(pairs, k, l);
1045 elbot = pairBot(pairs, k, l);
1046 assert_eq(elbot-eltop, spread);
1047 elham = mmPenalty(_maqPenalty, kq);
1053 assert_gt(spread, 0);
1054 eligibleSz += spread;
1061 assert_gt(eligibleNum, 0);
1062 assert_leq(eligibleNum, altNum);
1063 assert_gt(eligibleSz, 0);
1064 assert_geq(eligibleSz, eligibleNum);
1065 assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
1067 } // while(top == bot && altNum > 0)
1068 if(mustBacktrack || invalidHalfAndHalf || invalidExact) {
1071 // Mismatch with no alternatives
1072 if(top == bot && altNum == 0) {
1073 assert_eq(0, altNum);
1074 assert_eq(0, eligibleSz);
1075 assert_eq(0, eligibleNum);
1079 _chars[d] = (*_qry)[cur];
1081 } // while(cur < _qlen)
1082 assert_eq(0xffffffff, cur);
1083 assert_gt(bot, top);
1084 if(_reportPartials > 0) {
1085 // Stack depth should not exceed given hamming distance
1086 assert_leq(stackDepth, _reportPartials);
1089 if(stackDepth >= _reportPartials) {
1090 ret = reportAlignment(stackDepth, top, bot, ham);
1096 * Pretty print a hit along with the backtracking constraints.
1098 void printHit(const Hit& h) {
1099 ::printHit(*_os, h, *_qry, _qlen, _unrevOff, _1revOff, _2revOff, _3revOff, _ebwt->fw());
1103 * Return true iff we're enforcing a half-and-half constraint
1104 * (forced edits in both seed halves).
1106 bool halfAndHalf() const {
1107 return _halfAndHalf;
1113 * Return true iff we're OK to continue after considering which
1114 * half-seed boundary we're passing through, together with the
1115 * number of mismatches accumulated so far. Return false if we
1116 * should stop because a half-and-half constraint is violated. If
1117 * we're not currently passing a half-seed boundary, just return
1120 bool hhCheck(uint32_t stackDepth, uint32_t depth,
1121 const std::vector<uint32_t>& mms, bool empty)
1123 ASSERT_ONLY(uint32_t lim = (_3revOff == _2revOff)? 2 : 3);
1124 if((depth == (_5depth-1)) && !empty) {
1125 // We're crossing the boundary separating the hi-half
1126 // from the non-seed portion of the read.
1127 // We should induce a mismatch if we haven't mismatched
1128 // yet, so that we don't waste time pursuing a match
1129 // that was covered by a previous phase
1130 assert_eq(0, _reportPartials);
1131 assert_leq(stackDepth, lim-1);
1132 return stackDepth > 0;
1133 } else if((depth == (_3depth-1)) && !empty) {
1134 // We're crossing the boundary separating the lo-half
1135 // from the non-seed portion of the read
1136 assert_eq(0, _reportPartials);
1137 assert_leq(stackDepth, lim);
1138 assert_gt(stackDepth, 0);
1139 // Count the mismatches in the lo and hi halves
1140 uint32_t loHalfMms = 0, hiHalfMms = 0;
1141 for(size_t i = 0; i < stackDepth; i++) {
1142 uint32_t depth = _qlen - mms[i] - 1;
1143 if (depth < _5depth) hiHalfMms++;
1144 else if(depth < _3depth) loHalfMms++;
1147 assert_leq(loHalfMms + hiHalfMms, lim);
1148 bool invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
1149 return (stackDepth >= 2 && !invalidHalfAndHalf);
1151 if(depth < _5depth-1) {
1152 assert_leq(stackDepth, lim-1);
1154 else if(depth >= _5depth && depth < _3depth-1) {
1155 assert_gt(stackDepth, 0);
1156 assert_leq(stackDepth, lim);
1162 * Calculate the stratum of the partial (or full) alignment
1163 * currently under consideration. Stratum is equal to the number
1164 * of mismatches in the seed portion of the alignment.
1166 int calcStratum(const std::vector<uint32_t>& mms, uint32_t stackDepth) {
1168 for(size_t i = 0; i < stackDepth; i++) {
1169 if(mms[i] >= (_qlen - _3revOff)) {
1170 // This mismatch falls within the seed; count it
1171 // toward the stratum to report
1173 // Don't currently support more than 3
1174 // mismatches in the seed
1175 assert_leq(stratum, 3);
1182 * Mark character c at depth d as being eliminated with respect to
1183 * future backtracks.
1185 void eliminate(uint8_t *elims, uint32_t d, int c) {
1187 elims[d] = (1 << c);
1188 assert_gt(elims[d], 0);
1189 assert_lt(elims[d], 16);
1193 assert_lt(elims[d], 16);
1197 * Return true iff the state of the backtracker as encoded by
1198 * stackDepth, d and iham is compatible with the current half-and-
1199 * half alignment mode. prehits is for sanity checking when
1202 bool hhCheckTop(uint32_t stackDepth,
1205 const std::vector<uint32_t>& mms,
1206 uint64_t prehits = 0xffffffffffffffffllu)
1208 assert_eq(0, _reportPartials);
1209 // Crossing from the hi-half into the lo-half
1211 if(_3revOff == _2revOff) {
1212 // Total of 2 mismatches allowed: 1 hi, 1 lo
1213 // The backtracking logic should have prevented us from
1214 // backtracking more than once into this region
1215 assert_leq(stackDepth, 1);
1216 // Reject if we haven't encountered mismatch by this point
1217 if(stackDepth == 0) {
1220 } else { // if(_3revOff != _2revOff)
1221 // Total of 3 mismatches allowed: 1 hi, 1 or 2 lo
1222 // The backtracking logic should have prevented us from
1223 // backtracking more than twice into this region
1224 assert_leq(stackDepth, 2);
1225 // Reject if we haven't encountered mismatch by this point
1226 if(stackDepth < 1) {
1230 } else if(d == _3depth) {
1231 // Crossing from lo-half to outside of the seed
1232 if(_3revOff == _2revOff) {
1233 // Total of 2 mismatches allowed: 1 hi, 1 lo
1234 // The backtracking logic should have prevented us from
1235 // backtracking more than twice within this region
1236 assert_leq(stackDepth, 2);
1237 // Must have encountered two mismatches by this point
1238 if(stackDepth < 2) {
1239 // We're returning from the bottommost frame
1240 // without having found any hits; let's
1241 // sanity-check that there really aren't any
1244 } else { // if(_3revOff != _2revOff)
1245 // Total of 3 mismatches allowed: 1 hi, 1 or 2 lo
1246 // Count the mismatches in the lo and hi halves
1247 int loHalfMms = 0, hiHalfMms = 0;
1248 assert_geq(mms.size(), stackDepth);
1249 for(size_t i = 0; i < stackDepth; i++) {
1250 uint32_t d = _qlen - mms[i] - 1;
1251 if (d < _5depth) hiHalfMms++;
1252 else if(d < _3depth) loHalfMms++;
1255 assert_leq(loHalfMms + hiHalfMms, 3);
1256 assert_gt(hiHalfMms, 0);
1257 if(loHalfMms == 0) {
1258 // We're returning from the bottommost frame
1259 // without having found any hits; let's
1260 // sanity-check that there really aren't any
1263 assert_geq(stackDepth, 2);
1264 // The backtracking logic should have prevented us from
1265 // backtracking more than twice within this region
1266 assert_leq(stackDepth, 3);
1269 // We didn't just cross a boundary, so do an in-between check
1271 assert_geq(stackDepth, 1);
1272 } else if(d >= _3depth) {
1273 assert_geq(stackDepth, 2);
1280 * Return the Phred quality value for the most likely base at
1281 * offset 'off' in the read.
1283 inline uint8_t qualAt(size_t off) {
1284 return phredCharToPhredQual((*_qual)[off]);
1287 /// Get the top offset for character c at depth d
1288 inline uint32_t pairTop(uint32_t* pairs, size_t d, size_t c) {
1289 return pairs[d*8 + c + 0];
1292 /// Get the bot offset for character c at depth d
1293 inline uint32_t pairBot(uint32_t* pairs, size_t d, size_t c) {
1294 return pairs[d*8 + c + 4];
1297 /// Get the spread between the bot and top offsets for character c
1299 inline uint32_t pairSpread(uint32_t* pairs, size_t d, size_t c) {
1300 assert_geq(pairBot(pairs, d, c), pairTop(pairs, d, c));
1301 return pairBot(pairs, d, c) - pairTop(pairs, d, c);
1305 * Tally how many Ns occur in the seed region and in the ftab-
1306 * jumpable region of the read. Check whether the mismatches
1307 * induced by the Ns already violates the current policy. Return
1308 * false if the policy is already violated, true otherwise.
1310 bool tallyNs(int& nsInSeed, int& nsInFtab) {
1311 const Ebwt<String<Dna> >& ebwt = *_ebwt;
1312 int ftabChars = ebwt._eh._ftabChars;
1313 // Count Ns in the seed region of the read and short-circuit if
1314 // the configuration of Ns guarantees that there will be no
1315 // valid alignments given the backtracking constraints.
1316 for(size_t i = 0; i < _3revOff; i++) {
1317 if((int)(*_qry)[_qlen-i-1] == 4) {
1321 return false; // Exceeded mm budget on Ns alone
1323 } else if(nsInSeed == 2) {
1325 return false; // Exceeded mm budget on Ns alone
1327 } else if(nsInSeed == 3) {
1329 return false; // Exceeded mm budget on Ns alone
1332 assert_gt(nsInSeed, 3);
1333 return false; // Exceeded mm budget on Ns alone
1337 // Calculate the number of Ns there are in the region that
1338 // would get jumped over if the ftab were used.
1339 for(size_t i = 0; i < (size_t)ftabChars && i < _qlen; i++) {
1340 if((int)(*_qry)[_qlen-i-1] == 4) nsInFtab++;
1346 * Calculate the offset into the ftab for the rightmost 'ftabChars'
1347 * characters of the current query. Rightmost char gets least
1348 * significant bit-pair.
1350 uint32_t calcFtabOff() {
1351 const Ebwt<String<Dna> >& ebwt = *_ebwt;
1352 int ftabChars = ebwt._eh._ftabChars;
1353 uint32_t ftabOff = (*_qry)[_qlen - ftabChars];
1354 assert_lt(ftabOff, 4);
1355 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
1356 for(int i = ftabChars - 1; i > 0; i--) {
1358 assert_lt((uint32_t)(*_qry)[_qlen-i], 4);
1359 ftabOff |= (uint32_t)(*_qry)[_qlen-i];
1360 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
1362 assert_lt(ftabOff, ebwt._eh._ftabLen-1);
1367 * Mutate the _qry string according to the contents of the _muts
1368 * array, which represents a partial alignment.
1370 void applyPartialMutations() {
1372 // No mutations to apply
1375 for(size_t i = 0; i < length(*_muts); i++) {
1376 const QueryMutation& m = (*_muts)[i];
1377 assert_lt(m.pos, _qlen);
1378 assert_leq(m.oldBase, 4);
1379 assert_lt(m.newBase, 4);
1380 assert_neq(m.oldBase, m.newBase);
1381 assert_eq((uint32_t)((*_qry)[m.pos]), (uint32_t)m.oldBase);
1382 (*_qry)[m.pos] = (Dna5)(int)m.newBase; // apply it
1387 * Take partial-alignment mutations present in the _muts list and
1388 * place them on the _mm list so that they become part of the
1389 * reported alignment.
1391 void promotePartialMutations(int stackDepth) {
1393 // No mutations to undo
1396 size_t numMuts = length(*_muts);
1397 assert_leq(numMuts, _qlen);
1398 for(size_t i = 0; i < numMuts; i++) {
1399 // Entries in _mms[] are in terms of offset into
1400 // _qry - not in terms of offset from 3' or 5' end
1401 assert_lt(stackDepth + i, _qlen);
1402 // All partial-alignment mutations should fall
1404 assert_lt((*_muts)[i].pos, _qlen);
1405 // All partial-alignment mutations should fall
1406 // within unrevisitable region
1407 assert_lt(_qlen - (*_muts)[i].pos - 1, _unrevOff);
1409 // Shouldn't be any overlap between mismatched positions
1410 // and positions that mismatched in the partial alignment.
1411 for(size_t j = 0; j < stackDepth + i; j++) {
1412 assert_neq(_mms[j], (uint32_t)(*_muts)[i].pos);
1415 if(_mms.size() <= stackDepth + i) {
1416 assert_eq(_mms.size(), stackDepth + i);
1417 _mms.push_back((*_muts)[i].pos);
1419 _mms[stackDepth + i] = (*_muts)[i].pos;
1421 if(_refcs.size() <= stackDepth + i) {
1422 assert_eq(_refcs.size(), stackDepth + i);
1423 _refcs.push_back("ACGT"[(*_muts)[i].newBase]);
1425 _refcs[stackDepth + i] = "ACGT"[(*_muts)[i].newBase];
1431 * Undo mutations to the _qry string, returning it to the original
1434 void undoPartialMutations() {
1436 // No mutations to undo
1439 for(size_t i = 0; i < length(*_muts); i++) {
1440 const QueryMutation& m = (*_muts)[i];
1441 assert_lt(m.pos, _qlen);
1442 assert_leq(m.oldBase, 4);
1443 assert_lt(m.newBase, 4);
1444 assert_neq(m.oldBase, m.newBase);
1445 assert_eq((uint32_t)((*_qry)[m.pos]), (uint32_t)m.newBase);
1446 (*_qry)[m.pos] = (Dna5)(int)m.oldBase; // undo it
1451 * Report a range of alignments with # mismatches = stackDepth and
1452 * with the mutations (also mismatches) contained in _muts. The
1453 * range is delimited by top and bot. Returns true iff one or more
1454 * full alignments were successfully reported and the caller can
1457 bool reportAlignment(uint32_t stackDepth, uint32_t top,
1458 uint32_t bot, uint16_t cost)
1461 // No two elements of _mms[] should be the same
1462 assert_geq(_mms.size(), stackDepth);
1463 for(size_t i = 0; i < stackDepth; i++) {
1464 for(size_t j = i+1; j < stackDepth; j++) {
1465 assert_neq(_mms[j], _mms[i]);
1467 // All elements of _mms[] should fall within bounds
1468 assert_lt(_mms[i], _qlen);
1471 if(_reportPartials) {
1472 assert_leq(stackDepth, _reportPartials);
1473 if(stackDepth > 0) {
1474 // Report this partial alignment. A partial alignment
1475 // is defined purely by its mismatches; top and bot are
1477 reportPartial(stackDepth);
1479 return false; // keep going - we want to find all partial alignments
1482 if(stackDepth > 0) {
1483 stratum = calcStratum(_mms, stackDepth);
1485 assert_lt(stratum, 4);
1486 assert_geq(stratum, 0);
1488 // If _muts != NULL then this alignment extends a partial
1489 // alignment, so we have to account for the differences present
1492 // Undo partial-alignment mutations to get original _qry
1493 ASSERT_ONLY(String<Dna5> tmp = (*_qry));
1494 undoPartialMutations();
1495 assert_neq(tmp, (*_qry));
1496 // Add the partial-alignment mutations to the _mms[] array
1497 promotePartialMutations(stackDepth);
1498 // All muts are in the seed, so they count toward the stratum
1499 size_t numMuts = length(*_muts);
1501 cost |= (stratum << 14);
1502 assert_geq(cost, (uint32_t)(stratum << 14));
1503 // Report the range of full alignments
1504 hit = reportFullAlignment(stackDepth + numMuts, top, bot, stratum, cost);
1505 // Re-apply partial-alignment mutations
1506 applyPartialMutations();
1507 assert_eq(tmp, (*_qry));
1509 // Report the range of full alignments
1510 cost |= (stratum << 14);
1511 assert_geq(cost, (uint32_t)(stratum << 14));
1512 hit = reportFullAlignment(stackDepth, top, bot, stratum, cost);
1518 * Report a range of full alignments with # mismatches = stackDepth.
1519 * The range is delimited by top and bot. Returns true if one or
1520 * more alignments were successfully reported. Returns true iff
1521 * one or more full alignments were successfully reported and the
1522 * caller can stop searching.
1524 bool reportFullAlignment(uint32_t stackDepth,
1530 assert_gt(bot, top);
1531 if(stackDepth == 0 && !_reportExacts) {
1532 // We are not reporting exact hits (usually because we've
1533 // already reported them as part of a previous invocation
1534 // of the backtracker)
1537 assert(!_reportRanges);
1538 uint32_t spread = bot - top;
1539 // Pick a random spot in the range to begin report
1540 uint32_t r = top + (_rand.nextU32() % spread);
1541 for(uint32_t i = 0; i < spread; i++) {
1542 uint32_t ri = r + i;
1543 if(ri >= bot) ri -= spread;
1544 // reportChaseOne takes the _mms[] list in terms of
1545 // their indices into the query string; not in terms
1546 // of their offset from the 3' or 5' end.
1547 assert_geq(cost, (uint32_t)(stratum << 14));
1548 if(_ebwt->reportChaseOne((*_qry), _qual, _name,
1549 _color, colorExEnds, snpPhred, _refs, _mms,
1550 _refcs, stackDepth, ri, top, bot,
1551 _qlen, stratum, cost, _patid, _seed,
1554 // Return value of true means that we can stop
1557 // Return value of false means that we should continue
1558 // searching. This could happen if we the call to
1559 // reportChaseOne() reported a hit, but the user asked for
1560 // multiple hits and we haven't reached the ceiling yet.
1561 // This might also happen if the call to reportChaseOne()
1562 // didn't report a hit because the alignment was spurious
1563 // (i.e. overlapped some padding).
1565 // All range elements were examined and we should keep going
1570 * Report the partial alignment represented by the current stack
1571 * state (_mms[] and stackDepth).
1573 bool reportPartial(uint32_t stackDepth) {
1574 // Sanity-check stack depth
1575 if(_3revOff != _2revOff) {
1576 assert_leq(stackDepth, 3);
1577 } else if(_2revOff != _1revOff) {
1578 assert_leq(stackDepth, 2);
1580 assert_leq(stackDepth, 1);
1584 assert_gt(_reportPartials, 0);
1585 assert(_partials != NULL);
1586 ASSERT_ONLY(uint32_t qualTot = 0);
1587 PartialAlignment al;
1588 al.u64.u64 = 0xffffffffffffffffllu;
1589 assert_leq(stackDepth, 3);
1590 assert_gt(stackDepth, 0);
1593 assert_gt(_mms.size(), 0);
1594 assert_lt(_mms[0], _qlen);
1595 // First, append the mismatch position in the read
1596 al.entry.pos0 = (uint16_t)_mms[0]; // pos
1597 ASSERT_ONLY(uint8_t qual0 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[0]])));
1598 ASSERT_ONLY(qualTot += qual0);
1599 uint32_t ci = _qlen - _mms[0] - 1;
1600 // _chars[] is index in terms of RHS-relative depth
1601 int c = (int)(Dna5)_chars[ci];
1603 assert_neq(c, (int)(*_qry)[_mms[0]]);
1604 // Second, append the substituted character for the position
1607 if(stackDepth > 1) {
1608 assert_gt(_mms.size(), 1);
1610 assert_lt(_mms[1], _qlen);
1611 // First, append the mismatch position in the read
1612 al.entry.pos1 = (uint16_t)_mms[1]; // pos
1613 ASSERT_ONLY(uint8_t qual1 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[1]])));
1614 ASSERT_ONLY(qualTot += qual1);
1615 ci = _qlen - _mms[1] - 1;
1616 // _chars[] is index in terms of RHS-relative depth
1617 c = (int)(Dna5)_chars[ci];
1619 assert_neq(c, (int)(*_qry)[_mms[1]]);
1620 // Second, append the substituted character for the position
1622 if(stackDepth > 2) {
1623 assert_gt(_mms.size(), 2);
1625 assert_lt(_mms[2], _qlen);
1626 // First, append the mismatch position in the read
1627 al.entry.pos2 = (uint16_t)_mms[2]; // pos
1628 ASSERT_ONLY(uint8_t qual2 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[2]])));
1629 ASSERT_ONLY(qualTot += qual2);
1630 ci = _qlen - _mms[2] - 1;
1631 // _chars[] is index in terms of RHS-relative depth
1632 c = (int)(Dna5)_chars[ci];
1634 assert_neq(c, (int)(*_qry)[_mms[2]]);
1635 // Second, append the substituted character for the position
1638 // Signal that the '2' slot is empty
1639 al.entry.pos2 = 0xffff;
1642 // Signal that the '1' slot is empty
1643 al.entry.pos1 = 0xffff;
1646 assert_leq(qualTot, _qualThresh);
1647 assert(validPartialAlignment(al));
1649 assert(al.repOk(_qualThresh, _qlen, (*_qual), _maqPenalty));
1650 for(size_t i = 0; i < _partialsBuf.size(); i++) {
1651 assert(validPartialAlignment(_partialsBuf[i]));
1652 assert(!samePartialAlignment(_partialsBuf[i], al));
1655 _partialsBuf.push_back(al);
1660 * Check that the given eligibility parameters (lowAltQual,
1661 * eligibleSz, eligibleNum) are correct, given the appropriate
1662 * inputs (pairs, elims, depth, d, unrevOff)
1664 bool sanityCheckEligibility(uint32_t depth,
1667 uint32_t lowAltQual,
1668 uint32_t eligibleSz,
1669 uint32_t eligibleNum,
1673 // Sanity check that the lay of the land is as we
1674 // expect given eligibleNum and eligibleSz
1675 size_t i = max(depth, unrevOff), j = 0;
1677 uint32_t eligiblesVisited = 0;
1678 for(; i <= d; i++) {
1679 uint32_t icur = _qlen - i - 1; // current offset into _qry
1680 uint8_t qi = qualAt(icur);
1681 assert_lt(elims[i], 16);
1682 if((qi == lowAltQual || !_considerQuals) && elims[i] != 15) {
1683 // This is an eligible position with at least
1684 // one remaining backtrack target
1685 for(j = 0; j < 4; j++) {
1686 if((elims[i] & (1 << j)) == 0) {
1687 // This pair has not been eliminated
1688 assert_gt(pairBot(pairs, i, j), pairTop(pairs, i, j));
1689 cumSz += pairSpread(pairs, i, j);
1695 assert_eq(cumSz, eligibleSz);
1696 assert_eq(eligiblesVisited, eligibleNum);
1700 const BitPairReference* _refs; // reference sequences (or NULL if not colorspace)
1701 String<Dna5>* _qry; // query (read) sequence
1702 size_t _qlen; // length of _qry
1703 String<char>* _qual; // quality values for _qry
1704 String<char>* _name; // name of _qry
1705 bool _color; // whether read is colorspace
1706 const Ebwt<String<Dna> >* _ebwt; // Ebwt to search in
1707 const EbwtSearchParams<String<Dna> >& _params; // Ebwt to search in
1708 uint32_t _unrevOff; // unrevisitable chunk
1709 uint32_t _1revOff; // 1-revisitable chunk
1710 uint32_t _2revOff; // 2-revisitable chunk
1711 uint32_t _3revOff; // 3-revisitable chunk
1712 /// Whether to round qualities off Maq-style when calculating penalties
1714 uint32_t _qualThresh; // only accept hits with weighted
1715 // hamming distance <= _qualThresh
1716 uint32_t *_pairs; // ranges, leveled in parallel
1717 // with decision stack
1718 uint8_t *_elims; // which ranges have been
1719 // eliminated, leveled in parallel
1720 // with decision stack
1721 std::vector<uint32_t> _mms; // array for holding mismatches
1722 std::vector<uint8_t> _refcs; // array for holding mismatches
1723 // Entries in _mms[] are in terms of offset into
1724 // _qry - not in terms of offset from 3' or 5' end
1725 char *_chars; // characters selected so far
1726 // If > 0, report partial alignments up to this many mismatches
1727 uint32_t _reportPartials;
1728 /// Do not report alignments with stratum < this limit
1730 /// When reporting a full alignment, report top/bot; don't chase
1731 /// any of the results
1733 /// Append partial alignments here
1734 PartialAlignmentManager *_partials;
1735 /// Set of mutations that apply for a partial alignment
1736 String<QueryMutation> *_muts;
1737 /// Reference texts (NULL if they are unavailable
1738 vector<String<Dna5> >* _os;
1739 /// Whether to use the _os array together with a naive matching
1740 /// algorithm to double-check reported alignments (or the lack
1743 /// Whether to consider quality values when deciding where to
1745 bool _considerQuals;
1747 /// Depth of 5'-seed-half border
1749 /// Depth of 3'-seed-half border
1752 String<char> _qualDefault;
1753 /// Number of backtracks in last call to backtrack()
1755 /// Number of backtracks since last reset
1756 uint32_t _totNumBts;
1757 /// Max # of backtracks to allow before giving up
1759 /// Whether we precalcualted the Ebwt locus information for the
1760 /// next top/bot pair
1761 bool _precalcedSideLocus;
1762 /// Precalculated top locus
1764 /// Precalculated bot locus
1766 /// Flag to record whether a 'false' return from backtracker is due
1767 /// to having exceeded one or more backrtacking limits
1768 bool _bailedOnBacktracks;
1769 /// Source of pseudo-random numbers
1774 // Holding area for partial alignments
1775 vector<PartialAlignment> _partialsBuf;
1776 // Current range to expose to consumers
1781 std::set<int64_t> allTops_;
1786 * Class that coordinates quality- and quantity-aware backtracking over
1787 * some range of a read sequence.
1789 * The creator can configure the BacktrackManager to treat different
1790 * stretches of the read differently.
1792 class EbwtRangeSource : public RangeSource {
1793 typedef Ebwt<String<Dna> > TEbwt;
1794 typedef std::pair<int, int> TIntPair;
1807 AlignerMetrics *metrics = NULL) :
1823 maqPenalty_(maqPenalty),
1824 qualOrder_(qualOrder),
1826 reportExacts_(reportExacts),
1827 halfAndHalf_(halfAndHalf),
1833 skippingThisRead_(false),
1835 { curEbwt_ = ebwt_; }
1838 * Set a new query read.
1840 virtual void setQuery(ReadBuf& r, Range *seedRange) {
1841 const bool ebwtFw = ebwt_->fw();
1843 qry_ = fw_ ? &r.patFw : &r.patRc;
1844 qual_ = fw_ ? &r.qual : &r.qualRev;
1845 altQry_ = (String<Dna5>*)(fw_ ? r.altPatFw : r.altPatRc);
1846 altQual_ = (String<char>*)(fw_ ? r.altQual : r.altQualRev);
1848 qry_ = fw_ ? &r.patFwRev : &r.patRcRev;
1849 qual_ = fw_ ? &r.qualRev : &r.qual;
1850 altQry_ = (String<Dna5>*)(fw_ ? r.altPatFwRev : r.altPatRcRev);
1851 altQual_ = (String<char>*)(fw_ ? r.altQualRev : r.altQual);
1856 if(seedRange != NULL) seedRange_ = *seedRange;
1857 else seedRange_.invalidate();
1858 qlen_ = length(*qry_);
1859 skippingThisRead_ = false;
1860 // Apply edits from the partial alignment to the query pattern
1861 if(seedRange_.valid()) {
1863 const size_t srSz = seedRange_.mms.size();
1865 assert_eq(srSz, seedRange_.refcs.size());
1866 for(size_t i = 0; i < srSz; i++) {
1867 assert_lt(seedRange_.mms[i], qlen_);
1868 char rc = (char)seedRange_.refcs[i];
1869 assert(rc == 'A' || rc == 'C' || rc == 'G' || rc == 'T');
1870 ASSERT_ONLY(char oc = (char)qryBuf_[qlen_ - seedRange_.mms[i] - 1]);
1872 qryBuf_[qlen_ - seedRange_.mms[i] - 1] = (Dna5)rc;
1873 assert_neq((Dna5)rc, (*qry_)[qlen_ - seedRange_.mms[i] - 1]);
1877 // Make sure every qual is a valid qual ASCII character (>= 33)
1878 for(size_t i = 0; i < length(*qual_); i++) {
1879 assert_geq((*qual_)[i], 33);
1880 for(int j = 0; j < alts_; j++) {
1881 assert_geq(altQual_[j][i], 33);
1884 assert_geq(length(*qual_), qlen_);
1886 this->foundRange = false;
1892 * Set backtracking constraints.
1894 void setOffs(uint32_t depth5, // depth of far edge of hi-half
1895 uint32_t depth3, // depth of far edge of lo-half
1896 uint32_t unrevOff, // depth above which we cannot backtrack
1897 uint32_t revOff1, // depth above which we may backtrack just once
1898 uint32_t revOff2, // depth above which we may backtrack just twice
1899 uint32_t revOff3) // depth above which we may backtrack just three times
1903 assert_geq(depth3_, depth5_);
1904 offRev0_ = unrevOff;
1911 * Return true iff this RangeSource is allowed to report exact
1912 * alignments (exact = no edits).
1914 bool reportExacts() const {
1915 return reportExacts_;
1918 /// Return the current range
1919 virtual Range& range() {
1924 * Set qlen_ according to parameter, except don't let it fall below
1925 * the length of the query.
1927 void setQlen(uint32_t qlen) {
1928 assert(qry_ != NULL);
1929 qlen_ = min<uint32_t>(length(*qry_), qlen);
1933 * Initiate continuations so that the next call to advance() begins
1934 * a new search. Note that contMan is empty upon return if there
1935 * are no valid continuations to begin with. Also note that
1936 * calling initConts() may result in finding a range (i.e., if we
1937 * immediately jump to a valid range using the ftab).
1940 initBranch(PathManager& pm) {
1941 assert(curEbwt_ != NULL);
1942 assert_gt(length(*qry_), 0);
1943 assert_leq(qlen_, length(*qry_));
1944 assert_geq(length(*qual_), length(*qry_));
1945 const Ebwt<String<Dna> >& ebwt = *ebwt_;
1946 int ftabChars = ebwt._eh._ftabChars;
1947 this->foundRange = false;
1948 int nsInSeed = 0; int nsInFtab = 0;
1949 ASSERT_ONLY(allTops_.clear());
1950 if(skippingThisRead_) {
1955 uint32_t maxmms = 0;
1956 if(offRev0_ != offRev1_) maxmms = 1;
1957 if(offRev1_ != offRev2_) maxmms = 2;
1958 if(offRev2_ != offRev3_) maxmms = 3;
1959 if(qlen_ <= maxmms) {
1961 ThreadSafe _ts(&gLock);
1962 cerr << "Warning: Read (" << (*name_) << ") is less than " << (maxmms+1) << " characters long; skipping..." << endl;
1965 skippingThisRead_ = true;
1969 if(!tallyNs(nsInSeed, nsInFtab)) {
1970 // No alignments are possible because of the distribution
1971 // of Ns in the read in combination with the backtracking
1975 // icost = total cost penalty (major bits = stratum, minor bits =
1976 // quality penalty) incurred so far by partial alignment
1977 uint16_t icost = (seedRange_.valid()) ? seedRange_.cost : 0;
1978 // iham = total quality penalty incurred so far by partial alignment
1979 uint16_t iham = (seedRange_.valid() && qualOrder_) ? (seedRange_.cost & ~0xc000): 0;
1980 assert_leq(iham, qualLim_);
1981 // m = depth beyond which ftab must not extend or else we might
1982 // miss some legitimate paths
1983 uint32_t m = min<uint32_t>(offRev0_, qlen_);
1984 // Let skipInvalidExact = true if using the ftab would be a
1985 // waste because it would jump directly to an alignment we
1987 bool ftabSkipsToEnd = (qlen_ == (uint32_t)ftabChars);
1988 bool skipInvalidExact = (!reportExacts_ && ftabSkipsToEnd);
1990 // If it's OK to use the ftab...
1991 if(nsInFtab == 0 && m >= (uint32_t)ftabChars && !skipInvalidExact) {
1992 // Use the ftab to jump 'ftabChars' chars into the read
1994 uint32_t ftabOff = calcFtabOff();
1995 uint32_t top = ebwt.ftabHi(ftabOff);
1996 uint32_t bot = ebwt.ftabLo(ftabOff+1);
1997 if(qlen_ == (uint32_t)ftabChars && bot > top) {
1998 // We found a range with 0 mismatches immediately. Set
1999 // fields to indicate we found a range.
2000 assert(reportExacts_);
2001 curRange_.top = top;
2002 curRange_.bot = bot;
2003 curRange_.stratum = (icost >> 14);
2004 curRange_.cost = icost;
2005 curRange_.numMms = 0;
2006 curRange_.ebwt = ebwt_;
2008 curRange_.mms.clear(); // no mismatches
2009 curRange_.refcs.clear(); // no mismatches
2010 // Lump in the edits from the partial alignment
2012 assert(curRange_.repOk());
2013 // no need to do anything with curRange_.refcs
2014 this->foundRange = true;
2015 //this->done = true;
2017 } else if (bot > top) {
2018 // We have a range to extend
2019 assert_leq(top, ebwt._eh._len);
2020 assert_leq(bot, ebwt._eh._len);
2021 Branch *b = pm.bpool.alloc();
2027 pm.rpool, pm.epool, pm.bpool.lastId(), qlen_,
2028 offRev0_, offRev1_, offRev2_, offRev3_,
2029 0, ftabChars, icost, iham, top, bot,
2030 ebwt._eh, ebwt._ebwt))
2032 // Negative result from b->init() indicates we ran
2033 // out of best-first chunk memory
2037 assert(!b->curtailed_);
2038 assert(!b->exhausted_);
2039 assert_gt(b->depth3_, 0);
2040 pm.push(b); // insert into priority queue
2041 assert(!pm.empty());
2043 // The arrows are already closed within the
2044 // unrevisitable region; give up
2047 // We can't use the ftab, so we start from the rightmost
2048 // position and use _fchr
2049 Branch *b = pm.bpool.alloc();
2054 if(!b->init(pm.rpool, pm.epool, pm.bpool.lastId(), qlen_,
2055 offRev0_, offRev1_, offRev2_, offRev3_,
2056 0, 0, icost, iham, 0, 0, ebwt._eh, ebwt._ebwt))
2058 // Negative result from b->init() indicates we ran
2059 // out of best-first chunk memory
2063 assert(!b->curtailed_);
2064 assert(!b->exhausted_);
2065 assert_gt(b->depth3_, 0);
2066 pm.push(b); // insert into priority queue
2067 assert(!pm.empty());
2073 * Advance along the lowest-cost branch managed by the given
2074 * PathManager. Keep advancing until condition 'until' is
2075 * satisfied. Typically, the stopping condition 'until' is
2076 * set to stop whenever pm's minCost changes.
2079 advanceBranch(int until, uint16_t minCost, PathManager& pm) {
2080 assert(curEbwt_ != NULL);
2082 // Let this->foundRange = false; we'll set it to true iff this call
2083 // to advance yielded a new valid-alignment range.
2084 this->foundRange = false;
2086 // Can't have already exceeded weighted hamming distance threshold
2087 assert_gt(length(*qry_), 0);
2088 assert_leq(qlen_, length(*qry_));
2089 assert_geq(length(*qual_), length(*qry_));
2090 assert(!pm.empty());
2094 // Get the highest-priority branch according to the priority
2096 Branch* br = pm.front();
2097 // Shouldn't be curtailed or exhausted
2098 assert(!br->exhausted_);
2099 assert(!br->curtailed_);
2100 assert_gt(br->depth3_, 0);
2101 assert_leq(br->ham_, qualLim_);
2103 br->print((*qry_), (*qual_), minCost, cout, (halfAndHalf_>0), partial_, fw_, ebwt_->fw());
2104 if(!br->edits_.empty()) {
2106 for(size_t i = 0; i < br->edits_.size(); i++) {
2107 Edit e = br->edits_.get(i);
2108 cout << (curEbwt_->fw() ? (qlen_ - e.pos - 1) : e.pos)
2110 if(i < br->edits_.size()-1) cout << " ";
2116 assert(br->repOk(qlen_));
2118 ASSERT_ONLY(int stratum = br->cost_ >> 14); // shift the stratum over
2119 assert_lt(stratum, 4);
2120 // Not necessarily true with rounding
2121 uint32_t depth = br->tipDepth();
2123 const Ebwt<String<Dna> >& ebwt = *ebwt_;
2125 if(halfAndHalf_ > 0) assert_gt(depth3_, depth5_);
2127 bool reportedPartial = false;
2128 bool invalidExact = false;
2131 uint16_t cost = br->cost_;
2133 uint32_t nedits = 0;
2135 if(halfAndHalf_ && !hhCheckTop(br, depth, 0)) {
2136 // Stop extending this branch because it violates a half-
2137 // and-half constraint
2138 if(metrics_ != NULL) metrics_->curBacktracks_++;
2139 pm.curtail(br, qlen_, depth3_, qualOrder_);
2143 cur = qlen_ - depth - 1; // current offset into qry_
2145 // Determine whether ranges at this location are candidates
2147 int c = (int)(*qry_)[cur]; // get char at this position
2149 if(cur < qlen_-1) nextc = (int)(*qry_)[cur+1];
2151 // If any uncalled base's penalty is still under
2152 // the ceiling, then this position is an alternative
2153 uint8_t q[4] = {'!', '!', '!', '!'};
2155 // get unrounded penalties at this position
2157 bestq = penaltiesAt(cur, q, alts_, *qual_, altQry_, altQual_);
2159 bestq = q[0] = q[1] = q[2] = q[3] =
2160 mmPenalty(maqPenalty_, qualAt(cur));
2163 // The current query position is a legit alternative if it a) is
2164 // not in the unrevisitable region, and b) its selection would
2165 // not necessarily cause the quality ceiling (if one exists) to
2167 bool curIsAlternative = (depth >= br->depth0_) &&
2168 (br->ham_ + bestq <= qualLim_);
2169 ASSERT_ONLY(uint32_t obot = br->bot_);
2170 uint32_t otop = br->top_;
2172 // If c is 'N', then it's a mismatch
2173 if(c == 4 && depth > 0) {
2174 // Force the 'else if(curIsAlternative)' or 'else'
2176 br->top_ = br->bot_ = 1;
2178 // We'll take the 'if(br->top == 0 && br->bot == 0)'
2180 assert_eq(0, br->top_);
2181 assert_eq(0, br->bot_);
2184 // Get the range state for the current position
2185 RangeState *rs = br->rangeState();
2187 // Calculate the ranges for this position
2188 if(br->top_ == 0 && br->bot_ == 0) {
2189 // Calculate first quartet of ranges using the _fchr[]
2191 rs->tops[0] = ebwt._fchr[0];
2192 rs->bots[0] = rs->tops[1] = ebwt._fchr[1];
2193 rs->bots[1] = rs->tops[2] = ebwt._fchr[2];
2194 rs->bots[2] = rs->tops[3] = ebwt._fchr[3];
2195 rs->bots[3] = ebwt._fchr[4];
2196 ASSERT_ONLY(int r =)
2197 br->installRanges(c, nextc, fuzzy_, qualLim_ - br->ham_, q);
2198 assert(r < 4 || c == 4);
2199 // Update top and bot
2201 br->top_ = rs->tops[c];
2202 br->bot_ = rs->bots[c];
2204 } else if(curIsAlternative && (br->bot_ > br->top_ || c == 4)) {
2205 // Calculate next quartet of ranges. We hope that the
2206 // appropriate cache lines are prefetched.
2207 assert(br->ltop_.valid());
2209 rs->bots[0] = rs->tops[1] =
2210 rs->bots[1] = rs->tops[2] =
2211 rs->bots[2] = rs->tops[3] =
2213 if(br->lbot_.valid()) {
2214 if(metrics_ != NULL) metrics_->curBwtOps_++;
2215 ebwt.mapLFEx(br->ltop_, br->lbot_, rs->tops, rs->bots);
2218 uint32_t tmptops[] = {0, 0, 0, 0};
2219 uint32_t tmpbots[] = {0, 0, 0, 0};
2220 SideLocus ltop, lbot;
2221 ltop.initFromRow(otop, ebwt_->_eh, ebwt_->_ebwt);
2222 lbot.initFromRow(obot, ebwt_->_eh, ebwt_->_ebwt);
2223 ebwt.mapLFEx(ltop, lbot, tmptops, tmpbots);
2225 if(metrics_ != NULL) metrics_->curBwtOps_++;
2226 int cc = ebwt.mapLF1(otop, br->ltop_);
2228 assert(cc == -1 || (cc >= 0 && cc < 4));
2231 rs->tops[cc] = br->top_;
2232 rs->bots[cc] = (br->top_ + 1);
2235 for(int i = 0; i < 4; i++) {
2236 assert_eq(tmpbots[i] - tmptops[i],
2237 rs->bots[i] - rs->tops[i]);
2241 ASSERT_ONLY(int r =)
2242 br->installRanges(c, nextc, fuzzy_, qualLim_ - br->ham_, q);
2243 assert(r < 4 || c == 4);
2244 // Update top and bot
2246 br->top_ = rs->tops[c];
2247 br->bot_ = rs->bots[c];
2249 br->top_ = br->bot_ = 1;
2251 } else if(br->bot_ > br->top_) {
2252 // This read position is not a legitimate backtracking
2253 // alternative. No need to do the bookkeeping for the
2254 // entire quartet, just do c. We hope that the
2255 // appropriate cache lines are prefetched before now;
2256 // otherwise, we're about to take an expensive cache
2258 assert(br->ltop_.valid());
2259 rs->eliminated_ = true; // eliminate all alternatives leaving this node
2260 assert(br->eliminated(br->len_));
2262 if(br->top_ + 1 == br->bot_) {
2263 if(metrics_ != NULL) metrics_->curBwtOps_++;
2264 br->bot_ = br->top_ = ebwt.mapLF1(br->top_, br->ltop_, c);
2265 if(br->bot_ != 0xffffffff) br->bot_++;
2267 if(metrics_ != NULL) metrics_->curBwtOps_++;
2268 br->top_ = ebwt.mapLF(br->ltop_, c);
2269 assert(br->lbot_.valid());
2270 if(metrics_ != NULL) metrics_->curBwtOps_++;
2271 br->bot_ = ebwt.mapLF(br->lbot_, c);
2275 rs->eliminated_ = true;
2277 assert(rs->repOk());
2278 // br->top_ and br->bot_ now contain the next top and bot
2280 // The continuation had already processed the whole read
2281 assert_eq(qlen_, depth);
2284 empty = (br->top_ == br->bot_);
2285 hit = (cur == 0 && !empty);
2287 // Check whether we've obtained an exact alignment when
2288 // we've been instructed not to report exact alignments
2289 nedits = br->edits_.size();
2290 invalidExact = (hit && nedits == 0 && !reportExacts_);
2291 assert_leq(br->ham_, qualLim_);
2293 // Set this to true if the only way to make legal progress
2294 // is via one or more additional backtracks.
2295 if(halfAndHalf_ && !hhCheck(br, depth, empty)) {
2296 // This alignment doesn't satisfy the half-and-half
2297 // requirements; reject it
2298 if(metrics_ != NULL) metrics_->curBacktracks_++;
2299 pm.curtail(br, qlen_, depth3_, qualOrder_);
2303 if(hit && // there is a range to report
2304 !invalidExact && // not disqualified by no-exact-hits setting
2305 !reportedPartial) // not an already-reported partial alignment
2309 cout << " Partial alignment:" << endl;
2311 cout << " Final alignment:" << endl;
2314 br->print((*qry_), (*qual_), minCost, cout, halfAndHalf_ > 0, partial_, fw_, ebwt_->fw());
2318 assert_gt(br->bot_, br->top_);
2319 assert_leq(br->ham_, qualLim_);
2320 assert_leq((uint32_t)(br->cost_ & ~0xc000), qualLim_);
2321 if(metrics_ != NULL) metrics_->setReadHasRange();
2322 curRange_.top = br->top_;
2323 curRange_.bot = br->bot_;
2324 curRange_.cost = br->cost_;
2325 curRange_.stratum = (br->cost_ >> 14);
2326 curRange_.numMms = nedits;
2328 curRange_.mms.clear();
2329 curRange_.refcs.clear();
2330 for(size_t i = 0; i < nedits; i++) {
2331 curRange_.mms.push_back(qlen_ - br->edits_.get(i).pos - 1);
2332 curRange_.refcs.push_back((char)br->edits_.get(i).chr);
2335 curRange_.ebwt = ebwt_;
2336 this->foundRange = true;
2338 int64_t top2 = (int64_t)br->top_;
2339 top2++; // ensure it's not 0
2340 if(ebwt_->fw()) top2 = -top2;
2341 assert(allTops_.find(top2) == allTops_.end());
2342 allTops_.insert(top2);
2344 assert(curRange_.repOk());
2345 // Must curtail because we've consumed the whole pattern
2346 if(metrics_ != NULL) metrics_->curBacktracks_++;
2347 pm.curtail(br, qlen_, depth3_, qualOrder_);
2348 } else if(empty || cur == 0) {
2349 // The branch couldn't be extended further
2350 if(metrics_ != NULL) metrics_->curBacktracks_++;
2351 pm.curtail(br, qlen_, depth3_, qualOrder_);
2353 // Extend the branch by one position; no change to its cost
2354 // so there's no need to reconsider where it lies in the
2360 // Make sure the front element of the priority queue is
2361 // extendable (i.e. not curtailed) and then prep it.
2362 if(!pm.splitAndPrep(rand_, qlen_, qualLim_, depth3_,
2364 ebwt_->_eh, ebwt_->_ebwt, ebwt_->_fw))
2373 assert(!pm.front()->curtailed_);
2374 assert(!pm.front()->exhausted_);
2376 if(until == ADV_COST_CHANGES && pm.front()->cost_ != cost) break;
2377 else if(until == ADV_STEP) break;
2379 } while(!this->foundRange);
2381 assert(!pm.front()->curtailed_);
2382 assert(!pm.front()->exhausted_);
2387 * Return true iff we're enforcing a half-and-half constraint
2388 * (forced edits in both seed halves).
2390 int halfAndHalf() const {
2391 return halfAndHalf_;
2397 * Lump all the seed-alignment edits from the seedRange_ range
2398 * found previously to the curRange_ range just found.
2400 void addPartialEdits() {
2401 // Lump in the edits from the partial alignment
2402 if(seedRange_.valid()) {
2403 const size_t srSz = seedRange_.mms.size();
2404 for(size_t i = 0; i < srSz; i++) {
2405 curRange_.mms.push_back(qlen_ - seedRange_.mms[i] - 1);
2406 curRange_.refcs.push_back(seedRange_.refcs[i]);
2408 curRange_.numMms += srSz;
2413 * Return true iff we're OK to continue after considering which
2414 * half-seed boundary we're passing through, together with the
2415 * number of mismatches accumulated so far. Return false if we
2416 * should stop because a half-and-half constraint is violated. If
2417 * we're not currently passing a half-seed boundary, just return
2420 bool hhCheck(Branch *b, uint32_t depth, bool empty) {
2421 const uint32_t nedits = b->edits_.size();
2422 ASSERT_ONLY(uint32_t lim3 = (offRev3_ == offRev2_)? 2 : 3);
2423 ASSERT_ONLY(uint32_t lim5 = (offRev1_ == offRev0_)? 2 : 1);
2424 if((depth == (depth5_-1)) && !empty) {
2425 // We're crossing the boundary separating the hi-half
2426 // from the non-seed portion of the read.
2427 // We should induce a mismatch if we haven't mismatched
2428 // yet, so that we don't waste time pursuing a match
2429 // that was covered by a previous phase
2430 assert_leq(nedits, lim5);
2432 } else if((depth == (depth3_-1)) && !empty) {
2433 // We're crossing the boundary separating the lo-half
2434 // from the non-seed portion of the read
2435 assert_leq(nedits, lim3);
2436 assert_gt(nedits, 0);
2437 // Count the mismatches in the lo and hi halves
2438 uint32_t loHalfMms = 0, hiHalfMms = 0;
2439 for(size_t i = 0; i < nedits; i++) {
2440 uint32_t depth = b->edits_.get(i).pos;
2441 if (depth < depth5_) hiHalfMms++;
2442 else if(depth < depth3_) loHalfMms++;
2445 assert_leq(loHalfMms + hiHalfMms, lim3);
2446 bool invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
2447 return (nedits >= (uint32_t)halfAndHalf_ && !invalidHalfAndHalf);
2450 if(depth < depth5_-1) {
2451 assert_leq(nedits, lim5);
2453 else if(depth >= depth5_ && depth < depth3_-1) {
2454 assert_gt(nedits, 0);
2455 assert_leq(nedits, lim3);
2462 * Return true iff the state of the backtracker as encoded by
2463 * stackDepth, d and iham is compatible with the current half-and-
2464 * half alignment mode. prehits is for sanity checking when
2467 bool hhCheckTop(Branch* b,
2470 uint64_t prehits = 0xffffffffffffffffllu)
2472 // Crossing from the hi-half into the lo-half
2473 ASSERT_ONLY(uint32_t lim3 = (offRev3_ == offRev2_)? 2 : 3);
2474 ASSERT_ONLY(uint32_t lim5 = (offRev1_ == offRev0_)? 2 : 1);
2475 const uint32_t nedits = b->edits_.size();
2477 assert_leq(nedits, lim5);
2481 } else if(d == depth3_) {
2482 assert_leq(nedits, lim3);
2483 if(nedits < (uint32_t)halfAndHalf_) {
2489 // We didn't just cross a boundary, so do an in-between check
2491 assert_geq(nedits, 1);
2492 } else if(d >= depth3_) {
2493 assert_geq(nedits, lim3);