5 #ifndef RANGE_SOURCE_H_
6 #define RANGE_SOURCE_H_
10 #include "seqan/sequence.h"
23 * List of Edits that automatically expands as edits are added.
27 EditList() : sz_(0), moreEdits_(NULL), yetMoreEdits_(NULL) { }
30 * Add an edit to the edit list.
32 bool add(const Edit& e, AllocOnlyPool<Edit>& pool, size_t qlen) {
33 assert_lt(sz_, qlen + 10);
35 assert(moreEdits_ == NULL);
36 assert(yetMoreEdits_ == NULL);
38 } else if(sz_ == numEdits) {
39 assert(moreEdits_ == NULL);
40 assert(yetMoreEdits_ == NULL);
41 moreEdits_ = pool.alloc(numMoreEdits);
42 if(moreEdits_ == NULL) {
45 assert(moreEdits_ != NULL);
48 } else if(sz_ < (numEdits + numMoreEdits)) {
49 assert(moreEdits_ != NULL);
50 assert(yetMoreEdits_ == NULL);
51 moreEdits_[sz_ - numEdits] = e;
53 } else if(sz_ == (numEdits + numMoreEdits)) {
54 assert(moreEdits_ != NULL);
55 assert(yetMoreEdits_ == NULL);
56 yetMoreEdits_ = pool.alloc(qlen + 10 - numMoreEdits - numEdits);
57 if(yetMoreEdits_ == NULL) {
60 assert(yetMoreEdits_ != NULL);
64 assert(moreEdits_ != NULL);
65 assert(yetMoreEdits_ != NULL);
66 yetMoreEdits_[sz_ - numEdits - numMoreEdits] = e;
73 * Return a const reference to the ith Edit in the list.
75 const Edit& get(size_t i) const {
79 } else if(i < (numEdits + numMoreEdits)) {
80 assert(moreEdits_ != NULL);
81 return moreEdits_[i-numEdits];
83 assert(moreEdits_ != NULL);
84 assert(yetMoreEdits_ != NULL);
85 return yetMoreEdits_[i-numEdits-numMoreEdits];
90 * Get most recently added Edit.
92 const Edit& top() const {
98 * Return true iff no Edits have been added.
100 bool empty() const { return size() == 0; }
103 * Set a particular element of the EditList.
105 void set(size_t i, const Edit& e) {
109 } else if(i < (numEdits + numMoreEdits)) {
110 assert(moreEdits_ != NULL);
111 moreEdits_[i-numEdits] = e;
113 assert(moreEdits_ != NULL);
114 assert(yetMoreEdits_ != NULL);
115 yetMoreEdits_[i-numEdits-numMoreEdits] = e;
120 * Remove all Edits from the list.
125 yetMoreEdits_ = NULL;
129 * Return number of Edits in the List.
131 size_t size() const { return sz_; }
134 * Free all the heap-allocated edit lists
136 void free(AllocOnlyPool<Edit>& epool, size_t qlen) {
137 if(yetMoreEdits_ != NULL)
138 epool.free(yetMoreEdits_, qlen + 10 - numMoreEdits - numEdits);
139 if(moreEdits_ != NULL)
140 epool.free(moreEdits_, numMoreEdits);
143 const static size_t numEdits = 6; // part of object allocation
144 const static size_t numMoreEdits = 16; // first extra allocation
145 size_t sz_; // number of Edits stored in the EditList
146 Edit edits_[numEdits]; // first 4 edits; typically, no more are needed
147 Edit *moreEdits_; // if used, size is dictated by numMoreEdits
148 Edit *yetMoreEdits_; // if used, size is dictated by length of read
152 * Holds per-position information about what outgoing paths have been
153 * eliminated and what the quality value(s) is (are) at the position.
158 * Assuming qual A/C/G/T are already set, set quallo and quallo2
159 * to the additional cost incurred by the least and second-least
166 // A mismatch to an A in the genome has not been ruled out
167 if(flags.qualA < flags.quallo) {
168 //flags.quallo2 = flags.quallo;
169 flags.quallo = flags.qualA;
171 //else if(flags.qualA == flags.quallo) {
172 // flags.quallo2 = flags.quallo;
173 //} else if(flags.qualA < flags.quallo2) {
174 // flags.quallo2 = flags.qualA;
178 // A mismatch to a C in the genome has not been ruled out
179 if(flags.qualC < flags.quallo) {
180 flags.quallo2 = flags.quallo;
181 flags.quallo = flags.qualC;
182 } else if(flags.qualC == flags.quallo) {
183 flags.quallo2 = flags.quallo;
184 } else if(flags.qualC < flags.quallo2) {
185 flags.quallo2 = flags.qualC;
189 // A mismatch to a G in the genome has not been ruled out
190 if(flags.qualG < flags.quallo) {
191 flags.quallo2 = flags.quallo;
192 flags.quallo = flags.qualG;
193 } else if(flags.qualG == flags.quallo) {
194 flags.quallo2 = flags.quallo;
195 } else if(flags.qualG < flags.quallo2) {
196 flags.quallo2 = flags.qualG;
200 // A mismatch to a T in the genome has not been ruled out
201 if(flags.qualT < flags.quallo) {
202 flags.quallo2 = flags.quallo;
203 flags.quallo = flags.qualT;
204 } else if(flags.qualT == flags.quallo) {
205 flags.quallo2 = flags.quallo;
206 } else if(flags.qualT < flags.quallo2) {
207 flags.quallo2 = flags.qualT;
214 * Set all 13 elimination bits of the flags field to 1, indicating
215 * that all outgoing paths are eliminated.
217 inline void eliminate() {
218 join.elims = ((1 << 13) - 1);
222 * Internal consistency check. Basically just checks that lo and
223 * lo2 are set correctly.
228 assert_lt(flags.qualA, 127);
229 assert_lt(flags.qualC, 127);
230 assert_lt(flags.qualG, 127);
231 assert_lt(flags.qualT, 127);
233 if(flags.qualA < lo) {
236 //else if(flags.qualA == lo || flags.qualA < lo2) {
237 // lo2 = flags.qualA;
241 if(flags.qualC < lo) {
244 } else if(flags.qualC == lo || flags.qualC < lo2) {
249 if(flags.qualG < lo) {
252 } else if(flags.qualG == lo || flags.qualG < lo2) {
257 if(flags.qualT < lo) {
260 } else if(flags.qualT == lo || flags.qualT < lo2) {
264 assert_eq((int)lo, (int)flags.quallo);
265 assert_eq((int)lo2, (int)flags.quallo2);
270 uint64_t mmA : 1; // A in ref aligns to non-A char in read
271 uint64_t mmC : 1; // C in ref aligns to non-C char in read
272 uint64_t mmG : 1; // G in ref aligns to non-G char in read
273 uint64_t mmT : 1; // T in ref aligns to non-T char in read
274 uint64_t snpA : 1; // Same as mmA, but we think it's a SNP and not a miscall
275 uint64_t snpC : 1; // Same as mmC, but we think it's a SNP and not a miscall
276 uint64_t snpG : 1; // Same as mmG, but we think it's a SNP and not a miscall
277 uint64_t snpT : 1; // Same as mmT, but we think it's a SNP and not a miscall
278 uint64_t insA : 1; // A insertion in reference w/r/t read
279 uint64_t insC : 1; // C insertion in reference w/r/t read
280 uint64_t insG : 1; // G insertion in reference w/r/t read
281 uint64_t insT : 1; // T insertion in reference w/r/t read
282 uint64_t del : 1; // deletion of read character
283 uint64_t qualA : 7; // quality penalty for picking A at this position
284 uint64_t qualC : 7; // quality penalty for picking C at this position
285 uint64_t qualG : 7; // quality penalty for picking G at this position
286 uint64_t qualT : 7; // quality penalty for picking T at this position
287 uint64_t quallo : 7; // lowest quality penalty at this position
288 uint64_t quallo2 : 7; // 2nd-lowest quality penalty at this position
289 uint64_t reserved : 9;
292 uint64_t elims : 13; // all of the edit-elim flags bundled together
293 uint64_t quals : 42; // quality of positions
294 uint64_t reserved : 9;
297 uint64_t mmElims : 4; // substitution flags bundled together
298 uint64_t snpElims : 4; // substitution flags bundled together
299 uint64_t insElims : 4; // inserts-in-reference flags bundled together
300 uint64_t delElims : 1; // deletion of read character
301 uint64_t quals : 42; // quality of positions
302 uint64_t reserved : 9;
307 * All per-position state, including the ranges calculated for each
308 * character, the quality value at the position, and a set of flags
309 * recording whether we've tried each way of proceeding from this
315 * Using randomness when picking from among multiple choices, pick
316 * an edit to make. TODO: Only knows how to pick mismatches for
319 Edit pickEdit(int pos, RandomSource& rand, bool fuzzy,
320 uint32_t& top, uint32_t& bot, bool indels,
325 ret.type = EDIT_TYPE_MM;
330 assert(!eliminated_);
331 assert(!fuzzy || eq.repOk());
332 assert(!eq.flags.mmA || !eq.flags.mmC || !eq.flags.mmG || !eq.flags.mmT);
333 int num = !eq.flags.mmA + !eq.flags.mmC + !eq.flags.mmG + !eq.flags.mmT;
336 uint8_t lo2 = eq.flags.quallo2;
337 if(num == 2) eq.flags.quallo2 = 127;
338 // Only need to pick randomly if there's a quality tie
339 if(num > 1 && (!fuzzy || eq.flags.quallo == lo2)) {
340 last = false; // not the last at this pos
341 // Sum up range sizes and do a random weighted pick
343 bool candA = !eq.flags.mmA; bool candC = !eq.flags.mmC;
344 bool candG = !eq.flags.mmG; bool candT = !eq.flags.mmT;
345 bool candInsA = false, candInsC = false;
346 bool candInsG = false, candInsT = false;
347 bool candDel = false;
349 // Insertions and deletions can only be candidates
350 // if the user asked for indels
351 candInsA = !eq.flags.insA;
352 candInsC = !eq.flags.insC;
353 candInsG = !eq.flags.insG;
354 candInsT = !eq.flags.insT;
355 candDel = !eq.flags.del;
358 // To be a candidate in fuzzy mode, you have to both
359 // (a) not have been eliminated, and (b) be tied for
361 candA = (candA && eq.flags.qualA == eq.flags.quallo);
362 candC = (candC && eq.flags.qualC == eq.flags.quallo);
363 candG = (candG && eq.flags.qualG == eq.flags.quallo);
364 candT = (candT && eq.flags.qualT == eq.flags.quallo);
366 ASSERT_ONLY(int origNum = num);
368 assert_gt(bots[0], tops[0]);
369 tot += (bots[0] - tops[0]);
374 assert_gt(bots[1], tops[1]);
375 tot += (bots[1] - tops[1]);
380 assert_gt(bots[2], tops[2]);
381 tot += (bots[2] - tops[2]);
386 assert_gt(bots[3], tops[3]);
387 tot += (bots[3] - tops[3]);
393 assert_gt(bots[0], tops[0]);
394 tot += (bots[0] - tops[0]);
399 assert_gt(bots[1], tops[1]);
400 tot += (bots[1] - tops[1]);
405 assert_gt(bots[2], tops[2]);
406 tot += (bots[2] - tops[2]);
411 assert_gt(bots[3], tops[3]);
412 tot += (bots[3] - tops[3]);
417 // Always a candidate?
418 // Always a candidate just within the window?
419 // We can eliminate some indels based on the content downstream, but not many
424 assert_lt(num, origNum);
425 // Throw a dart randomly that hits one of the possible
426 // substitutions, with likelihoods weighted by range size
427 uint32_t dart = rand.nextU32() % tot;
429 if(dart < (bots[0] - tops[0])) {
430 // Eliminate A mismatch
434 assert_lt(eq.join2.mmElims, 15);
435 ret.chr = color ? '0' : 'A';
436 if(fuzzy) eq.updateLo();
439 dart -= (bots[0] - tops[0]);
442 if(dart < (bots[1] - tops[1])) {
443 // Eliminate C mismatch
447 assert_lt(eq.join2.mmElims, 15);
448 ret.chr = color ? '1' : 'C';
449 if(fuzzy) eq.updateLo();
452 dart -= (bots[1] - tops[1]);
455 if(dart < (bots[2] - tops[2])) {
456 // Eliminate G mismatch
460 assert_lt(eq.join2.mmElims, 15);
461 ret.chr = color ? '2' : 'G';
462 if(fuzzy) eq.updateLo();
465 dart -= (bots[2] - tops[2]);
468 assert_lt(dart, (bots[3] - tops[3]));
469 // Eliminate T mismatch
473 assert_lt(eq.join2.mmElims, 15);
474 ret.chr = color ? '3' : 'T';
475 if(fuzzy) eq.updateLo();
481 if(eq.flags.qualA == eq.flags.quallo && eq.flags.mmA == 0) {
484 } else if(eq.flags.qualC == eq.flags.quallo && eq.flags.mmC == 0) {
487 } else if(eq.flags.qualG == eq.flags.quallo && eq.flags.mmG == 0) {
491 assert_eq(eq.flags.qualT, eq.flags.quallo);
492 assert_eq(0, eq.flags.mmT);
496 ret.chr = color ? "0123"[chr] : "ACGT"[chr];
500 if(!last) eq.updateLo();
502 last = true; // last at this pos
503 // There's only one; pick it!
507 } else if(!eq.flags.mmC) {
509 } else if(!eq.flags.mmG) {
512 assert(!eq.flags.mmT);
515 ret.chr = color ? "0123"[chr] : "ACGT"[chr];
518 //assert_eq(15, eq.join2.mmElims);
519 // Mark entire position as eliminated
527 * Return true (without assertion) iff this RangeState is
528 * internally consistent.
531 if(eliminated_) return true;
532 // Uneliminated chars must have non-empty ranges
533 if(!eq.flags.mmA || !eq.flags.insA) assert_gt(bots[0], tops[0]);
534 if(!eq.flags.mmC || !eq.flags.insC) assert_gt(bots[1], tops[1]);
535 if(!eq.flags.mmG || !eq.flags.insG) assert_gt(bots[2], tops[2]);
536 if(!eq.flags.mmT || !eq.flags.insT) assert_gt(bots[3], tops[3]);
540 // Outgoing ranges; if the position being described is not a
541 // legitimate jumping-off point for a branch, tops[] and bots[]
542 // will be filled with 0s and all possibilities in eq will be
544 uint32_t tops[4]; // A, C, G, T top offsets
545 uint32_t bots[4]; // A, C, G, T bot offsets
546 ElimsAndQual eq; // Which outgoing paths have been tried already
547 bool eliminated_; // Whether all outgoing paths have been eliminated
551 * Encapsulates a "branch" of the search space; i.e. all of the
552 * information deduced by walking along a path with only matches, along
553 * with information about the decisions that lead to the root of that
557 typedef std::pair<uint32_t, uint32_t> U32Pair;
560 delayedCost_(0), curtailed_(false), exhausted_(false),
561 prepped_(false), delayedIncrease_(false) { }
564 * Initialize a new branch object with an empty path.
566 bool init(AllocOnlyPool<RangeState>& rsPool,
567 AllocOnlyPool<Edit>& epool,
580 const EbwtParams& ep,
582 const EditList* edits = NULL)
590 assert_gt(depth3_, 0);
599 // Care about both top and bot
600 SideLocus::initFromTopBot(itop, ibot, ep, ebwt, ltop_, lbot_);
601 } else if(ibot > itop) {
602 // Only care about top
603 ltop_.initFromRow(itop, ep, ebwt);
606 if(qlen - rdepth_ > 0) {
607 ranges_ = rsPool.allocC(qlen - rdepth_); // allocated from the RangeStatePool
608 if(ranges_ == NULL) {
609 return false; // RangeStatePool exhausted
611 rangesSz_ = qlen - rdepth_;
617 for(size_t i = 0; i < (qlen - rdepth_); i++) {
618 for(int j = 0; j < 4; j++) {
619 assert_eq(0, ranges_[i].tops[j]);
620 assert_eq(0, ranges_[i].bots[j]);
627 delayedIncrease_ = false;
630 const size_t numEdits = edits->size();
631 for(size_t i = 0; i < numEdits; i++) {
632 edits_.add(edits->get(i), epool, qlen);
635 // If we're starting with a non-zero length, that means we're
636 // jumping over a bunch of unrevisitable positions.
637 for(size_t i = 0; i < len_; i++) {
638 ranges_[i].eliminated_ = true;
639 assert(eliminated(i));
646 * Depth of the deepest tip of the branch.
648 uint16_t tipDepth() const {
649 return rdepth_ + len_;
653 * Return true iff all outgoing edges from position i have been
656 inline bool eliminated(int i) const {
658 if(i <= len_ && i < rangesSz_) {
659 assert(ranges_ != NULL);
661 if(!ranges_[i].eliminated_) {
662 // Someone must be as-yet-uneliminated
663 assert(!ranges_[i].eq.flags.mmA ||
664 !ranges_[i].eq.flags.mmC ||
665 !ranges_[i].eq.flags.mmG ||
666 !ranges_[i].eq.flags.mmT);
667 assert_lt(ranges_[i].eq.flags.quallo, 127);
670 return ranges_[i].eliminated_;
676 * Split off a new branch by selecting a good outgoing path and
677 * creating a new Branch object for it and inserting that branch
678 * into the priority queue. Mark that outgoing path from the
679 * parent branch as eliminated. If the second-best outgoing path
680 * cost more, add the difference to the cost of this branch (since
681 * that's the best we can do starting from here from now on).
683 Branch* splitBranch(AllocOnlyPool<RangeState>& rpool,
684 AllocOnlyPool<Edit>& epool,
685 AllocOnlyPool<Branch>& bpool,
687 RandomSource& rand, uint32_t qlen,
688 uint32_t qualLim, int seedLen,
689 bool qualOrder, const EbwtParams& ep,
690 const uint8_t* ebwt, bool ebwtFw,
696 assert(ranges_ != NULL);
699 Branch *newBranch = bpool.alloc();
700 if(newBranch == NULL) {
703 assert(newBranch != NULL);
704 uint32_t id = bpool.lastId();
705 int tiedPositions[3];
706 int numTiedPositions = 0;
707 // Lowest marginal cost incurred by any of the positions with
708 // non-eliminated outgoing edges
709 uint16_t bestCost = 0xffff;
711 uint16_t nextCost = 0xffff;
712 int numNotEliminated = 0;
713 int i = (int)depth0_;
714 i = max(0, i - rdepth_);
715 // Iterate over revisitable positions in the path
716 for(; i <= len_; i++) {
717 // If there are still valid options for leaving out of this
722 assert_lt(ranges_[i].eq.flags.quallo, 127);
723 if(ranges_[i].eq.flags.quallo2 < 127) {
727 uint16_t stratum = (rdepth_ + i < seedLen) ? (1 << 14) : 0;
728 uint16_t cost = stratum;
729 cost |= (qualOrder ? ranges_[i].eq.flags.quallo : 0);
730 if(cost < bestCost) {
731 // Demote the old best to the next-best
734 if(ranges_[i].eq.flags.quallo2 < 127) {
735 nextCost = min<uint16_t>(
736 nextCost, ranges_[i].eq.flags.quallo2 | stratum);
739 // Update the new best
741 numTiedPositions = 1;
742 tiedPositions[0] = i;
743 } else if(cost == bestCost) {
744 // As good as the best so far
745 if(fuzzy) nextCost = cost;
746 assert_gt(numTiedPositions, 0);
747 if(numTiedPositions < 3) {
748 tiedPositions[numTiedPositions++] = i;
750 tiedPositions[0] = tiedPositions[1];
751 tiedPositions[1] = tiedPositions[2];
752 tiedPositions[2] = i;
754 } else if(cost < nextCost) {
755 // 'cost' isn't beter than the best, but it is
756 // better than the next-best
761 assert_gt(numNotEliminated, 0);
762 assert_gt(numTiedPositions, 0);
763 //if(nextCost != 0xffff) assert_gt(nextCost, bestCost);
765 if(numTiedPositions > 1) {
766 r = rand.nextU32() % numTiedPositions;
768 int pos = tiedPositions[r];
770 // Pick an edit from among the edits tied for lowest cost
771 // (using randomness to break ties). If the selected edit is
772 // the last remaining one at this position, 'last' is set to
774 uint32_t top = 0, bot = 0;
775 Edit e = ranges_[pos].pickEdit(pos + rdepth_, rand, fuzzy, top,
778 // Create and initialize a new Branch
779 uint16_t newRdepth = rdepth_ + pos + 1;
780 assert_lt((bestCost >> 14), 4);
781 uint32_t hamadd = (bestCost & ~0xc000);
782 uint16_t depth = pos + rdepth_;
783 assert_geq(depth, depth0_);
784 uint16_t newDepth0 = depth0_;
785 uint16_t newDepth1 = depth1_;
786 uint16_t newDepth2 = depth2_;
787 uint16_t newDepth3 = depth3_;
788 if(depth < depth1_) newDepth0 = depth1_;
789 if(depth < depth2_) newDepth1 = depth2_;
790 if(depth < depth3_) newDepth2 = depth3_;
791 assert_eq((uint32_t)(cost_ & ~0xc000), (uint32_t)(ham_ + hamadd));
793 rpool, epool, id, qlen,
794 newDepth0, newDepth1, newDepth2, newDepth3,
795 newRdepth, 0, cost_, ham_ + hamadd,
796 top, bot, ep, ebwt, &edits_))
801 newBranch->edits_.add(e, epool, qlen);
802 if(numNotEliminated == 1 && last) {
803 // This branch is totally exhausted; there are no more
804 // valid outgoing paths from any positions within it.
805 // Remove it from the PathManager and mark it as exhausted.
806 // The caller should delete it.
808 if(ranges_ != NULL) {
809 assert_gt(rangesSz_, 0);
810 if(rpool.free(ranges_, rangesSz_)) {
816 else if(fuzzy && bestCost != nextCost) {
817 // We exhausted the last outgoing edge at the current best
818 // cost; update the best cost to be the next-best
819 assert_gt(nextCost, bestCost);
820 delayedCost_ = (cost_ - bestCost + nextCost);
821 assert_gt(delayedCost_, cost_);
822 delayedIncrease_ = true;
824 else if(!fuzzy && numTiedPositions == 1 && last) {
825 // We exhausted the last outgoing edge at the current best
826 // cost; update the best cost to be the next-best
827 assert_neq(0xffff, nextCost);
828 if(bestCost != nextCost) {
829 assert_gt(nextCost, bestCost);
830 delayedCost_ = (cost_ - bestCost + nextCost);
831 delayedIncrease_ = true;
838 * Free a branch and all its contents.
840 void free(uint32_t qlen,
841 AllocOnlyPool<RangeState>& rpool,
842 AllocOnlyPool<Edit>& epool,
843 AllocOnlyPool<Branch>& bpool)
845 edits_.free(epool, qlen);
846 if(ranges_ != NULL) {
847 assert_gt(rangesSz_, 0);
848 rpool.free(ranges_, rangesSz_);
856 * Pretty-print the state of this branch.
858 void print(const String<Dna5>& qry,
859 const String<char>& quals,
869 const size_t qlen = seqan::length(qry);
870 if(exhausted_) out << "E ";
871 else if(curtailed_) out << "C ";
873 if(ebwtFw) out << "<";
877 std::stringstream ss;
881 for(size_t i = 0; i < 6 - s.length(); i++) {
886 std::stringstream ss2;
890 for(size_t i = 0; i < 6 - s.length(); i++) {
895 if(halfAndHalf) out << " h ";
896 else if(seeded) out << " s ";
898 std::stringstream ss3;
899 const size_t numEdits = edits_.size();
901 for(size_t i = 0; i < rdepth_; i++) {
902 if(editidx < numEdits && edits_.get(editidx).pos == i) {
903 ss3 << " " << (char)tolower(edits_.get(editidx).chr);
906 ss3 << " " << (char)qry[qlen - i - 1];
914 for(size_t i = 0; i < len_; i++) {
915 if(editidx < numEdits && edits_.get(editidx).pos == printed) {
916 ss3 << (char)tolower(edits_.get(editidx).chr) << " ";
919 ss3 << (char)qry[qlen - printed - 1] << " ";
923 assert_eq(editidx, edits_.size());
924 for(size_t i = printed; i < qlen; i++) {
929 std::reverse(s.begin(), s.end());
935 * Called when the most recent branch extension resulted in an
936 * empty range or some other constraint violation (e.g., a
937 * half-and-half constraint).
939 void curtail(AllocOnlyPool<RangeState>& rpool, int seedLen, bool qualOrder) {
942 if(ranges_ == NULL) {
947 uint16_t lowestCost = 0xffff;
948 // Iterate over positions in the path looking for the cost of
949 // the lowest-cost non-eliminated position
950 uint32_t eliminatedStretch = 0;
951 int i = (int)depth0_;
952 i = max(0, i - rdepth_);
953 // TODO: It matters whether an insertion/deletion at given
954 // position would be a gap open or a gap extension
955 for(; i <= len_; i++) {
957 eliminatedStretch = 0;
958 uint16_t stratum = (rdepth_ + i < seedLen) ? (1 << 14) : 0;
959 uint16_t cost = (qualOrder ? /*TODO*/ ranges_[i].eq.flags.quallo : 0) | stratum;
960 if(cost < lowestCost) lowestCost = cost;
961 } else if(i < rangesSz_) {
965 if(lowestCost > 0 && lowestCost != 0xffff) {
966 // This branch's cost will change when curtailed; the
967 // caller should re-insert it into the priority queue so
968 // that the new cost takes effect.
970 } else if(lowestCost == 0xffff) {
971 // This branch is totally exhausted; there are no more
972 // valid outgoing paths from any positions within it.
973 // Remove it from the PathManager and mark it as exhausted.
974 // The caller should delete it.
976 if(ranges_ != NULL) {
977 assert_gt(rangesSz_, 0);
978 if(rpool.free(ranges_, rangesSz_)) {
984 // Just mark it as curtailed and keep the same cost
986 if(ranges_ != NULL) {
987 // Try to trim off no-longer-relevant elements of the
990 assert_gt(rangesSz_, 0);
991 uint32_t trim = (rangesSz_ - len_ - 1) + eliminatedStretch;
992 assert_leq(trim, rangesSz_);
993 if(rpool.free(ranges_ + rangesSz_ - trim, trim)) {
1004 * Prep this branch for the next extension by calculating the
1005 * SideLocus information and prefetching cache lines from the
1008 void prep(const EbwtParams& ep, const uint8_t* ebwt) {
1010 SideLocus::initFromTopBot(top_, bot_, ep, ebwt, ltop_, lbot_);
1011 } else if(bot_ > top_) {
1012 ltop_.initFromRow(top_, ep, ebwt);
1019 * Get the furthest-out RangeState.
1021 RangeState* rangeState() {
1022 assert(!exhausted_);
1023 assert(ranges_ != NULL);
1024 assert_lt(len_, rangesSz_);
1025 return &ranges_[len_];
1029 * Set the elims to match the ranges in ranges_[len_], already
1030 * calculated by the caller. Only does mismatches for now.
1032 int installRanges(int c, int nextc, bool fuzzy, uint32_t qAllow,
1035 assert(!exhausted_);
1036 assert(ranges_ != NULL);
1037 RangeState& r = ranges_[len_];
1039 r.eliminated_ = true; // start with everything eliminated
1040 r.eq.eliminate(); // set all elim flags to 1
1041 assert_lt(qs[0], 127);
1042 assert_lt(qs[1], 127);
1043 assert_lt(qs[2], 127);
1044 assert_lt(qs[3], 127);
1046 assert_eq(qs[0], qs[1]);
1047 assert_eq(qs[0], qs[2]);
1048 assert_eq(qs[0], qs[3]);
1049 r.eq.flags.quallo = qs[0];
1050 if(qs[0] > qAllow) return 0;
1052 // Set one/both of these to true to do the accounting for
1053 // insertions and deletions as well as mismatches
1054 bool doInserts = false;
1055 bool doDeletes = false;
1056 // We can proceed on an A
1057 if(c != 0 && r.bots[0] > r.tops[0] && qs[0] <= qAllow) {
1058 r.eliminated_ = false;
1059 r.eq.flags.mmA = 0; // A substitution is an option
1060 if(doInserts) r.eq.flags.insA = 0;
1061 if(doDeletes && nextc == 0) r.eq.flags.del = 0;
1064 // We can proceed on a C
1065 if(c != 1 && r.bots[1] > r.tops[1] && qs[1] <= qAllow) {
1066 r.eliminated_ = false;
1067 r.eq.flags.mmC = 0; // C substitution is an option
1068 if(doInserts) r.eq.flags.insC = 0;
1069 if(doDeletes && nextc == 1) r.eq.flags.del = 0;
1072 // We can proceed on a G
1073 if(c != 2 && r.bots[2] > r.tops[2] && qs[2] <= qAllow) {
1074 r.eliminated_ = false;
1075 r.eq.flags.mmG = 0; // G substitution is an option
1076 if(doInserts) r.eq.flags.insG = 0;
1077 if(doDeletes && nextc == 2) r.eq.flags.del = 0;
1080 // We can proceed on a T
1081 if(c != 3 && r.bots[3] > r.tops[3] && qs[3] <= qAllow) {
1082 r.eliminated_ = false;
1083 r.eq.flags.mmT = 0; // T substitution is an option
1084 if(doInserts) r.eq.flags.insT = 0;
1085 if(doDeletes && nextc == 3) r.eq.flags.del = 0;
1088 if(!r.eliminated_ && fuzzy) {
1090 r.eq.flags.qualA = qs[0];
1091 r.eq.flags.qualC = qs[1];
1092 r.eq.flags.qualG = qs[2];
1093 r.eq.flags.qualT = qs[3];
1095 // Now that the quals are set and the elim flags are set
1096 // according to which Burrows-Wheeler ranges are empty,
1097 // determine best and second-best quals
1100 assert_lt(r.eq.flags.quallo, 127);
1106 * Extend this branch by one position.
1109 assert(!exhausted_);
1110 assert(!curtailed_);
1111 assert(ranges_ != NULL);
1118 * Do an internal consistency check
1120 bool repOk(uint32_t qlen = 0) const{
1121 assert_leq(depth0_, depth1_);
1122 assert_leq(depth1_, depth2_);
1123 assert_leq(depth2_, depth3_);
1124 assert_gt(depth3_, 0);
1126 assert_leq(edits_.size(), qlen); // might have to relax this with inserts
1127 assert_leq(rdepth_, qlen);
1129 for(int i = 0; i < len_; i++) {
1130 if(!eliminated(i)) {
1131 assert_lt(i, (int)(len_));
1132 assert(ranges_[i].repOk());
1135 const size_t numEdits = edits_.size();
1136 for(size_t i = 0; i < numEdits; i++) {
1137 for(size_t j = i+1; j < numEdits; j++) {
1138 // No two edits should be at the same position (might
1139 // have to relax this with inserts)
1140 assert_neq(edits_.get(i).pos, edits_.get(j).pos);
1143 assert_lt((cost_ >> 14), 4);
1147 uint32_t id_; // branch id; needed to make the ordering of
1148 // branches that are tied in the priority queue
1149 // totally unambiguous. Otherwise, things start
1150 // getting non-deterministic.
1151 uint16_t depth0_; // no edits at depths < depth0
1152 uint16_t depth1_; // at most 1 edit at depths < depth1
1153 uint16_t depth2_; // at most 2 edits at depths < depth2
1154 uint16_t depth3_; // at most 3 edits at depths < depth3
1155 uint16_t rdepth_; // offset in read space from root of search space
1156 uint16_t len_; // length of the branch
1157 uint16_t cost_; // top 2 bits = stratum, bottom 14 = qual ham
1158 // it's up to Branch to keep this updated with the
1159 // cumulative cost of the best path leaving the
1160 // branch; if the branch hasn't been fully
1161 // extended yet, then that path will always be the
1162 // one that extends it by one more
1163 uint16_t ham_; // quality-weighted hamming distance so far
1164 RangeState *ranges_; // Allocated from the RangeStatePool
1166 uint32_t top_; // top offset leading to the root of this subtree
1167 uint32_t bot_; // bot offset leading to the root of this subtree
1170 EditList edits_; // edits leading to the root of the branch
1172 uint16_t delayedCost_;
1174 bool curtailed_; // can't be extended anymore without using edits
1175 bool exhausted_; // all outgoing edges exhausted, including all edits
1176 bool prepped_; // whether SideLocus's are inited
1177 bool delayedIncrease_;
1181 * Order two Branches based on cost.
1186 * true -> b before a
1187 * false -> a before b
1189 bool operator()(const Branch* a, const Branch* b) const {
1190 bool aUnextendable = a->curtailed_ || a->exhausted_;
1191 bool bUnextendable = b->curtailed_ || b->exhausted_;
1192 // Branch with the best cost
1193 if(a->cost_ == b->cost_) {
1194 // If one or the other is curtailed, take the one that's
1195 // still getting extended
1196 if(bUnextendable && !aUnextendable) {
1197 // a still being extended, return false
1200 if(aUnextendable && !bUnextendable) {
1201 // b still being extended, return true
1204 // Either both are curtailed or both are still being
1205 // extended, pick based on which one is deeper
1206 if(a->tipDepth() != b->tipDepth()) {
1207 // Expression is true if b is deeper
1208 return a->tipDepth() < b->tipDepth();
1210 // Keep things deterministic by providing an unambiguous
1211 // order using the id_ field
1212 assert_neq(b->id_, a->id_);
1213 return b->id_ < a->id_;
1215 return b->cost_ < a->cost_;
1219 static bool equal(const Branch* a, const Branch* b) {
1220 return a->cost_ == b->cost_ && a->curtailed_ == b->curtailed_ && a->tipDepth() == b->tipDepth();
1225 * A priority queue for Branch objects; makes it easy to process
1226 * branches in a best-first manner by prioritizing branches with lower
1227 * cumulative costs over branches with higher cumulative costs.
1231 typedef std::pair<int, int> TIntPair;
1232 typedef std::priority_queue<Branch*, std::vector<Branch*>, CostCompare> TBranchQueue;
1236 BranchQueue(bool verbose, bool quiet) :
1237 sz_(0), branchQ_(), patid_(0), verbose_(verbose), quiet_(quiet)
1241 * Return the front (highest-priority) element of the queue.
1244 Branch *b = branchQ_.top();
1247 ss << patid_ << ": Fronting " << b->id_ << ", " << b << ", " << b->cost_ << ", " << b->exhausted_ << ", " << b->curtailed_ << ", " << sz_ << "->" << (sz_-1);
1254 * Remove and return the front (highest-priority) element of the
1258 Branch *b = branchQ_.top(); // get it
1259 branchQ_.pop(); // remove it
1262 ss << patid_ << ": Popping " << b->id_ << ", " << b << ", " << b->cost_ << ", " << b->exhausted_ << ", " << b->curtailed_ << ", " << sz_ << "->" << (sz_-1);
1270 * Insert a new Branch into the sorted priority queue.
1272 void push(Branch *b) {
1274 bool bIsBetter = empty() || !CostCompare()(b, branchQ_.top());
1278 ss << patid_ << ": Pushing " << b->id_ << ", " << b << ", " << b->cost_ << ", " << b->exhausted_ << ", " << b->curtailed_ << ", " << sz_ << "->" << (sz_+1);
1283 assert(bIsBetter || branchQ_.top() != b || CostCompare::equal(branchQ_.top(), b));
1284 assert(!bIsBetter || branchQ_.top() == b || CostCompare::equal(branchQ_.top(), b));
1290 * Empty the priority queue and reset the count.
1292 void reset(uint32_t patid) {
1294 branchQ_ = TBranchQueue();
1299 * Return true iff the priority queue of branches is empty.
1301 bool empty() const {
1302 bool ret = branchQ_.empty();
1303 assert(ret || sz_ > 0);
1304 assert(!ret || sz_ == 0);
1309 * Return the number of Branches in the queue.
1311 uint32_t size() const {
1317 * Consistency check.
1319 bool repOk(std::set<Branch*>& bset) {
1320 TIntPair pair = bestStratumAndHam(bset);
1321 Branch *b = branchQ_.top();
1322 assert_eq(pair.first, (b->cost_ >> 14));
1323 assert_eq(pair.second, (b->cost_ & ~0xc000));
1324 std::set<Branch*>::iterator it;
1325 for(it = bset.begin(); it != bset.end(); it++) {
1326 assert_gt((*it)->depth3_, 0);
1336 * Return the stratum and quality-weight (sum of qualities of all
1337 * edited positions) of the lowest-cost branch.
1339 TIntPair bestStratumAndHam(std::set<Branch*>& bset) const {
1340 TIntPair ret = make_pair(0xffff, 0xffff);
1341 std::set<Branch*>::iterator it;
1342 for(it = bset.begin(); it != bset.end(); it++) {
1344 int stratum = b->cost_ >> 14;
1345 assert_lt(stratum, 4);
1346 int qual = b->cost_ & ~0xc000;
1347 if(stratum < ret.first ||
1348 (stratum == ret.first && qual < ret.second))
1350 ret.first = stratum;
1359 TBranchQueue branchQ_; // priority queue of branches
1366 * A class that both contains Branches and determines how those
1367 * branches are extended to form longer paths. The overall goal is to
1368 * find the best full alignment(s) as quickly as possible so that a
1369 * successful search can finish quickly. A second goal is to ensure
1370 * that the most "promising" paths are tried first so that, if there is
1371 * a limit on the amount of effort spent searching before we give up,
1372 * we can be as sensitive as possible given that limit.
1374 * The quality (or cost) of a given alignment path will ultimately be
1375 * configurable. The default cost model is:
1377 * 1. Mismatches incur one "edit" penalty and a "quality" penalty with
1378 * magnitude equal to the Phred quality score of the read position
1379 * involved in the edit (note that insertions into the read are a
1381 * 2. Edit penalties are all more costly than any quality penalty; i.e.
1382 * the policy sorts alignments first by edit penalty, then by
1384 * 3. For the Maq-like alignment policy, edit penalties saturate (don't
1385 * get any greater) after leaving the seed region of the alignment.
1391 PathManager(ChunkPool* cpool_, int *btCnt, bool verbose, bool quiet) :
1392 branchQ_(verbose, quiet),
1394 bpool(cpool, "branch"),
1395 rpool(cpool, "range state"),
1396 epool(cpool, "edit"),
1397 minCost(0), btCnt_(btCnt),
1405 * Return the "front" (highest-priority) branch in the collection.
1409 assert_gt(branchQ_.front()->depth3_, 0);
1410 return branchQ_.front();
1414 * Pop the highest-priority (lowest cost) element from the
1418 Branch* b = branchQ_.pop();
1419 assert_gt(b->depth3_, 0);
1421 // Also remove it from the set
1422 assert(branchSet_.find(b) != branchSet_.end());
1423 ASSERT_ONLY(size_t setSz = branchSet_.size());
1424 branchSet_.erase(branchSet_.find(b));
1425 assert_eq(setSz-1, branchSet_.size());
1426 if(!branchQ_.empty()) {
1427 // Top shouldn't be b any more
1428 Branch *newtop = branchQ_.front();
1429 assert(b != newtop);
1432 // Update this PathManager's cost
1433 minCost = branchQ_.front()->cost_;
1439 * Push a new element onto the priority queue.
1441 void push(Branch *b) {
1442 assert(!b->exhausted_);
1443 assert_gt(b->depth3_, 0);
1446 // Also insert it into the set
1447 assert(branchSet_.find(b) == branchSet_.end());
1448 branchSet_.insert(b);
1450 // Update this PathManager's cost
1451 minCost = branchQ_.front()->cost_;
1455 * Return the number of active branches in the best-first
1459 return branchQ_.size();
1463 * Reset the PathManager, clearing out the priority queue and
1464 * resetting the RangeStatePool.
1466 void reset(uint32_t patid) {
1467 branchQ_.reset(patid); // reset the priority queue
1468 assert(branchQ_.empty());
1469 bpool.reset(); // reset the Branch pool
1470 epool.reset(); // reset the Edit pool
1471 rpool.reset(); // reset the RangeState pool
1472 assert(bpool.empty());
1473 assert(epool.empty());
1474 assert(rpool.empty());
1475 ASSERT_ONLY(branchSet_.clear());
1476 assert_eq(0, branchSet_.size());
1477 assert_eq(0, branchQ_.size());
1483 * Return true iff Branch b is in the priority queue;
1485 bool contains(Branch *b) const {
1486 bool ret = branchSet_.find(b) != branchSet_.end();
1487 assert(!ret || !b->exhausted_);
1492 * Do a consistenty-check on the collection of branches contained
1493 * in this PathManager.
1496 if(empty()) return true;
1497 assert(branchQ_.repOk(branchSet_));
1503 * Return true iff the priority queue of branches is empty.
1505 bool empty() const {
1506 bool ret = branchQ_.empty();
1507 assert_eq(ret, branchSet_.empty());
1512 * Curtail the given branch, and possibly remove it from or
1513 * re-insert it into the priority queue.
1515 void curtail(Branch *br, uint32_t qlen, int seedLen, bool qualOrder) {
1516 assert(!br->exhausted_);
1517 assert(!br->curtailed_);
1518 uint16_t origCost = br->cost_;
1519 br->curtail(rpool, seedLen, qualOrder);
1520 assert(br->curtailed_);
1521 assert_geq(br->cost_, origCost);
1522 if(br->exhausted_) {
1523 assert(br == front());
1524 ASSERT_ONLY(Branch *popped =) pop();
1525 assert(popped == br);
1526 br->free(qlen, rpool, epool, bpool);
1527 } else if(br->cost_ != origCost) {
1528 // Re-insert the newly-curtailed branch
1529 assert(br == front());
1530 Branch *popped = pop();
1531 assert(popped == br);
1537 * If the frontmost branch is a curtailed branch, split off an
1538 * extendable branch and add it to the queue.
1540 bool splitAndPrep(RandomSource& rand, uint32_t qlen,
1541 uint32_t qualLim, int seedLen,
1542 bool qualOrder, bool fuzzy,
1543 const EbwtParams& ep, const uint8_t* ebwt,
1546 if(empty()) return true;
1547 // This counts as a backtrack
1548 if(btCnt_ != NULL && (*btCnt_ == 0)) {
1549 // Abruptly end search
1552 Branch *f = front();
1553 assert(!f->exhausted_);
1554 while(f->delayedIncrease_) {
1555 assert(!f->exhausted_);
1556 if(f->delayedIncrease_) {
1557 assert_neq(0, f->delayedCost_);
1558 ASSERT_ONLY(Branch *popped =) pop();
1559 assert(popped == f);
1560 f->cost_ = f->delayedCost_;
1561 f->delayedIncrease_ = false;
1562 f->delayedCost_ = 0;
1563 push(f); // re-insert it
1567 assert(!f->exhausted_);
1570 ASSERT_ONLY(uint16_t origCost = f->cost_);
1571 // This counts as a backtrack
1572 if(btCnt_ != NULL) {
1573 if(--(*btCnt_) == 0) {
1574 // Abruptly end search
1578 Branch* newbr = splitBranch(
1579 f, rand, qlen, qualLim, seedLen, qualOrder, fuzzy, ep, ebwt,
1584 // If f is exhausted, get rid of it immediately
1586 assert(!f->delayedIncrease_);
1587 ASSERT_ONLY(Branch *popped =) pop();
1588 assert(popped == f);
1589 f->free(qlen, rpool, epool, bpool);
1591 assert_eq(origCost, f->cost_);
1592 assert(newbr != NULL);
1594 assert(newbr == front());
1601 * Return true iff the front element of the queue is prepped.
1604 return front()->prepped_;
1610 * Split off an extendable branch from a curtailed branch.
1612 Branch* splitBranch(Branch* src, RandomSource& rand, uint32_t qlen,
1613 uint32_t qualLim, int seedLen, bool qualOrder, bool fuzzy,
1614 const EbwtParams& ep, const uint8_t* ebwt,
1617 Branch* dst = src->splitBranch(
1618 rpool, epool, bpool, size(), rand,
1619 qlen, qualLim, seedLen, qualOrder, ep, ebwt, ebwtFw, fuzzy,
1621 assert(dst->repOk());
1626 * Prep the next branch to be extended in advanceBranch().
1628 void prep(const EbwtParams& ep, const uint8_t* ebwt) {
1629 if(!branchQ_.empty()) {
1630 branchQ_.front()->prep(ep, ebwt);
1634 BranchQueue branchQ_; // priority queue for selecting lowest-cost Branch
1635 // set of branches in priority queue, for sanity checks
1636 ASSERT_ONLY(std::set<Branch*> branchSet_);
1640 ChunkPool *cpool; // pool for generic chunks of memory
1641 AllocOnlyPool<Branch> bpool; // pool for allocating Branches
1642 AllocOnlyPool<RangeState> rpool; // pool for allocating RangeStates
1643 AllocOnlyPool<Edit> epool; // pool for allocating Edits
1644 /// The minimum possible cost for any alignments obtained by
1645 /// advancing further
1649 /// Pointer to the aligner's per-read backtrack counter. We
1650 /// increment it in splitBranch.
1657 * Encapsulates an algorithm that navigates the Bowtie index to produce
1658 * candidate ranges of alignments in the Burrows-Wheeler matrix. A
1659 * higher authority is responsible for reporting hits out of those
1660 * ranges, and stopping when the consumer is satisfied.
1663 typedef Ebwt<String<Dna> > TEbwt;
1666 done(false), foundRange(false), curEbwt_(NULL) { }
1668 virtual ~RangeSource() { }
1670 /// Set query to find ranges for
1671 virtual void setQuery(ReadBuf& r, Range *partial) = 0;
1672 /// Set up the range search.
1673 virtual void initBranch(PathManager& pm) = 0;
1674 /// Advance the range search by one memory op
1675 virtual void advanceBranch(int until, uint16_t minCost, PathManager& pm) = 0;
1677 /// Return the last valid range found
1678 virtual Range& range() = 0;
1679 /// Return ptr to index this RangeSource is currently getting ranges from
1680 const TEbwt *curEbwt() const { return curEbwt_; }
1682 /// All searching w/r/t the current query is finished
1684 /// Set to true iff the last call to advance yielded a range
1687 /// ptr to index this RangeSource is currently getting ranges from
1688 const TEbwt *curEbwt_;
1692 * Abstract parent of RangeSourceDrivers.
1694 template<typename TRangeSource>
1695 class RangeSourceDriver {
1696 typedef Ebwt<String<Dna> > TEbwt;
1698 RangeSourceDriver(bool _done, uint32_t minCostAdjustment = 0) :
1699 foundRange(false), done(_done), minCostAdjustment_(minCostAdjustment)
1701 minCost = minCostAdjustment_;
1704 virtual ~RangeSourceDriver() { }
1707 * Prepare this aligner for the next read.
1709 virtual void setQuery(PatternSourcePerThread* patsrc, Range *r) {
1710 // Clear our buffer of previously-dished-out top offsets
1711 ASSERT_ONLY(allTops_.clear());
1712 setQueryImpl(patsrc, r);
1716 * Prepare this aligner for the next read.
1718 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) = 0;
1721 * Advance the aligner by one memory op. Return true iff we're
1722 * done with this read.
1724 virtual void advance(int until) {
1727 if(this->foundRange) {
1728 // Assert that we have not yet dished out a range with this
1730 assert_gt(range().bot, range().top);
1731 assert(range().ebwt != NULL);
1732 int64_t top = (int64_t)range().top;
1733 top++; // ensure it's not 0
1734 if(!range().ebwt->fw()) top = -top;
1735 assert(allTops_.find(top) == allTops_.end());
1736 allTops_.insert(top);
1741 * Advance the aligner by one memory op. Return true iff we're
1742 * done with this read.
1744 virtual void advanceImpl(int until) = 0;
1746 * Return the range found.
1748 virtual Range& range() = 0;
1751 * Return whether we're generating ranges for the first or the
1754 virtual bool mate1() const = 0;
1757 * Return true iff current pattern is forward-oriented.
1759 virtual bool fw() const = 0;
1761 virtual void removeMate(int m) { }
1763 /// Set to true iff we just found a range.
1767 * Set to true if all searching w/r/t the current query is
1768 * finished or if there is no current query.
1773 * The minimum "combined" stratum/qual cost that could possibly
1774 * result from subsequent calls to advance() for this driver.
1779 * Adjustment to the minCost given by the caller that constructed
1780 * the object. This is useful if we know the lowest-cost branch is
1781 * likely to cost less than the any of the alignments that could
1782 * possibly result from advancing (e.g. when we're going to force a
1783 * mismatch somewhere down the line).
1785 uint16_t minCostAdjustment_;
1790 std::set<int64_t> allTops_;
1795 * A concrete driver wrapper for a single RangeSource.
1797 template<typename TRangeSource>
1798 class SingleRangeSourceDriver : public RangeSourceDriver<TRangeSource> {
1800 typedef Ebwt<String<Dna> > TEbwt;
1803 SingleRangeSourceDriver(
1804 EbwtSearchParams<String<Dna> >& params,
1808 HitSinkPerThread* sinkPt,
1809 vector<String<Dna5> >& os,
1813 uint32_t minCostAdjustment,
1816 RangeSourceDriver<TRangeSource>(true, minCostAdjustment),
1817 len_(0), mate1_(mate1),
1821 ebwtFw_(rs_->curEbwt()->fw()),
1822 pm_(pool, btCnt, verbose, quiet)
1824 assert(rs_ != NULL);
1827 virtual ~SingleRangeSourceDriver() {
1828 delete rs_; rs_ = NULL;
1832 * Prepare this aligner for the next read.
1834 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) {
1836 pm_.reset(patsrc->patid());
1837 ReadBuf* buf = mate1_ ? &patsrc->bufa() : &patsrc->bufb();
1838 len_ = buf->length();
1839 rs_->setQuery(*buf, r);
1840 initRangeSource((fw_ == ebwtFw_) ? buf->qual : buf->qualRev,
1841 buf->fuzzy, buf->alts,
1842 (fw_ == ebwtFw_) ? buf->altQual : buf->altQualRev);
1844 if(this->done) return;
1845 ASSERT_ONLY(allTops_.clear());
1847 rs_->initBranch(pm_); // set up initial branch
1849 uint16_t icost = (r != NULL) ? r->cost : 0;
1850 this->minCost = max<uint16_t>(icost, this->minCostAdjustment_);
1851 this->done = rs_->done;
1852 this->foundRange = rs_->foundRange;
1854 assert(!pm_.front()->curtailed_);
1855 assert(!pm_.front()->exhausted_);
1860 * Advance the aligner by one memory op. Return true iff we're
1861 * done with this read.
1863 virtual void advanceImpl(int until) {
1864 if(this->done || pm_.empty()) {
1868 assert(!pm_.empty());
1869 assert(!pm_.front()->curtailed_);
1870 assert(!pm_.front()->exhausted_);
1872 // Advance the RangeSource for the forward-oriented read
1873 ASSERT_ONLY(uint16_t oldMinCost = this->minCost);
1874 ASSERT_ONLY(uint16_t oldPmMinCost = pm_.minCost);
1875 rs_->advanceBranch(until, this->minCost, pm_);
1876 this->done = pm_.empty();
1877 if(pm_.minCost != 0) {
1878 this->minCost = max(pm_.minCost, this->minCostAdjustment_);
1880 // pm_.minCost is 0 because we reset it due to exceptional
1886 if(pm_.minCost != 0 && pm_.minCost < oldPmMinCost) {
1887 cerr << "PathManager's cost went down" << endl;
1890 if(this->minCost < oldMinCost) {
1891 cerr << "this->minCost cost went down" << endl;
1895 cerr << "pm.minCost went from " << oldPmMinCost
1896 << " to " << pm_.minCost << endl;
1897 cerr << "this->minCost went from " << oldMinCost
1898 << " to " << this->minCost << endl;
1899 cerr << "this->minCostAdjustment_ == "
1900 << this->minCostAdjustment_ << endl;
1905 this->foundRange = rs_->foundRange;
1907 if(this->foundRange) {
1908 if(until >= ADV_COST_CHANGES) assert_eq(oldMinCost, range().cost);
1909 // Assert that we have not yet dished out a range with this
1911 assert_gt(range().bot, range().top);
1912 assert(range().ebwt != NULL);
1913 int64_t top = (int64_t)range().top;
1914 top++; // ensure it's not 0
1915 if(!range().ebwt->fw()) top = -top;
1916 assert(allTops_.find(top) == allTops_.end());
1917 allTops_.insert(top);
1920 assert(!pm_.front()->curtailed_);
1921 assert(!pm_.front()->exhausted_);
1927 * Return the range found.
1929 virtual Range& range() {
1930 rs_->range().fw = fw_;
1931 rs_->range().mate1 = mate1_;
1932 return rs_->range();
1936 * Return whether we're generating ranges for the first or the
1939 bool mate1() const {
1944 * Return true iff current pattern is forward-oriented.
1950 virtual void initRangeSource(const String<char>& qual, bool fuzzy,
1951 int alts, const String<char>* altQuals) = 0;
1959 // Temporary HitSink; to be deleted
1960 HitSinkPerThread* sinkPt_;
1962 // State for alignment
1963 EbwtSearchParams<String<Dna> >& params_;
1965 TRangeSource* rs_; // delete this in destructor
1968 ASSERT_ONLY(std::set<int64_t> allTops_);
1972 * Encapsulates an algorithm that navigates the Bowtie index to produce
1973 * candidate ranges of alignments in the Burrows-Wheeler matrix. A
1974 * higher authority is responsible for reporting hits out of those
1975 * ranges, and stopping when the consumer is satisfied.
1977 template<typename TRangeSource>
1978 class StubRangeSourceDriver : public RangeSourceDriver<TRangeSource> {
1980 typedef Ebwt<String<Dna> > TEbwt;
1981 typedef std::vector<RangeSourceDriver<TRangeSource>*> TRangeSrcDrPtrVec;
1985 StubRangeSourceDriver() :
1986 RangeSourceDriver<TRangeSource>(false)
1989 this->foundRange = false;
1992 virtual ~StubRangeSourceDriver() { }
1994 /// Set query to find ranges for
1995 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) { }
1997 /// Advance the range search by one memory op
1998 virtual void advanceImpl(int until) { }
2000 /// Return the last valid range found
2001 virtual Range& range() { throw 1; }
2004 * Return whether we're generating ranges for the first or the
2007 virtual bool mate1() const {
2012 * Return true iff current pattern is forward-oriented.
2014 virtual bool fw() const {
2021 * Encapsulates an algorithm that navigates the Bowtie index to produce
2022 * candidate ranges of alignments in the Burrows-Wheeler matrix. A
2023 * higher authority is responsible for reporting hits out of those
2024 * ranges, and stopping when the consumer is satisfied.
2026 template<typename TRangeSource>
2027 class ListRangeSourceDriver : public RangeSourceDriver<TRangeSource> {
2029 typedef Ebwt<String<Dna> > TEbwt;
2030 typedef std::vector<RangeSourceDriver<TRangeSource>*> TRangeSrcDrPtrVec;
2034 ListRangeSourceDriver(const TRangeSrcDrPtrVec& rss) :
2035 RangeSourceDriver<TRangeSource>(false),
2036 cur_(0), ham_(0), rss_(rss) /* copy */,
2037 patsrc_(NULL), seedRange_(NULL)
2039 assert_gt(rss_.size(), 0);
2040 assert(!this->done);
2043 virtual ~ListRangeSourceDriver() {
2044 for(size_t i = 0; i < rss_.size(); i++) {
2049 /// Set query to find ranges for
2050 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) {
2051 cur_ = 0; // go back to first RangeSource in list
2053 rss_[cur_]->setQuery(patsrc, r);
2054 patsrc_ = patsrc; // so that we can call setQuery on the other elements later
2056 this->done = (cur_ == rss_.size()-1) && rss_[cur_]->done;
2057 this->minCost = max(rss_[cur_]->minCost, this->minCostAdjustment_);
2058 this->foundRange = rss_[cur_]->foundRange;
2061 /// Advance the range search by one memory op
2062 virtual void advanceImpl(int until) {
2063 assert(!this->done);
2064 assert_lt(cur_, rss_.size());
2065 if(rss_[cur_]->done) {
2066 // Move on to next RangeSourceDriver
2067 if(cur_ < rss_.size()-1) {
2068 rss_[++cur_]->setQuery(patsrc_, seedRange_);
2069 this->minCost = max(rss_[cur_]->minCost, this->minCostAdjustment_);
2070 this->foundRange = rss_[cur_]->foundRange;
2072 // No RangeSources in list; done
2077 // Advance current RangeSource
2078 rss_[cur_]->advance(until);
2079 this->done = (cur_ == rss_.size()-1 && rss_[cur_]->done);
2080 this->foundRange = rss_[cur_]->foundRange;
2081 this->minCost = max(rss_[cur_]->minCost, this->minCostAdjustment_);
2085 /// Return the last valid range found
2086 virtual Range& range() { return rss_[cur_]->range(); }
2089 * Return whether we're generating ranges for the first or the
2092 virtual bool mate1() const {
2093 return rss_[0]->mate1();
2097 * Return true iff current pattern is forward-oriented.
2099 virtual bool fw() const {
2100 return rss_[0]->fw();
2107 TRangeSrcDrPtrVec rss_;
2108 PatternSourcePerThread* patsrc_;
2113 * A RangeSourceDriver that wraps a set of other RangeSourceDrivers and
2114 * chooses which one to advance at any given moment by picking one with
2115 * minimal "cumulative cost" so far.
2117 * Note that costs have to be "adjusted" to account for the fact that
2118 * the alignment policy for the underlying RangeSource might force
2121 template<typename TRangeSource>
2122 class CostAwareRangeSourceDriver : public RangeSourceDriver<TRangeSource> {
2124 typedef Ebwt<String<Dna> > TEbwt;
2125 typedef RangeSourceDriver<TRangeSource>* TRangeSrcDrPtr;
2126 typedef std::vector<TRangeSrcDrPtr> TRangeSrcDrPtrVec;
2130 CostAwareRangeSourceDriver(
2132 const TRangeSrcDrPtrVec* rss,
2136 RangeSourceDriver<TRangeSource>(false),
2137 rss_(), active_(), strandFix_(strandFix),
2138 lastRange_(NULL), delayedRange_(NULL), patsrc_(NULL),
2139 verbose_(verbose), quiet_(quiet), mixesReads_(mixesReads)
2145 this->foundRange = false;
2155 /// Destroy all underlying RangeSourceDrivers
2156 virtual ~CostAwareRangeSourceDriver() {
2157 const size_t rssSz = rss_.size();
2158 for(size_t i = 0; i < rssSz; i++) {
2165 /// Set query to find ranges for
2166 virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) {
2168 this->foundRange = false;
2170 delayedRange_ = NULL;
2171 ASSERT_ONLY(allTopsRc_.clear());
2173 rand_.init(patsrc->bufa().seed);
2174 const size_t rssSz = rss_.size();
2175 if(rssSz == 0) return;
2176 for(size_t i = 0; i < rssSz; i++) {
2177 // Assuming that all
2178 rss_[i]->setQuery(patsrc, r);
2186 * Add a new RangeSource to the list and re-sort it.
2188 void addSource(TRangeSrcDrPtr p, Range *r) {
2189 assert(!this->foundRange);
2190 this->lastRange_ = NULL;
2191 this->delayedRange_ = NULL;
2193 if(patsrc_ != NULL) {
2194 p->setQuery(patsrc_, r);
2197 active_.push_back(p);
2204 * Free and remove all contained RangeSources.
2206 void clearSources() {
2207 const size_t rssSz = rss_.size();
2208 for(size_t i = 0; i < rssSz; i++) {
2217 * Return the number of RangeSources contained within.
2219 size_t size() const {
2224 * Return true iff no RangeSources are contained within.
2226 bool empty() const {
2227 return rss_.empty();
2231 * Advance the aligner by one memory op. Return true iff we're
2232 * done with this read.
2234 virtual void advance(int until) {
2235 ASSERT_ONLY(uint16_t precost = this->minCost);
2236 assert(!this->done);
2237 assert(!this->foundRange);
2238 until = max<int>(until, ADV_COST_CHANGES);
2240 assert(!this->foundRange || lastRange_ != NULL);
2241 if(this->foundRange) {
2242 assert_eq(range().cost, precost);
2246 /// Advance the range search
2247 virtual void advanceImpl(int until) {
2249 ASSERT_ONLY(uint16_t iminCost = this->minCost);
2250 const size_t actSz = active_.size();
2251 assert(sortedActives());
2252 if(delayedRange_ != NULL) {
2253 assert_eq(iminCost, delayedRange_->cost);
2254 lastRange_ = delayedRange_;
2255 delayedRange_ = NULL;
2256 this->foundRange = true;
2257 assert_eq(range().cost, iminCost);
2258 if(!active_.empty()) {
2259 assert_geq(active_[0]->minCost, this->minCost);
2260 this->minCost = max(active_[0]->minCost, this->minCost);
2264 return; // found a range
2266 assert(delayedRange_ == NULL);
2267 if(mateEliminated() || actSz == 0) {
2268 // No more alternatoves; clear the active set and signal
2274 // Advance lowest-cost RangeSourceDriver
2275 TRangeSrcDrPtr p = active_[0];
2276 uint16_t precost = p->minCost;
2277 assert(!p->done || p->foundRange);
2278 if(!p->foundRange) {
2281 bool needsSort = false;
2283 Range *r = &p->range();
2284 assert_eq(r->cost, iminCost);
2285 needsSort = foundFirstRange(r); // may set delayedRange_; re-sorts active_
2286 assert_eq(lastRange_->cost, iminCost);
2287 if(delayedRange_ != NULL) assert_eq(delayedRange_->cost, iminCost);
2288 p->foundRange = false;
2290 if(p->done || (precost != p->minCost) || needsSort) {
2292 if(mateEliminated() || active_.empty()) {
2294 this->done = (delayedRange_ == NULL);
2297 assert(sortedActives());
2298 assert(lastRange_ == NULL || lastRange_->cost == iminCost);
2299 assert(delayedRange_ == NULL || delayedRange_->cost == iminCost);
2302 /// Return the last valid range found
2303 virtual Range& range() {
2304 assert(lastRange_ != NULL);
2309 * Return whether we're generating ranges for the first or the
2312 virtual bool mate1() const {
2313 return rss_[0]->mate1();
2317 * Return true iff current pattern is forward-oriented.
2319 virtual bool fw() const {
2320 return rss_[0]->fw();
2323 virtual void removeMate(int m) {
2324 bool qmate1 = (m == 1);
2326 for(size_t i = 0; i < active_.size(); i++) {
2327 if(active_[i]->mate1() == qmate1) {
2328 active_[i]->done = true;
2332 assert(mateEliminated());
2338 * Set paired_ to true iff there are mate1 and mate2 range sources
2339 * in the rss_ vector.
2342 const size_t rssSz = rss_.size();
2345 for(size_t i = 0; i < rssSz; i++) {
2346 if(rss_[i]->mate1()) saw1 = true;
2349 assert(saw1 || saw2);
2350 paired_ = saw1 && saw2;
2354 * Return true iff one mate or the other has been eliminated.
2356 bool mateEliminated() {
2357 if(!paired_) return false;
2358 bool mate1sLeft = false;
2359 bool mate2sLeft = false;
2360 // If this RangeSourceDriver is done, shift everyone else
2362 const size_t rssSz = active_.size();
2363 for(size_t i = 0; i < rssSz; i++) {
2364 if(!active_[i]->done) {
2365 if(active_[i]->mate1()) mate1sLeft = true;
2366 else mate2sLeft = true;
2369 return !mate1sLeft || !mate2sLeft;
2374 * Check that the given range has not yet been reported.
2376 bool checkRange(Range* r) {
2377 // Assert that we have not yet dished out a range with this
2379 assert_gt(r->bot, r->top);
2380 assert(r->ebwt != NULL);
2381 int64_t top = (int64_t)r->top;
2382 top++; // ensure it's not 0
2383 if(!r->ebwt->fw()) top = -top;
2385 assert(this->allTops_.find(top) == this->allTops_.end());
2386 if(!mixesReads_) this->allTops_.insert(top);
2388 assert(this->allTopsRc_.find(top) == this->allTopsRc_.end());
2389 if(!mixesReads_) this->allTopsRc_.insert(top);
2396 * We found a range; check whether we should attempt to find a
2397 * range of equal quality from the opposite strand so that we can
2398 * resolve the strand bias. Return true iff we modified the cost
2399 * of one or more items after the first item.
2401 bool foundFirstRange(Range* r) {
2403 assert(checkRange(r));
2404 this->foundRange = true;
2407 // We found a range but there may be an equally good range
2408 // on the other strand; let's try to get it.
2409 const size_t sz = active_.size();
2410 for(size_t i = 1; i < sz; i++) {
2411 // Same mate, different orientation?
2412 if(rss_[i]->mate1() == r->mate1 && rss_[i]->fw() != r->fw) {
2413 // Yes; see if it has the same cost
2414 TRangeSrcDrPtr p = active_[i];
2415 uint16_t minCost = max(this->minCost, p->minCost);
2416 if(minCost > r->cost) break;
2417 // Yes, it has the same cost
2418 assert_eq(minCost, r->cost); // can't be better
2419 // Advance it until it's done, we've found a range,
2420 // or its cost increases
2421 if(this->verbose_) cout << " Looking for opposite range to avoid strand bias:" << endl;
2422 while(!p->done && !p->foundRange) {
2423 p->advance(ADV_COST_CHANGES);
2424 assert_geq(p->minCost, minCost);
2425 if(p->minCost > minCost) break;
2428 // Found one! Now we have to choose which one
2429 // to give out first; we choose randomly using
2430 // the size of the ranges as weights.
2431 delayedRange_ = &p->range();
2432 assert(checkRange(delayedRange_));
2433 size_t tot = (delayedRange_->bot - delayedRange_->top) +
2434 (lastRange_->bot - lastRange_->top);
2435 uint32_t rq = rand_.nextU32() % tot;
2436 // We picked this range, not the first one
2437 if(rq < (delayedRange_->bot - delayedRange_->top)) {
2438 Range *tmp = lastRange_;
2439 lastRange_ = delayedRange_;
2440 delayedRange_ = tmp;
2442 p->foundRange = false;
2444 // Return true iff we need to force a re-sort
2448 // OK, now we have a choice of two equally good ranges from
2455 * Sort all of the RangeSourceDriver ptrs in the rss_ array so that
2456 * the one with the lowest cumulative cost is at the top. Break
2457 * ties randomly. Just do selection sort for now; we don't expect
2458 * the list to be long.
2460 void sortActives() {
2461 TRangeSrcDrPtrVec& vec = active_;
2462 size_t sz = vec.size();
2463 // Selection sort / removal outer loop
2464 for(size_t i = 0; i < sz;) {
2465 // Remove elements that we're done with
2466 if(vec[i]->done && !vec[i]->foundRange) {
2467 vec.erase(vec.begin() + i);
2472 uint16_t minCost = vec[i]->minCost;
2474 // Selection sort inner loop
2475 for(size_t j = i+1; j < sz; j++) {
2476 if(vec[j]->done && !vec[j]->foundRange) {
2477 // We'll get rid of this guy later
2480 if(vec[j]->minCost < minCost) {
2481 minCost = vec[j]->minCost;
2483 } else if(vec[j]->minCost == minCost) {
2484 // Possibly randomly pick the other
2485 if(rand_.nextU32() & 0x1000) {
2490 // Do the swap, if necessary
2492 assert_leq(minCost, vec[i]->minCost);
2493 TRangeSrcDrPtr tmp = vec[i];
2494 vec[i] = vec[minOff];
2499 if(delayedRange_ == NULL) {
2500 assert_geq(this->minCost, this->minCostAdjustment_);
2501 assert_geq(vec[0]->minCost, this->minCost);
2502 this->minCost = vec[0]->minCost;
2504 assert(sortedActives());
2509 * Check that the rss_ array is sorted by minCost; assert if it's
2512 bool sortedActives() const {
2513 // Selection sort outer loop
2514 const TRangeSrcDrPtrVec& vec = active_;
2515 const size_t sz = vec.size();
2516 for(size_t i = 0; i < sz; i++) {
2517 assert(!vec[i]->done || vec[i]->foundRange);
2518 for(size_t j = i+1; j < sz; j++) {
2519 assert(!vec[j]->done || vec[j]->foundRange);
2520 assert_leq(vec[i]->minCost, vec[j]->minCost);
2523 if(delayedRange_ == NULL && sz > 0) {
2524 // Only assert this if there's no delayed range; if there's
2525 // a delayed range, the minCost is its cost, not the 0th
2527 assert_leq(vec[0]->minCost, this->minCost);
2533 /// List of all the drivers
2534 TRangeSrcDrPtrVec rss_;
2535 /// List of all the as-yet-uneliminated drivers
2536 TRangeSrcDrPtrVec active_;
2537 /// Whether the list of drivers contains drivers for both mates 1 and 2
2539 /// If true, this driver will make an attempt to dish out ranges in
2540 /// a way that approaches the right distribution based on the
2541 /// number of hits on both strands.
2544 /// The random seed from the Aligner, which we use to randomly break ties
2547 Range *delayedRange_;
2548 PatternSourcePerThread* patsrc_;
2552 ASSERT_ONLY(std::set<int64_t> allTopsRc_);
2555 #endif /* RANGE_SOURCE_H_ */