4 * A generic class providing a stateful way to find alignments.
13 #include "seqan/sequence.h"
14 #include "assert_helpers.h"
18 #include "range_source.h"
19 #include "range_chaser.h"
20 #include "ref_aligner.h"
21 #include "reference.h"
22 #include "aligner_metrics.h"
23 #include "search_globals.h"
26 * State machine for carrying out an alignment, which usually consists
27 * of a series of phases that conduct different alignments using
28 * different backtracking constraints.
30 * Each Aligner should have a dedicated PatternSourcePerThread.
34 Aligner(bool _done, bool rangeMode) :
35 done(_done), patsrc_(NULL), bufa_(NULL), bufb_(NULL),
39 virtual ~Aligner() { }
40 /// Advance the range search by one memory op
41 virtual bool advance() = 0;
43 /// Prepare Aligner for the next read
44 virtual void setQuery(PatternSourcePerThread *patsrc) {
45 assert(patsrc != NULL);
47 bufa_ = &patsrc->bufa();
48 assert(bufa_ != NULL);
49 bufb_ = &patsrc->bufb();
50 alen_ = bufa_->length();
51 blen_ = (bufb_ != NULL) ? bufb_->length() : 0;
52 rand_.init(bufa_->seed);
56 * Set to true if all searching w/r/t the current query is
57 * finished or if there is no current query.
64 PatternSourcePerThread* patsrc_;
74 * Abstract parent factory class for constructing aligners of all kinds.
76 class AlignerFactory {
78 virtual ~AlignerFactory() { }
79 virtual Aligner* create() const = 0;
82 * Allocate a vector of n Aligners; use destroy(std::vector...) to
85 virtual std::vector<Aligner*>* create(uint32_t n) const {
86 std::vector<Aligner*>* v = new std::vector<Aligner*>;
87 for(uint32_t i = 0; i < n; i++) {
88 v->push_back(create());
89 assert(v->back() != NULL);
94 /// Free memory associated with the aligner
95 virtual void destroy(Aligner* al) const {
101 /// Free memory associated with an aligner list
102 virtual void destroy(std::vector<Aligner*>* als) const {
104 // Free all of the Aligners
105 for(size_t i = 0; i < als->size(); i++) {
106 if((*als)[i] != NULL) {
117 * Coordinates multiple aligners of the same type (i.e. either all
118 * single-end or all paired-end).
125 const AlignerFactory& alignFact,
126 const PatternSourcePerThreadFactory& patsrcFact) :
127 n_(n), qUpto_(qUpto),
128 alignFact_(alignFact), patsrcFact_(patsrcFact),
129 aligners_(NULL), patsrcs_(NULL)
131 aligners_ = alignFact_.create(n_);
132 assert(aligners_ != NULL);
133 patsrcs_ = patsrcFact_.create(n_);
134 assert(patsrcs_ != NULL);
137 /// Free memory associated with the aligners and their pattern sources.
138 virtual ~MultiAligner() {
139 alignFact_.destroy(aligners_);
140 patsrcFact_.destroy(patsrcs_);
144 * Advance an array of aligners in parallel, using prefetches to
145 * try to hide all the latency.
151 for(uint32_t i = 0; i < n_; i++) {
152 if(!(*aligners_)[i]->done) {
153 // Advance an aligner already in progress
155 (*aligners_)[i]->advance();
157 // Get a new read and initialize an aligner with it
158 (*patsrcs_)[i]->nextReadPair();
159 if(!(*patsrcs_)[i]->empty() && (*patsrcs_)[i]->patid() < qUpto_) {
160 (*aligners_)[i]->setQuery((*patsrcs_)[i]);
161 assert(!(*aligners_)[i]->done);
164 // No more reads; if done == true, it remains
173 uint32_t n_; /// Number of aligners
174 uint32_t qUpto_; /// Number of reads to align before stopping
175 const AlignerFactory& alignFact_;
176 const PatternSourcePerThreadFactory& patsrcFact_;
177 std::vector<Aligner *>* aligners_;
178 std::vector<PatternSourcePerThread *>* patsrcs_;
182 * Coordinates multiple single-end and paired-end aligners, routing
183 * reads to one or the other type as appropriate.
185 class MixedMultiAligner {
190 const AlignerFactory& alignSEFact,
191 const AlignerFactory& alignPEFact,
192 const PatternSourcePerThreadFactory& patsrcFact) :
193 n_(n), qUpto_(qUpto),
194 alignSEFact_(alignSEFact),
195 alignPEFact_(alignPEFact),
196 patsrcFact_(patsrcFact),
202 // Instantiate all single-end aligners
203 alignersSE_ = alignSEFact_.create(n_);
204 assert(alignersSE_ != NULL);
205 // Instantiate all paired-end aligners
206 alignersPE_ = alignPEFact_.create(n_);
207 assert(alignersPE_ != NULL);
208 // Allocate array of boolean flags indicating whether each of
209 // the slots is currently using the single-end or paired-end
211 seOrPe_ = new bool[n_];
212 for(uint32_t i = 0; i < n_; i++) {
215 // Instantiate all read sources
216 patsrcs_ = patsrcFact_.create(n_);
217 assert(patsrcs_ != NULL);
220 /// Free memory associated with the aligners and their pattern sources.
221 virtual ~MixedMultiAligner() {
222 alignSEFact_.destroy(alignersSE_);
223 alignPEFact_.destroy(alignersPE_);
224 patsrcFact_.destroy(patsrcs_);
229 * Advance an array of aligners in parallel, using prefetches to
230 * try to hide all the latency.
232 void run(bool verbose = false) {
236 Aligner *al = seOrPe_[0] ? (*alignersSE_)[0] : (*alignersPE_)[0];
237 PatternSourcePerThread *ps = (*patsrcs_)[0];
240 if(!first && !al->done) {
241 // Advance an aligner already in progress; this is
248 if(ps->patid() < qUpto_ && !ps->empty()) {
250 // Read currently in buffer is paired-end
251 (*alignersPE_)[0]->setQuery(ps);
252 al = (*alignersPE_)[0];
253 seOrPe_[0] = false; // false -> paired
255 // Read currently in buffer is single-end
256 (*alignersSE_)[0]->setQuery(ps);
257 al = (*alignersSE_)[0];
258 seOrPe_[0] = true; // true = unpaired
262 // No more reads; if done == true, it remains
271 for(uint32_t i = 0; i < n_; i++) {
272 Aligner *al = seOrPe_[i] ? (*alignersSE_)[i] :
274 if(!first && !al->done) {
275 // Advance an aligner already in progress; this is
280 // Feed a new read to a vacant aligner
281 PatternSourcePerThread *ps = (*patsrcs_)[i];
284 if(ps->patid() < qUpto_ && !ps->empty()) {
286 // Read currently in buffer is paired-end
287 (*alignersPE_)[i]->setQuery(ps);
288 seOrPe_[i] = false; // false -> paired
290 // Read currently in buffer is single-end
291 (*alignersSE_)[i]->setQuery(ps);
292 seOrPe_[i] = true; // true = unpaired
296 // No more reads; if done == true, it remains
307 uint32_t n_; /// Number of aligners
308 uint32_t qUpto_; /// Number of reads to align before stopping
309 const AlignerFactory& alignSEFact_;
310 const AlignerFactory& alignPEFact_;
311 const PatternSourcePerThreadFactory& patsrcFact_;
312 std::vector<Aligner *>* alignersSE_;
313 std::vector<Aligner *>* alignersPE_;
315 std::vector<PatternSourcePerThread *>* patsrcs_;
319 * An aligner for finding exact matches of unpaired reads. Always
320 * tries the forward-oriented version of the read before the reverse-
323 template<typename TRangeSource>
324 class UnpairedAlignerV2 : public Aligner {
325 typedef RangeSourceDriver<TRangeSource> TDriver;
328 EbwtSearchParams<String<Dna> >* params,
330 RangeChaser<String<Dna> >* rchase,
332 const HitSinkPerThreadFactory& sinkPtFactory,
333 HitSinkPerThread* sinkPt,
334 vector<String<Dna5> >& os,
335 const BitPairReference* refs,
342 AlignerMetrics *metrics = NULL) :
343 Aligner(true, rangeMode),
348 sinkPtFactory_(sinkPtFactory),
360 assert(pool_ != NULL);
361 assert(sinkPt_ != NULL);
362 assert(params_ != NULL);
363 assert(driver_ != NULL);
366 virtual ~UnpairedAlignerV2() {
367 delete driver_; driver_ = NULL;
368 delete params_; params_ = NULL;
369 delete rchase_; rchase_ = NULL;
370 delete[] btCnt_; btCnt_ = NULL;
371 sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL;
375 * Prepare this aligner for the next read.
377 virtual void setQuery(PatternSourcePerThread* patsrc) {
378 Aligner::setQuery(patsrc); // set fields & random seed
379 if(metrics_ != NULL) {
380 metrics_->nextRead(patsrc->bufa().patFw);
382 pool_->reset(&patsrc->bufa().name, patsrc->patid());
383 if(patsrc->bufa().length() < 4) {
385 cerr << "Warning: Skipping read " << patsrc->bufa().name
386 << " because it is less than 4 characters long" << endl;
389 sinkPt_->finishRead(*patsrc_, true, true);
392 driver_->setQuery(patsrc, NULL);
393 this->done = driver_->done;
395 // Reset #-backtrack countdown
396 if(btCnt_ != NULL) *btCnt_ = maxBts_;
397 if(sinkPt_->setHits(patsrc->bufa().hitset)) {
399 sinkPt_->finishRead(*patsrc_, true, true);
401 // Grab a bit from the pseudo-random seed to determine whether
402 // to start with forward or reverse complement
403 firstIsFw_ = ((patsrc->bufa().seed & 0x10) == 0);
408 * Helper for reporting an alignment.
410 inline bool report(const Range& ra,
415 bool ebwtFw = ra.ebwt->fw();
416 params_->setFw(ra.fw);
417 assert_eq(bufa_->color, color);
418 return params_->reportHit(
419 ra.fw ? (ebwtFw? bufa_->patFw : bufa_->patFwRev) :
420 (ebwtFw? bufa_->patRc : bufa_->patRcRev),
421 ra.fw ? (ebwtFw? &bufa_->qual : &bufa_->qualRev) :
422 (ebwtFw? &bufa_->qualRev : &bufa_->qual),
430 ra.mms, // mismatch positions
431 ra.refcs, // reference characters for mms
432 ra.numMms, // # mismatches
433 make_pair(first, second), // position
434 make_pair(0, 0), // (bogus) mate position
435 true, // (bogus) mate orientation
436 0, // (bogus) mate length
437 make_pair(ra.top, ra.bot),// arrows
440 ra.stratum, // alignment stratum
441 ra.cost, // cost, including qual penalty
442 ra.bot - ra.top - 1, // # other hits
443 patsrc_->patid(), // pattern id
444 bufa_->seed, // pseudo-random seed
445 0); // mate (0 = unpaired)
449 * Advance the aligner. Return true iff we're
450 * done with this read.
452 virtual bool advance() {
456 assert(driver_->foundRange);
457 assert(!sinkPt_->irrelevantCost(driver_->range().cost));
458 if(!rchase_->foundOff() && !rchase_->done) {
462 if(rchase_->foundOff()) {
463 this->done = report(driver_->range(), rchase_->off().first,
464 rchase_->off().second, rchase_->tlen());
467 assert(rchase_->done);
468 // Forget this range; keep looking for ranges
470 driver_->foundRange = false;
471 this->done = driver_->done;
475 if(!this->done && !chase_) {
476 assert(!driver_->done || driver_->foundRange);
477 if(driver_->foundRange) {
478 const Range& ra = driver_->range();
479 assert(!sinkPt_->irrelevantCost(ra.cost));
482 this->done = report(ra, ra.top, ra.bot, 0);
483 driver_->foundRange = false;
485 rchase_->setTopBot(ra.top, ra.bot, alen_, rand_, ra.ebwt);
486 if(rchase_->foundOff()) {
488 ra, rchase_->off().first,
489 rchase_->off().second, rchase_->tlen());
492 if(!rchase_->done && !sinkPt_->irrelevantCost(ra.cost)) {
493 // Keep chasing this range
496 driver_->foundRange = false;
500 this->done = sinkPt_->irrelevantCost(driver_->minCost);
502 driver_->advance(ADV_COST_CHANGES);
504 // No longer necessarily true with chain input
505 //assert(!sinkPt_->spanStrata());
508 if(driver_->done && !driver_->foundRange && !chase_) {
513 sinkPt_->finishRead(*patsrc_, true, true);
520 // Reference sequences (needed for colorspace decoding)
521 const BitPairReference* refs_;
528 // Temporary HitSink; to be deleted
529 const HitSinkPerThreadFactory& sinkPtFactory_;
530 HitSinkPerThread* sinkPt_;
532 // State for alignment
533 EbwtSearchParams<String<Dna> >* params_;
535 // State for getting alignments from ranges statefully
536 RangeChaser<String<Dna> >* rchase_;
538 // Range-finding state
541 bool verbose_; // be talkative
542 bool quiet_; // don't print informational/warning info
547 AlignerMetrics *metrics_;
551 * An aligner for finding paired alignments while operating entirely
552 * within the Burrows-Wheeler domain.
554 template<typename TRangeSource>
555 class PairedBWAlignerV1 : public Aligner {
557 typedef std::pair<uint32_t,uint32_t> U32Pair;
558 typedef std::vector<U32Pair> U32PairVec;
559 typedef std::vector<Range> TRangeVec;
560 typedef RangeSourceDriver<TRangeSource> TDriver;
561 typedef std::pair<uint64_t, uint64_t> TU64Pair;
562 typedef std::set<TU64Pair> TSetPairs;
566 EbwtSearchParams<String<Dna> >* params,
567 TDriver* driver1Fw, TDriver* driver1Rc,
568 TDriver* driver2Fw, TDriver* driver2Rc,
569 RefAligner<String<Dna5> >* refAligner,
570 RangeChaser<String<Dna> >* rchase,
572 const HitSinkPerThreadFactory& sinkPtFactory,
573 HitSinkPerThread* sinkPt,
579 uint32_t mixedThresh,
580 uint32_t mixedAttemptLim,
581 const BitPairReference* refs,
588 Aligner(true, rangeMode),
590 patsrc_(NULL), qlen1_(0), qlen2_(0), doneFw_(true),
592 chase1Fw_(false), chase1Rc_(false),
593 chase2Fw_(false), chase2Rc_(false),
594 delayedChase1Fw_(false), delayedChase1Rc_(false),
595 delayedChase2Fw_(false), delayedChase2Rc_(false),
596 refAligner_(refAligner),
597 sinkPtFactory_(sinkPtFactory),
600 minInsert_(minInsert),
601 maxInsert_(maxInsert),
602 dontReconcile_(dontReconcile),
603 symCeiling_(symCeiling),
604 mixedThresh_(mixedThresh),
605 mixedAttemptLim_(mixedAttemptLim),
607 fw1_(fw1), fw2_(fw2),
614 driver1Fw_(driver1Fw), driver1Rc_(driver1Rc),
615 offs1FwSz_(0), offs1RcSz_(0),
616 driver2Fw_(driver2Fw), driver2Rc_(driver2Rc),
617 offs2FwSz_(0), offs2RcSz_(0),
619 chaseL_fw_ (fw1_ ? chase1Fw_ : chase1Rc_),
620 chaseR_fw_ (fw2_ ? chase2Fw_ : chase2Rc_),
621 delayedchaseL_fw_(fw1_ ? delayedChase1Fw_ : delayedChase1Rc_),
622 delayedchaseR_fw_(fw2_ ? delayedChase2Fw_ : delayedChase2Rc_),
623 drL_fw_ (fw1_ ? *driver1Fw_ : *driver1Rc_),
624 drR_fw_ (fw2_ ? *driver2Fw_ : *driver2Rc_),
625 offsLarr_fw_ (fw1_ ? offs1FwArr_ : offs1RcArr_),
626 offsRarr_fw_ (fw2_ ? offs2FwArr_ : offs2RcArr_),
627 rangesLarr_fw_ (fw1_ ? ranges1FwArr_ : ranges1RcArr_),
628 rangesRarr_fw_ (fw2_ ? ranges2FwArr_ : ranges2RcArr_),
629 offsLsz_fw_ (fw1_ ? offs1FwSz_ : offs1RcSz_),
630 offsRsz_fw_ (fw2_ ? offs2FwSz_ : offs2RcSz_),
632 chaseL_rc_ (fw2_ ? chase2Rc_ : chase2Fw_),
633 chaseR_rc_ (fw1_ ? chase1Rc_ : chase1Fw_),
634 delayedchaseL_rc_(fw2_ ? delayedChase2Rc_ : delayedChase2Fw_),
635 delayedchaseR_rc_(fw1_ ? delayedChase1Rc_ : delayedChase1Fw_),
636 drL_rc_ (fw2_ ? *driver2Rc_ : *driver2Fw_),
637 drR_rc_ (fw1_ ? *driver1Rc_ : *driver1Fw_),
638 offsLarr_rc_ (fw2_ ? offs2RcArr_ : offs2FwArr_),
639 offsRarr_rc_ (fw1_ ? offs1RcArr_ : offs1FwArr_),
640 rangesLarr_rc_ (fw2_ ? ranges2RcArr_ : ranges2FwArr_),
641 rangesRarr_rc_ (fw1_ ? ranges1RcArr_ : ranges1FwArr_),
642 offsLsz_rc_ (fw2_ ? offs2RcSz_ : offs2FwSz_),
643 offsRsz_rc_ (fw1_ ? offs1RcSz_ : offs1FwSz_),
645 chaseL_ (&chaseL_fw_),
646 chaseR_ (&chaseR_fw_),
647 delayedchaseL_(&delayedchaseL_fw_),
648 delayedchaseR_(&delayedchaseR_fw_),
651 offsLarr_ (offsLarr_fw_),
652 offsRarr_ (offsRarr_fw_),
653 rangesLarr_ (rangesLarr_fw_),
654 rangesRarr_ (rangesRarr_fw_),
655 offsLsz_ (&offsLsz_fw_),
656 offsRsz_ (&offsRsz_fw_),
657 donePair_ (&doneFw_),
662 assert(pool_ != NULL);
663 assert(sinkPt_ != NULL);
664 assert(params_ != NULL);
665 assert(driver1Fw_ != NULL);
666 assert(driver1Rc_ != NULL);
667 assert(driver2Fw_ != NULL);
668 assert(driver2Rc_ != NULL);
671 virtual ~PairedBWAlignerV1() {
672 delete driver1Fw_; driver1Fw_ = NULL;
673 delete driver1Rc_; driver1Rc_ = NULL;
674 delete driver2Fw_; driver2Fw_ = NULL;
675 delete driver2Rc_; driver2Rc_ = NULL;
676 delete params_; params_ = NULL;
677 delete rchase_; rchase_ = NULL;
678 delete[] btCnt_; btCnt_ = NULL;
679 delete refAligner_; refAligner_ = NULL;
680 sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL;
684 * Prepare this aligner for the next read.
686 virtual void setQuery(PatternSourcePerThread* patsrc) {
687 assert(!patsrc->bufa().empty());
688 Aligner::setQuery(patsrc); // set fields & random seed
689 assert(!patsrc->bufb().empty());
690 // Give all of the drivers pointers to the relevant read info
692 pool_->reset(&patsrc->bufa().name, patsrc->patid());
693 if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) {
695 cerr << "Warning: Skipping pair " << patsrc->bufa().name
696 << " because a mate is less than 4 characters long" << endl;
699 sinkPt_->finishRead(*patsrc_, true, true);
702 driver1Fw_->setQuery(patsrc, NULL);
703 driver1Rc_->setQuery(patsrc, NULL);
704 driver2Fw_->setQuery(patsrc, NULL);
705 driver2Rc_->setQuery(patsrc, NULL);
706 qlen1_ = patsrc_->bufa().length();
707 qlen2_ = patsrc_->bufb().length();
708 if(btCnt_ != NULL) (*btCnt_) = maxBts_;
709 // Neither orientation is done
713 // No ranges are being chased yet
718 delayedChase1Fw_ = false;
719 delayedChase1Rc_ = false;
720 delayedChase2Fw_ = false;
721 delayedChase2Rc_ = false;
722 // Clear all intermediate ranges
723 for(size_t i = 0; i < 32; i++) {
724 offs1FwArr_[i].clear(); offs1RcArr_[i].clear();
725 offs2FwArr_[i].clear(); offs2RcArr_[i].clear();
726 ranges1FwArr_[i].clear(); ranges1RcArr_[i].clear();
727 ranges2FwArr_[i].clear(); ranges2RcArr_[i].clear();
729 offs1FwSz_ = offs1RcSz_ = offs2FwSz_ = offs2RcSz_ = 0;
730 chaseL_ = &chaseL_fw_;
731 chaseR_ = &chaseR_fw_;
732 delayedchaseL_ = &delayedchaseL_fw_;
733 delayedchaseR_ = &delayedchaseR_fw_;
736 offsLarr_ = offsLarr_fw_;
737 offsRarr_ = offsRarr_fw_;
738 rangesLarr_ = rangesLarr_fw_;
739 rangesRarr_ = rangesRarr_fw_;
740 offsLsz_ = &offsLsz_fw_;
741 offsRsz_ = &offsRsz_fw_;
742 donePair_ = &doneFw_;
749 allTopsL_fw_.clear();
750 allTopsR_fw_.clear();
751 allTopsL_rc_.clear();
752 allTopsR_rc_.clear();
757 * Advance the aligner by one memory op. Return true iff we're
758 * done with this read.
760 * A call to this function does one of many things:
761 * 1. Advance a RangeSourceDriver and check if it found a new range
762 * 2. Advance a RowChaseDriver and check if it found a reference
763 * offset for a an alignment in a range
765 virtual bool advance() {
767 if(doneFw_ && doneFwFirst_) {
768 if(verbose2_) cout << "--" << endl;
769 chaseL_ = &chaseL_rc_;
770 chaseR_ = &chaseR_rc_;
771 delayedchaseL_ = &delayedchaseL_rc_;
772 delayedchaseR_ = &delayedchaseR_rc_;
775 offsLarr_ = offsLarr_rc_;
776 offsRarr_ = offsRarr_rc_;
777 rangesLarr_ = rangesLarr_rc_;
778 rangesRarr_ = rangesRarr_rc_;
779 offsLsz_ = &offsLsz_rc_;
780 offsRsz_ = &offsRsz_rc_;
781 donePair_ = &this->done;
784 doneFwFirst_ = false;
787 bool chasing = *chaseL_ || *chaseR_;
788 if(chasing && !rchase_->foundOff() && !rchase_->done) {
792 advanceOrientation(!doneFw_);
794 if(verbose2_) cout << "----" << endl;
795 sinkPt_->finishRead(*patsrc_, true, true);
803 * Helper for reporting a pair of alignments. As of now, we report
804 * a paired alignment by reporting two consecutive alignments, one
807 bool report(const Range& rL, // range for upstream mate
808 const Range& rR, // range for downstream mate
809 uint32_t first, // ref idx
810 uint32_t upstreamOff, // offset for upstream mate
811 uint32_t dnstreamOff, // offset for downstream mate
812 uint32_t tlen, // length of ref
813 bool pairFw, // whether the pair is being mapped to fw strand
816 const ReferenceMap* rmap)
818 assert_lt(upstreamOff, dnstreamOff);
819 uint32_t spreadL = rL.bot - rL.top;
820 uint32_t spreadR = rR.bot - rR.top;
821 uint32_t oms = min(spreadL, spreadR) - 1;
822 ReadBuf* bufL = pairFw ? bufa_ : bufb_;
823 ReadBuf* bufR = pairFw ? bufb_ : bufa_;
824 uint32_t lenL = pairFw ? alen_ : blen_;
825 uint32_t lenR = pairFw ? blen_ : alen_;
827 assert(!params_->sink().exceededOverThresh());
828 params_->setFw(rL.fw);
829 assert_eq(bufL->color, color);
830 // Print upstream mate first
831 ret = params_->reportHit(
832 rL.fw ? (ebwtFwL? bufL->patFw : bufL->patFwRev) :
833 (ebwtFwL? bufL->patRc : bufL->patRcRev),
834 rL.fw ? (ebwtFwL? &bufL->qual : &bufL->qualRev) :
835 (ebwtFwL? &bufL->qualRev : &bufL->qual),
843 rL.mms, // mismatch positions
844 rL.refcs, // reference characters for mms
845 rL.numMms, // # mismatches
846 make_pair(first, upstreamOff),// position
847 make_pair(first, dnstreamOff),// mate position
848 rR.fw, // mate orientation
850 make_pair(rL.top, rL.bot), // arrows
853 rL.stratum, // alignment stratum
854 rL.cost, // cost, including quality penalty
860 return true; // can happen when -m is set
862 params_->setFw(rR.fw);
863 assert_eq(bufR->color, color);
864 ret = params_->reportHit(
865 rR.fw ? (ebwtFwR? bufR->patFw : bufR->patFwRev) :
866 (ebwtFwR? bufR->patRc : bufR->patRcRev),
867 rR.fw ? (ebwtFwR? &bufR->qual : &bufR->qualRev) :
868 (ebwtFwR? &bufR->qualRev : &bufR->qual),
876 rR.mms, // mismatch positions
877 rR.refcs, // reference characters for mms
878 rR.numMms, // # mismatches
879 make_pair(first, dnstreamOff),// position
880 make_pair(first, upstreamOff),// mate position
881 rL.fw, // mate orientation
883 make_pair(rR.top, rR.bot), // arrows
886 rR.stratum, // alignment stratum
887 rR.cost, // cost, including quality penalty
895 bool report(const Range& rL, // range for upstream mate
896 const Range& rR, // range for downstream mate
897 uint32_t first, // ref idx
898 uint32_t upstreamOff, // offset for upstream mate
899 uint32_t dnstreamOff, // offset for downstream mate
900 uint32_t tlen, // length of ref
901 bool pairFw, // whether the pair is being mapped to fw strand
902 const ReferenceMap* rmap)
904 return report(rL, rR, first, upstreamOff,
906 pairFw, rL.ebwt->fw(), rR.ebwt->fw(), rmap);
910 * Given a vector of reference positions where one of the two mates
911 * (the "anchor" mate) has aligned, look directly at the reference
912 * sequence for instances where the other mate (the "outstanding"
913 * mate) aligns such that mating constraint is satisfied.
915 * This function picks up to 'pick' anchors at random from the
916 * 'offs' array. It returns the number that it actually picked.
918 bool resolveOutstandingInRef(const bool off1,
923 assert(refs_->loaded());
924 assert_lt(off.first, refs_->numRefs());
925 // If matchRight is true, then we're trying to align the other
926 // mate to the right of the already-aligned mate. Otherwise,
928 bool matchRight = (off1 ? !doneFw_ : doneFw_);
929 // Sequence and quals for mate to be matched
930 bool fw = off1 ? fw2_ : fw1_; // whether outstanding mate is fw/rc
931 if(doneFw_) fw = !fw;
932 // 'seq' gets sequence of outstanding mate w/r/t the forward
934 const String<Dna5>& seq = fw ? (off1 ? patsrc_->bufb().patFw :
935 patsrc_->bufa().patFw) :
936 (off1 ? patsrc_->bufb().patRc :
937 patsrc_->bufa().patRc);
938 // 'seq' gets qualities of outstanding mate w/r/t the forward
940 const String<char>& qual = fw ? (off1 ? patsrc_->bufb().qual :
941 patsrc_->bufa().qual) :
942 (off1 ? patsrc_->bufb().qualRev :
943 patsrc_->bufa().qualRev);
944 uint32_t qlen = seqan::length(seq); // length of outstanding mate
945 uint32_t alen = (off1 ? patsrc_->bufa().length() :
946 patsrc_->bufb().length());
947 int minins = minInsert_;
948 int maxins = maxInsert_;
950 minins = max<int>(0, minins - patsrc_->bufa().trimmed5);
951 maxins = max<int>(0, maxins - patsrc_->bufa().trimmed5);
953 minins = max<int>(0, minins - patsrc_->bufa().trimmed3);
954 maxins = max<int>(0, maxins - patsrc_->bufa().trimmed3);
957 minins = max<int>(0, minins - patsrc_->bufb().trimmed3);
958 maxins = max<int>(0, maxins - patsrc_->bufb().trimmed3);
960 minins = max<int>(0, minins - patsrc_->bufb().trimmed5);
961 maxins = max<int>(0, maxins - patsrc_->bufb().trimmed5);
963 assert_geq(minins, 0);
964 assert_geq(maxins, 0);
965 // Don't even try if either of the mates is longer than the
966 // maximum insert size.
967 if((uint32_t)maxins <= max(qlen, alen)) {
970 const uint32_t tidx = off.first;
971 const uint32_t toff = off.second;
972 // Set begin/end to be a range of all reference
973 // positions that are legally permitted to be involved in
974 // the alignment of the outstanding mate. It's up to the
975 // callee to worry about how to scan these positions.
977 assert_geq(maxins, minins);
978 uint32_t insDiff = maxins - minins;
982 if(qlen < alen) begin += alen-qlen;
983 if(end > insDiff + qlen) {
984 begin = max<uint32_t>(begin, end - insDiff - qlen);
986 end = min<uint32_t>(refs_->approxLen(tidx), end);
987 begin = min<uint32_t>(refs_->approxLen(tidx), begin);
989 if(toff + alen < (uint32_t)maxins) {
992 begin = toff + alen - maxins;
994 uint32_t mi = min<uint32_t>(alen, qlen);
996 end = min<uint32_t>(end, toff + alen - minins + qlen - 1);
997 if(toff + alen + qlen < (uint32_t)minins + 1) end = 0;
999 // Check if there's not enough space in the range to fit an
1000 // alignment for the outstanding mate.
1001 if(end - begin < qlen) return false;
1002 std::vector<Range> ranges;
1003 std::vector<uint32_t> offs;
1004 refAligner_->find(1, tidx, refs_, seq, qual, begin, end, ranges,
1005 offs, doneFw_ ? &pairs_rc_ : &pairs_fw_,
1007 assert_eq(ranges.size(), offs.size());
1008 for(size_t i = 0; i < ranges.size(); i++) {
1009 Range& r = ranges[i];
1011 r.cost |= (r.stratum << 14);
1013 const uint32_t result = offs[i];
1014 // Just copy the known range's top and bot for now
1017 bool ebwtLFw = matchRight ? range.ebwt->fw() : true;
1018 bool ebwtRFw = matchRight ? true : range.ebwt->fw();
1020 matchRight ? range : r, // range for upstream mate
1021 matchRight ? r : range, // range for downstream mate
1023 matchRight ? toff : result, // upstream offset
1024 matchRight ? result : toff, // downstream offset
1025 tlen, // length of ref
1026 !doneFw_, // whether the pair is being mapped to fw strand
1029 range.ebwt->rmap())) return true;
1035 * Advance paired-end alignment.
1037 void advanceOrientation(bool pairFw) {
1038 assert(!this->done);
1039 assert(!*donePair_);
1040 assert(!*chaseL_ || !*chaseR_);
1042 assert(!rangeMode_);
1043 assert(!*delayedchaseL_);
1044 assert(drL_->foundRange);
1045 assert(rchase_->foundOff() || rchase_->done);
1046 if(rchase_->foundOff()) {
1047 // Resolve this against the reference loci
1048 // determined for the other mate
1049 const bool overThresh = (*offsLsz_ + *offsRsz_) > mixedThresh_;
1050 if(!this->done && (overThresh || dontReconcile_)) {
1051 // Because the total size of both ranges exceeds
1052 // our threshold, we're now operating in "mixed
1054 const Range& r = drL_->range();
1056 if(verbose_) cout << "Making an attempt to find the outstanding mate" << endl;
1057 this->done = resolveOutstandingInRef(
1058 pairFw, rchase_->off(),
1059 r.ebwt->_plen[rchase_->off().first], r);
1060 if(++mixedAttempts_ > mixedAttemptLim_) {
1061 // Give up on this pair
1068 assert(rchase_->done);
1069 // Forget this range; keep looking for ranges
1071 drL_->foundRange = false;
1072 if(verbose_) cout << "Done with chase for first mate" << endl;
1073 if(*delayedchaseR_) {
1074 // Start chasing the delayed range
1075 if(verbose_) cout << "Resuming delayed chase for second mate" << endl;
1076 assert(drR_->foundRange);
1077 const Range& r = drR_->range();
1079 uint32_t top = r.top;
1080 uint32_t bot = r.bot;
1081 uint32_t qlen = doneFw_? qlen1_ : qlen2_;
1082 rchase_->setTopBot(top, bot, qlen, rand_, r.ebwt);
1084 *delayedchaseR_ = false;
1087 } else if(*chaseR_) {
1088 assert(!rangeMode_);
1089 assert(!*delayedchaseR_);
1090 assert(drR_->foundRange);
1091 assert(rchase_->foundOff() || rchase_->done);
1092 if(rchase_->foundOff()) {
1093 // Resolve this against the reference loci
1094 // determined for the other mate
1095 const bool overThresh = (*offsLsz_ + *offsRsz_) > mixedThresh_;
1096 if(!this->done && (overThresh || dontReconcile_)) {
1097 // Because the total size of both ranges exceeds
1098 // our threshold, we're now operating in "mixed
1100 const Range& r = drR_->range();
1101 if(verbose_) cout << "Making an attempt to find the outstanding mate" << endl;
1102 this->done = resolveOutstandingInRef(
1103 !pairFw, rchase_->off(),
1104 r.ebwt->_plen[rchase_->off().first], r);
1105 if(++mixedAttempts_ > mixedAttemptLim_) {
1106 // Give up on this pair
1113 assert(rchase_->done);
1114 // Forget this range; keep looking for ranges
1116 drR_->foundRange = false;
1117 if(verbose_) cout << "Done with chase for second mate" << endl;
1118 if(*delayedchaseL_) {
1119 // Start chasing the delayed range
1120 if(verbose_) cout << "Resuming delayed chase for first mate" << endl;
1121 assert(drL_->foundRange);
1122 const Range& r = drL_->range();
1124 uint32_t top = r.top;
1125 uint32_t bot = r.bot;
1126 uint32_t qlen = doneFw_? qlen2_ : qlen1_;
1127 rchase_->setTopBot(top, bot, qlen, rand_, r.ebwt);
1129 *delayedchaseL_ = false;
1133 if(!this->done && !*donePair_ && !*chaseL_ && !*chaseR_) {
1134 // Search for more ranges for whichever mate currently has
1135 // fewer candidate alignments
1136 if((*offsLsz_ < *offsRsz_ || drR_->done) && !drL_->done) {
1137 // If there are no more ranges for the other mate and
1138 // there are no candidate alignments either, then we're
1139 // not going to find a paired alignment in this
1141 if(drR_->done && *offsRsz_ == 0) {
1142 // Give up on this orientation
1143 if(verbose_) cout << "Giving up on paired orientation " << (pairFw? "fw" : "rc") << " in mate 1" << endl;
1145 if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << endl;
1148 assert(!*delayedchaseL_);
1149 if(!drL_->foundRange) drL_->advance(ADV_FOUND_RANGE);
1150 if(drL_->foundRange) {
1153 std::set<int64_t>& s = (pairFw ? allTopsL_fw_ : allTopsL_rc_);
1154 int64_t t = drL_->range().top + 1; // add 1 to avoid 0
1155 if(!drL_->range().ebwt->fw()) t = -t; // invert for bw index
1156 assert(s.find(t) == s.end());
1160 // Add the size of this range to the total for this mate
1161 *offsLsz_ += (drL_->range().bot - drL_->range().top);
1162 if(*offsRsz_ == 0 && (!dontReconcile_ || *offsLsz_ > 3)) {
1163 // Delay chasing this range; we delay to avoid
1164 // needlessly chasing rows in this range when
1165 // the other mate doesn't end up aligning
1167 if(verbose_) cout << "Delaying a chase for first mate" << endl;
1168 *delayedchaseL_ = true;
1170 // Start chasing this range
1171 if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << " " << drL_->range().top << endl;
1172 if(verbose_) cout << "Chasing a range for first mate" << endl;
1173 if(*offsLsz_ > symCeiling_ && *offsRsz_ > symCeiling_) {
1174 // Too many candidates for both mates; abort
1175 // without any more searching
1179 // If this is the first range for both mates,
1180 // choose the smaller range to chase down first
1181 if(*delayedchaseR_ && (*offsRsz_ < *offsLsz_)) {
1182 assert(drR_->foundRange);
1183 *delayedchaseR_ = false;
1184 *delayedchaseL_ = true;
1186 const Range& r = drR_->range();
1188 uint32_t qlen = doneFw_? qlen1_ : qlen2_;
1189 rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1191 // Use Burrows-Wheeler for this pair (as
1194 const Range& r = drL_->range();
1195 uint32_t qlen = doneFw_? qlen2_ : qlen1_;
1196 rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1200 } else if(!drR_->done) {
1201 // If there are no more ranges for the other mate and
1202 // there are no candidate alignments either, then we're
1203 // not going to find a paired alignment in this
1205 if(drL_->done && *offsLsz_ == 0) {
1206 // Give up on this orientation
1207 if(verbose_) cout << "Giving up on paired orientation " << (pairFw? "fw" : "rc") << " in mate 2" << endl;
1208 if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << endl;
1212 assert(!*delayedchaseR_);
1213 if(!drR_->foundRange) drR_->advance(ADV_FOUND_RANGE);
1214 if(drR_->foundRange) {
1217 std::set<int64_t>& s = (pairFw ? allTopsR_fw_ : allTopsR_rc_);
1218 int64_t t = drR_->range().top + 1; // add 1 to avoid 0
1219 if(!drR_->range().ebwt->fw()) t = -t; // invert for bw index
1220 assert(s.find(t) == s.end());
1224 // Add the size of this range to the total for this mate
1225 *offsRsz_ += (drR_->range().bot - drR_->range().top);
1226 if(*offsLsz_ == 0 && (!dontReconcile_ || *offsRsz_ > 3)) {
1227 // Delay chasing this range; we delay to avoid
1228 // needlessly chasing rows in this range when
1229 // the other mate doesn't end up aligning
1231 if(verbose_) cout << "Delaying a chase for second mate" << endl;
1232 *delayedchaseR_ = true;
1234 // Start chasing this range
1235 if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << " " << drR_->range().top << endl;
1236 if(verbose_) cout << "Chasing a range for second mate" << endl;
1237 if(*offsLsz_ > symCeiling_ && *offsRsz_ > symCeiling_) {
1238 // Too many candidates for both mates; abort
1239 // without any more searching
1243 // If this is the first range for both mates,
1244 // choose the smaller range to chase down first
1245 if(*delayedchaseL_ && *offsLsz_ < *offsRsz_) {
1246 assert(drL_->foundRange);
1247 *delayedchaseL_ = false;
1248 *delayedchaseR_ = true;
1250 const Range& r = drL_->range();
1252 uint32_t qlen = doneFw_? qlen2_ : qlen1_;
1253 rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1255 // Use Burrows-Wheeler for this pair (as
1258 const Range& r = drR_->range();
1260 uint32_t qlen = doneFw_? qlen1_ : qlen2_;
1261 rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1266 // Finished processing ranges for both mates
1267 assert(drL_->done && drR_->done);
1273 const BitPairReference* refs_;
1275 PatternSourcePerThread *patsrc_;
1280 bool doneFw_; // finished with forward orientation of both mates?
1288 bool delayedChase1Fw_;
1289 bool delayedChase1Rc_;
1290 bool delayedChase2Fw_;
1291 bool delayedChase2Rc_;
1293 // For searching for outstanding mates
1294 RefAligner<String<Dna5> >* refAligner_;
1296 // Temporary HitSink; to be deleted
1297 const HitSinkPerThreadFactory& sinkPtFactory_;
1298 HitSinkPerThread* sinkPt_;
1300 // State for alignment
1301 EbwtSearchParams<String<Dna> >* params_;
1303 // Paired-end boundaries
1304 const uint32_t minInsert_;
1305 const uint32_t maxInsert_;
1307 // Don't attempt pairwise all-versus-all style of mate
1308 // reconciliation; just rely on mixed mode
1309 const bool dontReconcile_;
1311 // If both mates in a given orientation align >= symCeiling times,
1312 // then immediately give up
1313 const uint32_t symCeiling_;
1315 // If the total number of alignments for both mates in a given
1316 // orientation exceeds mixedThresh, then switch to mixed mode
1317 const uint32_t mixedThresh_;
1318 const uint32_t mixedAttemptLim_;
1319 uint32_t mixedAttempts_;
1321 // Orientation of upstream/downstream mates when aligning to
1326 // State for getting alignments from ranges statefully
1327 RangeChaser<String<Dna> >* rchase_;
1329 // true -> be talkative
1331 // true -> suppress warnings
1338 // Range-finding state for first mate
1339 TDriver* driver1Fw_;
1340 TDriver* driver1Rc_;
1341 U32PairVec offs1FwArr_[32];
1342 TRangeVec ranges1FwArr_[32];
1343 uint32_t offs1FwSz_; // total size of all ranges found in this category
1344 U32PairVec offs1RcArr_[32];
1345 TRangeVec ranges1RcArr_[32];
1346 uint32_t offs1RcSz_; // total size of all ranges found in this category
1348 // Range-finding state for second mate
1349 TDriver* driver2Fw_;
1350 TDriver* driver2Rc_;
1351 U32PairVec offs2FwArr_[32];
1352 TRangeVec ranges2FwArr_[32];
1353 uint32_t offs2FwSz_; // total size of all ranges found in this category
1354 U32PairVec offs2RcArr_[32];
1355 TRangeVec ranges2RcArr_[32];
1356 uint32_t offs2RcSz_; // total size of all ranges found in this category
1360 bool& delayedchaseL_fw_;
1361 bool& delayedchaseR_fw_;
1364 U32PairVec* offsLarr_fw_;
1365 U32PairVec* offsRarr_fw_;
1366 TRangeVec* rangesLarr_fw_;
1367 TRangeVec* rangesRarr_fw_;
1368 uint32_t& offsLsz_fw_;
1369 uint32_t& offsRsz_fw_;
1373 bool& delayedchaseL_rc_;
1374 bool& delayedchaseR_rc_;
1377 U32PairVec* offsLarr_rc_;
1378 U32PairVec* offsRarr_rc_;
1379 TRangeVec* rangesLarr_rc_;
1380 TRangeVec* rangesRarr_rc_;
1381 uint32_t& offsLsz_rc_;
1382 uint32_t& offsRsz_rc_;
1386 bool* delayedchaseL_;
1387 bool* delayedchaseR_;
1390 U32PairVec* offsLarr_;
1391 U32PairVec* offsRarr_;
1392 TRangeVec* rangesLarr_;
1393 TRangeVec* rangesRarr_;
1400 /// For keeping track of paired alignments that have already been
1401 /// found for the forward and reverse-comp pair orientations
1402 TSetPairs pairs_fw_;
1403 TSetPairs pairs_rc_;
1406 std::set<int64_t> allTopsL_fw_;
1407 std::set<int64_t> allTopsR_fw_;
1408 std::set<int64_t> allTopsL_rc_;
1409 std::set<int64_t> allTopsR_rc_;
1416 * Helper struct that holds a Range together with the coordinates where it al
1418 struct RangeWithCoords {
1424 * An aligner for finding paired alignments while operating entirely
1425 * within the Burrows-Wheeler domain.
1427 template<typename TRangeSource>
1428 class PairedBWAlignerV2 : public Aligner {
1430 typedef std::pair<uint32_t,uint32_t> U32Pair;
1431 typedef std::vector<U32Pair> U32PairVec;
1432 typedef std::vector<Range> TRangeVec;
1433 typedef RangeSourceDriver<TRangeSource> TDriver;
1434 typedef std::pair<uint64_t, uint64_t> TU64Pair;
1435 typedef std::set<TU64Pair> TSetPairs;
1439 EbwtSearchParams<String<Dna> >* params,
1440 EbwtSearchParams<String<Dna> >* paramsSe1,
1441 EbwtSearchParams<String<Dna> >* paramsSe2,
1443 RefAligner<String<Dna5> >* refAligner,
1444 RangeChaser<String<Dna> >* rchase,
1446 const HitSinkPerThreadFactory& sinkPtFactory,
1447 HitSinkPerThread* sinkPt,
1448 HitSinkPerThread* sinkPtSe1,
1449 HitSinkPerThread* sinkPtSe2,
1453 uint32_t mixedAttemptLim,
1454 const BitPairReference* refs,
1461 Aligner(true, rangeMode),
1464 qlen1_(0), qlen2_(0),
1469 refAligner_(refAligner),
1470 sinkPtFactory_(sinkPtFactory),
1472 sinkPtSe1_(sinkPtSe1),
1473 sinkPtSe2_(sinkPtSe2),
1475 paramsSe1_(paramsSe1),
1476 paramsSe2_(paramsSe2),
1477 minInsert_(minInsert),
1478 maxInsert_(maxInsert),
1479 mixedAttemptLim_(mixedAttemptLim),
1481 fw1_(fw1), fw2_(fw2),
1490 assert(sinkPt_ != NULL);
1491 assert(params_ != NULL);
1492 assert(driver_ != NULL);
1495 virtual ~PairedBWAlignerV2() {
1496 delete driver_; driver_ = NULL;
1497 delete params_; params_ = NULL;
1498 if(paramsSe1_ != NULL) {
1499 delete paramsSe1_; paramsSe1_ = NULL;
1500 delete paramsSe2_; paramsSe2_ = NULL;
1502 delete rchase_; rchase_ = NULL;
1503 delete[] btCnt_; btCnt_ = NULL;
1504 delete refAligner_; refAligner_ = NULL;
1505 sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL;
1506 if(sinkPtSe1_ != NULL) {
1507 sinkPtFactory_.destroy(sinkPtSe1_); sinkPtSe1_ = NULL;
1508 sinkPtFactory_.destroy(sinkPtSe2_); sinkPtSe2_ = NULL;
1513 * Prepare this aligner for the next read.
1515 virtual void setQuery(PatternSourcePerThread* patsrc) {
1516 assert(!patsrc->bufa().empty());
1517 Aligner::setQuery(patsrc); // set fields & random seed
1518 assert(!patsrc->bufb().empty());
1519 // Give all of the drivers pointers to the relevant read info
1521 pool_->reset(&patsrc->bufa().name, patsrc->patid());
1522 if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) {
1524 cerr << "Warning: Skipping pair " << patsrc->bufa().name
1525 << " because a mate is less than 4 characters long" << endl;
1528 sinkPt_->finishRead(*patsrc_, true, true);
1531 driver_->setQuery(patsrc, NULL);
1532 qlen1_ = patsrc_->bufa().length();
1533 qlen2_ = patsrc_->bufb().length();
1534 if(btCnt_ != NULL) (*btCnt_) = maxBts_;
1536 // Neither orientation is done
1538 // No ranges are being chased yet
1540 donePe_ = doneSe1_ = doneSe2_ = false;
1546 * Advance the aligner by one memory op. Return true iff we're
1547 * done with this read.
1549 * A call to this function does one of many things:
1550 * 1. Advance a RangeSourceDriver and check if it found a new range
1551 * 2. Advance a RowChaseDriver and check if it found a reference
1552 * offset for a an alignment in a range
1554 virtual bool advance() {
1555 assert(!this->done);
1557 assert(!rangeMode_); // chasing ranges
1558 if(!rchase_->foundOff() && !rchase_->done) {
1562 assert(rchase_->foundOff() || rchase_->done);
1563 if(rchase_->foundOff()) {
1564 const Range& r = driver_->range();
1568 r.ebwt->_plen[rchase_->off().first], r);
1571 assert(rchase_->done);
1572 // Forget this range; keep looking for ranges
1574 this->done = driver_->done;
1578 if(!this->done && !chase_) {
1579 // Search for more ranges for whichever mate currently has
1580 // fewer candidate alignments
1581 if(!driver_->done) {
1584 // Check whether any of the PE/SE possibilities
1585 // have become impossible due to the minCost
1588 assert(!this->done);
1589 donePe_ = sinkPt_->irrelevantCost(driver_->minCost);
1590 if(donePe_ && (!sinkPt_->empty() || sinkPtSe1_ == NULL)) {
1591 // Paired-end alignment(s) were found, no
1592 // more will be found, and no unpaired
1593 // alignments are requested, so stop
1596 if(donePe_ && sinkPtSe1_ != NULL) {
1597 // Note: removeMate affects minCost
1598 if(doneSe1_) driver_->removeMate(1);
1599 if(doneSe2_) driver_->removeMate(2);
1602 if(!this->done && sinkPtSe1_ != NULL) {
1604 doneSe1_ = sinkPtSe1_->irrelevantCost(driver_->minCost);
1605 if(doneSe1_ && donePe_) driver_->removeMate(1);
1608 doneSe2_ = sinkPtSe2_->irrelevantCost(driver_->minCost);
1609 if(doneSe2_ && donePe_) driver_->removeMate(2);
1611 // Do Se1 again, because removing Se2 may have
1612 // nudged minCost over the threshold
1614 doneSe1_ = sinkPtSe1_->irrelevantCost(driver_->minCost);
1615 if(doneSe1_ && donePe_) driver_->removeMate(1);
1617 if(doneSe1_ && doneSe2_) assert(donePe_);
1618 this->done = donePe_ && doneSe1_ && doneSe2_;
1622 if(sinkPtSe1_ != NULL) {
1623 assert(doneSe1_ || !sinkPtSe1_->irrelevantCost(driver_->minCost));
1624 assert(doneSe2_ || !sinkPtSe2_->irrelevantCost(driver_->minCost));
1626 assert(donePe_ || !sinkPt_->irrelevantCost(driver_->minCost));
1627 driver_->advance(ADV_COST_CHANGES);
1630 if(driver_->foundRange) {
1631 // Use Burrows-Wheeler for this pair (as usual)
1633 driver_->foundRange = false;
1634 const Range& r = driver_->range();
1636 rchase_->setTopBot(r.top, r.bot,
1637 r.mate1 ? qlen1_ : qlen2_,
1646 bool reportedPe = (sinkPt_->finishRead(*patsrc_, true, true) > 0);
1647 if(sinkPtSe1_ != NULL) {
1648 sinkPtSe1_->finishRead(*patsrc_, !reportedPe, false);
1649 sinkPtSe2_->finishRead(*patsrc_, !reportedPe, false);
1658 * Helper for reporting a pair of alignments. As of now, we report
1659 * a paired alignment by reporting two consecutive alignments, one
1662 bool report(const Range& rL, // range for upstream mate
1663 const Range& rR, // range for downstream mate
1664 uint32_t first, // ref idx
1665 uint32_t upstreamOff, // offset for upstream mate
1666 uint32_t dnstreamOff, // offset for downstream mate
1667 uint32_t tlen, // length of ref
1668 bool pairFw, // whether the pair is being mapped to fw strand
1671 const ReferenceMap *rmap)
1673 assert_lt(upstreamOff, dnstreamOff);
1674 uint32_t spreadL = rL.bot - rL.top;
1675 uint32_t spreadR = rR.bot - rR.top;
1676 uint32_t oms = min(spreadL, spreadR) - 1;
1677 ReadBuf* bufL = pairFw ? bufa_ : bufb_;
1678 ReadBuf* bufR = pairFw ? bufb_ : bufa_;
1679 uint32_t lenL = pairFw ? alen_ : blen_;
1680 uint32_t lenR = pairFw ? blen_ : alen_;
1682 assert(!params_->sink().exceededOverThresh());
1683 params_->setFw(rL.fw);
1684 assert_eq(bufL->color, color);
1685 // Print upstream mate first
1686 ret = params_->reportHit(
1687 rL.fw ? (ebwtFwL? bufL->patFw : bufL->patFwRev) :
1688 (ebwtFwL? bufL->patRc : bufL->patRcRev),
1689 rL.fw ? (ebwtFwL? &bufL->qual : &bufL->qualRev) :
1690 (ebwtFwL? &bufL->qualRev : &bufL->qual),
1698 rL.mms, // mismatch positions
1699 rL.refcs, // reference characters for mms
1700 rL.numMms, // # mismatches
1701 make_pair(first, upstreamOff),// position
1702 make_pair(first, dnstreamOff),// mate position
1703 rR.fw, // mate orientation
1704 lenR, // mate length
1705 make_pair(rL.top, rL.bot), // arrows
1708 rL.stratum, // alignment stratum
1709 rL.cost, // cost, including quality penalty
1710 oms, // # other hits
1715 return true; // can happen when -m is set
1717 params_->setFw(rR.fw);
1718 assert_eq(bufR->color, color);
1719 ret = params_->reportHit(
1720 rR.fw ? (ebwtFwR? bufR->patFw : bufR->patFwRev) :
1721 (ebwtFwR? bufR->patRc : bufR->patRcRev),
1722 rR.fw ? (ebwtFwR? &bufR->qual : &bufR->qualRev) :
1723 (ebwtFwR? &bufR->qualRev : &bufR->qual),
1731 rR.mms, // mismatch positions
1732 rR.refcs, // reference characters for mms
1733 rR.numMms, // # mismatches
1734 make_pair(first, dnstreamOff),// position
1735 make_pair(first, upstreamOff),// mate position
1736 rL.fw, // mate orientation
1737 lenL, // mate length
1738 make_pair(rR.top, rR.bot), // arrows
1741 rR.stratum, // alignment stratum
1742 rR.cost, // cost, including quality penalty
1743 oms, // # other hits
1751 * Helper for reporting a pair of alignments. As of now, we report
1752 * a paired alignment by reporting two consecutive alignments, one
1755 void reportSe(const Range& r, U32Pair h, uint32_t tlen) {
1756 EbwtSearchParams<String<Dna> >*params = (r.mate1 ? paramsSe1_ : paramsSe2_);
1757 assert(!(r.mate1 ? doneSe1_ : doneSe2_));
1758 params->setFw(r.fw);
1759 ReadBuf* buf = r.mate1 ? bufa_ : bufb_;
1760 bool ebwtFw = r.ebwt->fw();
1761 uint32_t len = r.mate1 ? alen_ : blen_;
1762 assert_eq(buf->color, color);
1763 // Print upstream mate first
1764 if(params->reportHit(
1765 r.fw ? (ebwtFw? buf->patFw : buf->patFwRev) :
1766 (ebwtFw? buf->patRc : buf->patRcRev),
1767 r.fw ? (ebwtFw? &buf->qual : &buf->qualRev) :
1768 (ebwtFw? &buf->qualRev : &buf->qual),
1776 r.mms, // mismatch positions
1777 r.refcs, // reference characters for mms
1778 r.numMms, // # mismatches
1780 make_pair(0, 0), // (bogus) mate coords
1781 true, // (bogus) mate orientation
1782 0, // (bogus) mate length
1783 make_pair(r.top, r.bot), // arrows
1786 r.stratum, // alignment stratum
1787 r.cost, // cost, including quality penalty
1788 r.bot - r.top - 1, // # other hits
1793 if(r.mate1) doneSe1_ = true;
1794 else doneSe2_ = true;
1795 if(donePe_) driver_->removeMate(r.mate1 ? 1 : 2);
1799 void resolveOutstanding(const U32Pair& off,
1800 const uint32_t tlen,
1803 assert(!this->done);
1805 bool ret = resolveOutstandingInRef(off, tlen, range);
1806 if(++mixedAttempts_ > mixedAttemptLim_ || ret) {
1807 // Give up on this pair
1809 if(sinkPtSe1_ != NULL) {
1810 if(doneSe1_) driver_->removeMate(1);
1811 if(doneSe2_) driver_->removeMate(2);
1814 this->done = (donePe_ && (!sinkPt_->empty() || sinkPtSe1_ == NULL || (doneSe1_ && doneSe2_)));
1816 if(!this->done && sinkPtSe1_ != NULL) {
1817 bool doneSe = (range.mate1 ? doneSe1_ : doneSe2_);
1819 // Hold onto this single-end alignment in case we don't
1820 // find any paired alignments
1821 reportSe(range, off, tlen);
1823 this->done = doneSe1_ && doneSe2_ && donePe_;
1828 * Given a vector of reference positions where one of the two mates
1829 * (the "anchor" mate) has aligned, look directly at the reference
1830 * sequence for instances where the other mate (the "outstanding"
1831 * mate) aligns such that mating constraint is satisfied.
1833 * This function picks up to 'pick' anchors at random from the
1834 * 'offs' array. It returns the number that it actually picked.
1836 bool resolveOutstandingInRef(const U32Pair& off,
1837 const uint32_t tlen,
1841 assert(refs_->loaded());
1842 assert_lt(off.first, refs_->numRefs());
1843 // pairFw = true if the anchor indicates that the pair will
1844 // align in its forward orientation (i.e. with mate1 to the
1846 bool pairFw = (range.mate1)? (range.fw == fw1_) : (range.fw == fw2_);
1847 // matchRight = true, if the opposite mate will be to the right
1848 // of the anchor mate
1849 bool matchRight = (pairFw ? range.mate1 : !range.mate1);
1850 // fw = orientation of the opposite mate
1851 bool fw = range.mate1 ? fw2_ : fw1_; // whether outstanding mate is fw/rc
1852 if(!pairFw) fw = !fw;
1853 // 'seq' = sequence for opposite mate
1854 const String<Dna5>& seq =
1855 fw ? (range.mate1 ? patsrc_->bufb().patFw :
1856 patsrc_->bufa().patFw) :
1857 (range.mate1 ? patsrc_->bufb().patRc :
1858 patsrc_->bufa().patRc);
1859 // 'qual' = qualities for opposite mate
1860 const String<char>& qual =
1861 fw ? (range.mate1 ? patsrc_->bufb().qual :
1862 patsrc_->bufa().qual) :
1863 (range.mate1 ? patsrc_->bufb().qualRev :
1864 patsrc_->bufa().qualRev);
1865 uint32_t qlen = seqan::length(seq); // length of outstanding mate
1866 uint32_t alen = (range.mate1 ? patsrc_->bufa().length() :
1867 patsrc_->bufb().length());
1868 int minins = minInsert_;
1869 int maxins = maxInsert_;
1871 minins = max<int>(0, minins - patsrc_->bufa().trimmed5);
1872 maxins = max<int>(0, maxins - patsrc_->bufa().trimmed5);
1874 minins = max<int>(0, minins - patsrc_->bufa().trimmed3);
1875 maxins = max<int>(0, maxins - patsrc_->bufa().trimmed3);
1878 minins = max<int>(0, minins - patsrc_->bufb().trimmed3);
1879 maxins = max<int>(0, maxins - patsrc_->bufb().trimmed3);
1881 minins = max<int>(0, minins - patsrc_->bufb().trimmed5);
1882 maxins = max<int>(0, maxins - patsrc_->bufb().trimmed5);
1884 assert_geq(minins, 0);
1885 assert_geq(maxins, 0);
1886 // Don't even try if either of the mates is longer than the
1887 // maximum insert size.
1888 if((uint32_t)maxins <= max(qlen, alen)) {
1891 const uint32_t tidx = off.first; // text id where anchor mate hit
1892 const uint32_t toff = off.second; // offset where anchor mate hit
1893 // Set begin/end to the range of reference positions where
1894 // outstanding mate may align while fulfilling insert-length
1896 uint32_t begin, end;
1897 assert_geq(maxins, minins);
1898 uint32_t insDiff = maxins - minins;
1900 end = toff + maxins;
1902 if(qlen < alen) begin += alen-qlen;
1903 if(end > insDiff + qlen) {
1904 begin = max<uint32_t>(begin, end - insDiff - qlen);
1906 end = min<uint32_t>(refs_->approxLen(tidx), end);
1907 begin = min<uint32_t>(refs_->approxLen(tidx), begin);
1909 if(toff + alen < (uint32_t)maxins) {
1912 begin = toff + alen - maxins;
1914 uint32_t mi = min<uint32_t>(alen, qlen);
1915 end = toff + mi - 1;
1916 end = min<uint32_t>(end, toff + alen - minins + qlen - 1);
1917 if(toff + alen + qlen < (uint32_t)(minins + 1)) end = 0;
1919 // Check if there's not enough space in the range to fit an
1920 // alignment for the outstanding mate.
1921 if(end - begin < qlen) return false;
1922 std::vector<Range> ranges;
1923 std::vector<uint32_t> offs;
1924 refAligner_->find(1, tidx, refs_, seq, qual, begin, end, ranges,
1925 offs, pairFw ? &pairs_fw_ : &pairs_rc_,
1927 assert_eq(ranges.size(), offs.size());
1928 for(size_t i = 0; i < ranges.size(); i++) {
1929 Range& r = ranges[i];
1931 r.cost |= (r.stratum << 14);
1932 r.mate1 = !range.mate1;
1933 const uint32_t result = offs[i];
1934 // Just copy the known range's top and bot for now
1937 bool ebwtLFw = matchRight ? range.ebwt->fw() : true;
1938 bool ebwtRFw = matchRight ? true : range.ebwt->fw();
1940 matchRight ? range : r, // range for upstream mate
1941 matchRight ? r : range, // range for downstream mate
1943 matchRight ? toff : result, // upstream offset
1944 matchRight ? result : toff, // downstream offset
1945 tlen, // length of ref
1946 pairFw, // whether the pair is being mapped to fw strand
1949 range.ebwt->rmap())) return true;
1954 const BitPairReference* refs_;
1956 PatternSourcePerThread *patsrc_;
1957 uint32_t qlen1_, qlen2_;
1960 // true -> we're no longer shooting for paired-end alignments;
1961 // just collecting single-end ones
1962 bool donePe_, doneSe1_, doneSe2_;
1964 // For searching for outstanding mates
1965 RefAligner<String<Dna5> >* refAligner_;
1967 // Temporary HitSink; to be deleted
1968 const HitSinkPerThreadFactory& sinkPtFactory_;
1969 HitSinkPerThread* sinkPt_;
1970 HitSinkPerThread* sinkPtSe1_, * sinkPtSe2_;
1972 // State for alignment
1973 EbwtSearchParams<String<Dna> >* params_;
1975 EbwtSearchParams<String<Dna> >* paramsSe1_, * paramsSe2_;
1977 // Paired-end boundaries
1978 const uint32_t minInsert_;
1979 const uint32_t maxInsert_;
1981 const uint32_t mixedAttemptLim_;
1982 uint32_t mixedAttempts_;
1984 // Orientation of upstream/downstream mates when aligning to
1986 const bool fw1_, fw2_;
1988 // State for getting alignments from ranges statefully
1989 RangeChaser<String<Dna> >* rchase_;
1991 // Range-finding state for first mate
1994 // Pool for distributing chunks of best-first path descriptor memory
2000 int maxBts_; // maximum allowed # backtracks
2001 int *btCnt_; // current backtrack count
2003 /// For keeping track of paired alignments that have already been
2004 /// found for the forward and reverse-comp pair orientations
2005 TSetPairs pairs_fw_, pairs_rc_;
2008 #endif /* ALIGNER_H_ */