Imported Upstream version 0.12.7
[bowtie.git] / aligner.h
1 /**
2  * aligner.h
3  *
4  * A generic class providing a stateful way to find alignments.
5  */
6
7 #ifndef ALIGNER_H_
8 #define ALIGNER_H_
9
10 #include <iostream>
11 #include <set>
12 #include <stdint.h>
13 #include "seqan/sequence.h"
14 #include "assert_helpers.h"
15 #include "ebwt.h"
16 #include "pat.h"
17 #include "range.h"
18 #include "range_source.h"
19 #include "range_chaser.h"
20 #include "ref_aligner.h"
21 #include "reference.h"
22 #include "aligner_metrics.h"
23 #include "search_globals.h"
24
25 /**
26  * State machine for carrying out an alignment, which usually consists
27  * of a series of phases that conduct different alignments using
28  * different backtracking constraints.
29  *
30  * Each Aligner should have a dedicated PatternSourcePerThread.
31  */
32 class Aligner {
33 public:
34         Aligner(bool _done, bool rangeMode) :
35                 done(_done), patsrc_(NULL), bufa_(NULL), bufb_(NULL),
36                 rangeMode_(rangeMode)
37         { }
38
39         virtual ~Aligner() { }
40         /// Advance the range search by one memory op
41         virtual bool advance() = 0;
42
43         /// Prepare Aligner for the next read
44         virtual void setQuery(PatternSourcePerThread *patsrc) {
45                 assert(patsrc != NULL);
46                 patsrc_ = patsrc;
47                 bufa_ = &patsrc->bufa();
48                 assert(bufa_ != NULL);
49                 bufb_ = &patsrc->bufb();
50                 alen_ = bufa_->length();
51                 blen_ = (bufb_ != NULL) ? bufb_->length() : 0;
52                 rand_.init(bufa_->seed);
53         }
54
55         /**
56          * Set to true if all searching w/r/t the current query is
57          * finished or if there is no current query.
58          */
59         bool done;
60
61 protected:
62
63         // Current read pair
64         PatternSourcePerThread* patsrc_;
65         ReadBuf* bufa_;
66         uint32_t alen_;
67         ReadBuf* bufb_;
68         uint32_t blen_;
69         bool rangeMode_;
70         RandomSource rand_;
71 };
72
73 /**
74  * Abstract parent factory class for constructing aligners of all kinds.
75  */
76 class AlignerFactory {
77 public:
78         virtual ~AlignerFactory() { }
79         virtual Aligner* create() const = 0;
80
81         /**
82          * Allocate a vector of n Aligners; use destroy(std::vector...) to
83          * free the memory.
84          */
85         virtual std::vector<Aligner*>* create(uint32_t n) const {
86                 std::vector<Aligner*>* v = new std::vector<Aligner*>;
87                 for(uint32_t i = 0; i < n; i++) {
88                         v->push_back(create());
89                         assert(v->back() != NULL);
90                 }
91                 return v;
92         }
93
94         /// Free memory associated with the aligner
95         virtual void destroy(Aligner* al) const {
96                 assert(al != NULL);
97                 // Free the Aligner
98                 delete al;
99         }
100
101         /// Free memory associated with an aligner list
102         virtual void destroy(std::vector<Aligner*>* als) const {
103                 assert(als != NULL);
104                 // Free all of the Aligners
105                 for(size_t i = 0; i < als->size(); i++) {
106                         if((*als)[i] != NULL) {
107                                 delete (*als)[i];
108                                 (*als)[i] = NULL;
109                         }
110                 }
111                 // Free the vector
112                 delete als;
113         }
114 };
115
116 /**
117  * Coordinates multiple aligners of the same type (i.e. either all
118  * single-end or all paired-end).
119  */
120 class MultiAligner {
121 public:
122         MultiAligner(
123                         uint32_t n,
124                         uint32_t qUpto,
125                         const AlignerFactory& alignFact,
126                         const PatternSourcePerThreadFactory& patsrcFact) :
127                         n_(n), qUpto_(qUpto),
128                         alignFact_(alignFact), patsrcFact_(patsrcFact),
129                         aligners_(NULL), patsrcs_(NULL)
130         {
131                 aligners_ = alignFact_.create(n_);
132                 assert(aligners_ != NULL);
133                 patsrcs_ = patsrcFact_.create(n_);
134                 assert(patsrcs_ != NULL);
135         }
136
137         /// Free memory associated with the aligners and their pattern sources.
138         virtual ~MultiAligner() {
139                 alignFact_.destroy(aligners_);
140                 patsrcFact_.destroy(patsrcs_);
141         }
142
143         /**
144          * Advance an array of aligners in parallel, using prefetches to
145          * try to hide all the latency.
146          */
147         void run() {
148                 bool done = false;
149                 while(!done) {
150                         done = true;
151                         for(uint32_t i = 0; i < n_; i++) {
152                                 if(!(*aligners_)[i]->done) {
153                                         // Advance an aligner already in progress
154                                         done = false;
155                                         (*aligners_)[i]->advance();
156                                 } else {
157                                         // Get a new read and initialize an aligner with it
158                                         (*patsrcs_)[i]->nextReadPair();
159                                         if(!(*patsrcs_)[i]->empty() && (*patsrcs_)[i]->patid() < qUpto_) {
160                                                 (*aligners_)[i]->setQuery((*patsrcs_)[i]);
161                                                 assert(!(*aligners_)[i]->done);
162                                                 done = false;
163                                         } else {
164                                                 // No more reads; if done == true, it remains
165                                                 // true
166                                         }
167                                 }
168                         }
169                 }
170         }
171
172 protected:
173         uint32_t n_;     /// Number of aligners
174         uint32_t qUpto_; /// Number of reads to align before stopping
175         const AlignerFactory&                  alignFact_;
176         const PatternSourcePerThreadFactory&   patsrcFact_;
177         std::vector<Aligner *>*                aligners_;
178         std::vector<PatternSourcePerThread *>* patsrcs_;
179 };
180
181 /**
182  * Coordinates multiple single-end and paired-end aligners, routing
183  * reads to one or the other type as appropriate.
184  */
185 class MixedMultiAligner {
186 public:
187         MixedMultiAligner(
188                         uint32_t n,
189                         uint32_t qUpto,
190                         const AlignerFactory& alignSEFact,
191                         const AlignerFactory& alignPEFact,
192                         const PatternSourcePerThreadFactory& patsrcFact) :
193                         n_(n), qUpto_(qUpto),
194                         alignSEFact_(alignSEFact),
195                         alignPEFact_(alignPEFact),
196                         patsrcFact_(patsrcFact),
197                         alignersSE_(NULL),
198                         alignersPE_(NULL),
199                         seOrPe_(NULL),
200                         patsrcs_(NULL)
201         {
202                 // Instantiate all single-end aligners
203                 alignersSE_ = alignSEFact_.create(n_);
204                 assert(alignersSE_ != NULL);
205                 // Instantiate all paired-end aligners
206                 alignersPE_ = alignPEFact_.create(n_);
207                 assert(alignersPE_ != NULL);
208                 // Allocate array of boolean flags indicating whether each of
209                 // the slots is currently using the single-end or paired-end
210                 // aligner
211                 seOrPe_ = new bool[n_];
212                 for(uint32_t i = 0; i < n_; i++) {
213                         seOrPe_[i] = true;
214                 }
215                 // Instantiate all read sources
216                 patsrcs_ = patsrcFact_.create(n_);
217                 assert(patsrcs_ != NULL);
218         }
219
220         /// Free memory associated with the aligners and their pattern sources.
221         virtual ~MixedMultiAligner() {
222                 alignSEFact_.destroy(alignersSE_);
223                 alignPEFact_.destroy(alignersPE_);
224                 patsrcFact_.destroy(patsrcs_);
225                 delete[] seOrPe_;
226         }
227
228         /**
229          * Advance an array of aligners in parallel, using prefetches to
230          * try to hide all the latency.
231          */
232         void run(bool verbose = false) {
233                 bool done = false;
234                 bool first = true;
235                 if(n_ == 1) {
236                         Aligner *al = seOrPe_[0] ? (*alignersSE_)[0] : (*alignersPE_)[0];
237                         PatternSourcePerThread *ps = (*patsrcs_)[0];
238                         while(!done) {
239                                 done = true;
240                                 if(!first && !al->done) {
241                                         // Advance an aligner already in progress; this is
242                                         // the common case
243                                         done = false;
244                                         al->advance();
245                                 } else {
246                                         // Get a new read
247                                         ps->nextReadPair();
248                                         if(ps->patid() < qUpto_ && !ps->empty()) {
249                                                 if(ps->paired()) {
250                                                         // Read currently in buffer is paired-end
251                                                         (*alignersPE_)[0]->setQuery(ps);
252                                                         al = (*alignersPE_)[0];
253                                                         seOrPe_[0] = false; // false -> paired
254                                                 } else {
255                                                         // Read currently in buffer is single-end
256                                                         (*alignersSE_)[0]->setQuery(ps);
257                                                         al = (*alignersSE_)[0];
258                                                         seOrPe_[0] = true; // true = unpaired
259                                                 }
260                                                 done = false;
261                                         } else {
262                                                 // No more reads; if done == true, it remains
263                                                 // true
264                                         }
265                                 }
266                                 first = false;
267                         }
268                 } else {
269                         while(!done) {
270                                 done = true;
271                                 for(uint32_t i = 0; i < n_; i++) {
272                                         Aligner *al = seOrPe_[i] ? (*alignersSE_)[i] :
273                                                                                            (*alignersPE_)[i];
274                                         if(!first && !al->done) {
275                                                 // Advance an aligner already in progress; this is
276                                                 // the common case
277                                                 done = false;
278                                                 al->advance();
279                                         } else {
280                                                 // Feed a new read to a vacant aligner
281                                                 PatternSourcePerThread *ps = (*patsrcs_)[i];
282                                                 // Get a new read
283                                                 ps->nextReadPair();
284                                                 if(ps->patid() < qUpto_ && !ps->empty()) {
285                                                         if(ps->paired()) {
286                                                                 // Read currently in buffer is paired-end
287                                                                 (*alignersPE_)[i]->setQuery(ps);
288                                                                 seOrPe_[i] = false; // false -> paired
289                                                         } else {
290                                                                 // Read currently in buffer is single-end
291                                                                 (*alignersSE_)[i]->setQuery(ps);
292                                                                 seOrPe_[i] = true; // true = unpaired
293                                                         }
294                                                         done = false;
295                                                 } else {
296                                                         // No more reads; if done == true, it remains
297                                                         // true
298                                                 }
299                                         }
300                                 }
301                                 first = false;
302                         }
303                 }
304         }
305
306 protected:
307         uint32_t n_;     /// Number of aligners
308         uint32_t qUpto_; /// Number of reads to align before stopping
309         const AlignerFactory&                  alignSEFact_;
310         const AlignerFactory&                  alignPEFact_;
311         const PatternSourcePerThreadFactory&   patsrcFact_;
312         std::vector<Aligner *>*                alignersSE_;
313         std::vector<Aligner *>*                alignersPE_;
314         bool *                                 seOrPe_;
315         std::vector<PatternSourcePerThread *>* patsrcs_;
316 };
317
318 /**
319  * An aligner for finding exact matches of unpaired reads.  Always
320  * tries the forward-oriented version of the read before the reverse-
321  * oriented read.
322  */
323 template<typename TRangeSource>
324 class UnpairedAlignerV2 : public Aligner {
325         typedef RangeSourceDriver<TRangeSource> TDriver;
326 public:
327         UnpairedAlignerV2(
328                 EbwtSearchParams<String<Dna> >* params,
329                 TDriver* driver,
330                 RangeChaser<String<Dna> >* rchase,
331                 HitSink& sink,
332                 const HitSinkPerThreadFactory& sinkPtFactory,
333                 HitSinkPerThread* sinkPt,
334                 vector<String<Dna5> >& os,
335                 const BitPairReference* refs,
336                 bool rangeMode,
337                 bool verbose,
338                 bool quiet,
339                 int maxBts,
340                 ChunkPool *pool,
341                 int *btCnt = NULL,
342                 AlignerMetrics *metrics = NULL) :
343                 Aligner(true, rangeMode),
344                 refs_(refs),
345                 doneFirst_(true),
346                 firstIsFw_(true),
347                 chase_(false),
348                 sinkPtFactory_(sinkPtFactory),
349                 sinkPt_(sinkPt),
350                 params_(params),
351                 rchase_(rchase),
352                 driver_(driver),
353                 verbose_(verbose),
354                 quiet_(quiet),
355                 maxBts_(maxBts),
356                 pool_(pool),
357                 btCnt_(btCnt),
358                 metrics_(metrics)
359         {
360                 assert(pool_   != NULL);
361                 assert(sinkPt_ != NULL);
362                 assert(params_ != NULL);
363                 assert(driver_ != NULL);
364         }
365
366         virtual ~UnpairedAlignerV2() {
367                 delete driver_;  driver_  = NULL;
368                 delete params_;  params_  = NULL;
369                 delete rchase_;  rchase_  = NULL;
370                 delete[] btCnt_; btCnt_   = NULL;
371                 sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL;
372         }
373
374         /**
375          * Prepare this aligner for the next read.
376          */
377         virtual void setQuery(PatternSourcePerThread* patsrc) {
378                 Aligner::setQuery(patsrc); // set fields & random seed
379                 if(metrics_ != NULL) {
380                         metrics_->nextRead(patsrc->bufa().patFw);
381                 }
382                 pool_->reset(&patsrc->bufa().name, patsrc->patid());
383                 if(patsrc->bufa().length() < 4) {
384                         if(!quiet_) {
385                                 cerr << "Warning: Skipping read " << patsrc->bufa().name
386                                      << " because it is less than 4 characters long" << endl;
387                         }
388                         this->done = true;
389                         sinkPt_->finishRead(*patsrc_, true, true);
390                         return;
391                 }
392                 driver_->setQuery(patsrc, NULL);
393                 this->done = driver_->done;
394                 doneFirst_ = false;
395                 // Reset #-backtrack countdown
396                 if(btCnt_ != NULL) *btCnt_ = maxBts_;
397                 if(sinkPt_->setHits(patsrc->bufa().hitset)) {
398                         this->done = true;
399                         sinkPt_->finishRead(*patsrc_, true, true);
400                 }
401                 // Grab a bit from the pseudo-random seed to determine whether
402                 // to start with forward or reverse complement
403                 firstIsFw_ = ((patsrc->bufa().seed & 0x10) == 0);
404                 chase_ = false;
405         }
406
407         /**
408          * Helper for reporting an alignment.
409          */
410         inline bool report(const Range& ra,
411                            uint32_t first,
412                            uint32_t second,
413                            uint32_t tlen)
414         {
415                 bool ebwtFw = ra.ebwt->fw();
416                 params_->setFw(ra.fw);
417                 assert_eq(bufa_->color, color);
418                 return params_->reportHit(
419                                 ra.fw ? (ebwtFw? bufa_->patFw    : bufa_->patFwRev) :
420                                         (ebwtFw? bufa_->patRc    : bufa_->patRcRev),
421                                 ra.fw ? (ebwtFw? &bufa_->qual    : &bufa_->qualRev) :
422                                         (ebwtFw? &bufa_->qualRev : &bufa_->qual),
423                                 &bufa_->name,
424                                 bufa_->color,
425                                 colorExEnds,
426                                 snpPhred,
427                                 refs_,
428                                 ra.ebwt->rmap(),
429                                 ebwtFw,
430                                 ra.mms,                   // mismatch positions
431                                 ra.refcs,                 // reference characters for mms
432                                 ra.numMms,                // # mismatches
433                                 make_pair(first, second), // position
434                                 make_pair(0, 0),          // (bogus) mate position
435                                 true,                     // (bogus) mate orientation
436                                 0,                        // (bogus) mate length
437                                 make_pair(ra.top, ra.bot),// arrows
438                                 tlen,                     // textlen
439                                 alen_,                    // qlen
440                                 ra.stratum,               // alignment stratum
441                                 ra.cost,                  // cost, including qual penalty
442                                 ra.bot - ra.top - 1,      // # other hits
443                                 patsrc_->patid(),         // pattern id
444                                 bufa_->seed,              // pseudo-random seed
445                                 0);                       // mate (0 = unpaired)
446         }
447
448         /**
449          * Advance the aligner.  Return true iff we're
450          * done with this read.
451          */
452         virtual bool advance() {
453                 assert(!this->done);
454                 if(chase_) {
455                         assert(!rangeMode_);
456                         assert(driver_->foundRange);
457                         assert(!sinkPt_->irrelevantCost(driver_->range().cost));
458                         if(!rchase_->foundOff() && !rchase_->done) {
459                                 rchase_->advance();
460                                 return false;
461                         }
462                         if(rchase_->foundOff()) {
463                                 this->done = report(driver_->range(), rchase_->off().first,
464                                                     rchase_->off().second, rchase_->tlen());
465                                 rchase_->reset();
466                         } else {
467                                 assert(rchase_->done);
468                                 // Forget this range; keep looking for ranges
469                                 chase_ = false;
470                                 driver_->foundRange = false;
471                                 this->done = driver_->done;
472                         }
473                 }
474                 // Still advancing a
475                 if(!this->done && !chase_) {
476                         assert(!driver_->done || driver_->foundRange);
477                         if(driver_->foundRange) {
478                                 const Range& ra = driver_->range();
479                                 assert(!sinkPt_->irrelevantCost(ra.cost));
480                                 assert(ra.repOk());
481                                 if(rangeMode_) {
482                                         this->done = report(ra, ra.top, ra.bot, 0);
483                                         driver_->foundRange = false;
484                                 } else {
485                                         rchase_->setTopBot(ra.top, ra.bot, alen_, rand_, ra.ebwt);
486                                         if(rchase_->foundOff()) {
487                                                 this->done = report(
488                                                                 ra, rchase_->off().first,
489                                                                 rchase_->off().second, rchase_->tlen());
490                                                 rchase_->reset();
491                                         }
492                                         if(!rchase_->done && !sinkPt_->irrelevantCost(ra.cost)) {
493                                                 // Keep chasing this range
494                                                 chase_ = true;
495                                         } else {
496                                                 driver_->foundRange = false;
497                                         }
498                                 }
499                         } else {
500                                 this->done = sinkPt_->irrelevantCost(driver_->minCost);
501                                 if(!this->done) {
502                                         driver_->advance(ADV_COST_CHANGES);
503                                 } else {
504                                         // No longer necessarily true with chain input
505                                         //assert(!sinkPt_->spanStrata());
506                                 }
507                         }
508                         if(driver_->done && !driver_->foundRange && !chase_) {
509                                 this->done = true;
510                         }
511                 }
512                 if(this->done) {
513                         sinkPt_->finishRead(*patsrc_, true, true);
514                 }
515                 return this->done;
516         }
517
518 protected:
519
520         // Reference sequences (needed for colorspace decoding)
521         const BitPairReference* refs_;
522
523         // Progress state
524         bool doneFirst_;
525         bool firstIsFw_;
526         bool chase_;
527
528         // Temporary HitSink; to be deleted
529         const HitSinkPerThreadFactory& sinkPtFactory_;
530         HitSinkPerThread* sinkPt_;
531
532         // State for alignment
533         EbwtSearchParams<String<Dna> >* params_;
534
535         // State for getting alignments from ranges statefully
536         RangeChaser<String<Dna> >* rchase_;
537
538         // Range-finding state
539         TDriver* driver_;
540
541         bool verbose_; // be talkative
542         bool quiet_; // don't print informational/warning info
543
544         const int maxBts_;
545         ChunkPool *pool_;
546         int *btCnt_;
547         AlignerMetrics *metrics_;
548 };
549
550 /**
551  * An aligner for finding paired alignments while operating entirely
552  * within the Burrows-Wheeler domain.
553  */
554 template<typename TRangeSource>
555 class PairedBWAlignerV1 : public Aligner {
556
557         typedef std::pair<uint32_t,uint32_t> U32Pair;
558         typedef std::vector<U32Pair> U32PairVec;
559         typedef std::vector<Range> TRangeVec;
560         typedef RangeSourceDriver<TRangeSource> TDriver;
561         typedef std::pair<uint64_t, uint64_t> TU64Pair;
562         typedef std::set<TU64Pair> TSetPairs;
563
564 public:
565         PairedBWAlignerV1(
566                 EbwtSearchParams<String<Dna> >* params,
567                 TDriver* driver1Fw, TDriver* driver1Rc,
568                 TDriver* driver2Fw, TDriver* driver2Rc,
569                 RefAligner<String<Dna5> >* refAligner,
570                 RangeChaser<String<Dna> >* rchase,
571                 HitSink& sink,
572                 const HitSinkPerThreadFactory& sinkPtFactory,
573                 HitSinkPerThread* sinkPt,
574                 bool fw1, bool fw2,
575                 uint32_t minInsert,
576                 uint32_t maxInsert,
577                 bool dontReconcile,
578                 uint32_t symCeiling,
579                 uint32_t mixedThresh,
580                 uint32_t mixedAttemptLim,
581                 const BitPairReference* refs,
582                 bool rangeMode,
583                 bool verbose,
584                 bool quiet,
585                 int maxBts,
586                 ChunkPool *pool,
587                 int *btCnt) :
588                 Aligner(true, rangeMode),
589                 refs_(refs),
590                 patsrc_(NULL), qlen1_(0), qlen2_(0), doneFw_(true),
591                 doneFwFirst_(true),
592                 chase1Fw_(false), chase1Rc_(false),
593                 chase2Fw_(false), chase2Rc_(false),
594                 delayedChase1Fw_(false), delayedChase1Rc_(false),
595                 delayedChase2Fw_(false), delayedChase2Rc_(false),
596                 refAligner_(refAligner),
597                 sinkPtFactory_(sinkPtFactory),
598                 sinkPt_(sinkPt),
599                 params_(params),
600                 minInsert_(minInsert),
601                 maxInsert_(maxInsert),
602                 dontReconcile_(dontReconcile),
603                 symCeiling_(symCeiling),
604                 mixedThresh_(mixedThresh),
605                 mixedAttemptLim_(mixedAttemptLim),
606                 mixedAttempts_(0),
607                 fw1_(fw1), fw2_(fw2),
608                 rchase_(rchase),
609                 verbose_(verbose),
610                 quiet_(quiet),
611                 maxBts_(maxBts),
612                 pool_(pool),
613                 btCnt_(btCnt),
614                 driver1Fw_(driver1Fw), driver1Rc_(driver1Rc),
615                 offs1FwSz_(0), offs1RcSz_(0),
616                 driver2Fw_(driver2Fw), driver2Rc_(driver2Rc),
617                 offs2FwSz_(0), offs2RcSz_(0),
618
619                 chaseL_fw_       (fw1_ ? chase1Fw_        : chase1Rc_),
620                 chaseR_fw_       (fw2_ ? chase2Fw_        : chase2Rc_),
621                 delayedchaseL_fw_(fw1_ ? delayedChase1Fw_ : delayedChase1Rc_),
622                 delayedchaseR_fw_(fw2_ ? delayedChase2Fw_ : delayedChase2Rc_),
623                 drL_fw_          (fw1_ ? *driver1Fw_      : *driver1Rc_),
624                 drR_fw_          (fw2_ ? *driver2Fw_      : *driver2Rc_),
625                 offsLarr_fw_     (fw1_ ? offs1FwArr_      : offs1RcArr_),
626                 offsRarr_fw_     (fw2_ ? offs2FwArr_      : offs2RcArr_),
627                 rangesLarr_fw_   (fw1_ ? ranges1FwArr_    : ranges1RcArr_),
628                 rangesRarr_fw_   (fw2_ ? ranges2FwArr_    : ranges2RcArr_),
629                 offsLsz_fw_      (fw1_ ? offs1FwSz_       : offs1RcSz_),
630                 offsRsz_fw_      (fw2_ ? offs2FwSz_       : offs2RcSz_),
631
632                 chaseL_rc_       (fw2_ ? chase2Rc_        : chase2Fw_),
633                 chaseR_rc_       (fw1_ ? chase1Rc_        : chase1Fw_),
634                 delayedchaseL_rc_(fw2_ ? delayedChase2Rc_ : delayedChase2Fw_),
635                 delayedchaseR_rc_(fw1_ ? delayedChase1Rc_ : delayedChase1Fw_),
636                 drL_rc_          (fw2_ ? *driver2Rc_      : *driver2Fw_),
637                 drR_rc_          (fw1_ ? *driver1Rc_      : *driver1Fw_),
638                 offsLarr_rc_     (fw2_ ? offs2RcArr_      : offs2FwArr_),
639                 offsRarr_rc_     (fw1_ ? offs1RcArr_      : offs1FwArr_),
640                 rangesLarr_rc_   (fw2_ ? ranges2RcArr_    : ranges2FwArr_),
641                 rangesRarr_rc_   (fw1_ ? ranges1RcArr_    : ranges1FwArr_),
642                 offsLsz_rc_      (fw2_ ? offs2RcSz_       : offs2FwSz_),
643                 offsRsz_rc_      (fw1_ ? offs1RcSz_       : offs1FwSz_),
644
645                 chaseL_       (&chaseL_fw_),
646                 chaseR_       (&chaseR_fw_),
647                 delayedchaseL_(&delayedchaseL_fw_),
648                 delayedchaseR_(&delayedchaseR_fw_),
649                 drL_          (&drL_fw_),
650                 drR_          (&drR_fw_),
651                 offsLarr_     (offsLarr_fw_),
652                 offsRarr_     (offsRarr_fw_),
653                 rangesLarr_   (rangesLarr_fw_),
654                 rangesRarr_   (rangesRarr_fw_),
655                 offsLsz_      (&offsLsz_fw_),
656                 offsRsz_      (&offsRsz_fw_),
657                 donePair_     (&doneFw_),
658                 fwL_(fw1),
659                 fwR_(fw2),
660                 verbose2_(false)
661         {
662                 assert(pool_      != NULL);
663                 assert(sinkPt_    != NULL);
664                 assert(params_    != NULL);
665                 assert(driver1Fw_ != NULL);
666                 assert(driver1Rc_ != NULL);
667                 assert(driver2Fw_ != NULL);
668                 assert(driver2Rc_ != NULL);
669         }
670
671         virtual ~PairedBWAlignerV1() {
672                 delete driver1Fw_; driver1Fw_ = NULL;
673                 delete driver1Rc_; driver1Rc_ = NULL;
674                 delete driver2Fw_; driver2Fw_ = NULL;
675                 delete driver2Rc_; driver2Rc_ = NULL;
676                 delete params_;    params_    = NULL;
677                 delete rchase_;    rchase_    = NULL;
678                 delete[] btCnt_;   btCnt_     = NULL;
679                 delete refAligner_; refAligner_ = NULL;
680                 sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL;
681         }
682
683         /**
684          * Prepare this aligner for the next read.
685          */
686         virtual void setQuery(PatternSourcePerThread* patsrc) {
687                 assert(!patsrc->bufa().empty());
688                 Aligner::setQuery(patsrc); // set fields & random seed
689                 assert(!patsrc->bufb().empty());
690                 // Give all of the drivers pointers to the relevant read info
691                 patsrc_ = patsrc;
692                 pool_->reset(&patsrc->bufa().name, patsrc->patid());
693                 if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) {
694                         if(!quiet_) {
695                                 cerr << "Warning: Skipping pair " << patsrc->bufa().name
696                                          << " because a mate is less than 4 characters long" << endl;
697                         }
698                         this->done = true;
699                         sinkPt_->finishRead(*patsrc_, true, true);
700                         return;
701                 }
702                 driver1Fw_->setQuery(patsrc, NULL);
703                 driver1Rc_->setQuery(patsrc, NULL);
704                 driver2Fw_->setQuery(patsrc, NULL);
705                 driver2Rc_->setQuery(patsrc, NULL);
706                 qlen1_ = patsrc_->bufa().length();
707                 qlen2_ = patsrc_->bufb().length();
708                 if(btCnt_ != NULL) (*btCnt_) = maxBts_;
709                 // Neither orientation is done
710                 doneFw_   = false;
711                 doneFwFirst_ = true;
712                 this->done   = false;
713                 // No ranges are being chased yet
714                 chase1Fw_ = false;
715                 chase1Rc_ = false;
716                 chase2Fw_ = false;
717                 chase2Rc_ = false;
718                 delayedChase1Fw_ = false;
719                 delayedChase1Rc_ = false;
720                 delayedChase2Fw_ = false;
721                 delayedChase2Rc_ = false;
722                 // Clear all intermediate ranges
723                 for(size_t i = 0; i < 32; i++) {
724                         offs1FwArr_[i].clear();   offs1RcArr_[i].clear();
725                         offs2FwArr_[i].clear();   offs2RcArr_[i].clear();
726                         ranges1FwArr_[i].clear(); ranges1RcArr_[i].clear();
727                         ranges2FwArr_[i].clear(); ranges2RcArr_[i].clear();
728                 }
729                 offs1FwSz_ = offs1RcSz_ = offs2FwSz_ = offs2RcSz_ = 0;
730                 chaseL_        = &chaseL_fw_;
731                 chaseR_        = &chaseR_fw_;
732                 delayedchaseL_ = &delayedchaseL_fw_;
733                 delayedchaseR_ = &delayedchaseR_fw_;
734                 drL_           = &drL_fw_;
735                 drR_           = &drR_fw_;
736                 offsLarr_      = offsLarr_fw_;
737                 offsRarr_      = offsRarr_fw_;
738                 rangesLarr_    = rangesLarr_fw_;
739                 rangesRarr_    = rangesRarr_fw_;
740                 offsLsz_       = &offsLsz_fw_;
741                 offsRsz_       = &offsRsz_fw_;
742                 donePair_      = &doneFw_;
743                 fwL_           = fw1_;
744                 fwR_           = fw2_;
745                 mixedAttempts_ = 0;
746                 pairs_fw_.clear();
747                 pairs_rc_.clear();
748 #ifndef NDEBUG
749                 allTopsL_fw_.clear();
750                 allTopsR_fw_.clear();
751                 allTopsL_rc_.clear();
752                 allTopsR_rc_.clear();
753 #endif
754         }
755
756         /**
757          * Advance the aligner by one memory op.  Return true iff we're
758          * done with this read.
759          *
760          * A call to this function does one of many things:
761          * 1. Advance a RangeSourceDriver and check if it found a new range
762          * 2. Advance a RowChaseDriver and check if it found a reference
763          *    offset for a an alignment in a range
764          */
765         virtual bool advance() {
766                 assert(!this->done);
767                 if(doneFw_ && doneFwFirst_) {
768                         if(verbose2_) cout << "--" << endl;
769                         chaseL_        = &chaseL_rc_;
770                         chaseR_        = &chaseR_rc_;
771                         delayedchaseL_ = &delayedchaseL_rc_;
772                         delayedchaseR_ = &delayedchaseR_rc_;
773                         drL_           = &drL_rc_;
774                         drR_           = &drR_rc_;
775                         offsLarr_      = offsLarr_rc_;
776                         offsRarr_      = offsRarr_rc_;
777                         rangesLarr_    = rangesLarr_rc_;
778                         rangesRarr_    = rangesRarr_rc_;
779                         offsLsz_       = &offsLsz_rc_;
780                         offsRsz_       = &offsRsz_rc_;
781                         donePair_      = &this->done;
782                         fwL_           = !fw2_;
783                         fwR_           = !fw1_;
784                         doneFwFirst_   = false;
785                         mixedAttempts_ = 0;
786                 }
787                 bool chasing = *chaseL_ || *chaseR_;
788                 if(chasing && !rchase_->foundOff() && !rchase_->done) {
789                         rchase_->advance();
790                         return false;
791                 }
792                 advanceOrientation(!doneFw_);
793                 if(this->done) {
794                         if(verbose2_) cout << "----" << endl;
795                         sinkPt_->finishRead(*patsrc_, true, true);
796                 }
797                 return this->done;
798         }
799
800 protected:
801
802         /**
803          * Helper for reporting a pair of alignments.  As of now, we report
804          * a paired alignment by reporting two consecutive alignments, one
805          * for each mate.
806          */
807         bool report(const Range& rL, // range for upstream mate
808                     const Range& rR, // range for downstream mate
809                     uint32_t first,  // ref idx
810                     uint32_t upstreamOff, // offset for upstream mate
811                     uint32_t dnstreamOff, // offset for downstream mate
812                     uint32_t tlen, // length of ref
813                     bool pairFw,   // whether the pair is being mapped to fw strand
814                     bool ebwtFwL,
815                     bool ebwtFwR,
816                     const ReferenceMap* rmap)
817         {
818                 assert_lt(upstreamOff, dnstreamOff);
819                 uint32_t spreadL = rL.bot - rL.top;
820                 uint32_t spreadR = rR.bot - rR.top;
821                 uint32_t oms = min(spreadL, spreadR) - 1;
822                 ReadBuf* bufL = pairFw ? bufa_ : bufb_;
823                 ReadBuf* bufR = pairFw ? bufb_ : bufa_;
824                 uint32_t lenL = pairFw ? alen_ : blen_;
825                 uint32_t lenR = pairFw ? blen_ : alen_;
826                 bool ret;
827                 assert(!params_->sink().exceededOverThresh());
828                 params_->setFw(rL.fw);
829                 assert_eq(bufL->color, color);
830                 // Print upstream mate first
831                 ret = params_->reportHit(
832                                 rL.fw ? (ebwtFwL?  bufL->patFw  :  bufL->patFwRev) :
833                                             (ebwtFwL?  bufL->patRc  :  bufL->patRcRev),
834                                 rL.fw ? (ebwtFwL? &bufL->qual    : &bufL->qualRev) :
835                                         (ebwtFwL? &bufL->qualRev : &bufL->qual),
836                                 &bufL->name,
837                                 bufL->color,
838                                 colorExEnds,
839                                 snpPhred,
840                                 refs_,
841                                 rmap,
842                                 ebwtFwL,
843                                 rL.mms,                       // mismatch positions
844                                 rL.refcs,                     // reference characters for mms
845                                 rL.numMms,                    // # mismatches
846                                 make_pair(first, upstreamOff),// position
847                                 make_pair(first, dnstreamOff),// mate position
848                                 rR.fw,                        // mate orientation
849                                 lenR,                         // mate length
850                                 make_pair(rL.top, rL.bot),    // arrows
851                                 tlen,                         // textlen
852                                 lenL,                         // qlen
853                                 rL.stratum,                   // alignment stratum
854                                 rL.cost,                      // cost, including quality penalty
855                                 oms,                          // # other hits
856                                 bufL->patid,
857                                 bufL->seed,
858                                 pairFw ? 1 : 2);
859                 if(ret) {
860                         return true; // can happen when -m is set
861                 }
862                 params_->setFw(rR.fw);
863                 assert_eq(bufR->color, color);
864                 ret = params_->reportHit(
865                                 rR.fw ? (ebwtFwR?  bufR->patFw  :  bufR->patFwRev) :
866                                             (ebwtFwR?  bufR->patRc  :  bufR->patRcRev),
867                                 rR.fw ? (ebwtFwR? &bufR->qual    : &bufR->qualRev) :
868                                         (ebwtFwR? &bufR->qualRev : &bufR->qual),
869                                 &bufR->name,
870                                 bufR->color,
871                                 colorExEnds,
872                                 snpPhred,
873                                 refs_,
874                                 rmap,
875                                 ebwtFwR,
876                                 rR.mms,                       // mismatch positions
877                                 rR.refcs,                     // reference characters for mms
878                                 rR.numMms,                    // # mismatches
879                                 make_pair(first, dnstreamOff),// position
880                                 make_pair(first, upstreamOff),// mate position
881                                 rL.fw,                        // mate orientation
882                                 lenL,                         // mate length
883                                 make_pair(rR.top, rR.bot),    // arrows
884                                 tlen,                         // textlen
885                                 lenR,                         // qlen
886                                 rR.stratum,                   // alignment stratum
887                                 rR.cost,                      // cost, including quality penalty
888                                 oms,                          // # other hits
889                                 bufR->patid,
890                                 bufR->seed,
891                                 pairFw ? 2 : 1);
892                 return ret;
893         }
894
895         bool report(const Range& rL, // range for upstream mate
896                     const Range& rR, // range for downstream mate
897                     uint32_t first,  // ref idx
898                     uint32_t upstreamOff, // offset for upstream mate
899                     uint32_t dnstreamOff, // offset for downstream mate
900                     uint32_t tlen, // length of ref
901                     bool pairFw,   // whether the pair is being mapped to fw strand
902                     const ReferenceMap* rmap)
903         {
904                 return report(rL, rR, first, upstreamOff,
905                               dnstreamOff, tlen,
906                               pairFw, rL.ebwt->fw(), rR.ebwt->fw(), rmap);
907         }
908
909         /**
910          * Given a vector of reference positions where one of the two mates
911          * (the "anchor" mate) has aligned, look directly at the reference
912          * sequence for instances where the other mate (the "outstanding"
913          * mate) aligns such that mating constraint is satisfied.
914          *
915          * This function picks up to 'pick' anchors at random from the
916          * 'offs' array.  It returns the number that it actually picked.
917          */
918         bool resolveOutstandingInRef(const bool off1,
919                                      const U32Pair& off,
920                                      const uint32_t tlen,
921                                      const Range& range)
922         {
923                 assert(refs_->loaded());
924                 assert_lt(off.first, refs_->numRefs());
925                 // If matchRight is true, then we're trying to align the other
926                 // mate to the right of the already-aligned mate.  Otherwise,
927                 // to the left.
928                 bool matchRight = (off1 ? !doneFw_ : doneFw_);
929                 // Sequence and quals for mate to be matched
930                 bool fw = off1 ? fw2_ : fw1_; // whether outstanding mate is fw/rc
931                 if(doneFw_) fw = !fw;
932                 // 'seq' gets sequence of outstanding mate w/r/t the forward
933                 // reference strand
934                 const String<Dna5>& seq  = fw ? (off1 ? patsrc_->bufb().patFw   :
935                                                         patsrc_->bufa().patFw)  :
936                                                 (off1 ? patsrc_->bufb().patRc   :
937                                                         patsrc_->bufa().patRc);
938                 // 'seq' gets qualities of outstanding mate w/r/t the forward
939                 // reference strand
940                 const String<char>& qual = fw ? (off1 ? patsrc_->bufb().qual  :
941                                                         patsrc_->bufa().qual) :
942                                                 (off1 ? patsrc_->bufb().qualRev  :
943                                                         patsrc_->bufa().qualRev);
944                 uint32_t qlen = seqan::length(seq);  // length of outstanding mate
945                 uint32_t alen = (off1 ? patsrc_->bufa().length() :
946                                         patsrc_->bufb().length());
947                 int minins = minInsert_;
948                 int maxins = maxInsert_;
949                 if(fw1_) {
950                         minins = max<int>(0, minins - patsrc_->bufa().trimmed5);
951                         maxins = max<int>(0, maxins - patsrc_->bufa().trimmed5);
952                 } else {
953                         minins = max<int>(0, minins - patsrc_->bufa().trimmed3);
954                         maxins = max<int>(0, maxins - patsrc_->bufa().trimmed3);
955                 }
956                 if(fw2_) {
957                         minins = max<int>(0, minins - patsrc_->bufb().trimmed3);
958                         maxins = max<int>(0, maxins - patsrc_->bufb().trimmed3);
959                 } else {
960                         minins = max<int>(0, minins - patsrc_->bufb().trimmed5);
961                         maxins = max<int>(0, maxins - patsrc_->bufb().trimmed5);
962                 }
963                 assert_geq(minins, 0);
964                 assert_geq(maxins, 0);
965                 // Don't even try if either of the mates is longer than the
966                 // maximum insert size.
967                 if((uint32_t)maxins <= max(qlen, alen)) {
968                         return false;
969                 }
970                 const uint32_t tidx = off.first;
971                 const uint32_t toff = off.second;
972                 // Set begin/end to be a range of all reference
973                 // positions that are legally permitted to be involved in
974                 // the alignment of the outstanding mate.  It's up to the
975                 // callee to worry about how to scan these positions.
976                 uint32_t begin, end;
977                 assert_geq(maxins, minins);
978                 uint32_t insDiff = maxins - minins;
979                 if(matchRight) {
980                         end = toff + maxins;
981                         begin = toff + 1;
982                         if(qlen < alen) begin += alen-qlen;
983                         if(end > insDiff + qlen) {
984                                 begin = max<uint32_t>(begin, end - insDiff - qlen);
985                         }
986                         end = min<uint32_t>(refs_->approxLen(tidx), end);
987                         begin = min<uint32_t>(refs_->approxLen(tidx), begin);
988                 } else {
989                         if(toff + alen < (uint32_t)maxins) {
990                                 begin = 0;
991                         } else {
992                                 begin = toff + alen - maxins;
993                         }
994                         uint32_t mi = min<uint32_t>(alen, qlen);
995                         end = toff + mi - 1;
996                         end = min<uint32_t>(end, toff + alen - minins + qlen - 1);
997                         if(toff + alen + qlen < (uint32_t)minins + 1) end = 0;
998                 }
999                 // Check if there's not enough space in the range to fit an
1000                 // alignment for the outstanding mate.
1001                 if(end - begin < qlen) return false;
1002                 std::vector<Range> ranges;
1003                 std::vector<uint32_t> offs;
1004                 refAligner_->find(1, tidx, refs_, seq, qual, begin, end, ranges,
1005                                   offs, doneFw_ ? &pairs_rc_ : &pairs_fw_,
1006                                   toff, fw);
1007                 assert_eq(ranges.size(), offs.size());
1008                 for(size_t i = 0; i < ranges.size(); i++) {
1009                         Range& r = ranges[i];
1010                         r.fw = fw;
1011                         r.cost |= (r.stratum << 14);
1012                         r.mate1 = !off1;
1013                         const uint32_t result = offs[i];
1014                         // Just copy the known range's top and bot for now
1015                         r.top = range.top;
1016                         r.bot = range.bot;
1017                         bool ebwtLFw = matchRight ? range.ebwt->fw() : true;
1018                         bool ebwtRFw = matchRight ? true : range.ebwt->fw();
1019                         if(report(
1020                                 matchRight ? range : r, // range for upstream mate
1021                                 matchRight ? r : range, // range for downstream mate
1022                                 tidx,                   // ref idx
1023                                 matchRight ? toff : result, // upstream offset
1024                                 matchRight ? result : toff, // downstream offset
1025                                 tlen,       // length of ref
1026                                 !doneFw_,   // whether the pair is being mapped to fw strand
1027                                 ebwtLFw,
1028                                 ebwtRFw,
1029                                 range.ebwt->rmap())) return true;
1030                 }
1031                 return false;
1032         }
1033
1034         /**
1035          * Advance paired-end alignment.
1036          */
1037         void advanceOrientation(bool pairFw) {
1038                 assert(!this->done);
1039                 assert(!*donePair_);
1040                 assert(!*chaseL_ || !*chaseR_);
1041                 if(*chaseL_) {
1042                         assert(!rangeMode_);
1043                         assert(!*delayedchaseL_);
1044                         assert(drL_->foundRange);
1045                         assert(rchase_->foundOff() || rchase_->done);
1046                         if(rchase_->foundOff()) {
1047                                 // Resolve this against the reference loci
1048                                 // determined for the other mate
1049                                 const bool overThresh = (*offsLsz_ + *offsRsz_) > mixedThresh_;
1050                                 if(!this->done && (overThresh || dontReconcile_)) {
1051                                         // Because the total size of both ranges exceeds
1052                                         // our threshold, we're now operating in "mixed
1053                                         // mode"
1054                                         const Range& r = drL_->range();
1055                                         assert(r.repOk());
1056                                         if(verbose_) cout << "Making an attempt to find the outstanding mate" << endl;
1057                                         this->done = resolveOutstandingInRef(
1058                                                         pairFw, rchase_->off(),
1059                                                 r.ebwt->_plen[rchase_->off().first], r);
1060                                         if(++mixedAttempts_ > mixedAttemptLim_) {
1061                                                 // Give up on this pair
1062                                                 *donePair_ = true;
1063                                                 return;
1064                                         }
1065                                 }
1066                                 rchase_->reset();
1067                         } else {
1068                                 assert(rchase_->done);
1069                                 // Forget this range; keep looking for ranges
1070                                 *chaseL_ = false;
1071                                 drL_->foundRange = false;
1072                                 if(verbose_) cout << "Done with chase for first mate" << endl;
1073                                 if(*delayedchaseR_) {
1074                                         // Start chasing the delayed range
1075                                         if(verbose_) cout << "Resuming delayed chase for second mate" << endl;
1076                                         assert(drR_->foundRange);
1077                                         const Range& r = drR_->range();
1078                                         assert(r.repOk());
1079                                         uint32_t top = r.top;
1080                                         uint32_t bot = r.bot;
1081                                         uint32_t qlen = doneFw_? qlen1_ : qlen2_;
1082                                         rchase_->setTopBot(top, bot, qlen, rand_, r.ebwt);
1083                                         *chaseR_ = true;
1084                                         *delayedchaseR_ = false;
1085                                 }
1086                         }
1087                 } else if(*chaseR_) {
1088                         assert(!rangeMode_);
1089                         assert(!*delayedchaseR_);
1090                         assert(drR_->foundRange);
1091                         assert(rchase_->foundOff() || rchase_->done);
1092                         if(rchase_->foundOff()) {
1093                                 // Resolve this against the reference loci
1094                                 // determined for the other mate
1095                                 const bool overThresh = (*offsLsz_ + *offsRsz_) > mixedThresh_;
1096                                 if(!this->done && (overThresh || dontReconcile_)) {
1097                                         // Because the total size of both ranges exceeds
1098                                         // our threshold, we're now operating in "mixed
1099                                         // mode"
1100                                         const Range& r = drR_->range();
1101                                         if(verbose_) cout << "Making an attempt to find the outstanding mate" << endl;
1102                                         this->done = resolveOutstandingInRef(
1103                                                         !pairFw, rchase_->off(),
1104                                                 r.ebwt->_plen[rchase_->off().first], r);
1105                                         if(++mixedAttempts_ > mixedAttemptLim_) {
1106                                                 // Give up on this pair
1107                                                 *donePair_ = true;
1108                                                 return;
1109                                         }
1110                                 }
1111                                 rchase_->reset();
1112                         } else {
1113                                 assert(rchase_->done);
1114                                 // Forget this range; keep looking for ranges
1115                                 *chaseR_ = false;
1116                                 drR_->foundRange = false;
1117                                 if(verbose_) cout << "Done with chase for second mate" << endl;
1118                                 if(*delayedchaseL_) {
1119                                         // Start chasing the delayed range
1120                                         if(verbose_) cout << "Resuming delayed chase for first mate" << endl;
1121                                         assert(drL_->foundRange);
1122                                         const Range& r = drL_->range();
1123                                         assert(r.repOk());
1124                                         uint32_t top = r.top;
1125                                         uint32_t bot = r.bot;
1126                                         uint32_t qlen = doneFw_? qlen2_ : qlen1_;
1127                                         rchase_->setTopBot(top, bot, qlen, rand_, r.ebwt);
1128                                         *chaseL_ = true;
1129                                         *delayedchaseL_ = false;
1130                                 }
1131                         }
1132                 }
1133                 if(!this->done && !*donePair_ && !*chaseL_ && !*chaseR_) {
1134                         // Search for more ranges for whichever mate currently has
1135                         // fewer candidate alignments
1136                         if((*offsLsz_ < *offsRsz_ || drR_->done) && !drL_->done) {
1137                                 // If there are no more ranges for the other mate and
1138                                 // there are no candidate alignments either, then we're
1139                                 // not going to find a paired alignment in this
1140                                 // orientation.
1141                                 if(drR_->done && *offsRsz_ == 0) {
1142                                         // Give up on this orientation
1143                                         if(verbose_) cout << "Giving up on paired orientation " << (pairFw? "fw" : "rc") << " in mate 1" << endl;
1144                                         *donePair_ = true;
1145                                         if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << endl;
1146                                         return;
1147                                 }
1148                                 assert(!*delayedchaseL_);
1149                                 if(!drL_->foundRange) drL_->advance(ADV_FOUND_RANGE);
1150                                 if(drL_->foundRange) {
1151 #ifndef NDEBUG
1152                                         {
1153                                                 std::set<int64_t>& s = (pairFw ? allTopsL_fw_ : allTopsL_rc_);
1154                                                 int64_t t = drL_->range().top + 1; // add 1 to avoid 0
1155                                                 if(!drL_->range().ebwt->fw()) t = -t; // invert for bw index
1156                                                 assert(s.find(t) == s.end());
1157                                                 s.insert(t);
1158                                         }
1159 #endif
1160                                         // Add the size of this range to the total for this mate
1161                                         *offsLsz_ += (drL_->range().bot - drL_->range().top);
1162                                         if(*offsRsz_ == 0 && (!dontReconcile_ || *offsLsz_ > 3)) {
1163                                                 // Delay chasing this range; we delay to avoid
1164                                                 // needlessly chasing rows in this range when
1165                                                 // the other mate doesn't end up aligning
1166                                                 // anywhere
1167                                                 if(verbose_) cout << "Delaying a chase for first mate" << endl;
1168                                                 *delayedchaseL_ = true;
1169                                         } else {
1170                                                 // Start chasing this range
1171                                                 if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << " " << drL_->range().top << endl;
1172                                                 if(verbose_) cout << "Chasing a range for first mate" << endl;
1173                                                 if(*offsLsz_ > symCeiling_ && *offsRsz_ > symCeiling_) {
1174                                                         // Too many candidates for both mates; abort
1175                                                         // without any more searching
1176                                                         *donePair_ = true;
1177                                                         return;
1178                                                 }
1179                                                 // If this is the first range for both mates,
1180                                                 // choose the smaller range to chase down first
1181                                                 if(*delayedchaseR_ && (*offsRsz_ < *offsLsz_)) {
1182                                                         assert(drR_->foundRange);
1183                                                         *delayedchaseR_ = false;
1184                                                         *delayedchaseL_ = true;
1185                                                         *chaseR_ = true;
1186                                                         const Range& r = drR_->range();
1187                                                         assert(r.repOk());
1188                                                         uint32_t qlen = doneFw_? qlen1_ : qlen2_;
1189                                                         rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1190                                                 } else {
1191                                                         // Use Burrows-Wheeler for this pair (as
1192                                                         // usual)
1193                                                         *chaseL_ = true;
1194                                                         const Range& r = drL_->range();
1195                                                         uint32_t qlen = doneFw_? qlen2_ : qlen1_;
1196                                                         rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1197                                                 }
1198                                         }
1199                                 }
1200                         } else if(!drR_->done) {
1201                                 // If there are no more ranges for the other mate and
1202                                 // there are no candidate alignments either, then we're
1203                                 // not going to find a paired alignment in this
1204                                 // orientation.
1205                                 if(drL_->done && *offsLsz_ == 0) {
1206                                         // Give up on this orientation
1207                                         if(verbose_) cout << "Giving up on paired orientation " << (pairFw? "fw" : "rc") << " in mate 2" << endl;
1208                                         if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << endl;
1209                                         *donePair_ = true;
1210                                         return;
1211                                 }
1212                                 assert(!*delayedchaseR_);
1213                                 if(!drR_->foundRange) drR_->advance(ADV_FOUND_RANGE);
1214                                 if(drR_->foundRange) {
1215 #ifndef NDEBUG
1216                                         {
1217                                                 std::set<int64_t>& s = (pairFw ? allTopsR_fw_ : allTopsR_rc_);
1218                                                 int64_t t = drR_->range().top + 1; // add 1 to avoid 0
1219                                                 if(!drR_->range().ebwt->fw()) t = -t; // invert for bw index
1220                                                 assert(s.find(t) == s.end());
1221                                                 s.insert(t);
1222                                         }
1223 #endif
1224                                         // Add the size of this range to the total for this mate
1225                                         *offsRsz_ += (drR_->range().bot - drR_->range().top);
1226                                         if(*offsLsz_ == 0 && (!dontReconcile_ || *offsRsz_ > 3)) {
1227                                                 // Delay chasing this range; we delay to avoid
1228                                                 // needlessly chasing rows in this range when
1229                                                 // the other mate doesn't end up aligning
1230                                                 // anywhere
1231                                                 if(verbose_) cout << "Delaying a chase for second mate" << endl;
1232                                                 *delayedchaseR_ = true;
1233                                         } else {
1234                                                 // Start chasing this range
1235                                                 if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << " " << drR_->range().top << endl;
1236                                                 if(verbose_) cout << "Chasing a range for second mate" << endl;
1237                                                 if(*offsLsz_ > symCeiling_ && *offsRsz_ > symCeiling_) {
1238                                                         // Too many candidates for both mates; abort
1239                                                         // without any more searching
1240                                                         *donePair_ = true;
1241                                                         return;
1242                                                 }
1243                                                 // If this is the first range for both mates,
1244                                                 // choose the smaller range to chase down first
1245                                                 if(*delayedchaseL_ && *offsLsz_ < *offsRsz_) {
1246                                                         assert(drL_->foundRange);
1247                                                         *delayedchaseL_ = false;
1248                                                         *delayedchaseR_ = true;
1249                                                         *chaseL_ = true;
1250                                                         const Range& r = drL_->range();
1251                                                         assert(r.repOk());
1252                                                         uint32_t qlen = doneFw_? qlen2_ : qlen1_;
1253                                                         rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1254                                                 } else {
1255                                                         // Use Burrows-Wheeler for this pair (as
1256                                                         // usual)
1257                                                         *chaseR_ = true;
1258                                                         const Range& r = drR_->range();
1259                                                         assert(r.repOk());
1260                                                         uint32_t qlen = doneFw_? qlen1_ : qlen2_;
1261                                                         rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt);
1262                                                 }
1263                                         }
1264                                 }
1265                         } else {
1266                                 // Finished processing ranges for both mates
1267                                 assert(drL_->done && drR_->done);
1268                                 *donePair_ = true;
1269                         }
1270                 }
1271         }
1272
1273         const BitPairReference* refs_;
1274
1275         PatternSourcePerThread *patsrc_;
1276         uint32_t qlen1_;
1277         uint32_t qlen2_;
1278
1279         // Progress state
1280         bool doneFw_;   // finished with forward orientation of both mates?
1281         bool doneFwFirst_;
1282
1283         bool chase1Fw_;
1284         bool chase1Rc_;
1285         bool chase2Fw_;
1286         bool chase2Rc_;
1287
1288         bool delayedChase1Fw_;
1289         bool delayedChase1Rc_;
1290         bool delayedChase2Fw_;
1291         bool delayedChase2Rc_;
1292
1293         // For searching for outstanding mates
1294         RefAligner<String<Dna5> >* refAligner_;
1295
1296         // Temporary HitSink; to be deleted
1297         const HitSinkPerThreadFactory& sinkPtFactory_;
1298         HitSinkPerThread* sinkPt_;
1299
1300         // State for alignment
1301         EbwtSearchParams<String<Dna> >* params_;
1302
1303         // Paired-end boundaries
1304         const uint32_t minInsert_;
1305         const uint32_t maxInsert_;
1306
1307         // Don't attempt pairwise all-versus-all style of mate
1308         // reconciliation; just rely on mixed mode
1309         const bool dontReconcile_;
1310
1311         // If both mates in a given orientation align >= symCeiling times,
1312         // then immediately give up
1313         const uint32_t symCeiling_;
1314
1315         // If the total number of alignments for both mates in a given
1316         // orientation exceeds mixedThresh, then switch to mixed mode
1317         const uint32_t mixedThresh_;
1318         const uint32_t mixedAttemptLim_;
1319         uint32_t mixedAttempts_;
1320
1321         // Orientation of upstream/downstream mates when aligning to
1322         // forward strand
1323         const bool fw1_;
1324         const bool fw2_;
1325
1326         // State for getting alignments from ranges statefully
1327         RangeChaser<String<Dna> >* rchase_;
1328
1329         // true -> be talkative
1330         bool verbose_;
1331         // true -> suppress warnings
1332         bool quiet_;
1333
1334         int maxBts_;
1335         ChunkPool *pool_;
1336         int *btCnt_;
1337
1338         // Range-finding state for first mate
1339         TDriver*      driver1Fw_;
1340         TDriver*      driver1Rc_;
1341         U32PairVec    offs1FwArr_[32];
1342         TRangeVec     ranges1FwArr_[32];
1343         uint32_t      offs1FwSz_; // total size of all ranges found in this category
1344         U32PairVec    offs1RcArr_[32];
1345         TRangeVec     ranges1RcArr_[32];
1346         uint32_t      offs1RcSz_; // total size of all ranges found in this category
1347
1348         // Range-finding state for second mate
1349         TDriver*      driver2Fw_;
1350         TDriver*      driver2Rc_;
1351         U32PairVec    offs2FwArr_[32];
1352         TRangeVec     ranges2FwArr_[32];
1353         uint32_t      offs2FwSz_; // total size of all ranges found in this category
1354         U32PairVec    offs2RcArr_[32];
1355         TRangeVec     ranges2RcArr_[32];
1356         uint32_t      offs2RcSz_; // total size of all ranges found in this category
1357
1358         bool&       chaseL_fw_;
1359         bool&       chaseR_fw_;
1360         bool&       delayedchaseL_fw_;
1361         bool&       delayedchaseR_fw_;
1362         TDriver&    drL_fw_;
1363         TDriver&    drR_fw_;
1364         U32PairVec* offsLarr_fw_;
1365         U32PairVec* offsRarr_fw_;
1366         TRangeVec*  rangesLarr_fw_;
1367         TRangeVec*  rangesRarr_fw_;
1368         uint32_t&   offsLsz_fw_;
1369         uint32_t&   offsRsz_fw_;
1370
1371         bool&       chaseL_rc_;
1372         bool&       chaseR_rc_;
1373         bool&       delayedchaseL_rc_;
1374         bool&       delayedchaseR_rc_;
1375         TDriver&    drL_rc_;
1376         TDriver&    drR_rc_;
1377         U32PairVec* offsLarr_rc_;
1378         U32PairVec* offsRarr_rc_;
1379         TRangeVec*  rangesLarr_rc_;
1380         TRangeVec*  rangesRarr_rc_;
1381         uint32_t&   offsLsz_rc_;
1382         uint32_t&   offsRsz_rc_;
1383
1384         bool*       chaseL_;
1385         bool*       chaseR_;
1386         bool*       delayedchaseL_;
1387         bool*       delayedchaseR_;
1388         TDriver*    drL_;
1389         TDriver*    drR_;
1390         U32PairVec* offsLarr_;
1391         U32PairVec* offsRarr_;
1392         TRangeVec*  rangesLarr_;
1393         TRangeVec*  rangesRarr_;
1394         uint32_t*   offsLsz_;
1395         uint32_t*   offsRsz_;
1396         bool*       donePair_;
1397         bool        fwL_;
1398         bool        fwR_;
1399
1400         /// For keeping track of paired alignments that have already been
1401         /// found for the forward and reverse-comp pair orientations
1402         TSetPairs   pairs_fw_;
1403         TSetPairs   pairs_rc_;
1404
1405 #ifndef NDEBUG
1406         std::set<int64_t> allTopsL_fw_;
1407         std::set<int64_t> allTopsR_fw_;
1408         std::set<int64_t> allTopsL_rc_;
1409         std::set<int64_t> allTopsR_rc_;
1410 #endif
1411
1412         bool verbose2_;
1413 };
1414
1415 /**
1416  * Helper struct that holds a Range together with the coordinates where it al
1417  */
1418 struct RangeWithCoords {
1419         Range r;
1420         U32Pair h;
1421 };
1422
1423 /**
1424  * An aligner for finding paired alignments while operating entirely
1425  * within the Burrows-Wheeler domain.
1426  */
1427 template<typename TRangeSource>
1428 class PairedBWAlignerV2 : public Aligner {
1429
1430         typedef std::pair<uint32_t,uint32_t> U32Pair;
1431         typedef std::vector<U32Pair> U32PairVec;
1432         typedef std::vector<Range> TRangeVec;
1433         typedef RangeSourceDriver<TRangeSource> TDriver;
1434         typedef std::pair<uint64_t, uint64_t> TU64Pair;
1435         typedef std::set<TU64Pair> TSetPairs;
1436
1437 public:
1438         PairedBWAlignerV2(
1439                 EbwtSearchParams<String<Dna> >* params,
1440                 EbwtSearchParams<String<Dna> >* paramsSe1,
1441                 EbwtSearchParams<String<Dna> >* paramsSe2,
1442                 TDriver* driver,
1443                 RefAligner<String<Dna5> >* refAligner,
1444                 RangeChaser<String<Dna> >* rchase,
1445                 HitSink& sink,
1446                 const HitSinkPerThreadFactory& sinkPtFactory,
1447                 HitSinkPerThread* sinkPt,
1448                 HitSinkPerThread* sinkPtSe1,
1449                 HitSinkPerThread* sinkPtSe2,
1450                 bool fw1, bool fw2,
1451                 uint32_t minInsert,
1452                 uint32_t maxInsert,
1453                 uint32_t mixedAttemptLim,
1454                 const BitPairReference* refs,
1455                 bool rangeMode,
1456                 bool verbose,
1457                 bool quiet,
1458                 int maxBts,
1459                 ChunkPool *pool,
1460                 int *btCnt) :
1461                 Aligner(true, rangeMode),
1462                 refs_(refs),
1463                 patsrc_(NULL),
1464                 qlen1_(0), qlen2_(0),
1465                 chase_(false),
1466                 donePe_(false),
1467                 doneSe1_(false),
1468                 doneSe2_(false),
1469                 refAligner_(refAligner),
1470                 sinkPtFactory_(sinkPtFactory),
1471                 sinkPt_(sinkPt),
1472                 sinkPtSe1_(sinkPtSe1),
1473                 sinkPtSe2_(sinkPtSe2),
1474                 params_(params),
1475                 paramsSe1_(paramsSe1),
1476                 paramsSe2_(paramsSe2),
1477                 minInsert_(minInsert),
1478                 maxInsert_(maxInsert),
1479                 mixedAttemptLim_(mixedAttemptLim),
1480                 mixedAttempts_(0),
1481                 fw1_(fw1), fw2_(fw2),
1482                 rchase_(rchase),
1483                 driver_(driver),
1484                 pool_(pool),
1485                 verbose_(verbose),
1486                 quiet_(quiet),
1487                 maxBts_(maxBts),
1488                 btCnt_(btCnt)
1489         {
1490                 assert(sinkPt_ != NULL);
1491                 assert(params_ != NULL);
1492                 assert(driver_ != NULL);
1493         }
1494
1495         virtual ~PairedBWAlignerV2() {
1496                 delete driver_; driver_ = NULL;
1497                 delete params_; params_ = NULL;
1498                 if(paramsSe1_ != NULL) {
1499                         delete paramsSe1_; paramsSe1_ = NULL;
1500                         delete paramsSe2_; paramsSe2_ = NULL;
1501                 }
1502                 delete rchase_; rchase_ = NULL;
1503                 delete[] btCnt_; btCnt_ = NULL;
1504                 delete refAligner_; refAligner_ = NULL;
1505                 sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL;
1506                 if(sinkPtSe1_ != NULL) {
1507                         sinkPtFactory_.destroy(sinkPtSe1_); sinkPtSe1_ = NULL;
1508                         sinkPtFactory_.destroy(sinkPtSe2_); sinkPtSe2_ = NULL;
1509                 }
1510         }
1511
1512         /**
1513          * Prepare this aligner for the next read.
1514          */
1515         virtual void setQuery(PatternSourcePerThread* patsrc) {
1516                 assert(!patsrc->bufa().empty());
1517                 Aligner::setQuery(patsrc); // set fields & random seed
1518                 assert(!patsrc->bufb().empty());
1519                 // Give all of the drivers pointers to the relevant read info
1520                 patsrc_ = patsrc;
1521                 pool_->reset(&patsrc->bufa().name, patsrc->patid());
1522                 if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) {
1523                         if(!quiet_) {
1524                                 cerr << "Warning: Skipping pair " << patsrc->bufa().name
1525                                      << " because a mate is less than 4 characters long" << endl;
1526                         }
1527                         this->done = true;
1528                         sinkPt_->finishRead(*patsrc_, true, true);
1529                         return;
1530                 }
1531                 driver_->setQuery(patsrc, NULL);
1532                 qlen1_ = patsrc_->bufa().length();
1533                 qlen2_ = patsrc_->bufb().length();
1534                 if(btCnt_ != NULL) (*btCnt_) = maxBts_;
1535                 mixedAttempts_ = 0;
1536                 // Neither orientation is done
1537                 this->done = false;
1538                 // No ranges are being chased yet
1539                 chase_ = false;
1540                 donePe_ = doneSe1_ = doneSe2_ = false;
1541                 pairs_fw_.clear();
1542                 pairs_rc_.clear();
1543         }
1544
1545         /**
1546          * Advance the aligner by one memory op.  Return true iff we're
1547          * done with this read.
1548          *
1549          * A call to this function does one of many things:
1550          * 1. Advance a RangeSourceDriver and check if it found a new range
1551          * 2. Advance a RowChaseDriver and check if it found a reference
1552          *    offset for a an alignment in a range
1553          */
1554         virtual bool advance() {
1555                 assert(!this->done);
1556                 if(chase_) {
1557                         assert(!rangeMode_); // chasing ranges
1558                         if(!rchase_->foundOff() && !rchase_->done) {
1559                                 rchase_->advance();
1560                                 return false;
1561                         }
1562                         assert(rchase_->foundOff() || rchase_->done);
1563                         if(rchase_->foundOff()) {
1564                                 const Range& r = driver_->range();
1565                                 assert(r.repOk());
1566                                 resolveOutstanding(
1567                                         rchase_->off(),
1568                                         r.ebwt->_plen[rchase_->off().first], r);
1569                                 rchase_->reset();
1570                         } else {
1571                                 assert(rchase_->done);
1572                                 // Forget this range; keep looking for ranges
1573                                 chase_ = false;
1574                                 this->done = driver_->done;
1575                         }
1576                 }
1577
1578                 if(!this->done && !chase_) {
1579                         // Search for more ranges for whichever mate currently has
1580                         // fewer candidate alignments
1581                         if(!driver_->done) {
1582                                 if(!this->done) {
1583                                         //
1584                                         // Check whether any of the PE/SE possibilities
1585                                         // have become impossible due to the minCost
1586                                         //
1587                                         if(!donePe_) {
1588                                                 assert(!this->done);
1589                                                 donePe_ = sinkPt_->irrelevantCost(driver_->minCost);
1590                                                 if(donePe_ && (!sinkPt_->empty() || sinkPtSe1_ == NULL)) {
1591                                                         // Paired-end alignment(s) were found, no
1592                                                         // more will be found, and no unpaired
1593                                                         // alignments are requested, so stop
1594                                                         this->done = true;
1595                                                 }
1596                                                 if(donePe_ && sinkPtSe1_ != NULL) {
1597                                                         // Note: removeMate affects minCost
1598                                                         if(doneSe1_) driver_->removeMate(1);
1599                                                         if(doneSe2_) driver_->removeMate(2);
1600                                                 }
1601                                         }
1602                                         if(!this->done && sinkPtSe1_ != NULL) {
1603                                                 if(!doneSe1_) {
1604                                                         doneSe1_ = sinkPtSe1_->irrelevantCost(driver_->minCost);
1605                                                         if(doneSe1_ && donePe_) driver_->removeMate(1);
1606                                                 }
1607                                                 if(!doneSe2_) {
1608                                                         doneSe2_ = sinkPtSe2_->irrelevantCost(driver_->minCost);
1609                                                         if(doneSe2_ && donePe_) driver_->removeMate(2);
1610                                                 }
1611                                                 // Do Se1 again, because removing Se2 may have
1612                                                 // nudged minCost over the threshold
1613                                                 if(!doneSe1_) {
1614                                                         doneSe1_ = sinkPtSe1_->irrelevantCost(driver_->minCost);
1615                                                         if(doneSe1_ && donePe_) driver_->removeMate(1);
1616                                                 }
1617                                                 if(doneSe1_ && doneSe2_) assert(donePe_);
1618                                                 this->done = donePe_ && doneSe1_ && doneSe2_;
1619                                         }
1620
1621                                         if(!this->done) {
1622                                                 if(sinkPtSe1_ != NULL) {
1623                                                         assert(doneSe1_ || !sinkPtSe1_->irrelevantCost(driver_->minCost));
1624                                                         assert(doneSe2_ || !sinkPtSe2_->irrelevantCost(driver_->minCost));
1625                                                 }
1626                                                 assert(donePe_ || !sinkPt_->irrelevantCost(driver_->minCost));
1627                                                 driver_->advance(ADV_COST_CHANGES);
1628                                         }
1629                                 }
1630                                 if(driver_->foundRange) {
1631                                         // Use Burrows-Wheeler for this pair (as usual)
1632                                         chase_ = true;
1633                                         driver_->foundRange = false;
1634                                         const Range& r = driver_->range();
1635                                         assert(r.repOk());
1636                                         rchase_->setTopBot(r.top, r.bot,
1637                                                            r.mate1 ? qlen1_ : qlen2_,
1638                                                            rand_, r.ebwt);
1639                                 }
1640                         } else {
1641                                 this->done = true;
1642                         }
1643                 }
1644
1645                 if(this->done) {
1646                         bool reportedPe = (sinkPt_->finishRead(*patsrc_, true, true) > 0);
1647                         if(sinkPtSe1_ != NULL) {
1648                                 sinkPtSe1_->finishRead(*patsrc_, !reportedPe, false);
1649                                 sinkPtSe2_->finishRead(*patsrc_, !reportedPe, false);
1650                         }
1651                 }
1652                 return this->done;
1653         }
1654
1655 protected:
1656
1657         /**
1658          * Helper for reporting a pair of alignments.  As of now, we report
1659          * a paired alignment by reporting two consecutive alignments, one
1660          * for each mate.
1661          */
1662         bool report(const Range& rL, // range for upstream mate
1663                     const Range& rR, // range for downstream mate
1664                     uint32_t first,  // ref idx
1665                     uint32_t upstreamOff, // offset for upstream mate
1666                     uint32_t dnstreamOff, // offset for downstream mate
1667                     uint32_t tlen, // length of ref
1668                     bool pairFw,   // whether the pair is being mapped to fw strand
1669                     bool ebwtFwL,
1670                     bool ebwtFwR,
1671                     const ReferenceMap *rmap)
1672         {
1673                 assert_lt(upstreamOff, dnstreamOff);
1674                 uint32_t spreadL = rL.bot - rL.top;
1675                 uint32_t spreadR = rR.bot - rR.top;
1676                 uint32_t oms = min(spreadL, spreadR) - 1;
1677                 ReadBuf* bufL = pairFw ? bufa_ : bufb_;
1678                 ReadBuf* bufR = pairFw ? bufb_ : bufa_;
1679                 uint32_t lenL = pairFw ? alen_ : blen_;
1680                 uint32_t lenR = pairFw ? blen_ : alen_;
1681                 bool ret;
1682                 assert(!params_->sink().exceededOverThresh());
1683                 params_->setFw(rL.fw);
1684                 assert_eq(bufL->color, color);
1685                 // Print upstream mate first
1686                 ret = params_->reportHit(
1687                                 rL.fw ? (ebwtFwL?  bufL->patFw  :  bufL->patFwRev) :
1688                                         (ebwtFwL?  bufL->patRc  :  bufL->patRcRev),
1689                                 rL.fw ? (ebwtFwL? &bufL->qual    : &bufL->qualRev) :
1690                                         (ebwtFwL? &bufL->qualRev : &bufL->qual),
1691                                 &bufL->name,
1692                                 bufL->color,
1693                                 colorExEnds,
1694                                 snpPhred,
1695                                 refs_,
1696                                 rmap,
1697                                 ebwtFwL,
1698                                 rL.mms,                       // mismatch positions
1699                                 rL.refcs,                     // reference characters for mms
1700                                 rL.numMms,                    // # mismatches
1701                                 make_pair(first, upstreamOff),// position
1702                                 make_pair(first, dnstreamOff),// mate position
1703                                 rR.fw,                        // mate orientation
1704                                 lenR,                         // mate length
1705                                 make_pair(rL.top, rL.bot),    // arrows
1706                                 tlen,                         // textlen
1707                                 lenL,                         // qlen
1708                                 rL.stratum,                   // alignment stratum
1709                                 rL.cost,                      // cost, including quality penalty
1710                                 oms,                          // # other hits
1711                                 bufL->patid,
1712                                 bufL->seed,
1713                                 pairFw ? 1 : 2);
1714                 if(ret) {
1715                         return true; // can happen when -m is set
1716                 }
1717                 params_->setFw(rR.fw);
1718                 assert_eq(bufR->color, color);
1719                 ret = params_->reportHit(
1720                                 rR.fw ? (ebwtFwR?  bufR->patFw  :  bufR->patFwRev) :
1721                                         (ebwtFwR?  bufR->patRc  :  bufR->patRcRev),
1722                                 rR.fw ? (ebwtFwR? &bufR->qual    : &bufR->qualRev) :
1723                                         (ebwtFwR? &bufR->qualRev : &bufR->qual),
1724                                 &bufR->name,
1725                                 bufR->color,
1726                                 colorExEnds,
1727                                 snpPhred,
1728                                 refs_,
1729                                 rmap,
1730                                 ebwtFwR,
1731                                 rR.mms,                       // mismatch positions
1732                                 rR.refcs,                     // reference characters for mms
1733                                 rR.numMms,                    // # mismatches
1734                                 make_pair(first, dnstreamOff),// position
1735                                 make_pair(first, upstreamOff),// mate position
1736                                 rL.fw,                        // mate orientation
1737                                 lenL,                         // mate length
1738                                 make_pair(rR.top, rR.bot),    // arrows
1739                                 tlen,                         // textlen
1740                                 lenR,                         // qlen
1741                                 rR.stratum,                   // alignment stratum
1742                                 rR.cost,                      // cost, including quality penalty
1743                                 oms,                          // # other hits
1744                                 bufR->patid,
1745                                 bufR->seed,
1746                                 pairFw ? 2 : 1);
1747                 return ret;
1748         }
1749
1750         /**
1751          * Helper for reporting a pair of alignments.  As of now, we report
1752          * a paired alignment by reporting two consecutive alignments, one
1753          * for each mate.
1754          */
1755         void reportSe(const Range& r, U32Pair h, uint32_t tlen) {
1756                 EbwtSearchParams<String<Dna> >*params = (r.mate1 ? paramsSe1_ : paramsSe2_);
1757                 assert(!(r.mate1 ? doneSe1_ : doneSe2_));
1758                 params->setFw(r.fw);
1759                 ReadBuf* buf = r.mate1 ? bufa_ : bufb_;
1760                 bool ebwtFw = r.ebwt->fw();
1761                 uint32_t len = r.mate1 ? alen_ : blen_;
1762                 assert_eq(buf->color, color);
1763                 // Print upstream mate first
1764                 if(params->reportHit(
1765                         r.fw ? (ebwtFw?  buf->patFw   :  buf->patFwRev) :
1766                                (ebwtFw?  buf->patRc   :  buf->patRcRev),
1767                         r.fw ? (ebwtFw? &buf->qual    : &buf->qualRev) :
1768                                (ebwtFw? &buf->qualRev : &buf->qual),
1769                         &buf->name,
1770                         buf->color,
1771                         colorExEnds,
1772                         snpPhred,
1773                         refs_,
1774                         r.ebwt->rmap(),
1775                         ebwtFw,
1776                         r.mms,                   // mismatch positions
1777                         r.refcs,                 // reference characters for mms
1778                         r.numMms,                // # mismatches
1779                         h,                       // position
1780                         make_pair(0, 0),         // (bogus) mate coords
1781                         true,                    // (bogus) mate orientation
1782                         0,                       // (bogus) mate length
1783                         make_pair(r.top, r.bot), // arrows
1784                         tlen,                    // textlen
1785                         len,                     // qlen
1786                         r.stratum,               // alignment stratum
1787                         r.cost,                  // cost, including quality penalty
1788                         r.bot - r.top - 1,       // # other hits
1789                         buf->patid,
1790                         buf->seed,
1791                         0))
1792                 {
1793                         if(r.mate1) doneSe1_ = true;
1794                         else        doneSe2_ = true;
1795                         if(donePe_) driver_->removeMate(r.mate1 ? 1 : 2);
1796                 }
1797         }
1798
1799         void resolveOutstanding(const U32Pair& off,
1800                                 const uint32_t tlen,
1801                                 const Range& range)
1802         {
1803                 assert(!this->done);
1804                 if(!donePe_) {
1805                         bool ret = resolveOutstandingInRef(off, tlen, range);
1806                         if(++mixedAttempts_ > mixedAttemptLim_ || ret) {
1807                                 // Give up on this pair
1808                                 donePe_ = true;
1809                                 if(sinkPtSe1_ != NULL) {
1810                                         if(doneSe1_) driver_->removeMate(1);
1811                                         if(doneSe2_) driver_->removeMate(2);
1812                                 }
1813                         }
1814                         this->done = (donePe_ && (!sinkPt_->empty() || sinkPtSe1_ == NULL || (doneSe1_ && doneSe2_)));
1815                 }
1816                 if(!this->done && sinkPtSe1_ != NULL) {
1817                         bool doneSe = (range.mate1 ? doneSe1_ : doneSe2_);
1818                         if(!doneSe) {
1819                                 // Hold onto this single-end alignment in case we don't
1820                                 // find any paired alignments
1821                                 reportSe(range, off, tlen);
1822                         }
1823                         this->done = doneSe1_ && doneSe2_ && donePe_;
1824                 }
1825         }
1826
1827         /**
1828          * Given a vector of reference positions where one of the two mates
1829          * (the "anchor" mate) has aligned, look directly at the reference
1830          * sequence for instances where the other mate (the "outstanding"
1831          * mate) aligns such that mating constraint is satisfied.
1832          *
1833          * This function picks up to 'pick' anchors at random from the
1834          * 'offs' array.  It returns the number that it actually picked.
1835          */
1836         bool resolveOutstandingInRef(const U32Pair& off,
1837                                      const uint32_t tlen,
1838                                      const Range& range)
1839         {
1840                 assert(!donePe_);
1841                 assert(refs_->loaded());
1842                 assert_lt(off.first, refs_->numRefs());
1843                 // pairFw = true if the anchor indicates that the pair will
1844                 // align in its forward orientation (i.e. with mate1 to the
1845                 // left of mate2)
1846                 bool pairFw = (range.mate1)? (range.fw == fw1_) : (range.fw == fw2_);
1847                 // matchRight = true, if the opposite mate will be to the right
1848                 // of the anchor mate
1849                 bool matchRight = (pairFw ? range.mate1 : !range.mate1);
1850                 // fw = orientation of the opposite mate
1851                 bool fw = range.mate1 ? fw2_ : fw1_; // whether outstanding mate is fw/rc
1852                 if(!pairFw) fw = !fw;
1853                 // 'seq' = sequence for opposite mate
1854                 const String<Dna5>& seq  =
1855                         fw ? (range.mate1 ? patsrc_->bufb().patFw   :
1856                                         patsrc_->bufa().patFw)  :
1857                          (range.mate1 ? patsrc_->bufb().patRc   :
1858                                         patsrc_->bufa().patRc);
1859                 // 'qual' = qualities for opposite mate
1860                 const String<char>& qual =
1861                         fw ? (range.mate1 ? patsrc_->bufb().qual  :
1862                                             patsrc_->bufa().qual) :
1863                              (range.mate1 ? patsrc_->bufb().qualRev :
1864                                             patsrc_->bufa().qualRev);
1865                 uint32_t qlen = seqan::length(seq);  // length of outstanding mate
1866                 uint32_t alen = (range.mate1 ? patsrc_->bufa().length() :
1867                                                patsrc_->bufb().length());
1868                 int minins = minInsert_;
1869                 int maxins = maxInsert_;
1870                 if(fw1_) {
1871                         minins = max<int>(0, minins - patsrc_->bufa().trimmed5);
1872                         maxins = max<int>(0, maxins - patsrc_->bufa().trimmed5);
1873                 } else {
1874                         minins = max<int>(0, minins - patsrc_->bufa().trimmed3);
1875                         maxins = max<int>(0, maxins - patsrc_->bufa().trimmed3);
1876                 }
1877                 if(fw2_) {
1878                         minins = max<int>(0, minins - patsrc_->bufb().trimmed3);
1879                         maxins = max<int>(0, maxins - patsrc_->bufb().trimmed3);
1880                 } else {
1881                         minins = max<int>(0, minins - patsrc_->bufb().trimmed5);
1882                         maxins = max<int>(0, maxins - patsrc_->bufb().trimmed5);
1883                 }
1884                 assert_geq(minins, 0);
1885                 assert_geq(maxins, 0);
1886                 // Don't even try if either of the mates is longer than the
1887                 // maximum insert size.
1888                 if((uint32_t)maxins <= max(qlen, alen)) {
1889                         return false;
1890                 }
1891                 const uint32_t tidx = off.first;  // text id where anchor mate hit
1892                 const uint32_t toff = off.second; // offset where anchor mate hit
1893                 // Set begin/end to the range of reference positions where
1894                 // outstanding mate may align while fulfilling insert-length
1895                 // constraints.
1896                 uint32_t begin, end;
1897                 assert_geq(maxins, minins);
1898                 uint32_t insDiff = maxins - minins;
1899                 if(matchRight) {
1900                         end = toff + maxins;
1901                         begin = toff + 1;
1902                         if(qlen < alen) begin += alen-qlen;
1903                         if(end > insDiff + qlen) {
1904                                 begin = max<uint32_t>(begin, end - insDiff - qlen);
1905                         }
1906                         end = min<uint32_t>(refs_->approxLen(tidx), end);
1907                         begin = min<uint32_t>(refs_->approxLen(tidx), begin);
1908                 } else {
1909                         if(toff + alen < (uint32_t)maxins) {
1910                                 begin = 0;
1911                         } else {
1912                                 begin = toff + alen - maxins;
1913                         }
1914                         uint32_t mi = min<uint32_t>(alen, qlen);
1915                         end = toff + mi - 1;
1916                         end = min<uint32_t>(end, toff + alen - minins + qlen - 1);
1917                         if(toff + alen + qlen < (uint32_t)(minins + 1)) end = 0;
1918                 }
1919                 // Check if there's not enough space in the range to fit an
1920                 // alignment for the outstanding mate.
1921                 if(end - begin < qlen) return false;
1922                 std::vector<Range> ranges;
1923                 std::vector<uint32_t> offs;
1924                 refAligner_->find(1, tidx, refs_, seq, qual, begin, end, ranges,
1925                                   offs, pairFw ? &pairs_fw_ : &pairs_rc_,
1926                                   toff, fw);
1927                 assert_eq(ranges.size(), offs.size());
1928                 for(size_t i = 0; i < ranges.size(); i++) {
1929                         Range& r = ranges[i];
1930                         r.fw = fw;
1931                         r.cost |= (r.stratum << 14);
1932                         r.mate1 = !range.mate1;
1933                         const uint32_t result = offs[i];
1934                         // Just copy the known range's top and bot for now
1935                         r.top = range.top;
1936                         r.bot = range.bot;
1937                         bool ebwtLFw = matchRight ? range.ebwt->fw() : true;
1938                         bool ebwtRFw = matchRight ? true : range.ebwt->fw();
1939                         if(report(
1940                                 matchRight ? range : r, // range for upstream mate
1941                                 matchRight ? r : range, // range for downstream mate
1942                                 tidx,                   // ref idx
1943                                 matchRight ? toff : result, // upstream offset
1944                                 matchRight ? result : toff, // downstream offset
1945                                 tlen,       // length of ref
1946                                 pairFw,     // whether the pair is being mapped to fw strand
1947                                 ebwtLFw,
1948                                 ebwtRFw,
1949                                 range.ebwt->rmap())) return true;
1950                 }
1951                 return false;
1952         }
1953
1954         const BitPairReference* refs_;
1955
1956         PatternSourcePerThread *patsrc_;
1957         uint32_t qlen1_, qlen2_;
1958         bool chase_;
1959
1960         // true -> we're no longer shooting for paired-end alignments;
1961         // just collecting single-end ones
1962         bool donePe_, doneSe1_, doneSe2_;
1963
1964         // For searching for outstanding mates
1965         RefAligner<String<Dna5> >* refAligner_;
1966
1967         // Temporary HitSink; to be deleted
1968         const HitSinkPerThreadFactory& sinkPtFactory_;
1969         HitSinkPerThread* sinkPt_;
1970         HitSinkPerThread* sinkPtSe1_, * sinkPtSe2_;
1971
1972         // State for alignment
1973         EbwtSearchParams<String<Dna> >* params_;
1974         // for single-end:
1975         EbwtSearchParams<String<Dna> >* paramsSe1_, * paramsSe2_;
1976
1977         // Paired-end boundaries
1978         const uint32_t minInsert_;
1979         const uint32_t maxInsert_;
1980
1981         const uint32_t mixedAttemptLim_;
1982         uint32_t mixedAttempts_;
1983
1984         // Orientation of upstream/downstream mates when aligning to
1985         // forward strand
1986         const bool fw1_, fw2_;
1987
1988         // State for getting alignments from ranges statefully
1989         RangeChaser<String<Dna> >* rchase_;
1990
1991         // Range-finding state for first mate
1992         TDriver* driver_;
1993
1994         // Pool for distributing chunks of best-first path descriptor memory
1995         ChunkPool *pool_;
1996
1997         bool verbose_;
1998         bool quiet_;
1999
2000         int maxBts_; // maximum allowed # backtracks
2001         int *btCnt_; // current backtrack count
2002
2003         /// For keeping track of paired alignments that have already been
2004         /// found for the forward and reverse-comp pair orientations
2005         TSetPairs pairs_fw_, pairs_rc_;
2006 };
2007
2008 #endif /* ALIGNER_H_ */