Commit patch to not break on spaces.
[bowtie.git] / ref_aligner.h
1 /*
2  * ref_aligner.h
3  */
4
5 #ifndef REF_ALIGNER_H_
6 #define REF_ALIGNER_H_
7
8 #include <stdint.h>
9 #include <iostream>
10 #include <vector>
11 #include <stdexcept>
12 #include "seqan/sequence.h"
13 #include "alphabet.h"
14 #include "range.h"
15 #include "reference.h"
16
17 // Let the reference-aligner buffer size be 16K by default.  If more
18 // room is required, a new buffer must be allocated from the heap.
19 const static int REF_ALIGNER_BUFSZ = 16 * 1024;
20
21 /**
22  * Abstract parent class for classes that look for alignments by
23  * matching against the reference sequence directly.  This is useful
24  * both for sanity-checking results from the Bowtie index and for
25  * finding mates when the reference location of the opposite mate is
26  * known.
27  */
28 template<typename TStr>
29 class RefAligner {
30
31         typedef seqan::String<seqan::Dna5> TDna5Str;
32         typedef seqan::String<char> TCharStr;
33         typedef std::vector<uint32_t> TU32Vec;
34         typedef std::vector<Range> TRangeVec;
35         typedef std::pair<uint64_t, uint64_t> TU64Pair;
36         typedef std::set<TU64Pair> TSetPairs;
37
38 public:
39         RefAligner(bool color,
40                    bool verbose = false,
41                    bool quiet = false,
42                    uint32_t seedLen = 0,
43                    uint32_t qualMax = 0xffffffff,
44                    bool maqPenalty = false) :
45                 color_(color), verbose_(verbose), seedLen_(seedLen),
46                 qualMax_(qualMax), maqPenalty_(maqPenalty), refbuf_(buf_),
47                 refbufSz_(REF_ALIGNER_BUFSZ), freeRefbuf_(false)
48                 { }
49
50         /**
51          * Free the reference-space alignment buffer if this object
52          * allocated it.
53          */
54         virtual ~RefAligner() {
55                 if(freeRefbuf_) {
56                         delete[] refbuf_;
57                 }
58         }
59
60         /**
61          * Find one alignment of qry:quals in the range begin-end in
62          * reference string ref.  Store the alignment details in range.
63          */
64         virtual void find(uint32_t numToFind,
65                           const uint32_t tidx,
66                           const BitPairReference *refs,
67                           const TDna5Str& qry,
68                           const TCharStr& quals,
69                           uint32_t begin,
70                           uint32_t end,
71                           TRangeVec& ranges,
72                           TU32Vec& results,
73                           TSetPairs* pairs = NULL,
74                           uint32_t aoff = 0xffffffff,
75                           bool seedOnLeft = false)
76         {
77                 assert_gt(numToFind, 0);
78                 assert_gt(end, begin);
79                 uint32_t spread = end - begin + (color_ ? 1 : 0);
80                 uint32_t spreadPlus = spread + 12;
81                 // Make sure the buffer is large enough to accommodate the spread
82                 if(spreadPlus > this->refbufSz_) {
83                         this->newBuf(spreadPlus);
84                 }
85                 // Read in the relevant stretch of the reference string
86                 int offset = refs->getStretch(this->refbuf_, tidx, begin, spread);
87                 uint8_t *buf = ((uint8_t*)this->refbuf_) + offset;
88                 if(color_) {
89                         // Colorize buffer
90                         for(size_t i = 0; i < (end-begin); i++) {
91                                 assert_leq((int)buf[i], 4);
92                                 buf[i] = dinuc2color[(int)buf[i]][(int)buf[i+1]];
93                         }
94                 }
95                 // Look for alignments
96                 ASSERT_ONLY(uint32_t irsz = ranges.size());
97                 anchor64Find(numToFind, tidx, buf, qry, quals, begin,
98                              end, ranges, results, pairs, aoff, seedOnLeft);
99 #ifndef NDEBUG
100                 for(size_t i = irsz; i < results.size(); i++) {
101                         assert_eq(ranges[i].numMms, ranges[i].mms.size());
102                         assert_eq(ranges[i].numMms, ranges[i].refcs.size());
103                 }
104 #endif
105         }
106
107         /**
108          * Find one alignment of qry:quals in the range begin-end in
109          * reference string ref.  Store the alignment details in range.
110          * Uses a combination of the anchor bases and 64-bit arithmetic to
111          * find anchors quickly.
112          */
113         virtual void anchor64Find(uint32_t numToFind,
114                         uint32_t tidx,
115                         uint8_t* ref,
116                     const TDna5Str& qry,
117                     const TCharStr& quals,
118                     uint32_t begin,
119                     uint32_t end,
120                         TRangeVec& ranges,
121                         TU32Vec& results,
122                         TSetPairs* pairs = NULL,
123                         uint32_t aoff = 0xffffffff,
124                         bool seedOnLeft = false) const = 0;
125
126         /**
127          * Set a new reference-sequence buffer.
128          */
129         void setBuf(uint32_t *newbuf, uint32_t newsz) {
130                 if(freeRefbuf_) {
131                         delete[] refbuf_;
132                         freeRefbuf_ = false;
133                 }
134                 refbuf_ = newbuf;
135                 refbufSz_ = newsz;
136         }
137
138         /**
139          * Set a new reference-sequence buffer.
140          */
141         void newBuf(uint32_t newsz) {
142                 if(freeRefbuf_) {
143                         delete[] refbuf_;
144                 }
145                 try {
146                         refbuf_ = new uint32_t[(newsz + 3) / 4];
147                         if(refbuf_ == NULL) throw std::bad_alloc();
148                 } catch(std::bad_alloc& e) {
149                         cerr << "Error: Could not allocate reference-space alignment buffer of " << newsz << "B" << endl;
150                         throw 1;
151                 }
152                 refbufSz_ = newsz;
153                 freeRefbuf_ = true;
154         }
155
156 protected:
157         bool      color_;     /// whether to colorize reference buffers before handing off
158         bool      verbose_;   /// be talkative
159         uint32_t  seedLen_;   /// length of seed region for read
160         uint32_t  qualMax_;   /// maximum sum of quality penalties
161         bool      maqPenalty_;/// whether to round as in Maq
162         uint32_t *refbuf_;    /// pointer to current reference buffer
163         uint32_t  refbufSz_;  /// size of current reference buffer
164         uint32_t  buf_[REF_ALIGNER_BUFSZ / 4]; /// built-in reference buffer (may be superseded)
165         bool      freeRefbuf_; /// whether refbuf_ points to something we should delete
166 };
167
168 /**
169  * Concrete RefAligner for finding nearby exact hits given an anchor
170  * hit.
171  */
172 template<typename TStr>
173 class ExactRefAligner : public RefAligner<TStr> {
174
175         typedef seqan::String<seqan::Dna5> TDna5Str;
176         typedef seqan::String<char> TCharStr;
177         typedef std::vector<uint32_t> TU32Vec;
178         typedef std::vector<Range> TRangeVec;
179         typedef std::pair<uint64_t, uint64_t> TU64Pair;
180         typedef std::set<TU64Pair> TSetPairs;
181
182 public:
183
184         ExactRefAligner(bool color, bool verbose, bool quiet) :
185                 RefAligner<TStr>(color, verbose, quiet) { }
186
187         virtual ~ExactRefAligner() { }
188
189 protected:
190         /**
191          * Because we're doing end-to-end exact, we don't care which end of
192          * 'qry' is the 5' end.
193          */
194         void naiveFind(uint32_t numToFind,
195                        uint32_t tidx,
196                        uint8_t* ref,
197                    const TDna5Str& qry,
198                    const TCharStr& quals,
199                    uint32_t begin,
200                    uint32_t end,
201                        TRangeVec& ranges,
202                        TU32Vec& results,
203                        TSetPairs* pairs,
204                        uint32_t aoff,
205                        bool seedOnLeft) const
206         {
207                 assert_gt(numToFind, 0);
208                 const uint32_t qlen = seqan::length(qry);
209                 assert_geq(end - begin, qlen); // caller should have checked this
210                 assert_gt(end, begin);
211                 assert_gt(qlen, 0);
212                 uint32_t qend = end - qlen;
213                 uint32_t lim = qend - begin;
214                 uint32_t halfway = begin + (lim >> 1);
215                 bool hi = false;
216                 for(uint32_t i = 1; i <= lim+1; i++) {
217                         uint32_t ri;  // leftmost position in candidate alignment
218                         uint32_t rir; // same, minus begin; for indexing into ref[]
219                         if(hi) {
220                                 ri = halfway + (i >> 1); rir = ri - begin;
221                                 assert_leq(ri, qend);
222                         } else {
223                                 ri = halfway - (i >> 1); rir = ri - begin;
224                                 assert_geq(ri, begin);
225                         }
226                         hi = !hi;
227                         // Do the naive comparison
228                         bool match = true;
229                         for(uint32_t j = 0; j < qlen; j++) {
230 #if 0
231                                 // Count Ns in the reference as mismatches
232                                 const int q = (int)qry[j];
233                                 const int r = (int)ref[rir + j];
234                                 assert_leq(q, 4);
235                                 assert_leq(r, 4);
236                                 if(q == 4 || r == 4 || q != r) {
237                                         // Mismatch
238                                         match = false;
239                                         break;
240                                 }
241                                 // Match; continue
242 #else
243                                 // Disallow alignments that involve an N in the
244                                 // reference
245                                 const int r = (int)ref[rir + j];
246                                 if(r & 4) {
247                                         // N in reference; bail
248                                         match = false;
249                                         break;
250                                 }
251                                 const int q = (int)qry[j];
252                                 assert_leq(q, 4);
253                                 assert_lt(r, 4);
254                                 if(q != r) {
255                                         // Mismatch
256                                         match = false;
257                                         break;
258                                 }
259                                 // Match; continue
260 #endif
261                         }
262                         if(match) {
263                                 ranges.resize(ranges.size()+1);
264                                 Range& range = ranges.back();
265                                 range.stratum = 0;
266                                 range.numMms = 0;
267                                 assert_eq(0, range.mms.size());
268                                 assert_eq(0, range.refcs.size());
269                                 results.push_back(ri);
270                         }
271                 }
272                 return; // no match
273         }
274
275         /**
276          * Because we're doing end-to-end exact, we don't care which end of
277          * 'qry' is the 5' end.
278          */
279         virtual void anchor64Find(uint32_t numToFind,
280                     uint32_t tidx,
281                         uint8_t *ref,
282                     const TDna5Str& qry,
283                     const TCharStr& quals,
284                     uint32_t begin,
285                     uint32_t end,
286                         TRangeVec& ranges,
287                         TU32Vec& results,
288                         TSetPairs* pairs,
289                         uint32_t aoff, // offset of anchor mate
290                         bool seedOnLeft) const
291         {
292                 assert_gt(numToFind, 0);
293                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
294                 ASSERT_ONLY(uint32_t duplicates = 0);
295                 ASSERT_ONLY(uint32_t r2i = 0);
296                 const uint32_t qlen = seqan::length(qry);
297                 assert_geq(end - begin, qlen); // caller should have checked this
298                 assert_gt(end, begin);
299                 assert_gt(qlen, 0);
300 #ifndef NDEBUG
301                 // Get all naive hits
302                 TRangeVec r2; TU32Vec re2;
303                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
304                           re2, pairs, aoff, seedOnLeft);
305 #endif
306                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
307                 // anchorOverhang = # read bases not included in the anchor
308                 const uint32_t anchorOverhang = qlen <= 32 ? 0 : qlen - 32;
309                 const uint32_t lim = end - qlen - begin;
310                 const uint32_t halfway = begin + (lim >> 1);
311                 uint64_t anchor = 0llu;
312                 uint64_t buffw = 0llu;
313                 // Set up a mask that we'll apply to the two bufs every round
314                 // to discard bits that were rotated out of the anchor area
315                 uint64_t clearMask = 0xffffffffffffffffllu;
316                 bool useMask = false;
317                 if(anchorBitPairs < 32) {
318                         clearMask >>= ((32-anchorBitPairs) << 1);
319                         useMask = true;
320                 }
321                 const int lhsShift = ((anchorBitPairs - 1) << 1);
322                 // Build the contents of the 'anchor' dword and the initial
323                 // contents of the 'buffw' dword.  If there are fewer than 32
324                 // anchorBitPairs, the content will be packed into the least
325                 // significant bits of the word.
326                 uint32_t skipLeftToRights = 0;
327                 uint32_t skipRightToLefts = 0;
328                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
329                         int c = (int)qry[i]; // next query character
330                         assert_leq(c, 4);
331                         if(c & 4) {
332                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
333                                 return; // can't match if query has Ns
334                         }
335                         int r = (int)ref[halfway - begin + i]; // next reference character
336                         if(r & 4) {
337                                 r = 0;
338                                 // The right-to-left direction absorbs the candidate
339                                 // alignment based at 'halfway'; so skipLeftToRights is
340                                 // i, not i+1
341                                 skipLeftToRights = max(skipLeftToRights, i);
342                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
343                         }
344                         assert_lt(r, 4);
345                         assert_lt(c, 4);
346                         anchor  = ((anchor  << 2llu) | c);
347                         buffw = ((buffw << 2llu) | r);
348                 }
349                 // Check whether read is disqualified by Ns outside of the anchor
350                 // region
351                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
352                         if((int)qry[i] == 4) {
353                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
354                                 return; // can't match if query has Ns
355                         }
356                 }
357                 uint64_t bufbw = buffw;
358                 // We're moving the right-hand edge of the anchor along until
359                 // it's 'anchorOverhang' chars from the end of the target region.
360                 // Note that we're not making a 3'/5' distinction here; if we
361                 // were, we might need to make the 'anchorOverhang' adjustment on
362                 // the left end of the range rather than the right.
363                 bool hi = false;
364                 uint32_t riHi  = halfway;
365                 uint32_t rirHi = halfway - begin;
366                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
367                 uint32_t riLo  = halfway + 1;
368                 uint32_t rirLo = halfway - begin + 1;
369                 for(uint32_t i = 1; i <= lim + 1; i++) {
370                         int r;       // new reference char
371                         assert_lt(skipLeftToRights, qlen);
372                         assert_leq(skipRightToLefts, qlen);
373                         if(hi) {
374                                 hi = false;
375                                 // Moving left-to-right
376                                 riHi++; rirHi++; rirHiAnchor++;
377                                 r = (int)ref[rirHiAnchor];
378                                 if(r & 4) {
379                                         r = 0;
380                                         skipLeftToRights = anchorBitPairs;
381                                 }
382                                 assert_lt(r, 4);
383                                 // Bring in new base pair at the least significant
384                                 // position
385                                 buffw = ((buffw << 2llu) | r);
386                                 if(useMask) buffw &= clearMask;
387                                 if(skipLeftToRights > 0) {
388                                         skipLeftToRights--;
389                                         continue;
390                                 }
391                                 if(buffw != anchor) {
392                                         continue;
393                                 }
394                         } else {
395                                 hi = true;
396                                 // Moving right-to-left
397                                 riLo--; rirLo--;
398                                 r = (int)ref[rirLo];
399                                 if(r & 4) {
400                                         r = 0;
401                                         skipRightToLefts = qlen;
402                                 }
403                                 assert_lt(r, 4);
404                                 if(i >= 2) {
405                                         bufbw >>= 2llu;
406                                         // Bring in new base pair at the most significant
407                                         // position
408                                         bufbw |= ((uint64_t)r << lhsShift);
409                                 }
410                                 if(skipRightToLefts > 0) {
411                                         skipRightToLefts--;
412                                         continue;
413                                 }
414                                 if(bufbw != anchor) {
415                                         continue;
416                                 }
417                         }
418                         // Seed hit!
419                         bool foundHit = true;
420                         uint32_t ri = hi ? riLo : riHi;
421                         uint32_t rir = hi ? rirLo : rirHi;
422                         if(anchorOverhang > 0) {
423                                 // Does the non-anchor part of the alignment (the
424                                 // "overhang") ruin it?
425                                 bool skipCandidate = false;
426                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
427                                         assert_lt(ri + anchorBitPairs + j, end);
428                                         int rc = (int)ref[rir + anchorBitPairs + j];
429                                         if(rc == 4) {
430                                                 // Oops, encountered an N in the reference in
431                                                 // the overhang portion of the candidate
432                                                 // alignment
433                                                 // (Note that we inverted hi earlier)
434                                                 if(hi) {
435                                                         // Right-to-left
436                                                         skipRightToLefts = anchorOverhang - j - 1;
437                                                 } else {
438                                                         // Left-to-right
439                                                         skipLeftToRights = anchorBitPairs + j;
440                                                 }
441                                                 skipCandidate = true;
442                                                 break; // Skip this candidate
443                                         }
444                                         if((int)qry[32 + j] != rc) {
445                                                 // Yes, overhang ruins it
446                                                 foundHit = false;
447                                                 break;
448                                         }
449                                 }
450                                 if(skipCandidate) continue;
451                         }
452                         if(foundHit) {
453                                 if(pairs != NULL) {
454                                         TU64Pair p;
455                                         if(ri < aoff) {
456                                                 // By convention, the upstream mate's
457                                                 // coordinates go in the 'first' field
458                                                 p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
459                                                 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
460                                         } else {
461                                                 p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
462                                                 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
463                                         }
464                                         if(pairs->find(p) != pairs->end()) {
465                                                 // We already found this hit!  Continue.
466                                                 ASSERT_ONLY(duplicates++);
467                                                 ASSERT_ONLY(r2i++);
468                                                 continue;
469                                         } else {
470                                                 // Record this hit
471                                                 pairs->insert(p);
472                                         }
473                                 }
474                                 assert_lt(r2i, r2.size());
475                                 assert_eq(re2[r2i], ri);
476                                 ranges.resize(ranges.size()+1);
477                                 Range& range = ranges.back();
478                                 range.stratum = 0;
479                                 range.numMms = 0;
480                                 assert_eq(0, range.mms.size());
481                                 assert_eq(0, range.refcs.size());
482                                 ASSERT_ONLY(r2i++);
483                                 results.push_back(ri);
484                                 if(--numToFind == 0) return;
485                         } else {
486                                 // Keep scanning
487                         }
488                 }
489                 assert_leq(duplicates, r2.size());
490                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
491                 return; // no hit
492         }
493 };
494
495 /**
496  * Defined in ref_aligner.cpp.  Maps an octet representing the XOR of
497  * two two-bit-per-base-encoded DNA sequences to the number of bases
498  * that mismatch between the two.
499  */
500 extern unsigned char u8toMms[];
501
502 /**
503  * Concrete RefAligner for finding nearby 1-mismatch hits given an
504  * anchor hit.
505  */
506 template<typename TStr>
507 class OneMMRefAligner : public RefAligner<TStr> {
508
509         typedef seqan::String<seqan::Dna5> TDna5Str;
510         typedef seqan::String<char> TCharStr;
511         typedef std::vector<Range> TRangeVec;
512         typedef std::vector<uint32_t> TU32Vec;
513         typedef std::pair<uint64_t, uint64_t> TU64Pair;
514         typedef std::set<TU64Pair> TSetPairs;
515
516 public:
517
518         OneMMRefAligner(bool color, bool verbose, bool quiet) :
519                 RefAligner<TStr>(color, verbose, quiet) { }
520
521         virtual ~OneMMRefAligner() { }
522
523 protected:
524         /**
525          * Because we're doing end-to-end exact, we don't care which end of
526          * 'qry' is the 5' end.
527          */
528         void naiveFind(uint32_t numToFind,
529                        uint32_t tidx,
530                        uint8_t* ref,
531                    const TDna5Str& qry,
532                    const TCharStr& quals,
533                    uint32_t begin,
534                    uint32_t end,
535                        TRangeVec& ranges,
536                        TU32Vec& results,
537                        TSetPairs* pairs,
538                        uint32_t aoff,
539                        bool seedOnLeft) const
540         {
541                 assert_gt(numToFind, 0);
542                 const uint32_t qlen = seqan::length(qry);
543                 assert_geq(end - begin, qlen); // caller should have checked this
544                 assert_gt(end, begin);
545                 assert_gt(qlen, 0);
546                 uint32_t qend = end - qlen;
547                 uint32_t lim = qend - begin;
548                 uint32_t halfway = begin + (lim >> 1);
549                 bool hi = false;
550                 for(uint32_t i = 1; i <= lim+1; i++) {
551                         uint32_t ri;  // leftmost position in candidate alignment
552                         uint32_t rir; // same, minus begin; for indexing into ref[]
553                         if(hi) {
554                                 ri = halfway + (i >> 1); rir = ri - begin;
555                                 assert_leq(ri, qend);
556                         } else {
557                                 ri = halfway - (i >> 1); rir = ri - begin;
558                                 assert_geq(ri, begin);
559                         }
560                         hi = !hi;
561                         // Do the naive comparison
562                         bool match = true;
563                         int refc = -1;
564                         uint32_t mmOff = 0xffffffff;
565                         int mms = 0;
566                         for(uint32_t j = 0; j < qlen; j++) {
567 #if 0
568                                 // Count Ns in the reference as mismatches
569                                 const int q = (int)qry[j];
570                                 const int r = (int)ref[rir + j];
571                                 assert_leq(q, 4);
572                                 assert_leq(r, 4);
573                                 if(q == 4 || r == 4 || q != r) {
574 #else
575                                 // Disallow alignments that involve an N in the
576                                 // reference
577                                 const int r = (int)ref[rir + j];
578                                 if(r & 4) {
579                                         // N in reference; bail
580                                         match = false;
581                                         break;
582                                 }
583                                 const int q = (int)qry[j];
584                                 assert_leq(q, 4);
585                                 assert_lt(r, 4);
586                                 if(q != r) {
587 #endif
588                                         // Mismatch!
589                                         if(++mms > 1) {
590                                                 // Too many; reject this alignment
591                                                 match = false;
592                                                 break;
593                                         } else {
594                                                 // First one; remember offset and ref char
595                                                 refc = "ACGTN"[r];
596                                                 mmOff = j;
597                                         }
598                                 }
599                         }
600                         if(match) {
601                                 assert_leq(mms, 1);
602                                 ranges.resize(ranges.size()+1);
603                                 Range& range = ranges.back();
604                                 range.stratum = mms;
605                                 range.numMms = mms;
606                                 assert_eq(0, range.mms.size());
607                                 assert_eq(0, range.refcs.size());
608                                 if(mms == 1) {
609                                         assert_lt(mmOff, qlen);
610                                         range.mms.push_back(mmOff);
611                                         range.refcs.push_back(refc);
612                                 }
613                                 results.push_back(ri);
614                         }
615                 }
616                 return;
617         }
618
619         /**
620          * Because we're doing end-to-end exact, we don't care which end of
621          * 'qry' is the 5' end.
622          */
623         virtual void anchor64Find(uint32_t numToFind,
624                         uint32_t tidx,
625                         uint8_t* ref,
626                     const TDna5Str& qry,
627                     const TCharStr& quals,
628                     uint32_t begin,
629                     uint32_t end,
630                         TRangeVec& ranges,
631                         TU32Vec& results,
632                         TSetPairs* pairs = NULL,
633                         uint32_t aoff = 0xffffffff,
634                         bool seedOnLeft = false) const
635         {
636                 assert_gt(numToFind, 0);
637                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
638                 ASSERT_ONLY(uint32_t duplicates = 0);
639                 ASSERT_ONLY(uint32_t r2i = 0);
640                 const uint32_t qlen = seqan::length(qry);
641                 assert_geq(end - begin, qlen); // caller should have checked this
642                 assert_gt(end, begin);
643                 assert_gt(qlen, 0);
644 #ifndef NDEBUG
645                 // Get results from the naive matcher for sanity-checking
646                 TRangeVec r2; TU32Vec re2;
647                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
648                           re2, pairs, aoff, seedOnLeft);
649 #endif
650                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
651                 const int lhsShift = ((anchorBitPairs - 1) << 1);
652                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
653                 // anchorOverhang = # read bases not included in the anchor
654                 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
655                 const uint32_t lim = end - qlen - begin;
656                 const uint32_t halfway = begin + (lim >> 1);
657                 uint64_t anchor = 0llu;
658                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
659                 // OR the 'diff' buffer with this so that we can always count
660                 // 'N's as mismatches
661                 uint64_t diffMask = 0llu;
662                 // Set up a mask that we'll apply to the two bufs every round
663                 // to discard bits that were rotated out of the anchor area
664                 uint64_t clearMask = 0xffffffffffffffffllu;
665                 bool useMask = false;
666                 if(anchorBitPairs < 32) {
667                         clearMask >>= ((32-anchorBitPairs) << 1);
668                         useMask = true;
669                 }
670                 int nsInAnchor = 0;
671                 int nPos = -1;
672                 uint32_t skipLeftToRights = 0;
673                 uint32_t skipRightToLefts = 0;
674                 // Construct the 'anchor' 64-bit buffer so that it holds all of
675                 // the first 'anchorBitPairs' bit pairs of the query.
676                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
677                         int c = (int)qry[i]; // next query character
678                         int r = (int)ref[halfway - begin + i]; // next reference character
679                         if(r & 4) {
680                                 r = 0;
681                                 // The right-to-left direction absorbs the candidate
682                                 // alignment based at 'halfway'; so skipLeftToRights is
683                                 // i, not i+1
684                                 skipLeftToRights = max(skipLeftToRights, i);
685                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
686                         }
687                         assert_leq(c, 4);
688                         assert_lt(r, 4);
689                         // Special case: query has an 'N'
690                         if(c == 4) {
691                                 if(++nsInAnchor > 1) {
692                                         // More than one 'N' in the anchor region; can't
693                                         // possibly have a 1-mismatch hit anywhere
694                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
695                                         return;   // can't match if query has Ns
696                                 }
697                                 nPos = (int)i;
698                                 // Make it look like an 'A' in the anchor
699                                 c = 0;
700                                 diffMask = (diffMask << 2llu) | 1llu;
701                         } else {
702                                 diffMask <<= 2llu;
703                         }
704                         anchor  = ((anchor  << 2llu) | c);
705                         buffw = ((buffw << 2llu) | r);
706                 }
707                 // Check whether read is disqualified by Ns outside of the anchor
708                 // region
709                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
710                         if((int)qry[i] == 4) {
711                                 if(++nsInAnchor > 1) {
712                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
713                                         return; // can't match if query has Ns
714                                 }
715                         }
716                 }
717                 uint64_t bufbw = buffw;
718                 // We're moving the right-hand edge of the anchor along until
719                 // it's 'anchorOverhang' chars from the end of the target region.
720                 // Note that we're not making a 3'/5' distinction here; if we
721                 // were, we might need to make the 'anchorOverhang' adjustment on
722                 // the left end of the range rather than the right.
723                 bool hi = false;
724                 uint32_t riHi  = halfway;
725                 uint32_t rirHi = halfway - begin;
726                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
727                 uint32_t riLo  = halfway + 1;
728                 uint32_t rirLo = halfway - begin + 1;
729                 for(uint32_t i = 1; i <= lim + 1; i++) {
730                         int r;       // new reference char
731                         uint64_t diff;
732                         assert_lt(skipLeftToRights, qlen);
733                         assert_leq(skipRightToLefts, qlen);
734                         if(hi) {
735                                 hi = false;
736                                 // Moving left-to-right
737                                 riHi++;
738                                 rirHi++;
739                                 rirHiAnchor++;
740                                 r = (int)ref[rirHiAnchor];
741                                 if(r & 4) {
742                                         r = 0;
743                                         skipLeftToRights = anchorBitPairs;
744                                 }
745                                 assert_lt(r, 4);
746                                 // Bring in new base pair at the least significant
747                                 // position
748                                 buffw = ((buffw << 2llu) | r);
749                                 if(useMask) buffw &= clearMask;
750                                 if(skipLeftToRights > 0) {
751                                         skipLeftToRights--;
752                                         continue;
753                                 }
754                                 diff = (buffw ^ anchor) | diffMask;
755                         } else {
756                                 hi = true;
757                                 // Moving right-to-left
758                                 riLo--;
759                                 rirLo--;
760                                 r = (int)ref[rirLo];
761                                 if(r & 4) {
762                                         r = 0;
763                                         skipRightToLefts = qlen;
764                                 }
765                                 assert_lt(r, 4);
766                                 if(i >= 2) {
767                                         bufbw >>= 2llu;
768                                         // Bring in new base pair at the most significant
769                                         // position
770                                         bufbw |= ((uint64_t)r << lhsShift);
771                                 }
772                                 if(skipRightToLefts > 0) {
773                                         skipRightToLefts--;
774                                         continue;
775                                 }
776                                 diff = (bufbw ^ anchor) | diffMask;
777                         }
778                         if((diff & 0xffffffff00000000llu) &&
779                            (diff & 0x00000000ffffffffllu)) continue;
780                         uint32_t ri  = hi ? riLo  : riHi;
781                         uint32_t rir = hi ? rirLo : rirHi;
782                         // Could use pop count
783                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
784                         // As a first cut, see if there are too many mismatches in
785                         // the first and last parts of the anchor
786                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
787                         if(diffs > 1) continue;
788                         diffs += u8toMms[(int)diff8[1]] +
789                                  u8toMms[(int)diff8[2]] +
790                                  u8toMms[(int)diff8[3]] +
791                                  u8toMms[(int)diff8[4]] +
792                                  u8toMms[(int)diff8[5]] +
793                                  u8toMms[(int)diff8[6]];
794                         uint32_t mmpos = 0xffffffff;
795                         int refc = -1;
796                         if(diffs > 1) {
797                                 // Too many differences
798                                 continue;
799                         } else if(diffs == 1 && nPos != -1) {
800                                 // There was one difference, but there was also one N,
801                                 // so we already know where the difference is
802                                 mmpos = nPos;
803                                 refc = "ACGT"[(int)ref[rir + nPos]];
804                         } else if(diffs == 1) {
805                                 // Figure out which position mismatched
806                                 mmpos = 31;
807                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos -= 16; }
808                                 assert_neq(0, diff);
809                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos -=  8; }
810                                 assert_neq(0, diff);
811                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos -=  4; }
812                                 assert_neq(0, diff);
813                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos -=  2; }
814                                 assert_neq(0, diff);
815                                 if((diff & 0x3llu) == 0)        { mmpos--; }
816                                 assert_neq(0, diff);
817                                 assert_geq(mmpos, 0);
818                                 assert_lt(mmpos, 32);
819                                 mmpos -= anchorCushion;
820                                 refc = "ACGT"[(int)ref[rir + mmpos]];
821                         }
822                         // Now extend the anchor into a longer alignment
823                         bool foundHit = true;
824                         if(anchorOverhang > 0) {
825                                 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
826                                 bool skipCandidate = false;
827                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
828                                         int rc = (int)ref[rir + 32 + j];
829                                         if(rc == 4) {
830                                                 // Oops, encountered an N in the reference in
831                                                 // the overhang portion of the candidate
832                                                 // alignment
833                                                 // (Note that we inverted hi earlier)
834                                                 if(hi) {
835                                                         // Right-to-left
836                                                         skipRightToLefts = anchorOverhang - j - 1;
837                                                 } else {
838                                                         // Left-to-right
839                                                         skipLeftToRights = anchorBitPairs + j;
840                                                 }
841                                                 skipCandidate = true; // Skip this candidate
842                                                 break;
843                                         }
844                                         if((int)qry[32 + j] != rc) {
845                                                 if(++diffs > 1) {
846                                                         foundHit = false;
847                                                         break;
848                                                 } else {
849                                                         mmpos = 32 + j;
850                                                         refc = "ACGT"[(int)ref[rir + 32 + j]];
851                                                 }
852                                         }
853                                 }
854                                 if(skipCandidate) continue;
855                         }
856                         if(!foundHit) continue;
857                         if(pairs != NULL) {
858                                 TU64Pair p;
859                                 if(ri < aoff) {
860                                         // By convention, the upstream mate's
861                                         // coordinates go in the 'first' field
862                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
863                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
864                                 } else {
865                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
866                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
867                                 }
868                                 if(pairs->find(p) != pairs->end()) {
869                                         // We already found this hit!  Continue.
870                                         ASSERT_ONLY(duplicates++);
871                                         ASSERT_ONLY(r2i++);
872                                         continue;
873                                 } else {
874                                         // Record this hit
875                                         pairs->insert(p);
876                                 }
877                         }
878                         assert_leq(diffs, 1);
879                         assert_lt(r2i, r2.size());
880                         assert_eq(re2[r2i], ri);
881                         ranges.resize(ranges.size()+1);
882                         Range& range = ranges.back();
883                         assert_eq(diffs, r2[r2i].numMms);
884                         range.stratum = diffs;
885                         range.numMms = diffs;
886                         assert_eq(0, range.mms.size());
887                         assert_eq(0, range.refcs.size());
888                         if(diffs == 1) {
889                                 assert_neq(mmpos, 0xffffffff);
890                                 assert_eq(mmpos, r2[r2i].mms[0]);
891                                 assert_neq(-1, refc);
892                                 assert_eq(refc, r2[r2i].refcs[0]);
893                                 range.mms.push_back(mmpos);
894                                 range.refcs.push_back(refc);
895                         }
896                         ASSERT_ONLY(r2i++);
897                         results.push_back(ri);
898                         if(--numToFind == 0) return;
899                 }
900                 assert_leq(duplicates, r2.size());
901                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
902                 return; // no hit
903         }
904 };
905
906 /**
907  * Concrete RefAligner for finding nearby 2-mismatch hits given an
908  * anchor hit.
909  */
910 template<typename TStr>
911 class TwoMMRefAligner : public RefAligner<TStr> {
912
913         typedef seqan::String<seqan::Dna5> TDna5Str;
914         typedef seqan::String<char> TCharStr;
915         typedef std::vector<Range> TRangeVec;
916         typedef std::vector<uint32_t> TU32Vec;
917         typedef std::pair<uint64_t, uint64_t> TU64Pair;
918         typedef std::set<TU64Pair> TSetPairs;
919
920 public:
921
922         TwoMMRefAligner(bool color, bool verbose, bool quiet) :
923                 RefAligner<TStr>(color, verbose, quiet) { }
924
925         virtual ~TwoMMRefAligner() { }
926
927 protected:
928         /**
929          * Because we're doing end-to-end exact, we don't care which end of
930          * 'qry' is the 5' end.
931          */
932         void naiveFind(uint32_t numToFind,
933                                    uint32_t tidx,
934                                    uint8_t* ref,
935                                    const TDna5Str& qry,
936                                    const TCharStr& quals,
937                                    uint32_t begin,
938                                    uint32_t end,
939                                    TRangeVec& ranges,
940                                    TU32Vec& results,
941                                    TSetPairs* pairs,
942                                    uint32_t aoff,
943                                    bool seedOnLeft) const
944         {
945                 assert_gt(numToFind, 0);
946                 const uint32_t qlen = seqan::length(qry);
947                 assert_geq(end - begin, qlen); // caller should have checked this
948                 assert_gt(end, begin);
949                 assert_gt(qlen, 0);
950                 uint32_t qend = end - qlen;
951                 uint32_t lim = qend - begin;
952                 uint32_t halfway = begin + (lim >> 1);
953                 bool hi = false;
954                 for(uint32_t i = 1; i <= lim+1; i++) {
955                         uint32_t ri;  // leftmost position in candidate alignment
956                         uint32_t rir; // same, minus begin; for indexing into ref[]
957                         if(hi) {
958                                 ri = halfway + (i >> 1); rir = ri - begin;
959                                 assert_leq(ri, qend);
960                         } else {
961                                 ri = halfway - (i >> 1); rir = ri - begin;
962                                 assert_geq(ri, begin);
963                         }
964                         hi = !hi;
965                         // Do the naive comparison
966                         bool match = true;
967                         int refc1 = -1;
968                         uint32_t mmOff1 = 0xffffffff;
969                         int refc2 = -1;
970                         uint32_t mmOff2 = 0xffffffff;
971                         int mms = 0;
972                         for(uint32_t j = 0; j < qlen; j++) {
973 #if 0
974                                 // Count Ns in the reference as mismatches
975                                 const int q = (int)qry[j];
976                                 const int r = (int)ref[rir + j];
977                                 assert_leq(q, 4);
978                                 assert_leq(r, 4);
979                                 if(q == 4 || r == 4 || q != r) {
980 #else
981                                 // Disallow alignments that involve an N in the
982                                 // reference
983                                 const int r = (int)ref[rir + j];
984                                 if(r & 4) {
985                                         // N in reference; bail
986                                         match = false;
987                                         break;
988                                 }
989                                 const int q = (int)qry[j];
990                                 assert_leq(q, 4);
991                                 assert_lt(r, 4);
992                                 if(q != r) {
993 #endif
994                                         // Mismatch!
995                                         if(++mms > 2) {
996                                                 // Too many; reject this alignment
997                                                 match = false;
998                                                 break;
999                                         } else if(mms == 2) {
1000                                                 // Second one; remember offset and ref char
1001                                                 refc2 = "ACGTN"[r];
1002                                                 mmOff2 = j;
1003                                         } else {
1004                                                 assert_eq(1, mms);
1005                                                 // First one; remember offset and ref char
1006                                                 refc1 = "ACGTN"[r];
1007                                                 mmOff1 = j;
1008                                         }
1009                                 }
1010                         }
1011                         if(match) {
1012                                 assert_leq(mms, 2);
1013                                 ranges.resize(ranges.size()+1);
1014                                 Range& range = ranges.back();
1015                                 range.stratum = mms;
1016                                 range.numMms = mms;
1017                                 assert_eq(0, range.mms.size());
1018                                 assert_eq(0, range.refcs.size());
1019                                 if(mms > 0) {
1020                                         assert_lt(mmOff1, qlen);
1021                                         assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1022                                         range.mms.push_back(mmOff1);
1023                                         range.refcs.push_back(refc1);
1024                                         if(mms > 1) {
1025                                                 assert_eq(2, mms);
1026                                                 assert_lt(mmOff2, qlen);
1027                                                 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1028                                                 range.mms.push_back(mmOff2);
1029                                                 range.refcs.push_back(refc2);
1030                                         }
1031                                 }
1032                                 results.push_back(ri);
1033                         }
1034                 }
1035                 return;
1036         }
1037
1038         /**
1039          * Because we're doing end-to-end exact, we don't care which end of
1040          * 'qry' is the 5' end.
1041          */
1042         virtual void anchor64Find(uint32_t numToFind,
1043                                         uint32_t tidx,
1044                                         uint8_t* ref,
1045                                         const TDna5Str& qry,
1046                                         const TCharStr& quals,
1047                                         uint32_t begin,
1048                                         uint32_t end,
1049                                         TRangeVec& ranges,
1050                                         TU32Vec& results,
1051                                         TSetPairs* pairs = NULL,
1052                                         uint32_t aoff = 0xffffffff,
1053                                         bool seedOnLeft = false) const
1054         {
1055                 assert_gt(numToFind, 0);
1056                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1057                 ASSERT_ONLY(uint32_t duplicates = 0);
1058                 ASSERT_ONLY(uint32_t r2i = 0);
1059                 const uint32_t qlen = seqan::length(qry);
1060                 assert_geq(end - begin, qlen); // caller should have checked this
1061                 assert_gt(end, begin);
1062                 assert_gt(qlen, 0);
1063 #ifndef NDEBUG
1064                 // Get results from the naive matcher for sanity-checking
1065                 TRangeVec r2; TU32Vec re2;
1066                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1067                                   re2, pairs, aoff, seedOnLeft);
1068 #endif
1069                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1070                 const int lhsShift = ((anchorBitPairs - 1) << 1);
1071                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
1072                 // anchorOverhang = # read bases not included in the anchor
1073                 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1074                 const uint32_t lim = end - qlen - begin;
1075                 const uint32_t halfway = begin + (lim >> 1);
1076                 uint64_t anchor = 0llu;
1077                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
1078                 // OR the 'diff' buffer with this so that we can always count
1079                 // 'N's as mismatches
1080                 uint64_t diffMask = 0llu;
1081                 // Set up a mask that we'll apply to the two bufs every round
1082                 // to discard bits that were rotated out of the anchor area
1083                 uint64_t clearMask = 0xffffffffffffffffllu;
1084                 bool useMask = false;
1085                 if(anchorBitPairs < 32) {
1086                         clearMask >>= ((32-anchorBitPairs) << 1);
1087                         useMask = true;
1088                 }
1089                 int nsInAnchor = 0;
1090                 uint32_t nPoss = 0;
1091                 int nPos1 = -1;
1092                 int nPos2 = -1;
1093                 uint32_t skipLeftToRights = 0;
1094                 uint32_t skipRightToLefts = 0;
1095                 // Construct the 'anchor' 64-bit buffer so that it holds all of
1096                 // the first 'anchorBitPairs' bit pairs of the query.
1097                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1098                         int c = (int)qry[i]; // next query character
1099                         int r = (int)ref[halfway - begin + i]; // next reference character
1100                         if(r & 4) {
1101                                 r = 0;
1102                                 // The right-to-left direction absorbs the candidate
1103                                 // alignment based at 'halfway'; so skipLeftToRights is
1104                                 // i, not i+1
1105                                 skipLeftToRights = max(skipLeftToRights, i);
1106                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1107                         }
1108                         assert_leq(c, 4);
1109                         assert_lt(r, 4);
1110                         // Special case: query has an 'N'
1111                         if(c == 4) {
1112                                 if(++nsInAnchor > 2) {
1113                                         // More than two 'N's in the anchor region; can't
1114                                         // possibly have a 2-mismatch hit anywhere
1115                                         return;   // can't match if query has Ns
1116                                 } else if(nsInAnchor == 2) {
1117                                         nPos2 = (int)i;
1118                                         nPoss++;
1119                                 } else {
1120                                         assert_eq(1, nsInAnchor);
1121                                         nPos1 = (int)i;
1122                                         nPoss++;
1123                                 }
1124                                 // Make it look like an 'A' in the anchor
1125                                 c = 0;
1126                                 diffMask = (diffMask << 2llu) | 1llu;
1127                         } else {
1128                                 diffMask <<= 2llu;
1129                         }
1130                         anchor  = ((anchor  << 2llu) | c);
1131                         buffw = ((buffw << 2llu) | r);
1132                 }
1133                 assert_leq(nPoss, 2);
1134                 // Check whether read is disqualified by Ns outside of the anchor
1135                 // region
1136                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1137                         if((int)qry[i] == 4) {
1138                                 if(++nsInAnchor > 2) {
1139                                         return; // can't match if query has Ns
1140                                 }
1141                         }
1142                 }
1143                 uint64_t bufbw = buffw;
1144                 // We're moving the right-hand edge of the anchor along until
1145                 // it's 'anchorOverhang' chars from the end of the target region.
1146                 // Note that we're not making a 3'/5' distinction here; if we
1147                 // were, we might need to make the 'anchorOverhang' adjustment on
1148                 // the left end of the range rather than the right.
1149                 bool hi = false;
1150                 uint32_t riHi  = halfway;
1151                 uint32_t rirHi = halfway - begin;
1152                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1153                 uint32_t riLo  = halfway + 1;
1154                 uint32_t rirLo = halfway - begin + 1;
1155                 uint32_t i;
1156                 for(i = 1; i <= lim + 1; i++) {
1157                         int r;       // new reference char
1158                         uint64_t diff;
1159                         assert_lt(skipLeftToRights, qlen);
1160                         assert_leq(skipRightToLefts, qlen);
1161                         if(hi) {
1162                                 hi = false;
1163                                 // Moving left-to-right
1164                                 riHi++;
1165                                 rirHi++;
1166                                 rirHiAnchor++;
1167                                 r = (int)ref[rirHiAnchor];
1168                                 if(r & 4) {
1169                                         r = 0;
1170                                         skipLeftToRights = anchorBitPairs;
1171                                 }
1172                                 assert_lt(r, 4);
1173                                 // Bring in new base pair at the least significant
1174                                 // position
1175                                 buffw = ((buffw << 2llu) | r);
1176                                 if(useMask) buffw &= clearMask;
1177                                 if(skipLeftToRights > 0) {
1178                                         skipLeftToRights--;
1179                                         continue;
1180                                 }
1181                                 diff = (buffw ^ anchor) | diffMask;
1182                         } else {
1183                                 hi = true;
1184                                 // Moving right-to-left
1185                                 riLo--;
1186                                 rirLo--;
1187                                 r = (int)ref[rirLo];
1188                                 if(r & 4) {
1189                                         r = 0;
1190                                         skipRightToLefts = qlen;
1191                                 }
1192                                 assert_lt(r, 4);
1193                                 if(i >= 2) {
1194                                         bufbw >>= 2llu;
1195                                         // Bring in new base pair at the most significant
1196                                         // position
1197                                         bufbw |= ((uint64_t)r << lhsShift);
1198                                 }
1199                                 if(skipRightToLefts > 0) {
1200                                         skipRightToLefts--;
1201                                         continue;
1202                                 }
1203                                 diff = (bufbw ^ anchor) | diffMask;
1204                         }
1205                         if((diff & 0xfffff00000000000llu) &&
1206                            (diff & 0x00000ffffff00000llu) &&
1207                            (diff & 0x00000000000fffffllu)) continue;
1208                         uint32_t ri  = hi ? riLo  : riHi;
1209                         uint32_t rir = hi ? rirLo : rirHi;
1210                         // Could use pop count
1211                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1212                         // As a first cut, see if there are too many mismatches in
1213                         // the first and last parts of the anchor
1214                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1215                         if(diffs > 2) continue;
1216                         diffs += u8toMms[(int)diff8[1]] +
1217                                          u8toMms[(int)diff8[2]] +
1218                                          u8toMms[(int)diff8[3]] +
1219                                          u8toMms[(int)diff8[4]] +
1220                                          u8toMms[(int)diff8[5]] +
1221                                          u8toMms[(int)diff8[6]];
1222                         uint32_t mmpos1 = 0xffffffff;
1223                         int refc1 = -1;
1224                         uint32_t mmpos2 = 0xffffffff;
1225                         int refc2 = -1;
1226                         if(diffs > 2) {
1227                                 // Too many differences
1228                                 continue;
1229                         } else if(nPoss > 1 && diffs == nPoss) {
1230                                 // There was one difference, but there was also one N,
1231                                 // so we already know where the difference is
1232                                 mmpos1 = nPos1;
1233                                 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1234                                 if(nPoss == 2) {
1235                                         mmpos2 = nPos2;
1236                                         refc2 = "ACGT"[(int)ref[rir + nPos2]];
1237                                 }
1238                         } else if(diffs > 0) {
1239                                 // Figure out which position mismatched
1240                                 uint64_t diff2 = diff;
1241                                 mmpos1 = 31;
1242                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1243                                 assert_neq(0, diff);
1244                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos1 -=  8; }
1245                                 assert_neq(0, diff);
1246                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos1 -=  4; }
1247                                 assert_neq(0, diff);
1248                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos1 -=  2; }
1249                                 assert_neq(0, diff);
1250                                 if((diff & 0x3llu) == 0)        { mmpos1--; }
1251                                 assert_neq(0, diff);
1252                                 assert_geq(mmpos1, 0);
1253                                 assert_lt(mmpos1, 32);
1254                                 mmpos1 -= anchorCushion;
1255                                 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1256                                 if(diffs > 1) {
1257                                         // Figure out the second mismatched position
1258                                         ASSERT_ONLY(uint64_t origDiff2 = diff2);
1259                                         diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1+anchorCushion) << 1));
1260                                         assert_neq(diff2, origDiff2);
1261                                         mmpos2 = 31;
1262                                         if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1263                                         assert_neq(0, diff2);
1264                                         if((diff2 & 0xffffllu) == 0)     { diff2 >>= 16llu; mmpos2 -=  8; }
1265                                         assert_neq(0, diff2);
1266                                         if((diff2 & 0xffllu) == 0)       { diff2 >>= 8llu;  mmpos2 -=  4; }
1267                                         assert_neq(0, diff2);
1268                                         if((diff2 & 0xfllu) == 0)        { diff2 >>= 4llu;  mmpos2 -=  2; }
1269                                         assert_neq(0, diff2);
1270                                         if((diff2 & 0x3llu) == 0)        { mmpos2--; }
1271                                         assert_neq(0, diff2);
1272                                         assert_geq(mmpos2, 0);
1273                                         assert_lt(mmpos2, 32);
1274                                         mmpos2 -= anchorCushion;
1275                                         assert_neq(mmpos1, mmpos2);
1276                                         refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1277                                         if(mmpos2 < mmpos1) {
1278                                                 uint32_t mmtmp = mmpos1;
1279                                                 mmpos1 = mmpos2;
1280                                                 mmpos2 = mmtmp;
1281                                                 int refctmp = refc1;
1282                                                 refc1 = refc2;
1283                                                 refc2 = refctmp;
1284                                         }
1285                                         assert_lt(mmpos1, mmpos2);
1286                                 }
1287                         }
1288                         // Now extend the anchor into a longer alignment
1289                         bool foundHit = true;
1290                         if(anchorOverhang > 0) {
1291                                 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1292                                 bool skipCandidate = false;
1293                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
1294                                         int rc = (int)ref[rir + 32 + j];
1295                                         if(rc == 4) {
1296                                                 // Oops, encountered an N in the reference in
1297                                                 // the overhang portion of the candidate
1298                                                 // alignment
1299                                                 // (Note that we inverted hi earlier)
1300                                                 if(hi) {
1301                                                         // Right-to-left
1302                                                         skipRightToLefts = anchorOverhang - j - 1;
1303                                                 } else {
1304                                                         // Left-to-right
1305                                                         skipLeftToRights = anchorBitPairs + j;
1306                                                 }
1307                                                 skipCandidate = true; // Skip this candidate
1308                                                 break;
1309                                         }
1310                                         if((int)qry[32 + j] != rc) {
1311                                                 if(++diffs > 2) {
1312                                                         foundHit = false;
1313                                                         break;
1314                                                 } else if(diffs == 2) {
1315                                                         mmpos2 = 32 + j;
1316                                                         refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1317                                                 } else {
1318                                                         assert_eq(1, diffs);
1319                                                         mmpos1 = 32 + j;
1320                                                         refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1321                                                 }
1322                                         }
1323                                 }
1324                                 if(skipCandidate) continue;
1325                         }
1326                         if(!foundHit) continue;
1327                         if(pairs != NULL) {
1328                                 TU64Pair p;
1329                                 if(ri < aoff) {
1330                                         // By convention, the upstream mate's
1331                                         // coordinates go in the 'first' field
1332                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
1333                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1334                                 } else {
1335                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1336                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1337                                 }
1338                                 if(pairs->find(p) != pairs->end()) {
1339                                         // We already found this hit!  Continue.
1340                                         ASSERT_ONLY(duplicates++);
1341                                         ASSERT_ONLY(r2i++);
1342                                         continue;
1343                                 } else {
1344                                         // Record this hit
1345                                         pairs->insert(p);
1346                                 }
1347                         }
1348                         assert_leq(diffs, 2);
1349                         assert_lt(r2i, r2.size());
1350                         assert_eq(re2[r2i], ri);
1351                         ranges.resize(ranges.size()+1);
1352                         Range& range = ranges.back();
1353                         assert_eq(diffs, r2[r2i].numMms);
1354                         range.stratum = diffs;
1355                         range.numMms = diffs;
1356                         assert_eq(0, range.mms.size());
1357                         assert_eq(0, range.refcs.size());
1358                         if(diffs > 0) {
1359                                 assert_neq(mmpos1, 0xffffffff);
1360                                 assert_eq(mmpos1, r2[r2i].mms[0]);
1361                                 assert_neq(-1, refc1);
1362                                 assert_eq(refc1, r2[r2i].refcs[0]);
1363                                 range.mms.push_back(mmpos1);
1364                                 range.refcs.push_back(refc1);
1365                                 if(diffs > 1) {
1366                                         assert_neq(mmpos2, 0xffffffff);
1367                                         assert_eq(mmpos2, r2[r2i].mms[1]);
1368                                         assert_neq(-1, refc2);
1369                                         assert_eq(refc2, r2[r2i].refcs[1]);
1370                                         range.mms.push_back(mmpos2);
1371                                         range.refcs.push_back(refc2);
1372                                 }
1373                         }
1374                         ASSERT_ONLY(r2i++);
1375                         results.push_back(ri);
1376                         if(--numToFind == 0) return;
1377                 }
1378                 assert_leq(duplicates, r2.size());
1379                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1380                 return; // no hit
1381         }
1382 };
1383
1384 /**
1385  * Concrete RefAligner for finding nearby 2-mismatch hits given an
1386  * anchor hit.
1387  */
1388 template<typename TStr>
1389 class ThreeMMRefAligner : public RefAligner<TStr> {
1390
1391         typedef seqan::String<seqan::Dna5> TDna5Str;
1392         typedef seqan::String<char> TCharStr;
1393         typedef std::vector<Range> TRangeVec;
1394         typedef std::vector<uint32_t> TU32Vec;
1395         typedef std::pair<uint64_t, uint64_t> TU64Pair;
1396         typedef std::set<TU64Pair> TSetPairs;
1397
1398 public:
1399
1400         ThreeMMRefAligner(bool color, bool verbose, bool quiet) :
1401                 RefAligner<TStr>(color, verbose, quiet) { }
1402
1403         virtual ~ThreeMMRefAligner() { }
1404
1405 protected:
1406         /**
1407          * Because we're doing end-to-end exact, we don't care which end of
1408          * 'qry' is the 5' end.
1409          */
1410         void naiveFind(uint32_t numToFind,
1411                                    uint32_t tidx,
1412                                    uint8_t* ref,
1413                                    const TDna5Str& qry,
1414                                    const TCharStr& quals,
1415                                    uint32_t begin,
1416                                    uint32_t end,
1417                                    TRangeVec& ranges,
1418                                    TU32Vec& results,
1419                                    TSetPairs* pairs,
1420                                    uint32_t aoff,
1421                                    bool seedOnLeft) const
1422         {
1423                 assert_gt(numToFind, 0);
1424                 const uint32_t qlen = seqan::length(qry);
1425                 assert_geq(end - begin, qlen); // caller should have checked this
1426                 assert_gt(end, begin);
1427                 assert_gt(qlen, 0);
1428                 uint32_t qend = end - qlen;
1429                 uint32_t lim = qend - begin;
1430                 uint32_t halfway = begin + (lim >> 1);
1431                 bool hi = false;
1432                 for(uint32_t i = 1; i <= lim+1; i++) {
1433                         uint32_t ri;  // leftmost position in candidate alignment
1434                         uint32_t rir; // same, minus begin; for indexing into ref[]
1435                         if(hi) {
1436                                 ri = halfway + (i >> 1); rir = ri - begin;
1437                                 assert_leq(ri, qend);
1438                         } else {
1439                                 ri = halfway - (i >> 1); rir = ri - begin;
1440                                 assert_geq(ri, begin);
1441                         }
1442                         hi = !hi;
1443                         // Do the naive comparison
1444                         bool match = true;
1445                         int refc1 = -1;
1446                         uint32_t mmOff1 = 0xffffffff;
1447                         int refc2 = -1;
1448                         uint32_t mmOff2 = 0xffffffff;
1449                         int refc3 = -1;
1450                         uint32_t mmOff3 = 0xffffffff;
1451                         int mms = 0;
1452                         for(uint32_t j = 0; j < qlen; j++) {
1453 #if 0
1454                                 // Count Ns in the reference as mismatches
1455                                 const int q = (int)qry[j];
1456                                 const int r = (int)ref[rir + j];
1457                                 assert_leq(q, 4);
1458                                 assert_leq(r, 4);
1459                                 if(q == 4 || r == 4 || q != r) {
1460 #else
1461                                 // Disallow alignments that involve an N in the
1462                                 // reference
1463                                 const int r = (int)ref[rir + j];
1464                                 if(r & 4) {
1465                                         // N in reference; bail
1466                                         match = false;
1467                                         break;
1468                                 }
1469                                 const int q = (int)qry[j];
1470                                 assert_leq(q, 4);
1471                                 assert_lt(r, 4);
1472                                 if(q != r) {
1473 #endif
1474                                         // Mismatch!
1475                                         if(++mms > 3) {
1476                                                 // Too many; reject this alignment
1477                                                 match = false;
1478                                                 break;
1479                                         } else if(mms == 3) {
1480                                                 // Second one; remember offset and ref char
1481                                                 refc3 = "ACGTN"[r];
1482                                                 mmOff3 = j;
1483                                         } else if(mms == 2) {
1484                                                 // Second one; remember offset and ref char
1485                                                 refc2 = "ACGTN"[r];
1486                                                 mmOff2 = j;
1487                                         } else {
1488                                                 assert_eq(1, mms);
1489                                                 // First one; remember offset and ref char
1490                                                 refc1 = "ACGTN"[r];
1491                                                 mmOff1 = j;
1492                                         }
1493                                 }
1494                         }
1495                         if(match) {
1496                                 assert_leq(mms, 3);
1497                                 ranges.resize(ranges.size()+1);
1498                                 Range& range = ranges.back();
1499                                 range.stratum = mms;
1500                                 range.numMms = mms;
1501                                 assert_eq(0, range.mms.size());
1502                                 assert_eq(0, range.refcs.size());
1503                                 if(mms > 0) {
1504                                         assert_lt(mmOff1, qlen);
1505                                         assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1506                                         range.mms.push_back(mmOff1);
1507                                         range.refcs.push_back(refc1);
1508                                         if(mms > 1) {
1509                                                 assert_lt(mmOff2, qlen);
1510                                                 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1511                                                 range.mms.push_back(mmOff2);
1512                                                 range.refcs.push_back(refc2);
1513                                                 if(mms > 2) {
1514                                                         assert_eq(3, mms);
1515                                                         assert_lt(mmOff3, qlen);
1516                                                         assert(refc3 == 'A' || refc3 == 'C' || refc3 == 'G' || refc3 == 'T');
1517                                                         range.mms.push_back(mmOff3);
1518                                                         range.refcs.push_back(refc3);
1519                                                 }
1520                                         }
1521                                 }
1522                                 results.push_back(ri);
1523                         }
1524                 }
1525                 return;
1526         }
1527
1528         /**
1529          * Because we're doing end-to-end exact, we don't care which end of
1530          * 'qry' is the 5' end.
1531          */
1532         virtual void anchor64Find(uint32_t numToFind,
1533                                         uint32_t tidx,
1534                                         uint8_t* ref,
1535                                         const TDna5Str& qry,
1536                                         const TCharStr& quals,
1537                                         uint32_t begin,
1538                                         uint32_t end,
1539                                         TRangeVec& ranges,
1540                                         TU32Vec& results,
1541                                         TSetPairs* pairs = NULL,
1542                                         uint32_t aoff = 0xffffffff,
1543                                         bool seedOnLeft = false) const
1544         {
1545                 assert_gt(numToFind, 0);
1546                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1547                 ASSERT_ONLY(uint32_t duplicates = 0);
1548                 ASSERT_ONLY(uint32_t r2i = 0);
1549                 const uint32_t qlen = seqan::length(qry);
1550                 assert_geq(end - begin, qlen); // caller should have checked this
1551                 assert_gt(end, begin);
1552                 assert_gt(qlen, 0);
1553 #ifndef NDEBUG
1554                 // Get results from the naive matcher for sanity-checking
1555                 TRangeVec r2; TU32Vec re2;
1556                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1557                                   re2, pairs, aoff, seedOnLeft);
1558 #endif
1559                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1560                 const int lhsShift = ((anchorBitPairs - 1) << 1);
1561                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
1562                 // anchorOverhang = # read bases not included in the anchor
1563                 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1564                 const uint32_t lim = end - qlen - begin;
1565                 const uint32_t halfway = begin + (lim >> 1);
1566                 uint64_t anchor = 0llu;
1567                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
1568                 // OR the 'diff' buffer with this so that we can always count
1569                 // 'N's as mismatches
1570                 uint64_t diffMask = 0llu;
1571                 // Set up a mask that we'll apply to the two bufs every round
1572                 // to discard bits that were rotated out of the anchor area
1573                 uint64_t clearMask = 0xffffffffffffffffllu;
1574                 bool useMask = false;
1575                 if(anchorBitPairs < 32) {
1576                         clearMask >>= ((32-anchorBitPairs) << 1);
1577                         useMask = true;
1578                 }
1579                 int nsInAnchor = 0;
1580                 uint32_t nPoss = 0;
1581                 int nPos1 = -1;
1582                 int nPos2 = -1;
1583                 int nPos3 = -1;
1584                 uint32_t skipLeftToRights = 0;
1585                 uint32_t skipRightToLefts = 0;
1586                 // Construct the 'anchor' 64-bit buffer so that it holds all of
1587                 // the first 'anchorBitPairs' bit pairs of the query.
1588                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1589                         int c = (int)qry[i]; // next query character
1590                         int r = (int)ref[halfway - begin + i]; // next reference character
1591                         if(r & 4) {
1592                                 r = 0;
1593                                 // The right-to-left direction absorbs the candidate
1594                                 // alignment based at 'halfway'; so skipLeftToRights is
1595                                 // i, not i+1
1596                                 skipLeftToRights = max(skipLeftToRights, i);
1597                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1598                         }
1599                         assert_leq(c, 4);
1600                         assert_lt(r, 4);
1601                         // Special case: query has an 'N'
1602                         if(c == 4) {
1603                                 if(++nsInAnchor > 3) {
1604                                         // More than two 'N's in the anchor region; can't
1605                                         // possibly have a 2-mismatch hit anywhere
1606                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
1607                                         return;   // can't match if query has Ns
1608                                 } else if(nsInAnchor == 3) {
1609                                         nPos3 = (int)i;
1610                                         nPoss++;
1611                                 } else if(nsInAnchor == 2) {
1612                                         nPos2 = (int)i;
1613                                         nPoss++;
1614                                 } else {
1615                                         assert_eq(1, nsInAnchor);
1616                                         nPos1 = (int)i;
1617                                         nPoss++;
1618                                 }
1619                                 // Make it look like an 'A' in the anchor
1620                                 c = 0;
1621                                 diffMask = (diffMask << 2llu) | 1llu;
1622                         } else {
1623                                 diffMask <<= 2llu;
1624                         }
1625                         anchor  = ((anchor  << 2llu) | c);
1626                         buffw = ((buffw << 2llu) | r);
1627                 }
1628                 assert_leq(nPoss, 3);
1629                 // Check whether read is disqualified by Ns outside of the anchor
1630                 // region
1631                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1632                         if((int)qry[i] == 4) {
1633                                 if(++nsInAnchor > 3) {
1634                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
1635                                         return; // can't match if query has Ns
1636                                 }
1637                         }
1638                 }
1639                 uint64_t bufbw = buffw;
1640                 // We're moving the right-hand edge of the anchor along until
1641                 // it's 'anchorOverhang' chars from the end of the target region.
1642                 // Note that we're not making a 3'/5' distinction here; if we
1643                 // were, we might need to make the 'anchorOverhang' adjustment on
1644                 // the left end of the range rather than the right.
1645                 bool hi = false;
1646                 uint32_t riHi  = halfway;
1647                 uint32_t rirHi = halfway - begin;
1648                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1649                 uint32_t riLo  = halfway + 1;
1650                 uint32_t rirLo = halfway - begin + 1;
1651                 for(uint32_t i = 1; i <= lim + 1; i++) {
1652                         int r;       // new reference char
1653                         uint64_t diff;
1654                         assert_lt(skipLeftToRights, qlen);
1655                         assert_leq(skipRightToLefts, qlen);
1656                         if(hi) {
1657                                 hi = false;
1658                                 // Moving left-to-right
1659                                 riHi++;
1660                                 rirHi++;
1661                                 rirHiAnchor++;
1662                                 r = (int)ref[rirHiAnchor];
1663                                 if(r & 4) {
1664                                         r = 0;
1665                                         skipLeftToRights = anchorBitPairs;
1666                                 }
1667                                 assert_lt(r, 4);
1668                                 // Bring in new base pair at the least significant
1669                                 // position
1670                                 buffw = ((buffw << 2llu) | r);
1671                                 if(useMask) buffw &= clearMask;
1672                                 if(skipLeftToRights > 0) {
1673                                         skipLeftToRights--;
1674                                         continue;
1675                                 }
1676                                 diff = (buffw ^ anchor) | diffMask;
1677                         } else {
1678                                 hi = true;
1679                                 // Moving right-to-left
1680                                 riLo--;
1681                                 rirLo--;
1682                                 r = (int)ref[rirLo];
1683                                 if(r & 4) {
1684                                         r = 0;
1685                                         skipRightToLefts = qlen;
1686                                 }
1687                                 assert_lt(r, 4);
1688                                 if(i >= 2) {
1689                                         bufbw >>= 2llu;
1690                                         // Bring in new base pair at the most significant
1691                                         // position
1692                                         bufbw |= ((uint64_t)r << lhsShift);
1693                                 }
1694                                 if(skipRightToLefts > 0) {
1695                                         skipRightToLefts--;
1696                                         continue;
1697                                 }
1698                                 diff = (bufbw ^ anchor) | diffMask;
1699                         }
1700                         if((diff & 0xffff000000000000llu) &&
1701                            (diff & 0x0000ffff00000000llu) &&
1702                            (diff & 0x00000000ffff0000llu) &&
1703                            (diff & 0x000000000000ffffllu)) continue;
1704                         uint32_t ri  = hi ? riLo  : riHi;
1705                         uint32_t rir = hi ? rirLo : rirHi;
1706                         // Could use pop count
1707                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1708                         // As a first cut, see if there are too many mismatches in
1709                         // the first and last parts of the anchor
1710                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1711                         if(diffs > 3) continue;
1712                         diffs += u8toMms[(int)diff8[1]] +
1713                                          u8toMms[(int)diff8[2]] +
1714                                          u8toMms[(int)diff8[3]] +
1715                                          u8toMms[(int)diff8[4]] +
1716                                          u8toMms[(int)diff8[5]] +
1717                                          u8toMms[(int)diff8[6]];
1718                         uint32_t mmpos1 = 0xffffffff;
1719                         int refc1 = -1;
1720                         uint32_t mmpos2 = 0xffffffff;
1721                         int refc2 = -1;
1722                         uint32_t mmpos3 = 0xffffffff;
1723                         int refc3 = -1;
1724                         if(diffs > 3) {
1725                                 // Too many differences
1726                                 continue;
1727                         } else if(nPoss > 1 && diffs == nPoss) {
1728                                 // There was one difference, but there was also one N,
1729                                 // so we already know where the difference is
1730                                 mmpos1 = nPos1;
1731                                 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1732                                 if(nPoss > 1) {
1733                                         mmpos2 = nPos2;
1734                                         refc2 = "ACGT"[(int)ref[rir + nPos2]];
1735                                         if(nPoss > 2) {
1736                                                 mmpos3 = nPos3;
1737                                                 refc3 = "ACGT"[(int)ref[rir + nPos3]];
1738                                         }
1739                                 }
1740                         } else if(diffs > 0) {
1741                                 // Figure out which position mismatched
1742                                 uint64_t diff2 = diff;
1743                                 mmpos1 = 31;
1744                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1745                                 assert_neq(0, diff);
1746                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos1 -=  8; }
1747                                 assert_neq(0, diff);
1748                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos1 -=  4; }
1749                                 assert_neq(0, diff);
1750                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos1 -=  2; }
1751                                 assert_neq(0, diff);
1752                                 if((diff & 0x3llu) == 0)        { mmpos1--; }
1753                                 assert_neq(0, diff);
1754                                 assert_geq(mmpos1, 0);
1755                                 assert_lt(mmpos1, 32);
1756                                 mmpos1 -= anchorCushion;
1757                                 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1758                                 if(diffs > 1) {
1759                                         // Figure out the second mismatched position
1760                                         diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1 + anchorCushion) << 1));
1761                                         uint64_t diff3 = diff2;
1762                                         mmpos2 = 31;
1763                                         if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1764                                         assert_neq(0, diff2);
1765                                         if((diff2 & 0xffffllu) == 0)     { diff2 >>= 16llu; mmpos2 -=  8; }
1766                                         assert_neq(0, diff2);
1767                                         if((diff2 & 0xffllu) == 0)       { diff2 >>= 8llu;  mmpos2 -=  4; }
1768                                         assert_neq(0, diff2);
1769                                         if((diff2 & 0xfllu) == 0)        { diff2 >>= 4llu;  mmpos2 -=  2; }
1770                                         assert_neq(0, diff2);
1771                                         if((diff2 & 0x3llu) == 0)        { mmpos2--; }
1772                                         assert_neq(0, diff2);
1773                                         mmpos2 -= anchorCushion;
1774                                         assert_geq(mmpos2, 0);
1775                                         assert_lt(mmpos2, 32);
1776                                         assert_neq(mmpos1, mmpos2);
1777                                         refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1778                                         uint32_t mmpos2orig = mmpos2;
1779                                         if(mmpos2 < mmpos1) {
1780                                                 uint32_t mmtmp = mmpos1;
1781                                                 mmpos1 = mmpos2;
1782                                                 mmpos2 = mmtmp;
1783                                                 int refctmp = refc1;
1784                                                 refc1 = refc2;
1785                                                 refc2 = refctmp;
1786                                         }
1787                                         assert_lt(mmpos1, mmpos2);
1788                                         if(diffs > 2) {
1789                                                 // Figure out the second mismatched position
1790                                                 diff3 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos2orig + anchorCushion) << 1));
1791                                                 mmpos3 = 31;
1792                                                 if((diff3 & 0xffffffffllu) == 0) { diff3 >>= 32llu; mmpos3 -= 16; }
1793                                                 assert_neq(0, diff3);
1794                                                 if((diff3 & 0xffffllu) == 0)     { diff3 >>= 16llu; mmpos3 -=  8; }
1795                                                 assert_neq(0, diff3);
1796                                                 if((diff3 & 0xffllu) == 0)       { diff3 >>= 8llu;  mmpos3 -=  4; }
1797                                                 assert_neq(0, diff3);
1798                                                 if((diff3 & 0xfllu) == 0)        { diff3 >>= 4llu;  mmpos3 -=  2; }
1799                                                 assert_neq(0, diff3);
1800                                                 if((diff3 & 0x3llu) == 0)        { mmpos3--; }
1801                                                 assert_neq(0, diff3);
1802                                                 mmpos3 -= anchorCushion;
1803                                                 assert_geq(mmpos3, 0);
1804                                                 assert_lt(mmpos3, 32);
1805                                                 assert_neq(mmpos1, mmpos3);
1806                                                 assert_neq(mmpos2, mmpos3);
1807                                                 refc3 = "ACGT"[(int)ref[rir + mmpos3]];
1808                                                 if(mmpos3 < mmpos1) {
1809                                                         uint32_t mmtmp = mmpos1;
1810                                                         mmpos1 = mmpos3;
1811                                                         mmpos3 = mmpos2;
1812                                                         mmpos2 = mmtmp;
1813                                                         int refctmp = refc1;
1814                                                         refc1 = refc3;
1815                                                         refc3 = refc2;
1816                                                         refc2 = refctmp;
1817                                                 } else if(mmpos3 < mmpos2) {
1818                                                         uint32_t mmtmp = mmpos2;
1819                                                         mmpos2 = mmpos3;
1820                                                         mmpos3 = mmtmp;
1821                                                         int refctmp = refc2;
1822                                                         refc2 = refc3;
1823                                                         refc3 = refctmp;
1824                                                 }
1825                                                 assert_lt(mmpos1, mmpos2);
1826                                                 assert_lt(mmpos2, mmpos3);
1827                                         }
1828                                 }
1829                         }
1830                         // Now extend the anchor into a longer alignment
1831                         bool foundHit = true;
1832                         if(anchorOverhang > 0) {
1833                                 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1834                                 bool skipCandidate = false;
1835                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
1836                                         int rc = (int)ref[rir + 32 + j];
1837                                         if(rc == 4) {
1838                                                 // Oops, encountered an N in the reference in
1839                                                 // the overhang portion of the candidate
1840                                                 // alignment
1841                                                 // (Note that we inverted hi earlier)
1842                                                 if(hi) {
1843                                                         // Right-to-left
1844                                                         skipRightToLefts = anchorOverhang - j - 1;
1845                                                 } else {
1846                                                         // Left-to-right
1847                                                         skipLeftToRights = anchorBitPairs + j;
1848                                                 }
1849                                                 skipCandidate = true; // Skip this candidate
1850                                                 break;
1851                                         }
1852                                         if((int)qry[32 + j] != rc) {
1853                                                 if(++diffs > 3) {
1854                                                         foundHit = false;
1855                                                         break;
1856                                                 } else if(diffs == 3) {
1857                                                         mmpos3 = 32 + j;
1858                                                         refc3 = "ACGT"[(int)ref[rir + 32 + j]];
1859                                                 } else if(diffs == 2) {
1860                                                         mmpos2 = 32 + j;
1861                                                         refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1862                                                 } else {
1863                                                         assert_eq(1, diffs);
1864                                                         mmpos1 = 32 + j;
1865                                                         refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1866                                                 }
1867                                         }
1868                                 }
1869                                 if(skipCandidate) continue;
1870                         }
1871                         if(!foundHit) continue;
1872                         if(pairs != NULL) {
1873                                 TU64Pair p;
1874                                 if(ri < aoff) {
1875                                         // By convention, the upstream mate's
1876                                         // coordinates go in the 'first' field
1877                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
1878                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1879                                 } else {
1880                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1881                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1882                                 }
1883                                 if(pairs->find(p) != pairs->end()) {
1884                                         // We already found this hit!  Continue.
1885                                         ASSERT_ONLY(duplicates++);
1886                                         ASSERT_ONLY(r2i++);
1887                                         continue;
1888                                 } else {
1889                                         // Record this hit
1890                                         pairs->insert(p);
1891                                 }
1892                         }
1893                         assert_leq(diffs, 3);
1894                         assert_lt(r2i, r2.size());
1895                         assert_eq(re2[r2i], ri);
1896                         ranges.resize(ranges.size()+1);
1897                         Range& range = ranges.back();
1898                         assert_eq(diffs, r2[r2i].numMms);
1899                         range.stratum = diffs;
1900                         range.numMms = diffs;
1901                         assert_eq(0, range.mms.size());
1902                         assert_eq(0, range.refcs.size());
1903                         if(diffs > 0) {
1904                                 assert_neq(mmpos1, 0xffffffff);
1905                                 assert_eq(mmpos1, r2[r2i].mms[0]);
1906                                 assert_neq(-1, refc1);
1907                                 assert_eq(refc1, r2[r2i].refcs[0]);
1908                                 range.mms.push_back(mmpos1);
1909                                 range.refcs.push_back(refc1);
1910                                 if(diffs > 1) {
1911                                         assert_neq(mmpos2, 0xffffffff);
1912                                         assert_eq(mmpos2, r2[r2i].mms[1]);
1913                                         assert_neq(-1, refc2);
1914                                         assert_eq(refc2, r2[r2i].refcs[1]);
1915                                         range.mms.push_back(mmpos2);
1916                                         range.refcs.push_back(refc2);
1917                                         if(diffs > 2) {
1918                                                 assert_neq(mmpos3, 0xffffffff);
1919                                                 assert_eq(mmpos3, r2[r2i].mms[2]);
1920                                                 assert_neq(-1, refc3);
1921                                                 assert_eq(refc3, r2[r2i].refcs[2]);
1922                                                 range.mms.push_back(mmpos3);
1923                                                 range.refcs.push_back(refc3);
1924                                         }
1925                                 }
1926                         }
1927                         ASSERT_ONLY(r2i++);
1928                         results.push_back(ri);
1929                         if(--numToFind == 0) return;
1930                 }
1931                 assert_leq(duplicates, r2.size());
1932                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1933                 return; // no hit
1934         }
1935 };
1936
1937 /**
1938  * Concrete RefAligner for finding nearby 1-mismatch hits given an
1939  * anchor hit.
1940  */
1941 template<typename TStr>
1942 class Seed0RefAligner : public RefAligner<TStr> {
1943
1944         typedef seqan::String<seqan::Dna5> TDna5Str;
1945         typedef seqan::String<char> TCharStr;
1946         typedef std::vector<Range> TRangeVec;
1947         typedef std::vector<uint32_t> TU32Vec;
1948         typedef std::pair<uint64_t, uint64_t> TU64Pair;
1949         typedef std::set<TU64Pair> TSetPairs;
1950
1951 public:
1952
1953         Seed0RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
1954                 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
1955
1956         virtual ~Seed0RefAligner() { }
1957
1958 protected:
1959         /**
1960          * This schematic shows the roles played by the begin, qbegin, end,
1961          * qend, halfway, slen, qlen, and lim variables:
1962          *
1963          * seedOnLeft == true:
1964          *
1965          * |<                   lim                   >|<     qlen       >|
1966          *  --------------------------------------------------------------
1967          * |                     | slen | qlen-slen |  | slen | qlen-slen |
1968          *  --------------------------------------------------------------
1969          * ^                     ^                     ^                  ^
1970          * begin & qbegin     halfway                qend               end
1971          *
1972          * seedOnLeft == false:
1973          *
1974          * |<     qlen       >|<                   lim                   >|
1975          *  --------------------------------------------------------------
1976          * | qlen-slen | slen |  | qlen-slen | slen |                     |
1977          *  --------------------------------------------------------------
1978          * ^                  ^                     ^                     ^
1979          * begin            qbegin             halfway           qend & end
1980          *
1981          * Note that, for seeds longer than 32 base-pairs, the seed is
1982          * further subdivided into a 32-bit anchor and a seed overhang of
1983          * length > 0.
1984          */
1985         void naiveFind(uint32_t numToFind,
1986                                    uint32_t tidx,
1987                                    uint8_t* ref,
1988                                    const TDna5Str& qry,
1989                                    const TCharStr& quals,
1990                                    uint32_t begin,
1991                                    uint32_t end,
1992                                    TRangeVec& ranges,
1993                                    TU32Vec& results,
1994                                    TSetPairs* pairs,
1995                                    uint32_t aoff,
1996                                    bool seedOnLeft) const
1997         {
1998                 assert_gt(numToFind, 0);
1999                 assert_gt(end, begin);
2000                 const uint32_t qlen = seqan::length(qry);
2001                 assert_gt(qlen, 0);
2002                 assert_geq(end - begin, qlen); // caller should have checked this
2003                 assert_gt(this->seedLen_, 0);
2004                 const uint32_t slen = min(qlen, this->seedLen_);
2005                 uint32_t qend = end;
2006                 uint32_t qbegin = begin;
2007                 // If the seed is on the left-hand side of the alignment, then
2008                 // leave a gap at the right-hand side of the interval;
2009                 // otherwise, do the opposite
2010                 if(seedOnLeft) {
2011                         // Leave gap on right-hand side of the interval
2012                         qend -= qlen;
2013                 } else {
2014                         // Leave gap on left-hand side of the interval
2015                         qbegin += qlen;
2016                 }
2017                 // lim = number of alignments to try
2018                 const uint32_t lim = qend - qbegin;
2019                 // halfway = position in the reference to start at (and then
2020                 // we work our way out to the right and to the left).
2021                 const uint32_t halfway = qbegin + (lim >> 1);
2022                 // Vectors for holding edit information
2023                 std::vector<uint32_t> nonSeedMms;
2024                 std::vector<uint8_t> nonSeedRefcs;
2025                 bool hi = false;
2026                 for(uint32_t i = 1; i <= lim+1; i++) {
2027                         uint32_t ri;  // leftmost position in candidate alignment
2028                         uint32_t rir; // same, minus begin; for indexing into ref[]
2029                         if(hi) {
2030                                 ri = halfway + (i >> 1); rir = ri - begin;
2031                                 assert_leq(ri, qend);
2032                         } else {
2033                                 ri = halfway - (i >> 1); rir = ri - begin;
2034                                 assert_geq(ri, begin);
2035                         }
2036                         hi = !hi;
2037                         // Do the naive comparison
2038                         bool match = true;
2039                         int mms = 0;
2040                         unsigned int ham = 0;
2041                         nonSeedMms.clear();
2042                         nonSeedRefcs.clear();
2043                         // Walk through each position of the alignment
2044                         for(uint32_t jj = 0; jj < qlen; jj++) {
2045                                 uint32_t j = jj;
2046                                 if(!seedOnLeft) {
2047                                         // If seed is on the right, scan right-to-left
2048                                         j = qlen - jj - 1;
2049                                 } else {
2050                                         // Go left-to-right
2051                                 }
2052                                 uint32_t rirj = rir + j;
2053                                 if(!seedOnLeft) {
2054                                         assert_geq(rir, jj);
2055                                         rirj = rir - jj - 1;
2056                                 }
2057 #if 0
2058                                 // Count Ns in the reference as mismatches
2059                                 const int q = (int)qry[j];
2060                                 const int r = (int)ref[rirj];
2061                                 assert_leq(q, 4);
2062                                 assert_leq(r, 4);
2063                                 if(q == 4 || r == 4 || q != r) {
2064 #else
2065                                 // Disallow alignments that involve an N in the
2066                                 // reference
2067                                 const int r = (int)ref[rirj];
2068                                 if(r & 4) {
2069                                         // N in reference; bail on this alignment
2070                                         match = false;
2071                                         break;
2072                                 }
2073                                 const int q = (int)qry[j];
2074                                 assert_leq(q, 4);
2075                                 assert_lt(r, 4);
2076                                 if(q != r) {
2077 #endif
2078                                         // Mismatch!
2079                                         if(jj < slen) {
2080                                                 // More than one mismatch in the anchor; reject
2081                                                 match = false;
2082                                                 break;
2083                                         }
2084                                         uint8_t qual = phredCharToPhredQual(quals[j]);
2085                                         ham += mmPenalty(this->maqPenalty_, qual);
2086                                         if(ham > this->qualMax_) {
2087                                                 // Exceeded quality ceiling; reject
2088                                                 match = false;
2089                                                 break;
2090                                         } else {
2091                                                 // Legal mismatch outside of the anchor; record it
2092                                                 mms++;
2093                                                 nonSeedMms.push_back(j);
2094                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
2095                                                 nonSeedRefcs.push_back("ACGTN"[r]);
2096                                         }
2097                                 }
2098                         }
2099                         if(match) {
2100                                 ranges.resize(ranges.size()+1);
2101                                 Range& range = ranges.back();
2102                                 range.stratum = 0;
2103                                 range.numMms = mms;
2104                                 assert_eq(0, range.mms.size());
2105                                 assert_eq(0, range.refcs.size());
2106                                 if(mms >= 1) {
2107                                         // Be careful to add edits in left-to-right order
2108                                         // with respect to the read/alignment
2109                                         const size_t nonSeedMmsSz = nonSeedMms.size();
2110                                         if(nonSeedMmsSz > 0) {
2111                                                 if(seedOnLeft) {
2112                                                         for(size_t k = 0; k < nonSeedMmsSz; k++) {
2113                                                                 range.mms.push_back(nonSeedMms[k]);
2114                                                                 range.refcs.push_back(nonSeedRefcs[k]);
2115                                                         }
2116                                                 } else {
2117                                                         for(size_t k = nonSeedMmsSz; k > 0; k--) {
2118                                                                 range.mms.push_back(nonSeedMms[k-1]);
2119                                                                 range.refcs.push_back(nonSeedRefcs[k-1]);
2120                                                         }
2121                                                 }
2122                                         }
2123                                 }
2124                                 assert_eq((size_t)mms, range.mms.size());
2125                                 assert_eq((size_t)mms, range.refcs.size());
2126                                 if(seedOnLeft) {
2127                                         results.push_back(ri);
2128                                 } else {
2129                                         results.push_back(ri - qlen);
2130                                 }
2131                         }
2132                 }
2133                 return;
2134         }
2135
2136         /**
2137          * This schematic shows the roles played by the begin, qbegin, end,
2138          * qend, halfway, slen, qlen, and lim variables:
2139          *
2140          * seedOnLeft == true:
2141          *
2142          * |<                   lim                   >|<     qlen       >|
2143          *  --------------------------------------------------------------
2144          * |                     | slen | qlen-slen |  | slen | qlen-slen |
2145          *  --------------------------------------------------------------
2146          * ^                     ^                     ^                  ^
2147          * begin & qbegin     halfway                qend               end
2148          *
2149          * seedOnLeft == false:
2150          *
2151          *             |<                   lim                   >|
2152          *  --------------------------------------------------------------
2153          * | qlen-slen |         | qlen-slen | slen |              | slen |
2154          *  --------------------------------------------------------------
2155          * ^           ^                     ^                     ^      ^
2156          * begin       qbegin             halfway                qend   end
2157          *
2158          * Note that, for seeds longer than 32 base-pairs, the seed is
2159          * further subdivided into a 32-bit anchor and a seed overhang of
2160          * length > 0.
2161          */
2162         virtual void anchor64Find(uint32_t numToFind,
2163                                         uint32_t tidx,
2164                                         uint8_t* ref,
2165                                         const TDna5Str& qry,
2166                                         const TCharStr& quals,
2167                                         uint32_t begin,
2168                                         uint32_t end,
2169                                         TRangeVec& ranges,
2170                                         TU32Vec& results,
2171                                         TSetPairs* pairs = NULL,
2172                                         uint32_t aoff = 0xffffffff,
2173                                         bool seedOnLeft = false) const
2174         {
2175                 assert_gt(numToFind, 0);
2176                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
2177                 ASSERT_ONLY(uint32_t duplicates = 0);
2178                 ASSERT_ONLY(uint32_t r2i = 0);
2179                 const uint32_t qlen = seqan::length(qry);
2180                 assert_gt(qlen, 0);
2181                 assert_gt(end, begin);
2182                 assert_geq(end - begin, qlen); // caller should have checked this
2183                 assert_gt(this->seedLen_, 0);
2184                 uint32_t slen = min(qlen, this->seedLen_);
2185 #ifndef NDEBUG
2186                 // Get results from the naive matcher for sanity-checking
2187                 TRangeVec r2; TU32Vec re2;
2188                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
2189                                   re2, pairs, aoff, seedOnLeft);
2190 #endif
2191                 const uint32_t anchorBitPairs = min<int>(slen, 32);
2192                 const int lhsShift = ((anchorBitPairs - 1) << 1);
2193                 ASSERT_ONLY(const uint32_t anchorCushion  = 32 - anchorBitPairs);
2194                 // seedAnchorOverhang = # seed bases not included in the anchor
2195                 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
2196                 // seedAnchorOverhang = # seed bases not included in the anchor
2197                 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
2198                 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
2199                 assert_eq(qlen, readSeedOverhang + slen);
2200                 uint32_t qend = end;
2201                 uint32_t qbegin = begin;
2202                 if(seedOnLeft) {
2203                         // Leave read-sized gap on right-hand side of the interval
2204                         qend -= qlen;
2205                 } else {
2206                         // Leave seed-sized gap on right-hand side and
2207                         // non-seed-sized gap on the left-hand side
2208                         qbegin += readSeedOverhang;
2209                         qend -= slen;
2210                 }
2211                 // lim = # possible alignments in the range
2212                 const uint32_t lim = qend - qbegin;
2213                 // halfway = point on the genome to radiate out from
2214                 const uint32_t halfway = qbegin + (lim >> 1);
2215                 uint64_t anchor = 0llu;
2216                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
2217                 // Set up a mask that we'll apply to the two bufs every round
2218                 // to discard bits that were rotated out of the anchor area
2219                 uint64_t clearMask = 0xffffffffffffffffllu;
2220                 bool useMask = false;
2221                 if(anchorBitPairs < 32) {
2222                         clearMask >>= ((32-anchorBitPairs) << 1);
2223                         useMask = true;
2224                 }
2225                 uint32_t skipLeftToRights = 0;
2226                 uint32_t skipRightToLefts = 0;
2227                 const uint32_t halfwayRi = halfway - begin;
2228                 // Construct the 'anchor' 64-bit buffer so that it holds all of
2229                 // the first 'anchorBitPairs' bit pairs of the query.
2230                 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
2231                         uint32_t i = ii;
2232                         if(!seedOnLeft) {
2233                                 // Fill in the anchor using characters from the right-
2234                                 // hand side of the query (but take the characters in
2235                                 // left-to-right order)
2236                                 i = qlen - slen + ii;
2237                         }
2238                         int c = (int)qry[i]; // next query character
2239                         int r = (int)ref[halfwayRi + ii]; // next reference character
2240                         if(r & 4) {
2241                                 // The reference character is an N; to mimic the
2242                                 // behavior of BW alignment, we have to skip all
2243                                 // alignments that involve an N in the reference.  Set
2244                                 // the skip* variables accordingly.
2245                                 r = 0;
2246                                 uint32_t lrSkips = ii;
2247                                 uint32_t rlSkips = qlen - ii;
2248                                 if(!seedOnLeft && readSeedOverhang) {
2249                                         lrSkips += readSeedOverhang;
2250                                         assert_geq(rlSkips, readSeedOverhang);
2251                                         rlSkips -= readSeedOverhang;
2252                                 }
2253                                 // The right-to-left direction absorbs the candidate
2254                                 // alignment based at 'halfway'
2255                                 skipLeftToRights = max(skipLeftToRights, lrSkips);
2256                                 skipRightToLefts = max(skipRightToLefts, rlSkips);
2257                                 assert_leq(skipLeftToRights, qlen);
2258                                 assert_leq(skipRightToLefts, qlen);
2259                         }
2260                         assert_leq(c, 4);
2261                         assert_lt(r, 4);
2262                         // Special case: query has an 'N'
2263                         if(c == 4) {
2264                                 // One or more 'N's in the anchor region; can't
2265                                 // possibly have a 0-mismatch hit anywhere
2266                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2267                                 return;   // can't match if query has Ns
2268                         }
2269                         anchor = ((anchor << 2llu) | c);
2270                         buffw = ((buffw << 2llu) | r);
2271                 }
2272                 // Check whether read is disqualified by Ns inside the seed
2273                 // region but outside the anchor region
2274                 if(seedAnchorOverhang) {
2275                         assert_lt(anchorBitPairs, slen);
2276                         for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
2277                                 uint32_t i = ii;
2278                                 if(!seedOnLeft) {
2279                                         i = qlen - slen + ii;
2280                                 }
2281                                 if((int)qry[i] == 4) {
2282                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
2283                                         return; // can't match if query has Ns
2284                                 }
2285                         }
2286                 } else {
2287                         assert_eq(anchorBitPairs, slen);
2288                 }
2289                 uint64_t bufbw = buffw;
2290                 // Slide the anchor out in either direction, alternating
2291                 // between right-to-left and left-to-right shifts, until all of
2292                 // the positions from qbegin to qend have been covered.
2293                 bool hi = false;
2294                 uint32_t riHi  = halfway;
2295                 uint32_t rirHi = halfway - begin;
2296                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
2297                 uint32_t riLo  = halfway + 1;
2298                 uint32_t rirLo = halfway - begin + 1;
2299                 uint32_t lrSkips = anchorBitPairs;
2300                 uint32_t rlSkips = qlen;
2301                 if(!seedOnLeft && readSeedOverhang) {
2302                         lrSkips += readSeedOverhang;
2303                         assert_geq(rlSkips, readSeedOverhang);
2304                         rlSkips -= readSeedOverhang;
2305                 }
2306                 for(uint32_t i = 1; i <= lim + 1; i++) {
2307                         int r;       // new reference char
2308                         uint64_t diff;
2309                         assert_leq(skipLeftToRights, qlen);
2310                         assert_leq(skipRightToLefts, qlen);
2311                         if(hi) {
2312                                 hi = false;
2313                                 // Moving left-to-right
2314                                 riHi++;
2315                                 rirHi++;
2316                                 rirHiAnchor++;
2317                                 r = (int)ref[rirHiAnchor];
2318                                 if(r & 4) {
2319                                         r = 0;
2320                                         skipLeftToRights = lrSkips;
2321                                 }
2322                                 assert_lt(r, 4);
2323                                 // Bring in new base pair at the least significant
2324                                 // position
2325                                 buffw = ((buffw << 2llu) | r);
2326                                 if(useMask) buffw &= clearMask;
2327                                 if(skipLeftToRights > 0) {
2328                                         skipLeftToRights--;
2329                                         continue;
2330                                 }
2331                                 diff = buffw ^ anchor;
2332                         } else {
2333                                 hi = true;
2334                                 // Moving right-to-left
2335                                 riLo--;
2336                                 rirLo--;
2337                                 r = (int)ref[rirLo];
2338                                 if(r & 4) {
2339                                         r = 0;
2340                                         skipRightToLefts = rlSkips;
2341                                 }
2342                                 assert_lt(r, 4);
2343                                 if(i >= 2) {
2344                                         bufbw >>= 2llu;
2345                                         // Bring in new base pair at the most significant
2346                                         // position
2347                                         bufbw |= ((uint64_t)r << lhsShift);
2348                                 }
2349                                 if(skipRightToLefts > 0) {
2350                                         skipRightToLefts--;
2351                                         continue;
2352                                 }
2353                                 diff = bufbw ^ anchor;
2354                         }
2355                         if(diff) continue;
2356                         uint32_t ri  = hi ? riLo  : riHi;
2357                         uint32_t rir = hi ? rirLo : rirHi;
2358                         unsigned int ham = 0;
2359                         // If the seed is longer than the anchor, then scan the
2360                         // rest of the seed characters
2361                         bool foundHit = true;
2362                         if(seedAnchorOverhang) {
2363                                 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
2364                                         int rc = (int)ref[rir + anchorBitPairs + j];
2365                                         if(rc == 4) {
2366                                                 // Oops, encountered an N in the reference in
2367                                                 // the overhang portion of the candidate
2368                                                 // alignment
2369                                                 // (Note that we inverted hi earlier)
2370                                                 if(hi) {
2371                                                         // Right-to-left
2372                                                         // Skip out of the seedAnchorOverhang
2373                                                         assert_eq(0, skipRightToLefts);
2374                                                         skipRightToLefts = seedAnchorOverhang - j - 1;
2375                                                         if(seedOnLeft) {
2376                                                                 // ...and skip out of the rest of the read
2377                                                                 skipRightToLefts += readSeedOverhang;
2378                                                         }
2379                                                 } else {
2380                                                         // Left-to-right
2381                                                         // Skip out of the seedAnchorOverhang
2382                                                         assert_eq(0, skipLeftToRights);
2383                                                         skipLeftToRights = anchorBitPairs + j;
2384                                                         if(!seedOnLeft) {
2385                                                                 // ...and skip out of the rest of the read
2386                                                                 skipLeftToRights += readSeedOverhang;
2387                                                         }
2388                                                 }
2389                                                 foundHit = false; // Skip this candidate
2390                                                 break;
2391                                         }
2392                                         uint32_t qoff = anchorBitPairs + j;
2393                                         if(!seedOnLeft) {
2394                                                 qoff += readSeedOverhang;
2395                                         }
2396                                         assert_lt(qoff, qlen);
2397                                         if((int)qry[qoff] != rc) {
2398                                                 foundHit = false;
2399                                                 break;
2400                                         }
2401                                 }
2402                                 if(!foundHit) continue;
2403                         }
2404                         // If the read is longer than the seed, then scan the rest
2405                         // of the read characters; mismatches no longer count
2406                         // toward the stratum or the 1-mm limit.
2407                         // Vectors for holding edit information
2408                         std::vector<uint32_t> nonSeedMms;
2409                         std::vector<uint8_t> nonSeedRefcs;
2410                         int mms = 0; // start counting total mismatches
2411                         if((qlen - slen) > 0) {
2412                                 // Going left-to-right
2413                                 for(uint32_t j = 0; j < readSeedOverhang; j++) {
2414                                         uint32_t roff = rir + slen + j;
2415                                         uint32_t qoff = slen + j;
2416                                         if(!seedOnLeft) {
2417                                                 assert_geq(roff, qlen);
2418                                                 roff -= qlen;
2419                                                 qoff = j;
2420                                         }
2421                                         int rc = (int)ref[roff];
2422                                         if(rc == 4) {
2423                                                 // Oops, encountered an N in the reference in
2424                                                 // the overhang portion of the candidate
2425                                                 // alignment
2426                                                 // (Note that we inverted hi earlier)
2427                                                 if(hi) {
2428                                                         // Right-to-left
2429                                                         // Skip what's left of the readSeedOverhang
2430                                                         skipRightToLefts = readSeedOverhang - j - 1;
2431                                                         if(!seedOnLeft) {
2432                                                                 // ...and skip the seed if it's on the right
2433                                                                 skipRightToLefts += slen;
2434                                                         }
2435                                                 } else {
2436                                                         // Left-to-right
2437                                                         // Skip what we've matched of the overhang
2438                                                         skipLeftToRights = j;
2439                                                         if(seedOnLeft) {
2440                                                                 // ...and skip the seed if it's on the left
2441                                                                 skipLeftToRights += slen;
2442                                                         }
2443                                                 }
2444                                                 foundHit = false; // Skip this candidate
2445                                                 break;
2446                                         }
2447                                         if((int)qry[qoff] != rc) {
2448                                                 // Calculate quality of mismatched base
2449                                                 char q = phredCharToPhredQual(quals[qoff]);
2450                                                 ham += mmPenalty(this->maqPenalty_, q);
2451                                                 if(ham > this->qualMax_) {
2452                                                         // Exceeded quality limit
2453                                                         foundHit = false;
2454                                                         break;
2455                                                 }
2456                                                 // Legal mismatch outside of the anchor; record it
2457                                                 mms++;
2458                                                 nonSeedMms.push_back(qoff);
2459                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
2460                                                 nonSeedRefcs.push_back("ACGTN"[rc]);
2461                                         }
2462                                 }
2463                                 if(!foundHit) continue;
2464                         }
2465                         assert(foundHit);
2466                         // Adjust ri if seed is on the right-hand side
2467                         if(!seedOnLeft) {
2468                                 ri -= readSeedOverhang;
2469                                 rir -= readSeedOverhang;
2470                         }
2471                         if(pairs != NULL) {
2472                                 TU64Pair p;
2473                                 if(ri < aoff) {
2474                                         // By convention, the upstream mate's
2475                                         // coordinates go in the 'first' field
2476                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
2477                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2478                                 } else {
2479                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
2480                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2481                                 }
2482                                 if(pairs->find(p) != pairs->end()) {
2483                                         // We already found this hit!  Continue.
2484                                         ASSERT_ONLY(duplicates++);
2485                                         ASSERT_ONLY(r2i++);
2486                                         continue;
2487                                 } else {
2488                                         // Record this hit
2489                                         pairs->insert(p);
2490                                 }
2491                         }
2492                         if(this->verbose_) {
2493                                 cout << "About to report seed0:" << endl;
2494                                 cout << "  ";
2495                                 for(size_t i = 0; i < qlen; i++) {
2496                                         cout << (char)qry[i];
2497                                 }
2498                                 cout << endl;
2499                                 cout << "  ";
2500                                 for(size_t i = 0; i < qlen; i++) {
2501                                         cout << "ACGT"[ref[rir+i]];
2502                                 }
2503                                 cout << endl;
2504                         }
2505                         assert_lt(r2i, r2.size());
2506                         assert_eq(re2[r2i], ri);
2507                         ranges.resize(ranges.size()+1);
2508                         Range& range = ranges.back();
2509                         assert_eq((size_t)mms, r2[r2i].numMms);
2510                         range.stratum = 0;
2511                         range.numMms = mms;
2512                         assert_eq(0, range.mms.size());
2513                         assert_eq(0, range.refcs.size());
2514                         if(mms > 0) {
2515                                 ASSERT_ONLY(size_t mmcur = 0);
2516                                 const size_t nonSeedMmsSz = nonSeedMms.size();
2517                                 for(size_t i = 0; i < nonSeedMmsSz; i++) {
2518                                         assert_neq(0xffffffff, nonSeedMms[i]);
2519                                         assert_lt(mmcur, (size_t)mms);
2520                                         assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
2521                                         range.mms.push_back(nonSeedMms[i]);
2522                                         assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
2523                                         ASSERT_ONLY(mmcur++);
2524                                         range.refcs.push_back(nonSeedRefcs[i]);
2525                                 }
2526                                 assert_eq(mmcur, r2[r2i].mms.size());
2527                         }
2528                         assert_eq((size_t)mms, range.mms.size());
2529                         assert_eq((size_t)mms, range.refcs.size());
2530                         ASSERT_ONLY(r2i++);
2531                         results.push_back(ri);
2532                         if(--numToFind == 0) return;
2533                 }
2534                 assert_leq(duplicates, r2.size());
2535                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
2536                 return; // no hit
2537         }
2538 };
2539
2540 /**
2541  * Concrete RefAligner for finding nearby 1-mismatch hits given an
2542  * anchor hit.
2543  */
2544 template<typename TStr>
2545 class Seed1RefAligner : public RefAligner<TStr> {
2546
2547         typedef seqan::String<seqan::Dna5> TDna5Str;
2548         typedef seqan::String<char> TCharStr;
2549         typedef std::vector<Range> TRangeVec;
2550         typedef std::vector<uint32_t> TU32Vec;
2551         typedef std::pair<uint64_t, uint64_t> TU64Pair;
2552         typedef std::set<TU64Pair> TSetPairs;
2553
2554 public:
2555
2556         Seed1RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
2557                 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
2558
2559         virtual ~Seed1RefAligner() { }
2560
2561 protected:
2562         /**
2563          * This schematic shows the roles played by the begin, qbegin, end,
2564          * qend, halfway, slen, qlen, and lim variables:
2565          *
2566          * seedOnLeft == true:
2567          *
2568          * |<                   lim                   >|<     qlen       >|
2569          *  --------------------------------------------------------------
2570          * |                     | slen | qlen-slen |  | slen | qlen-slen |
2571          *  --------------------------------------------------------------
2572          * ^                     ^                     ^                  ^
2573          * begin & qbegin     halfway                qend               end
2574          *
2575          * seedOnLeft == false:
2576          *
2577          * |<     qlen       >|<                   lim                   >|
2578          *  --------------------------------------------------------------
2579          * | qlen-slen | slen |  | qlen-slen | slen |                     |
2580          *  --------------------------------------------------------------
2581          * ^                  ^                     ^                     ^
2582          * begin            qbegin             halfway           qend & end
2583          *
2584          * Note that, for seeds longer than 32 base-pairs, the seed is
2585          * further subdivided into a 32-bit anchor and a seed overhang of
2586          * length > 0.
2587          */
2588         void naiveFind(uint32_t numToFind,
2589                                    uint32_t tidx,
2590                                    uint8_t* ref,
2591                                    const TDna5Str& qry,
2592                                    const TCharStr& quals,
2593                                    uint32_t begin,
2594                                    uint32_t end,
2595                                    TRangeVec& ranges,
2596                                    TU32Vec& results,
2597                                    TSetPairs* pairs,
2598                                    uint32_t aoff,
2599                                    bool seedOnLeft) const
2600         {
2601                 assert_gt(numToFind, 0);
2602                 assert_gt(end, begin);
2603                 const uint32_t qlen = seqan::length(qry);
2604                 assert_gt(qlen, 0);
2605                 assert_geq(end - begin, qlen); // caller should have checked this
2606                 assert_gt(this->seedLen_, 0);
2607                 const uint32_t slen = min(qlen, this->seedLen_);
2608                 uint32_t qend = end;
2609                 uint32_t qbegin = begin;
2610                 // If the seed is on the left-hand side of the alignment, then
2611                 // leave a gap at the right-hand side of the interval;
2612                 // otherwise, do the opposite
2613                 if(seedOnLeft) {
2614                         // Leave gap on right-hand side of the interval
2615                         qend -= qlen;
2616                 } else {
2617                         // Leave gap on left-hand side of the interval
2618                         qbegin += qlen;
2619                 }
2620                 // lim = number of alignments to try
2621                 const uint32_t lim = qend - qbegin;
2622                 // halfway = position in the reference to start at (and then
2623                 // we work our way out to the right and to the left).
2624                 const uint32_t halfway = qbegin + (lim >> 1);
2625                 // Vectors for holding edit information
2626                 std::vector<uint32_t> nonSeedMms;
2627                 assert_eq(0, nonSeedMms.size());
2628                 std::vector<uint8_t> nonSeedRefcs;
2629                 assert_eq(0, nonSeedRefcs.size());
2630                 bool hi = false;
2631                 for(uint32_t i = 1; i <= lim+1; i++) {
2632                         uint32_t ri;  // leftmost position in candidate alignment
2633                         uint32_t rir; // same, minus begin; for indexing into ref[]
2634                         if(hi) {
2635                                 ri = halfway + (i >> 1); rir = ri - begin;
2636                                 assert_leq(ri, qend);
2637                         } else {
2638                                 ri = halfway - (i >> 1); rir = ri - begin;
2639                                 assert_geq(ri, begin);
2640                         }
2641                         hi = !hi;
2642                         // Do the naive comparison
2643                         bool match = true;
2644                         int refc = -1;
2645                         uint32_t mmOff = 0xffffffff;
2646                         int mms = 0;
2647                         int seedMms = 0;
2648                         unsigned int ham = 0;
2649                         nonSeedMms.clear();
2650                         nonSeedRefcs.clear();
2651                         // Walk through each position of the alignment
2652                         for(uint32_t jj = 0; jj < qlen; jj++) {
2653                                 uint32_t j = jj;
2654                                 if(!seedOnLeft) {
2655                                         // If seed is on the right, scan right-to-left
2656                                         j = qlen - jj - 1;
2657                                 } else {
2658                                         // Go left-to-right
2659                                 }
2660                                 uint32_t rirj = rir + j;
2661                                 if(!seedOnLeft) {
2662                                         assert_geq(rir, jj);
2663                                         rirj = rir - jj - 1;
2664                                 }
2665 #if 0
2666                                 // Count Ns in the reference as mismatches
2667                                 const int q = (int)qry[j];
2668                                 const int r = (int)ref[rirj];
2669                                 assert_leq(q, 4);
2670                                 assert_leq(r, 4);
2671                                 if(q == 4 || r == 4 || q != r) {
2672 #else
2673                                 // Disallow alignments that involve an N in the
2674                                 // reference
2675                                 const int r = (int)ref[rirj];
2676                                 if(r & 4) {
2677                                         // N in reference; bail on this alignment
2678                                         match = false;
2679                                         break;
2680                                 }
2681                                 const int q = (int)qry[j];
2682                                 assert_leq(q, 4);
2683                                 assert_lt(r, 4);
2684                                 if(q != r) {
2685 #endif
2686                                         // Mismatch!
2687                                         mms++;
2688                                         if(mms > 1 && jj < slen) {
2689                                                 // More than one mismatch in the anchor; reject
2690                                                 match = false;
2691                                                 break;
2692                                         }
2693                                         uint8_t qual = phredCharToPhredQual(quals[j]);
2694                                         ham += mmPenalty(this->maqPenalty_, qual);
2695                                         if(ham > this->qualMax_) {
2696                                                 // Exceeded quality ceiling; reject
2697                                                 match = false;
2698                                                 break;
2699                                         } else if(jj < slen) {
2700                                                 // First mismatch in the anchor; remember offset
2701                                                 // and ref char
2702                                                 refc = "ACGTN"[r];
2703                                                 mmOff = j;
2704                                                 seedMms = 1;
2705                                         } else {
2706                                                 // Legal mismatch outside of the anchor; record it
2707                                                 nonSeedMms.push_back(j);
2708                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
2709                                                 nonSeedRefcs.push_back("ACGTN"[r]);
2710                                         }
2711                                 }
2712                         }
2713                         if(match) {
2714                                 ranges.resize(ranges.size()+1);
2715                                 Range& range = ranges.back();
2716                                 assert_leq(seedMms, mms);
2717                                 range.stratum = seedMms;
2718                                 range.numMms = mms;
2719                                 assert_eq(0, range.mms.size());
2720                                 assert_eq(0, range.refcs.size());
2721                                 if(mms >= 1) {
2722                                         // Be careful to add edits in left-to-right order
2723                                         // with respect to the read/alignment
2724                                         if(seedOnLeft && seedMms) {
2725                                                 assert_lt(mmOff, qlen);
2726                                                 range.mms.push_back(mmOff);
2727                                                 range.refcs.push_back(refc);
2728                                         }
2729                                         const size_t nonSeedMmsSz = nonSeedMms.size();
2730                                         if(nonSeedMmsSz > 0) {
2731                                                 if(seedOnLeft) {
2732                                                         for(size_t k = 0; k < nonSeedMmsSz; k++) {
2733                                                                 range.mms.push_back(nonSeedMms[k]);
2734                                                                 range.refcs.push_back(nonSeedRefcs[k]);
2735                                                         }
2736                                                 } else {
2737                                                         for(size_t k = nonSeedMmsSz; k > 0; k--) {
2738                                                                 range.mms.push_back(nonSeedMms[k-1]);
2739                                                                 range.refcs.push_back(nonSeedRefcs[k-1]);
2740                                                         }
2741                                                 }
2742                                         }
2743                                         if(!seedOnLeft && seedMms) {
2744                                                 assert_lt(mmOff, qlen);
2745                                                 range.mms.push_back(mmOff);
2746                                                 range.refcs.push_back(refc);
2747                                         }
2748                                 }
2749                                 assert_eq((size_t)mms, range.mms.size());
2750                                 assert_eq((size_t)mms, range.refcs.size());
2751                                 if(seedOnLeft) {
2752                                         results.push_back(ri);
2753                                 } else {
2754                                         results.push_back(ri - qlen);
2755                                 }