Commit patch to not break on spaces.
[bowtie.git] / ref_aligner.h
1 /*
2  * ref_aligner.h
3  */
4
5 #ifndef REF_ALIGNER_H_
6 #define REF_ALIGNER_H_
7
8 #include <stdint.h>
9 #include <iostream>
10 #include <vector>
11 #include <stdexcept>
12 #include "seqan/sequence.h"
13 #include "alphabet.h"
14 #include "range.h"
15 #include "reference.h"
16
17 // Let the reference-aligner buffer size be 16K by default.  If more
18 // room is required, a new buffer must be allocated from the heap.
19 const static int REF_ALIGNER_BUFSZ = 16 * 1024;
20
21 /**
22  * Abstract parent class for classes that look for alignments by
23  * matching against the reference sequence directly.  This is useful
24  * both for sanity-checking results from the Bowtie index and for
25  * finding mates when the reference location of the opposite mate is
26  * known.
27  */
28 template<typename TStr>
29 class RefAligner {
30
31         typedef seqan::String<seqan::Dna5> TDna5Str;
32         typedef seqan::String<char> TCharStr;
33         typedef std::vector<uint32_t> TU32Vec;
34         typedef std::vector<Range> TRangeVec;
35         typedef std::pair<uint64_t, uint64_t> TU64Pair;
36         typedef std::set<TU64Pair> TSetPairs;
37
38 public:
39         RefAligner(bool color,
40                    bool verbose = false,
41                    bool quiet = false,
42                    uint32_t seedLen = 0,
43                    uint32_t qualMax = 0xffffffff,
44                    bool maqPenalty = false) :
45                 color_(color), verbose_(verbose), seedLen_(seedLen),
46                 qualMax_(qualMax), maqPenalty_(maqPenalty), refbuf_(buf_),
47                 refbufSz_(REF_ALIGNER_BUFSZ), freeRefbuf_(false)
48                 { }
49
50         /**
51          * Free the reference-space alignment buffer if this object
52          * allocated it.
53          */
54         virtual ~RefAligner() {
55                 if(freeRefbuf_) {
56                         delete[] refbuf_;
57                 }
58         }
59
60         /**
61          * Find one alignment of qry:quals in the range begin-end in
62          * reference string ref.  Store the alignment details in range.
63          */
64         virtual void find(uint32_t numToFind,
65                           const uint32_t tidx,
66                           const BitPairReference *refs,
67                           const TDna5Str& qry,
68                           const TCharStr& quals,
69                           uint32_t begin,
70                           uint32_t end,
71                           TRangeVec& ranges,
72                           TU32Vec& results,
73                           TSetPairs* pairs = NULL,
74                           uint32_t aoff = 0xffffffff,
75                           bool seedOnLeft = false)
76         {
77                 assert_gt(numToFind, 0);
78                 assert_gt(end, begin);
79                 uint32_t spread = end - begin + (color_ ? 1 : 0);
80                 uint32_t spreadPlus = spread + 12;
81                 // Make sure the buffer is large enough to accommodate the spread
82                 if(spreadPlus > this->refbufSz_) {
83                         this->newBuf(spreadPlus);
84                 }
85                 // Read in the relevant stretch of the reference string
86                 int offset = refs->getStretch(this->refbuf_, tidx, begin, spread);
87                 uint8_t *buf = ((uint8_t*)this->refbuf_) + offset;
88                 if(color_) {
89                         // Colorize buffer
90                         for(size_t i = 0; i < (end-begin); i++) {
91                                 assert_leq((int)buf[i], 4);
92                                 buf[i] = dinuc2color[(int)buf[i]][(int)buf[i+1]];
93                         }
94                 }
95                 // Look for alignments
96                 ASSERT_ONLY(uint32_t irsz = ranges.size());
97                 anchor64Find(numToFind, tidx, buf, qry, quals, begin,
98                              end, ranges, results, pairs, aoff, seedOnLeft);
99 #ifndef NDEBUG
100                 for(size_t i = irsz; i < results.size(); i++) {
101                         assert_eq(ranges[i].numMms, ranges[i].mms.size());
102                         assert_eq(ranges[i].numMms, ranges[i].refcs.size());
103                 }
104 #endif
105         }
106
107         /**
108          * Find one alignment of qry:quals in the range begin-end in
109          * reference string ref.  Store the alignment details in range.
110          * Uses a combination of the anchor bases and 64-bit arithmetic to
111          * find anchors quickly.
112          */
113         virtual void anchor64Find(uint32_t numToFind,
114                         uint32_t tidx,
115                         uint8_t* ref,
116                     const TDna5Str& qry,
117                     const TCharStr& quals,
118                     uint32_t begin,
119                     uint32_t end,
120                         TRangeVec& ranges,
121                         TU32Vec& results,
122                         TSetPairs* pairs = NULL,
123                         uint32_t aoff = 0xffffffff,
124                         bool seedOnLeft = false) const = 0;
125
126         /**
127          * Set a new reference-sequence buffer.
128          */
129         void setBuf(uint32_t *newbuf, uint32_t newsz) {
130                 if(freeRefbuf_) {
131                         delete[] refbuf_;
132                         freeRefbuf_ = false;
133                 }
134                 refbuf_ = newbuf;
135                 refbufSz_ = newsz;
136         }
137
138         /**
139          * Set a new reference-sequence buffer.
140          */
141         void newBuf(uint32_t newsz) {
142                 if(freeRefbuf_) {
143                         delete[] refbuf_;
144                 }
145                 try {
146                         refbuf_ = new uint32_t[(newsz + 3) / 4];
147                         if(refbuf_ == NULL) throw std::bad_alloc();
148                 } catch(std::bad_alloc& e) {
149                         cerr << "Error: Could not allocate reference-space alignment buffer of " << newsz << "B" << endl;
150                         throw 1;
151                 }
152                 refbufSz_ = newsz;
153                 freeRefbuf_ = true;
154         }
155
156 protected:
157         bool      color_;     /// whether to colorize reference buffers before handing off
158         bool      verbose_;   /// be talkative
159         uint32_t  seedLen_;   /// length of seed region for read
160         uint32_t  qualMax_;   /// maximum sum of quality penalties
161         bool      maqPenalty_;/// whether to round as in Maq
162         uint32_t *refbuf_;    /// pointer to current reference buffer
163         uint32_t  refbufSz_;  /// size of current reference buffer
164         uint32_t  buf_[REF_ALIGNER_BUFSZ / 4]; /// built-in reference buffer (may be superseded)
165         bool      freeRefbuf_; /// whether refbuf_ points to something we should delete
166 };
167
168 /**
169  * Concrete RefAligner for finding nearby exact hits given an anchor
170  * hit.
171  */
172 template<typename TStr>
173 class ExactRefAligner : public RefAligner<TStr> {
174
175         typedef seqan::String<seqan::Dna5> TDna5Str;
176         typedef seqan::String<char> TCharStr;
177         typedef std::vector<uint32_t> TU32Vec;
178         typedef std::vector<Range> TRangeVec;
179         typedef std::pair<uint64_t, uint64_t> TU64Pair;
180         typedef std::set<TU64Pair> TSetPairs;
181
182 public:
183
184         ExactRefAligner(bool color, bool verbose, bool quiet) :
185                 RefAligner<TStr>(color, verbose, quiet) { }
186
187         virtual ~ExactRefAligner() { }
188
189 protected:
190         /**
191          * Because we're doing end-to-end exact, we don't care which end of
192          * 'qry' is the 5' end.
193          */
194         void naiveFind(uint32_t numToFind,
195                        uint32_t tidx,
196                        uint8_t* ref,
197                    const TDna5Str& qry,
198                    const TCharStr& quals,
199                    uint32_t begin,
200                    uint32_t end,
201                        TRangeVec& ranges,
202                        TU32Vec& results,
203                        TSetPairs* pairs,
204                        uint32_t aoff,
205                        bool seedOnLeft) const
206         {
207                 assert_gt(numToFind, 0);
208                 const uint32_t qlen = seqan::length(qry);
209                 assert_geq(end - begin, qlen); // caller should have checked this
210                 assert_gt(end, begin);
211                 assert_gt(qlen, 0);
212                 uint32_t qend = end - qlen;
213                 uint32_t lim = qend - begin;
214                 uint32_t halfway = begin + (lim >> 1);
215                 bool hi = false;
216                 for(uint32_t i = 1; i <= lim+1; i++) {
217                         uint32_t ri;  // leftmost position in candidate alignment
218                         uint32_t rir; // same, minus begin; for indexing into ref[]
219                         if(hi) {
220                                 ri = halfway + (i >> 1); rir = ri - begin;
221                                 assert_leq(ri, qend);
222                         } else {
223                                 ri = halfway - (i >> 1); rir = ri - begin;
224                                 assert_geq(ri, begin);
225                         }
226                         hi = !hi;
227                         // Do the naive comparison
228                         bool match = true;
229                         for(uint32_t j = 0; j < qlen; j++) {
230 #if 0
231                                 // Count Ns in the reference as mismatches
232                                 const int q = (int)qry[j];
233                                 const int r = (int)ref[rir + j];
234                                 assert_leq(q, 4);
235                                 assert_leq(r, 4);
236                                 if(q == 4 || r == 4 || q != r) {
237                                         // Mismatch
238                                         match = false;
239                                         break;
240                                 }
241                                 // Match; continue
242 #else
243                                 // Disallow alignments that involve an N in the
244                                 // reference
245                                 const int r = (int)ref[rir + j];
246                                 if(r & 4) {
247                                         // N in reference; bail
248                                         match = false;
249                                         break;
250                                 }
251                                 const int q = (int)qry[j];
252                                 assert_leq(q, 4);
253                                 assert_lt(r, 4);
254                                 if(q != r) {
255                                         // Mismatch
256                                         match = false;
257                                         break;
258                                 }
259                                 // Match; continue
260 #endif
261                         }
262                         if(match) {
263                                 ranges.resize(ranges.size()+1);
264                                 Range& range = ranges.back();
265                                 range.stratum = 0;
266                                 range.numMms = 0;
267                                 assert_eq(0, range.mms.size());
268                                 assert_eq(0, range.refcs.size());
269                                 results.push_back(ri);
270                         }
271                 }
272                 return; // no match
273         }
274
275         /**
276          * Because we're doing end-to-end exact, we don't care which end of
277          * 'qry' is the 5' end.
278          */
279         virtual void anchor64Find(uint32_t numToFind,
280                     uint32_t tidx,
281                         uint8_t *ref,
282                     const TDna5Str& qry,
283                     const TCharStr& quals,
284                     uint32_t begin,
285                     uint32_t end,
286                         TRangeVec& ranges,
287                         TU32Vec& results,
288                         TSetPairs* pairs,
289                         uint32_t aoff, // offset of anchor mate
290                         bool seedOnLeft) const
291         {
292                 assert_gt(numToFind, 0);
293                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
294                 ASSERT_ONLY(uint32_t duplicates = 0);
295                 ASSERT_ONLY(uint32_t r2i = 0);
296                 const uint32_t qlen = seqan::length(qry);
297                 assert_geq(end - begin, qlen); // caller should have checked this
298                 assert_gt(end, begin);
299                 assert_gt(qlen, 0);
300 #ifndef NDEBUG
301                 // Get all naive hits
302                 TRangeVec r2; TU32Vec re2;
303                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
304                           re2, pairs, aoff, seedOnLeft);
305 #endif
306                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
307                 // anchorOverhang = # read bases not included in the anchor
308                 const uint32_t anchorOverhang = qlen <= 32 ? 0 : qlen - 32;
309                 const uint32_t lim = end - qlen - begin;
310                 const uint32_t halfway = begin + (lim >> 1);
311                 uint64_t anchor = 0llu;
312                 uint64_t buffw = 0llu;
313                 // Set up a mask that we'll apply to the two bufs every round
314                 // to discard bits that were rotated out of the anchor area
315                 uint64_t clearMask = 0xffffffffffffffffllu;
316                 bool useMask = false;
317                 if(anchorBitPairs < 32) {
318                         clearMask >>= ((32-anchorBitPairs) << 1);
319                         useMask = true;
320                 }
321                 const int lhsShift = ((anchorBitPairs - 1) << 1);
322                 // Build the contents of the 'anchor' dword and the initial
323                 // contents of the 'buffw' dword.  If there are fewer than 32
324                 // anchorBitPairs, the content will be packed into the least
325                 // significant bits of the word.
326                 uint32_t skipLeftToRights = 0;
327                 uint32_t skipRightToLefts = 0;
328                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
329                         int c = (int)qry[i]; // next query character
330                         assert_leq(c, 4);
331                         if(c & 4) {
332                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
333                                 return; // can't match if query has Ns
334                         }
335                         int r = (int)ref[halfway - begin + i]; // next reference character
336                         if(r & 4) {
337                                 r = 0;
338                                 // The right-to-left direction absorbs the candidate
339                                 // alignment based at 'halfway'; so skipLeftToRights is
340                                 // i, not i+1
341                                 skipLeftToRights = max(skipLeftToRights, i);
342                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
343                         }
344                         assert_lt(r, 4);
345                         assert_lt(c, 4);
346                         anchor  = ((anchor  << 2llu) | c);
347                         buffw = ((buffw << 2llu) | r);
348                 }
349                 // Check whether read is disqualified by Ns outside of the anchor
350                 // region
351                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
352                         if((int)qry[i] == 4) {
353                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
354                                 return; // can't match if query has Ns
355                         }
356                 }
357                 uint64_t bufbw = buffw;
358                 // We're moving the right-hand edge of the anchor along until
359                 // it's 'anchorOverhang' chars from the end of the target region.
360                 // Note that we're not making a 3'/5' distinction here; if we
361                 // were, we might need to make the 'anchorOverhang' adjustment on
362                 // the left end of the range rather than the right.
363                 bool hi = false;
364                 uint32_t riHi  = halfway;
365                 uint32_t rirHi = halfway - begin;
366                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
367                 uint32_t riLo  = halfway + 1;
368                 uint32_t rirLo = halfway - begin + 1;
369                 for(uint32_t i = 1; i <= lim + 1; i++) {
370                         int r;       // new reference char
371                         assert_lt(skipLeftToRights, qlen);
372                         assert_leq(skipRightToLefts, qlen);
373                         if(hi) {
374                                 hi = false;
375                                 // Moving left-to-right
376                                 riHi++; rirHi++; rirHiAnchor++;
377                                 r = (int)ref[rirHiAnchor];
378                                 if(r & 4) {
379                                         r = 0;
380                                         skipLeftToRights = anchorBitPairs;
381                                 }
382                                 assert_lt(r, 4);
383                                 // Bring in new base pair at the least significant
384                                 // position
385                                 buffw = ((buffw << 2llu) | r);
386                                 if(useMask) buffw &= clearMask;
387                                 if(skipLeftToRights > 0) {
388                                         skipLeftToRights--;
389                                         continue;
390                                 }
391                                 if(buffw != anchor) {
392                                         continue;
393                                 }
394                         } else {
395                                 hi = true;
396                                 // Moving right-to-left
397                                 riLo--; rirLo--;
398                                 r = (int)ref[rirLo];
399                                 if(r & 4) {
400                                         r = 0;
401                                         skipRightToLefts = qlen;
402                                 }
403                                 assert_lt(r, 4);
404                                 if(i >= 2) {
405                                         bufbw >>= 2llu;
406                                         // Bring in new base pair at the most significant
407                                         // position
408                                         bufbw |= ((uint64_t)r << lhsShift);
409                                 }
410                                 if(skipRightToLefts > 0) {
411                                         skipRightToLefts--;
412                                         continue;
413                                 }
414                                 if(bufbw != anchor) {
415                                         continue;
416                                 }
417                         }
418                         // Seed hit!
419                         bool foundHit = true;
420                         uint32_t ri = hi ? riLo : riHi;
421                         uint32_t rir = hi ? rirLo : rirHi;
422                         if(anchorOverhang > 0) {
423                                 // Does the non-anchor part of the alignment (the
424                                 // "overhang") ruin it?
425                                 bool skipCandidate = false;
426                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
427                                         assert_lt(ri + anchorBitPairs + j, end);
428                                         int rc = (int)ref[rir + anchorBitPairs + j];
429                                         if(rc == 4) {
430                                                 // Oops, encountered an N in the reference in
431                                                 // the overhang portion of the candidate
432                                                 // alignment
433                                                 // (Note that we inverted hi earlier)
434                                                 if(hi) {
435                                                         // Right-to-left
436                                                         skipRightToLefts = anchorOverhang - j - 1;
437                                                 } else {
438                                                         // Left-to-right
439                                                         skipLeftToRights = anchorBitPairs + j;
440                                                 }
441                                                 skipCandidate = true;
442                                                 break; // Skip this candidate
443                                         }
444                                         if((int)qry[32 + j] != rc) {
445                                                 // Yes, overhang ruins it
446                                                 foundHit = false;
447                                                 break;
448                                         }
449                                 }
450                                 if(skipCandidate) continue;
451                         }
452                         if(foundHit) {
453                                 if(pairs != NULL) {
454                                         TU64Pair p;
455                                         if(ri < aoff) {
456                                                 // By convention, the upstream mate's
457                                                 // coordinates go in the 'first' field
458                                                 p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
459                                                 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
460                                         } else {
461                                                 p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
462                                                 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
463                                         }
464                                         if(pairs->find(p) != pairs->end()) {
465                                                 // We already found this hit!  Continue.
466                                                 ASSERT_ONLY(duplicates++);
467                                                 ASSERT_ONLY(r2i++);
468                                                 continue;
469                                         } else {
470                                                 // Record this hit
471                                                 pairs->insert(p);
472                                         }
473                                 }
474                                 assert_lt(r2i, r2.size());
475                                 assert_eq(re2[r2i], ri);
476                                 ranges.resize(ranges.size()+1);
477                                 Range& range = ranges.back();
478                                 range.stratum = 0;
479                                 range.numMms = 0;
480                                 assert_eq(0, range.mms.size());
481                                 assert_eq(0, range.refcs.size());
482                                 ASSERT_ONLY(r2i++);
483                                 results.push_back(ri);
484                                 if(--numToFind == 0) return;
485                         } else {
486                                 // Keep scanning
487                         }
488                 }
489                 assert_leq(duplicates, r2.size());
490                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
491                 return; // no hit
492         }
493 };
494
495 /**
496  * Defined in ref_aligner.cpp.  Maps an octet representing the XOR of
497  * two two-bit-per-base-encoded DNA sequences to the number of bases
498  * that mismatch between the two.
499  */
500 extern unsigned char u8toMms[];
501
502 /**
503  * Concrete RefAligner for finding nearby 1-mismatch hits given an
504  * anchor hit.
505  */
506 template<typename TStr>
507 class OneMMRefAligner : public RefAligner<TStr> {
508
509         typedef seqan::String<seqan::Dna5> TDna5Str;
510         typedef seqan::String<char> TCharStr;
511         typedef std::vector<Range> TRangeVec;
512         typedef std::vector<uint32_t> TU32Vec;
513         typedef std::pair<uint64_t, uint64_t> TU64Pair;
514         typedef std::set<TU64Pair> TSetPairs;
515
516 public:
517
518         OneMMRefAligner(bool color, bool verbose, bool quiet) :
519                 RefAligner<TStr>(color, verbose, quiet) { }
520
521         virtual ~OneMMRefAligner() { }
522
523 protected:
524         /**
525          * Because we're doing end-to-end exact, we don't care which end of
526          * 'qry' is the 5' end.
527          */
528         void naiveFind(uint32_t numToFind,
529                        uint32_t tidx,
530                        uint8_t* ref,
531                    const TDna5Str& qry,
532                    const TCharStr& quals,
533                    uint32_t begin,
534                    uint32_t end,
535                        TRangeVec& ranges,
536                        TU32Vec& results,
537                        TSetPairs* pairs,
538                        uint32_t aoff,
539                        bool seedOnLeft) const
540         {
541                 assert_gt(numToFind, 0);
542                 const uint32_t qlen = seqan::length(qry);
543                 assert_geq(end - begin, qlen); // caller should have checked this
544                 assert_gt(end, begin);
545                 assert_gt(qlen, 0);
546                 uint32_t qend = end - qlen;
547                 uint32_t lim = qend - begin;
548                 uint32_t halfway = begin + (lim >> 1);
549                 bool hi = false;
550                 for(uint32_t i = 1; i <= lim+1; i++) {
551                         uint32_t ri;  // leftmost position in candidate alignment
552                         uint32_t rir; // same, minus begin; for indexing into ref[]
553                         if(hi) {
554                                 ri = halfway + (i >> 1); rir = ri - begin;
555                                 assert_leq(ri, qend);
556                         } else {
557                                 ri = halfway - (i >> 1); rir = ri - begin;
558                                 assert_geq(ri, begin);
559                         }
560                         hi = !hi;
561                         // Do the naive comparison
562                         bool match = true;
563                         int refc = -1;
564                         uint32_t mmOff = 0xffffffff;
565                         int mms = 0;
566                         for(uint32_t j = 0; j < qlen; j++) {
567 #if 0
568                                 // Count Ns in the reference as mismatches
569                                 const int q = (int)qry[j];
570                                 const int r = (int)ref[rir + j];
571                                 assert_leq(q, 4);
572                                 assert_leq(r, 4);
573                                 if(q == 4 || r == 4 || q != r) {
574 #else
575                                 // Disallow alignments that involve an N in the
576                                 // reference
577                                 const int r = (int)ref[rir + j];
578                                 if(r & 4) {
579                                         // N in reference; bail
580                                         match = false;
581                                         break;
582                                 }
583                                 const int q = (int)qry[j];
584                                 assert_leq(q, 4);
585                                 assert_lt(r, 4);
586                                 if(q != r) {
587 #endif
588                                         // Mismatch!
589                                         if(++mms > 1) {
590                                                 // Too many; reject this alignment
591                                                 match = false;
592                                                 break;
593                                         } else {
594                                                 // First one; remember offset and ref char
595                                                 refc = "ACGTN"[r];
596                                                 mmOff = j;
597                                         }
598                                 }
599                         }
600                         if(match) {
601                                 assert_leq(mms, 1);
602                                 ranges.resize(ranges.size()+1);
603                                 Range& range = ranges.back();
604                                 range.stratum = mms;
605                                 range.numMms = mms;
606                                 assert_eq(0, range.mms.size());
607                                 assert_eq(0, range.refcs.size());
608                                 if(mms == 1) {
609                                         assert_lt(mmOff, qlen);
610                                         range.mms.push_back(mmOff);
611                                         range.refcs.push_back(refc);
612                                 }
613                                 results.push_back(ri);
614                         }
615                 }
616                 return;
617         }
618
619         /**
620          * Because we're doing end-to-end exact, we don't care which end of
621          * 'qry' is the 5' end.
622          */
623         virtual void anchor64Find(uint32_t numToFind,
624                         uint32_t tidx,
625                         uint8_t* ref,
626                     const TDna5Str& qry,
627                     const TCharStr& quals,
628                     uint32_t begin,
629                     uint32_t end,
630                         TRangeVec& ranges,
631                         TU32Vec& results,
632                         TSetPairs* pairs = NULL,
633                         uint32_t aoff = 0xffffffff,
634                         bool seedOnLeft = false) const
635         {
636                 assert_gt(numToFind, 0);
637                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
638                 ASSERT_ONLY(uint32_t duplicates = 0);
639                 ASSERT_ONLY(uint32_t r2i = 0);
640                 const uint32_t qlen = seqan::length(qry);
641                 assert_geq(end - begin, qlen); // caller should have checked this
642                 assert_gt(end, begin);
643                 assert_gt(qlen, 0);
644 #ifndef NDEBUG
645                 // Get results from the naive matcher for sanity-checking
646                 TRangeVec r2; TU32Vec re2;
647                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
648                           re2, pairs, aoff, seedOnLeft);
649 #endif
650                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
651                 const int lhsShift = ((anchorBitPairs - 1) << 1);
652                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
653                 // anchorOverhang = # read bases not included in the anchor
654                 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
655                 const uint32_t lim = end - qlen - begin;
656                 const uint32_t halfway = begin + (lim >> 1);
657                 uint64_t anchor = 0llu;
658                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
659                 // OR the 'diff' buffer with this so that we can always count
660                 // 'N's as mismatches
661                 uint64_t diffMask = 0llu;
662                 // Set up a mask that we'll apply to the two bufs every round
663                 // to discard bits that were rotated out of the anchor area
664                 uint64_t clearMask = 0xffffffffffffffffllu;
665                 bool useMask = false;
666                 if(anchorBitPairs < 32) {
667                         clearMask >>= ((32-anchorBitPairs) << 1);
668                         useMask = true;
669                 }
670                 int nsInAnchor = 0;
671                 int nPos = -1;
672                 uint32_t skipLeftToRights = 0;
673                 uint32_t skipRightToLefts = 0;
674                 // Construct the 'anchor' 64-bit buffer so that it holds all of
675                 // the first 'anchorBitPairs' bit pairs of the query.
676                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
677                         int c = (int)qry[i]; // next query character
678                         int r = (int)ref[halfway - begin + i]; // next reference character
679                         if(r & 4) {
680                                 r = 0;
681                                 // The right-to-left direction absorbs the candidate
682                                 // alignment based at 'halfway'; so skipLeftToRights is
683                                 // i, not i+1
684                                 skipLeftToRights = max(skipLeftToRights, i);
685                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
686                         }
687                         assert_leq(c, 4);
688                         assert_lt(r, 4);
689                         // Special case: query has an 'N'
690                         if(c == 4) {
691                                 if(++nsInAnchor > 1) {
692                                         // More than one 'N' in the anchor region; can't
693                                         // possibly have a 1-mismatch hit anywhere
694                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
695                                         return;   // can't match if query has Ns
696                                 }
697                                 nPos = (int)i;
698                                 // Make it look like an 'A' in the anchor
699                                 c = 0;
700                                 diffMask = (diffMask << 2llu) | 1llu;
701                         } else {
702                                 diffMask <<= 2llu;
703                         }
704                         anchor  = ((anchor  << 2llu) | c);
705                         buffw = ((buffw << 2llu) | r);
706                 }
707                 // Check whether read is disqualified by Ns outside of the anchor
708                 // region
709                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
710                         if((int)qry[i] == 4) {
711                                 if(++nsInAnchor > 1) {
712                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
713                                         return; // can't match if query has Ns
714                                 }
715                         }
716                 }
717                 uint64_t bufbw = buffw;
718                 // We're moving the right-hand edge of the anchor along until
719                 // it's 'anchorOverhang' chars from the end of the target region.
720                 // Note that we're not making a 3'/5' distinction here; if we
721                 // were, we might need to make the 'anchorOverhang' adjustment on
722                 // the left end of the range rather than the right.
723                 bool hi = false;
724                 uint32_t riHi  = halfway;
725                 uint32_t rirHi = halfway - begin;
726                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
727                 uint32_t riLo  = halfway + 1;
728                 uint32_t rirLo = halfway - begin + 1;
729                 for(uint32_t i = 1; i <= lim + 1; i++) {
730                         int r;       // new reference char
731                         uint64_t diff;
732                         assert_lt(skipLeftToRights, qlen);
733                         assert_leq(skipRightToLefts, qlen);
734                         if(hi) {
735                                 hi = false;
736                                 // Moving left-to-right
737                                 riHi++;
738                                 rirHi++;
739                                 rirHiAnchor++;
740                                 r = (int)ref[rirHiAnchor];
741                                 if(r & 4) {
742                                         r = 0;
743                                         skipLeftToRights = anchorBitPairs;
744                                 }
745                                 assert_lt(r, 4);
746                                 // Bring in new base pair at the least significant
747                                 // position
748                                 buffw = ((buffw << 2llu) | r);
749                                 if(useMask) buffw &= clearMask;
750                                 if(skipLeftToRights > 0) {
751                                         skipLeftToRights--;
752                                         continue;
753                                 }
754                                 diff = (buffw ^ anchor) | diffMask;
755                         } else {
756                                 hi = true;
757                                 // Moving right-to-left
758                                 riLo--;
759                                 rirLo--;
760                                 r = (int)ref[rirLo];
761                                 if(r & 4) {
762                                         r = 0;
763                                         skipRightToLefts = qlen;
764                                 }
765                                 assert_lt(r, 4);
766                                 if(i >= 2) {
767                                         bufbw >>= 2llu;
768                                         // Bring in new base pair at the most significant
769                                         // position
770                                         bufbw |= ((uint64_t)r << lhsShift);
771                                 }
772                                 if(skipRightToLefts > 0) {
773                                         skipRightToLefts--;
774                                         continue;
775                                 }
776                                 diff = (bufbw ^ anchor) | diffMask;
777                         }
778                         if((diff & 0xffffffff00000000llu) &&
779                            (diff & 0x00000000ffffffffllu)) continue;
780                         uint32_t ri  = hi ? riLo  : riHi;
781                         uint32_t rir = hi ? rirLo : rirHi;
782                         // Could use pop count
783                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
784                         // As a first cut, see if there are too many mismatches in
785                         // the first and last parts of the anchor
786                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
787                         if(diffs > 1) continue;
788                         diffs += u8toMms[(int)diff8[1]] +
789                                  u8toMms[(int)diff8[2]] +
790                                  u8toMms[(int)diff8[3]] +
791                                  u8toMms[(int)diff8[4]] +
792                                  u8toMms[(int)diff8[5]] +
793                                  u8toMms[(int)diff8[6]];
794                         uint32_t mmpos = 0xffffffff;
795                         int refc = -1;
796                         if(diffs > 1) {
797                                 // Too many differences
798                                 continue;
799                         } else if(diffs == 1 && nPos != -1) {
800                                 // There was one difference, but there was also one N,
801                                 // so we already know where the difference is
802                                 mmpos = nPos;
803                                 refc = "ACGT"[(int)ref[rir + nPos]];
804                         } else if(diffs == 1) {
805                                 // Figure out which position mismatched
806                                 mmpos = 31;
807                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos -= 16; }
808                                 assert_neq(0, diff);
809                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos -=  8; }
810                                 assert_neq(0, diff);
811                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos -=  4; }
812                                 assert_neq(0, diff);
813                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos -=  2; }
814                                 assert_neq(0, diff);
815                                 if((diff & 0x3llu) == 0)        { mmpos--; }
816                                 assert_neq(0, diff);
817                                 assert_geq(mmpos, 0);
818                                 assert_lt(mmpos, 32);
819                                 mmpos -= anchorCushion;
820                                 refc = "ACGT"[(int)ref[rir + mmpos]];
821                         }
822                         // Now extend the anchor into a longer alignment
823                         bool foundHit = true;
824                         if(anchorOverhang > 0) {
825                                 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
826                                 bool skipCandidate = false;
827                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
828                                         int rc = (int)ref[rir + 32 + j];
829                                         if(rc == 4) {
830                                                 // Oops, encountered an N in the reference in
831                                                 // the overhang portion of the candidate
832                                                 // alignment
833                                                 // (Note that we inverted hi earlier)
834                                                 if(hi) {
835                                                         // Right-to-left
836                                                         skipRightToLefts = anchorOverhang - j - 1;
837                                                 } else {
838                                                         // Left-to-right
839                                                         skipLeftToRights = anchorBitPairs + j;
840                                                 }
841                                                 skipCandidate = true; // Skip this candidate
842                                                 break;
843                                         }
844                                         if((int)qry[32 + j] != rc) {
845                                                 if(++diffs > 1) {
846                                                         foundHit = false;
847                                                         break;
848                                                 } else {
849                                                         mmpos = 32 + j;
850                                                         refc = "ACGT"[(int)ref[rir + 32 + j]];
851                                                 }
852                                         }
853                                 }
854                                 if(skipCandidate) continue;
855                         }
856                         if(!foundHit) continue;
857                         if(pairs != NULL) {
858                                 TU64Pair p;
859                                 if(ri < aoff) {
860                                         // By convention, the upstream mate's
861                                         // coordinates go in the 'first' field
862                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
863                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
864                                 } else {
865                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
866                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
867                                 }
868                                 if(pairs->find(p) != pairs->end()) {
869                                         // We already found this hit!  Continue.
870                                         ASSERT_ONLY(duplicates++);
871                                         ASSERT_ONLY(r2i++);
872                                         continue;
873                                 } else {
874                                         // Record this hit
875                                         pairs->insert(p);
876                                 }
877                         }
878                         assert_leq(diffs, 1);
879                         assert_lt(r2i, r2.size());
880                         assert_eq(re2[r2i], ri);
881                         ranges.resize(ranges.size()+1);
882                         Range& range = ranges.back();
883                         assert_eq(diffs, r2[r2i].numMms);
884                         range.stratum = diffs;
885                         range.numMms = diffs;
886                         assert_eq(0, range.mms.size());
887                         assert_eq(0, range.refcs.size());
888                         if(diffs == 1) {
889                                 assert_neq(mmpos, 0xffffffff);
890                                 assert_eq(mmpos, r2[r2i].mms[0]);
891                                 assert_neq(-1, refc);
892                                 assert_eq(refc, r2[r2i].refcs[0]);
893                                 range.mms.push_back(mmpos);
894                                 range.refcs.push_back(refc);
895                         }
896                         ASSERT_ONLY(r2i++);
897                         results.push_back(ri);
898                         if(--numToFind == 0) return;
899                 }
900                 assert_leq(duplicates, r2.size());
901                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
902                 return; // no hit
903         }
904 };
905
906 /**
907  * Concrete RefAligner for finding nearby 2-mismatch hits given an
908  * anchor hit.
909  */
910 template<typename TStr>
911 class TwoMMRefAligner : public RefAligner<TStr> {
912
913         typedef seqan::String<seqan::Dna5> TDna5Str;
914         typedef seqan::String<char> TCharStr;
915         typedef std::vector<Range> TRangeVec;
916         typedef std::vector<uint32_t> TU32Vec;
917         typedef std::pair<uint64_t, uint64_t> TU64Pair;
918         typedef std::set<TU64Pair> TSetPairs;
919
920 public:
921
922         TwoMMRefAligner(bool color, bool verbose, bool quiet) :
923                 RefAligner<TStr>(color, verbose, quiet) { }
924
925         virtual ~TwoMMRefAligner() { }
926
927 protected:
928         /**
929          * Because we're doing end-to-end exact, we don't care which end of
930          * 'qry' is the 5' end.
931          */
932         void naiveFind(uint32_t numToFind,
933                                    uint32_t tidx,
934                                    uint8_t* ref,
935                                    const TDna5Str& qry,
936                                    const TCharStr& quals,
937                                    uint32_t begin,
938                                    uint32_t end,
939                                    TRangeVec& ranges,
940                                    TU32Vec& results,
941                                    TSetPairs* pairs,
942                                    uint32_t aoff,
943                                    bool seedOnLeft) const
944         {
945                 assert_gt(numToFind, 0);
946                 const uint32_t qlen = seqan::length(qry);
947                 assert_geq(end - begin, qlen); // caller should have checked this
948                 assert_gt(end, begin);
949                 assert_gt(qlen, 0);
950                 uint32_t qend = end - qlen;
951                 uint32_t lim = qend - begin;
952                 uint32_t halfway = begin + (lim >> 1);
953                 bool hi = false;
954                 for(uint32_t i = 1; i <= lim+1; i++) {
955                         uint32_t ri;  // leftmost position in candidate alignment
956                         uint32_t rir; // same, minus begin; for indexing into ref[]
957                         if(hi) {
958                                 ri = halfway + (i >> 1); rir = ri - begin;
959                                 assert_leq(ri, qend);
960                         } else {
961                                 ri = halfway - (i >> 1); rir = ri - begin;
962                                 assert_geq(ri, begin);
963                         }
964                         hi = !hi;
965                         // Do the naive comparison
966                         bool match = true;
967                         int refc1 = -1;
968                         uint32_t mmOff1 = 0xffffffff;
969                         int refc2 = -1;
970                         uint32_t mmOff2 = 0xffffffff;
971                         int mms = 0;
972                         for(uint32_t j = 0; j < qlen; j++) {
973 #if 0
974                                 // Count Ns in the reference as mismatches
975                                 const int q = (int)qry[j];
976                                 const int r = (int)ref[rir + j];
977                                 assert_leq(q, 4);
978                                 assert_leq(r, 4);
979                                 if(q == 4 || r == 4 || q != r) {
980 #else
981                                 // Disallow alignments that involve an N in the
982                                 // reference
983                                 const int r = (int)ref[rir + j];
984                                 if(r & 4) {
985                                         // N in reference; bail
986                                         match = false;
987                                         break;
988                                 }
989                                 const int q = (int)qry[j];
990                                 assert_leq(q, 4);
991                                 assert_lt(r, 4);
992                                 if(q != r) {
993 #endif
994                                         // Mismatch!
995                                         if(++mms > 2) {
996                                                 // Too many; reject this alignment
997                                                 match = false;
998                                                 break;
999                                         } else if(mms == 2) {
1000                                                 // Second one; remember offset and ref char
1001                                                 refc2 = "ACGTN"[r];
1002                                                 mmOff2 = j;
1003                                         } else {
1004                                                 assert_eq(1, mms);
1005                                                 // First one; remember offset and ref char
1006                                                 refc1 = "ACGTN"[r];
1007                                                 mmOff1 = j;
1008                                         }
1009                                 }
1010                         }
1011                         if(match) {
1012                                 assert_leq(mms, 2);
1013                                 ranges.resize(ranges.size()+1);
1014                                 Range& range = ranges.back();
1015                                 range.stratum = mms;
1016                                 range.numMms = mms;
1017                                 assert_eq(0, range.mms.size());
1018                                 assert_eq(0, range.refcs.size());
1019                                 if(mms > 0) {
1020                                         assert_lt(mmOff1, qlen);
1021                                         assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1022                                         range.mms.push_back(mmOff1);
1023                                         range.refcs.push_back(refc1);
1024                                         if(mms > 1) {
1025                                                 assert_eq(2, mms);
1026                                                 assert_lt(mmOff2, qlen);
1027                                                 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1028                                                 range.mms.push_back(mmOff2);
1029                                                 range.refcs.push_back(refc2);
1030                                         }
1031                                 }
1032                                 results.push_back(ri);
1033                         }
1034                 }
1035                 return;
1036         }
1037
1038         /**
1039          * Because we're doing end-to-end exact, we don't care which end of
1040          * 'qry' is the 5' end.
1041          */
1042         virtual void anchor64Find(uint32_t numToFind,
1043                                         uint32_t tidx,
1044                                         uint8_t* ref,
1045                                         const TDna5Str& qry,
1046                                         const TCharStr& quals,
1047                                         uint32_t begin,
1048                                         uint32_t end,
1049                                         TRangeVec& ranges,
1050                                         TU32Vec& results,
1051                                         TSetPairs* pairs = NULL,
1052                                         uint32_t aoff = 0xffffffff,
1053                                         bool seedOnLeft = false) const
1054         {
1055                 assert_gt(numToFind, 0);
1056                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1057                 ASSERT_ONLY(uint32_t duplicates = 0);
1058                 ASSERT_ONLY(uint32_t r2i = 0);
1059                 const uint32_t qlen = seqan::length(qry);
1060                 assert_geq(end - begin, qlen); // caller should have checked this
1061                 assert_gt(end, begin);
1062                 assert_gt(qlen, 0);
1063 #ifndef NDEBUG
1064                 // Get results from the naive matcher for sanity-checking
1065                 TRangeVec r2; TU32Vec re2;
1066                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1067                                   re2, pairs, aoff, seedOnLeft);
1068 #endif
1069                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1070                 const int lhsShift = ((anchorBitPairs - 1) << 1);
1071                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
1072                 // anchorOverhang = # read bases not included in the anchor
1073                 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1074                 const uint32_t lim = end - qlen - begin;
1075                 const uint32_t halfway = begin + (lim >> 1);
1076                 uint64_t anchor = 0llu;
1077                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
1078                 // OR the 'diff' buffer with this so that we can always count
1079                 // 'N's as mismatches
1080                 uint64_t diffMask = 0llu;
1081                 // Set up a mask that we'll apply to the two bufs every round
1082                 // to discard bits that were rotated out of the anchor area
1083                 uint64_t clearMask = 0xffffffffffffffffllu;
1084                 bool useMask = false;
1085                 if(anchorBitPairs < 32) {
1086                         clearMask >>= ((32-anchorBitPairs) << 1);
1087                         useMask = true;
1088                 }
1089                 int nsInAnchor = 0;
1090                 uint32_t nPoss = 0;
1091                 int nPos1 = -1;
1092                 int nPos2 = -1;
1093                 uint32_t skipLeftToRights = 0;
1094                 uint32_t skipRightToLefts = 0;
1095                 // Construct the 'anchor' 64-bit buffer so that it holds all of
1096                 // the first 'anchorBitPairs' bit pairs of the query.
1097                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1098                         int c = (int)qry[i]; // next query character
1099                         int r = (int)ref[halfway - begin + i]; // next reference character
1100                         if(r & 4) {
1101                                 r = 0;
1102                                 // The right-to-left direction absorbs the candidate
1103                                 // alignment based at 'halfway'; so skipLeftToRights is
1104                                 // i, not i+1
1105                                 skipLeftToRights = max(skipLeftToRights, i);
1106                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1107                         }
1108                         assert_leq(c, 4);
1109                         assert_lt(r, 4);
1110                         // Special case: query has an 'N'
1111                         if(c == 4) {
1112                                 if(++nsInAnchor > 2) {
1113                                         // More than two 'N's in the anchor region; can't
1114                                         // possibly have a 2-mismatch hit anywhere
1115                                         return;   // can't match if query has Ns
1116                                 } else if(nsInAnchor == 2) {
1117                                         nPos2 = (int)i;
1118                                         nPoss++;
1119                                 } else {
1120                                         assert_eq(1, nsInAnchor);
1121                                         nPos1 = (int)i;
1122                                         nPoss++;
1123                                 }
1124                                 // Make it look like an 'A' in the anchor
1125                                 c = 0;
1126                                 diffMask = (diffMask << 2llu) | 1llu;
1127                         } else {
1128                                 diffMask <<= 2llu;
1129                         }
1130                         anchor  = ((anchor  << 2llu) | c);
1131                         buffw = ((buffw << 2llu) | r);
1132                 }
1133                 assert_leq(nPoss, 2);
1134                 // Check whether read is disqualified by Ns outside of the anchor
1135                 // region
1136                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1137                         if((int)qry[i] == 4) {
1138                                 if(++nsInAnchor > 2) {
1139                                         return; // can't match if query has Ns
1140                                 }
1141                         }
1142                 }
1143                 uint64_t bufbw = buffw;
1144                 // We're moving the right-hand edge of the anchor along until
1145                 // it's 'anchorOverhang' chars from the end of the target region.
1146                 // Note that we're not making a 3'/5' distinction here; if we
1147                 // were, we might need to make the 'anchorOverhang' adjustment on
1148                 // the left end of the range rather than the right.
1149                 bool hi = false;
1150                 uint32_t riHi  = halfway;
1151                 uint32_t rirHi = halfway - begin;
1152                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1153                 uint32_t riLo  = halfway + 1;
1154                 uint32_t rirLo = halfway - begin + 1;
1155                 uint32_t i;
1156                 for(i = 1; i <= lim + 1; i++) {
1157                         int r;       // new reference char
1158                         uint64_t diff;
1159                         assert_lt(skipLeftToRights, qlen);
1160                         assert_leq(skipRightToLefts, qlen);
1161                         if(hi) {
1162                                 hi = false;
1163                                 // Moving left-to-right
1164                                 riHi++;
1165                                 rirHi++;
1166                                 rirHiAnchor++;
1167                                 r = (int)ref[rirHiAnchor];
1168                                 if(r & 4) {
1169                                         r = 0;
1170                                         skipLeftToRights = anchorBitPairs;
1171                                 }
1172                                 assert_lt(r, 4);
1173                                 // Bring in new base pair at the least significant
1174                                 // position
1175                                 buffw = ((buffw << 2llu) | r);
1176                                 if(useMask) buffw &= clearMask;
1177                                 if(skipLeftToRights > 0) {
1178                                         skipLeftToRights--;
1179                                         continue;
1180                                 }
1181                                 diff = (buffw ^ anchor) | diffMask;
1182                         } else {
1183                                 hi = true;
1184                                 // Moving right-to-left
1185                                 riLo--;
1186                                 rirLo--;
1187                                 r = (int)ref[rirLo];
1188                                 if(r & 4) {
1189                                         r = 0;
1190                                         skipRightToLefts = qlen;
1191                                 }
1192                                 assert_lt(r, 4);
1193                                 if(i >= 2) {
1194                                         bufbw >>= 2llu;
1195                                         // Bring in new base pair at the most significant
1196                                         // position
1197                                         bufbw |= ((uint64_t)r << lhsShift);
1198                                 }
1199                                 if(skipRightToLefts > 0) {
1200                                         skipRightToLefts--;
1201                                         continue;
1202                                 }
1203                                 diff = (bufbw ^ anchor) | diffMask;
1204                         }
1205                         if((diff & 0xfffff00000000000llu) &&
1206                            (diff & 0x00000ffffff00000llu) &&
1207                            (diff & 0x00000000000fffffllu)) continue;
1208                         uint32_t ri  = hi ? riLo  : riHi;
1209                         uint32_t rir = hi ? rirLo : rirHi;
1210                         // Could use pop count
1211                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1212                         // As a first cut, see if there are too many mismatches in
1213                         // the first and last parts of the anchor
1214                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1215                         if(diffs > 2) continue;
1216                         diffs += u8toMms[(int)diff8[1]] +
1217                                          u8toMms[(int)diff8[2]] +
1218                                          u8toMms[(int)diff8[3]] +
1219                                          u8toMms[(int)diff8[4]] +
1220                                          u8toMms[(int)diff8[5]] +
1221                                          u8toMms[(int)diff8[6]];
1222                         uint32_t mmpos1 = 0xffffffff;
1223                         int refc1 = -1;
1224                         uint32_t mmpos2 = 0xffffffff;
1225                         int refc2 = -1;
1226                         if(diffs > 2) {
1227                                 // Too many differences
1228                                 continue;
1229                         } else if(nPoss > 1 && diffs == nPoss) {
1230                                 // There was one difference, but there was also one N,
1231                                 // so we already know where the difference is
1232                                 mmpos1 = nPos1;
1233                                 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1234                                 if(nPoss == 2) {
1235                                         mmpos2 = nPos2;
1236                                         refc2 = "ACGT"[(int)ref[rir + nPos2]];
1237                                 }
1238                         } else if(diffs > 0) {
1239                                 // Figure out which position mismatched
1240                                 uint64_t diff2 = diff;
1241                                 mmpos1 = 31;
1242                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1243                                 assert_neq(0, diff);
1244                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos1 -=  8; }
1245                                 assert_neq(0, diff);
1246                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos1 -=  4; }
1247                                 assert_neq(0, diff);
1248                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos1 -=  2; }
1249                                 assert_neq(0, diff);
1250                                 if((diff & 0x3llu) == 0)        { mmpos1--; }
1251                                 assert_neq(0, diff);
1252                                 assert_geq(mmpos1, 0);
1253                                 assert_lt(mmpos1, 32);
1254                                 mmpos1 -= anchorCushion;
1255                                 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1256                                 if(diffs > 1) {
1257                                         // Figure out the second mismatched position
1258                                         ASSERT_ONLY(uint64_t origDiff2 = diff2);
1259                                         diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1+anchorCushion) << 1));
1260                                         assert_neq(diff2, origDiff2);
1261                                         mmpos2 = 31;
1262                                         if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1263                                         assert_neq(0, diff2);
1264                                         if((diff2 & 0xffffllu) == 0)     { diff2 >>= 16llu; mmpos2 -=  8; }
1265                                         assert_neq(0, diff2);
1266                                         if((diff2 & 0xffllu) == 0)       { diff2 >>= 8llu;  mmpos2 -=  4; }
1267                                         assert_neq(0, diff2);
1268                                         if((diff2 & 0xfllu) == 0)        { diff2 >>= 4llu;  mmpos2 -=  2; }
1269                                         assert_neq(0, diff2);
1270                                         if((diff2 & 0x3llu) == 0)        { mmpos2--; }
1271                                         assert_neq(0, diff2);
1272                                         assert_geq(mmpos2, 0);
1273                                         assert_lt(mmpos2, 32);
1274                                         mmpos2 -= anchorCushion;
1275                                         assert_neq(mmpos1, mmpos2);
1276                                         refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1277                                         if(mmpos2 < mmpos1) {
1278                                                 uint32_t mmtmp = mmpos1;
1279                                                 mmpos1 = mmpos2;
1280                                                 mmpos2 = mmtmp;
1281                                                 int refctmp = refc1;
1282                                                 refc1 = refc2;
1283                                                 refc2 = refctmp;
1284                                         }
1285                                         assert_lt(mmpos1, mmpos2);
1286                                 }
1287                         }
1288                         // Now extend the anchor into a longer alignment
1289                         bool foundHit = true;
1290                         if(anchorOverhang > 0) {
1291                                 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1292                                 bool skipCandidate = false;
1293                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
1294                                         int rc = (int)ref[rir + 32 + j];
1295                                         if(rc == 4) {
1296                                                 // Oops, encountered an N in the reference in
1297                                                 // the overhang portion of the candidate
1298                                                 // alignment
1299                                                 // (Note that we inverted hi earlier)
1300                                                 if(hi) {
1301                                                         // Right-to-left
1302                                                         skipRightToLefts = anchorOverhang - j - 1;
1303                                                 } else {
1304                                                         // Left-to-right
1305                                                         skipLeftToRights = anchorBitPairs + j;
1306                                                 }
1307                                                 skipCandidate = true; // Skip this candidate
1308                                                 break;
1309                                         }
1310                                         if((int)qry[32 + j] != rc) {
1311                                                 if(++diffs > 2) {
1312                                                         foundHit = false;
1313                                                         break;
1314                                                 } else if(diffs == 2) {
1315                                                         mmpos2 = 32 + j;
1316                                                         refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1317                                                 } else {
1318                                                         assert_eq(1, diffs);
1319                                                         mmpos1 = 32 + j;
1320                                                         refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1321                                                 }
1322                                         }
1323                                 }
1324                                 if(skipCandidate) continue;
1325                         }
1326                         if(!foundHit) continue;
1327                         if(pairs != NULL) {
1328                                 TU64Pair p;
1329                                 if(ri < aoff) {
1330                                         // By convention, the upstream mate's
1331                                         // coordinates go in the 'first' field
1332                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
1333                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1334                                 } else {
1335                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1336                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1337                                 }
1338                                 if(pairs->find(p) != pairs->end()) {
1339                                         // We already found this hit!  Continue.
1340                                         ASSERT_ONLY(duplicates++);
1341                                         ASSERT_ONLY(r2i++);
1342                                         continue;
1343                                 } else {
1344                                         // Record this hit
1345                                         pairs->insert(p);
1346                                 }
1347                         }
1348                         assert_leq(diffs, 2);
1349                         assert_lt(r2i, r2.size());
1350                         assert_eq(re2[r2i], ri);
1351                         ranges.resize(ranges.size()+1);
1352                         Range& range = ranges.back();
1353                         assert_eq(diffs, r2[r2i].numMms);
1354                         range.stratum = diffs;
1355                         range.numMms = diffs;
1356                         assert_eq(0, range.mms.size());
1357                         assert_eq(0, range.refcs.size());
1358                         if(diffs > 0) {
1359                                 assert_neq(mmpos1, 0xffffffff);
1360                                 assert_eq(mmpos1, r2[r2i].mms[0]);
1361                                 assert_neq(-1, refc1);
1362                                 assert_eq(refc1, r2[r2i].refcs[0]);
1363                                 range.mms.push_back(mmpos1);
1364                                 range.refcs.push_back(refc1);
1365                                 if(diffs > 1) {
1366                                         assert_neq(mmpos2, 0xffffffff);
1367                                         assert_eq(mmpos2, r2[r2i].mms[1]);
1368                                         assert_neq(-1, refc2);
1369                                         assert_eq(refc2, r2[r2i].refcs[1]);
1370                                         range.mms.push_back(mmpos2);
1371                                         range.refcs.push_back(refc2);
1372                                 }
1373                         }
1374                         ASSERT_ONLY(r2i++);
1375                         results.push_back(ri);
1376                         if(--numToFind == 0) return;
1377                 }
1378                 assert_leq(duplicates, r2.size());
1379                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1380                 return; // no hit
1381         }
1382 };
1383
1384 /**
1385  * Concrete RefAligner for finding nearby 2-mismatch hits given an
1386  * anchor hit.
1387  */
1388 template<typename TStr>
1389 class ThreeMMRefAligner : public RefAligner<TStr> {
1390
1391         typedef seqan::String<seqan::Dna5> TDna5Str;
1392         typedef seqan::String<char> TCharStr;
1393         typedef std::vector<Range> TRangeVec;
1394         typedef std::vector<uint32_t> TU32Vec;
1395         typedef std::pair<uint64_t, uint64_t> TU64Pair;
1396         typedef std::set<TU64Pair> TSetPairs;
1397
1398 public:
1399
1400         ThreeMMRefAligner(bool color, bool verbose, bool quiet) :
1401                 RefAligner<TStr>(color, verbose, quiet) { }
1402
1403         virtual ~ThreeMMRefAligner() { }
1404
1405 protected:
1406         /**
1407          * Because we're doing end-to-end exact, we don't care which end of
1408          * 'qry' is the 5' end.
1409          */
1410         void naiveFind(uint32_t numToFind,
1411                                    uint32_t tidx,
1412                                    uint8_t* ref,
1413                                    const TDna5Str& qry,
1414                                    const TCharStr& quals,
1415                                    uint32_t begin,
1416                                    uint32_t end,
1417                                    TRangeVec& ranges,
1418                                    TU32Vec& results,
1419                                    TSetPairs* pairs,
1420                                    uint32_t aoff,
1421                                    bool seedOnLeft) const
1422         {
1423                 assert_gt(numToFind, 0);
1424                 const uint32_t qlen = seqan::length(qry);
1425                 assert_geq(end - begin, qlen); // caller should have checked this
1426                 assert_gt(end, begin);
1427                 assert_gt(qlen, 0);
1428                 uint32_t qend = end - qlen;
1429                 uint32_t lim = qend - begin;
1430                 uint32_t halfway = begin + (lim >> 1);
1431                 bool hi = false;
1432                 for(uint32_t i = 1; i <= lim+1; i++) {
1433                         uint32_t ri;  // leftmost position in candidate alignment
1434                         uint32_t rir; // same, minus begin; for indexing into ref[]
1435                         if(hi) {
1436                                 ri = halfway + (i >> 1); rir = ri - begin;
1437                                 assert_leq(ri, qend);
1438                         } else {
1439                                 ri = halfway - (i >> 1); rir = ri - begin;
1440                                 assert_geq(ri, begin);
1441                         }
1442                         hi = !hi;
1443                         // Do the naive comparison
1444                         bool match = true;
1445                         int refc1 = -1;
1446                         uint32_t mmOff1 = 0xffffffff;
1447                         int refc2 = -1;
1448                         uint32_t mmOff2 = 0xffffffff;
1449                         int refc3 = -1;
1450                         uint32_t mmOff3 = 0xffffffff;
1451                         int mms = 0;
1452                         for(uint32_t j = 0; j < qlen; j++) {
1453 #if 0
1454                                 // Count Ns in the reference as mismatches
1455                                 const int q = (int)qry[j];
1456                                 const int r = (int)ref[rir + j];
1457                                 assert_leq(q, 4);
1458                                 assert_leq(r, 4);
1459                                 if(q == 4 || r == 4 || q != r) {
1460 #else
1461                                 // Disallow alignments that involve an N in the
1462                                 // reference
1463                                 const int r = (int)ref[rir + j];
1464                                 if(r & 4) {
1465                                         // N in reference; bail
1466                                         match = false;
1467                                         break;
1468                                 }
1469                                 const int q = (int)qry[j];
1470                                 assert_leq(q, 4);
1471                                 assert_lt(r, 4);
1472                                 if(q != r) {
1473 #endif
1474                                         // Mismatch!
1475                                         if(++mms > 3) {
1476                                                 // Too many; reject this alignment
1477                                                 match = false;
1478                                                 break;
1479                                         } else if(mms == 3) {
1480                                                 // Second one; remember offset and ref char
1481                                                 refc3 = "ACGTN"[r];
1482                                                 mmOff3 = j;
1483                                         } else if(mms == 2) {
1484                                                 // Second one; remember offset and ref char
1485                                                 refc2 = "ACGTN"[r];
1486                                                 mmOff2 = j;
1487                                         } else {
1488                                                 assert_eq(1, mms);
1489                                                 // First one; remember offset and ref char
1490                                                 refc1 = "ACGTN"[r];
1491                                                 mmOff1 = j;
1492                                         }
1493                                 }
1494                         }
1495                         if(match) {
1496                                 assert_leq(mms, 3);
1497                                 ranges.resize(ranges.size()+1);
1498                                 Range& range = ranges.back();
1499                                 range.stratum = mms;
1500                                 range.numMms = mms;
1501                                 assert_eq(0, range.mms.size());
1502                                 assert_eq(0, range.refcs.size());
1503                                 if(mms > 0) {
1504                                         assert_lt(mmOff1, qlen);
1505                                         assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1506                                         range.mms.push_back(mmOff1);
1507                                         range.refcs.push_back(refc1);
1508                                         if(mms > 1) {
1509                                                 assert_lt(mmOff2, qlen);
1510                                                 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1511                                                 range.mms.push_back(mmOff2);
1512                                                 range.refcs.push_back(refc2);
1513                                                 if(mms > 2) {
1514                                                         assert_eq(3, mms);
1515                                                         assert_lt(mmOff3, qlen);
1516                                                         assert(refc3 == 'A' || refc3 == 'C' || refc3 == 'G' || refc3 == 'T');
1517                                                         range.mms.push_back(mmOff3);
1518                                                         range.refcs.push_back(refc3);
1519                                                 }
1520                                         }
1521                                 }
1522                                 results.push_back(ri);
1523                         }
1524                 }
1525                 return;
1526         }
1527
1528         /**
1529          * Because we're doing end-to-end exact, we don't care which end of
1530          * 'qry' is the 5' end.
1531          */
1532         virtual void anchor64Find(uint32_t numToFind,
1533                                         uint32_t tidx,
1534                                         uint8_t* ref,
1535                                         const TDna5Str& qry,
1536                                         const TCharStr& quals,
1537                                         uint32_t begin,
1538                                         uint32_t end,
1539                                         TRangeVec& ranges,
1540                                         TU32Vec& results,
1541                                         TSetPairs* pairs = NULL,
1542                                         uint32_t aoff = 0xffffffff,
1543                                         bool seedOnLeft = false) const
1544         {
1545                 assert_gt(numToFind, 0);
1546                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1547                 ASSERT_ONLY(uint32_t duplicates = 0);
1548                 ASSERT_ONLY(uint32_t r2i = 0);
1549                 const uint32_t qlen = seqan::length(qry);
1550                 assert_geq(end - begin, qlen); // caller should have checked this
1551                 assert_gt(end, begin);
1552                 assert_gt(qlen, 0);
1553 #ifndef NDEBUG
1554                 // Get results from the naive matcher for sanity-checking
1555                 TRangeVec r2; TU32Vec re2;
1556                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1557                                   re2, pairs, aoff, seedOnLeft);
1558 #endif
1559                 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1560                 const int lhsShift = ((anchorBitPairs - 1) << 1);
1561                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
1562                 // anchorOverhang = # read bases not included in the anchor
1563                 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1564                 const uint32_t lim = end - qlen - begin;
1565                 const uint32_t halfway = begin + (lim >> 1);
1566                 uint64_t anchor = 0llu;
1567                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
1568                 // OR the 'diff' buffer with this so that we can always count
1569                 // 'N's as mismatches
1570                 uint64_t diffMask = 0llu;
1571                 // Set up a mask that we'll apply to the two bufs every round
1572                 // to discard bits that were rotated out of the anchor area
1573                 uint64_t clearMask = 0xffffffffffffffffllu;
1574                 bool useMask = false;
1575                 if(anchorBitPairs < 32) {
1576                         clearMask >>= ((32-anchorBitPairs) << 1);
1577                         useMask = true;
1578                 }
1579                 int nsInAnchor = 0;
1580                 uint32_t nPoss = 0;
1581                 int nPos1 = -1;
1582                 int nPos2 = -1;
1583                 int nPos3 = -1;
1584                 uint32_t skipLeftToRights = 0;
1585                 uint32_t skipRightToLefts = 0;
1586                 // Construct the 'anchor' 64-bit buffer so that it holds all of
1587                 // the first 'anchorBitPairs' bit pairs of the query.
1588                 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1589                         int c = (int)qry[i]; // next query character
1590                         int r = (int)ref[halfway - begin + i]; // next reference character
1591                         if(r & 4) {
1592                                 r = 0;
1593                                 // The right-to-left direction absorbs the candidate
1594                                 // alignment based at 'halfway'; so skipLeftToRights is
1595                                 // i, not i+1
1596                                 skipLeftToRights = max(skipLeftToRights, i);
1597                                 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1598                         }
1599                         assert_leq(c, 4);
1600                         assert_lt(r, 4);
1601                         // Special case: query has an 'N'
1602                         if(c == 4) {
1603                                 if(++nsInAnchor > 3) {
1604                                         // More than two 'N's in the anchor region; can't
1605                                         // possibly have a 2-mismatch hit anywhere
1606                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
1607                                         return;   // can't match if query has Ns
1608                                 } else if(nsInAnchor == 3) {
1609                                         nPos3 = (int)i;
1610                                         nPoss++;
1611                                 } else if(nsInAnchor == 2) {
1612                                         nPos2 = (int)i;
1613                                         nPoss++;
1614                                 } else {
1615                                         assert_eq(1, nsInAnchor);
1616                                         nPos1 = (int)i;
1617                                         nPoss++;
1618                                 }
1619                                 // Make it look like an 'A' in the anchor
1620                                 c = 0;
1621                                 diffMask = (diffMask << 2llu) | 1llu;
1622                         } else {
1623                                 diffMask <<= 2llu;
1624                         }
1625                         anchor  = ((anchor  << 2llu) | c);
1626                         buffw = ((buffw << 2llu) | r);
1627                 }
1628                 assert_leq(nPoss, 3);
1629                 // Check whether read is disqualified by Ns outside of the anchor
1630                 // region
1631                 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1632                         if((int)qry[i] == 4) {
1633                                 if(++nsInAnchor > 3) {
1634                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
1635                                         return; // can't match if query has Ns
1636                                 }
1637                         }
1638                 }
1639                 uint64_t bufbw = buffw;
1640                 // We're moving the right-hand edge of the anchor along until
1641                 // it's 'anchorOverhang' chars from the end of the target region.
1642                 // Note that we're not making a 3'/5' distinction here; if we
1643                 // were, we might need to make the 'anchorOverhang' adjustment on
1644                 // the left end of the range rather than the right.
1645                 bool hi = false;
1646                 uint32_t riHi  = halfway;
1647                 uint32_t rirHi = halfway - begin;
1648                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1649                 uint32_t riLo  = halfway + 1;
1650                 uint32_t rirLo = halfway - begin + 1;
1651                 for(uint32_t i = 1; i <= lim + 1; i++) {
1652                         int r;       // new reference char
1653                         uint64_t diff;
1654                         assert_lt(skipLeftToRights, qlen);
1655                         assert_leq(skipRightToLefts, qlen);
1656                         if(hi) {
1657                                 hi = false;
1658                                 // Moving left-to-right
1659                                 riHi++;
1660                                 rirHi++;
1661                                 rirHiAnchor++;
1662                                 r = (int)ref[rirHiAnchor];
1663                                 if(r & 4) {
1664                                         r = 0;
1665                                         skipLeftToRights = anchorBitPairs;
1666                                 }
1667                                 assert_lt(r, 4);
1668                                 // Bring in new base pair at the least significant
1669                                 // position
1670                                 buffw = ((buffw << 2llu) | r);
1671                                 if(useMask) buffw &= clearMask;
1672                                 if(skipLeftToRights > 0) {
1673                                         skipLeftToRights--;
1674                                         continue;
1675                                 }
1676                                 diff = (buffw ^ anchor) | diffMask;
1677                         } else {
1678                                 hi = true;
1679                                 // Moving right-to-left
1680                                 riLo--;
1681                                 rirLo--;
1682                                 r = (int)ref[rirLo];
1683                                 if(r & 4) {
1684                                         r = 0;
1685                                         skipRightToLefts = qlen;
1686                                 }
1687                                 assert_lt(r, 4);
1688                                 if(i >= 2) {
1689                                         bufbw >>= 2llu;
1690                                         // Bring in new base pair at the most significant
1691                                         // position
1692                                         bufbw |= ((uint64_t)r << lhsShift);
1693                                 }
1694                                 if(skipRightToLefts > 0) {
1695                                         skipRightToLefts--;
1696                                         continue;
1697                                 }
1698                                 diff = (bufbw ^ anchor) | diffMask;
1699                         }
1700                         if((diff & 0xffff000000000000llu) &&
1701                            (diff & 0x0000ffff00000000llu) &&
1702                            (diff & 0x00000000ffff0000llu) &&
1703                            (diff & 0x000000000000ffffllu)) continue;
1704                         uint32_t ri  = hi ? riLo  : riHi;
1705                         uint32_t rir = hi ? rirLo : rirHi;
1706                         // Could use pop count
1707                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1708                         // As a first cut, see if there are too many mismatches in
1709                         // the first and last parts of the anchor
1710                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1711                         if(diffs > 3) continue;
1712                         diffs += u8toMms[(int)diff8[1]] +
1713                                          u8toMms[(int)diff8[2]] +
1714                                          u8toMms[(int)diff8[3]] +
1715                                          u8toMms[(int)diff8[4]] +
1716                                          u8toMms[(int)diff8[5]] +
1717                                          u8toMms[(int)diff8[6]];
1718                         uint32_t mmpos1 = 0xffffffff;
1719                         int refc1 = -1;
1720                         uint32_t mmpos2 = 0xffffffff;
1721                         int refc2 = -1;
1722                         uint32_t mmpos3 = 0xffffffff;
1723                         int refc3 = -1;
1724                         if(diffs > 3) {
1725                                 // Too many differences
1726                                 continue;
1727                         } else if(nPoss > 1 && diffs == nPoss) {
1728                                 // There was one difference, but there was also one N,
1729                                 // so we already know where the difference is
1730                                 mmpos1 = nPos1;
1731                                 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1732                                 if(nPoss > 1) {
1733                                         mmpos2 = nPos2;
1734                                         refc2 = "ACGT"[(int)ref[rir + nPos2]];
1735                                         if(nPoss > 2) {
1736                                                 mmpos3 = nPos3;
1737                                                 refc3 = "ACGT"[(int)ref[rir + nPos3]];
1738                                         }
1739                                 }
1740                         } else if(diffs > 0) {
1741                                 // Figure out which position mismatched
1742                                 uint64_t diff2 = diff;
1743                                 mmpos1 = 31;
1744                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1745                                 assert_neq(0, diff);
1746                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos1 -=  8; }
1747                                 assert_neq(0, diff);
1748                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos1 -=  4; }
1749                                 assert_neq(0, diff);
1750                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos1 -=  2; }
1751                                 assert_neq(0, diff);
1752                                 if((diff & 0x3llu) == 0)        { mmpos1--; }
1753                                 assert_neq(0, diff);
1754                                 assert_geq(mmpos1, 0);
1755                                 assert_lt(mmpos1, 32);
1756                                 mmpos1 -= anchorCushion;
1757                                 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1758                                 if(diffs > 1) {
1759                                         // Figure out the second mismatched position
1760                                         diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1 + anchorCushion) << 1));
1761                                         uint64_t diff3 = diff2;
1762                                         mmpos2 = 31;
1763                                         if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1764                                         assert_neq(0, diff2);
1765                                         if((diff2 & 0xffffllu) == 0)     { diff2 >>= 16llu; mmpos2 -=  8; }
1766                                         assert_neq(0, diff2);
1767                                         if((diff2 & 0xffllu) == 0)       { diff2 >>= 8llu;  mmpos2 -=  4; }
1768                                         assert_neq(0, diff2);
1769                                         if((diff2 & 0xfllu) == 0)        { diff2 >>= 4llu;  mmpos2 -=  2; }
1770                                         assert_neq(0, diff2);
1771                                         if((diff2 & 0x3llu) == 0)        { mmpos2--; }
1772                                         assert_neq(0, diff2);
1773                                         mmpos2 -= anchorCushion;
1774                                         assert_geq(mmpos2, 0);
1775                                         assert_lt(mmpos2, 32);
1776                                         assert_neq(mmpos1, mmpos2);
1777                                         refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1778                                         uint32_t mmpos2orig = mmpos2;
1779                                         if(mmpos2 < mmpos1) {
1780                                                 uint32_t mmtmp = mmpos1;
1781                                                 mmpos1 = mmpos2;
1782                                                 mmpos2 = mmtmp;
1783                                                 int refctmp = refc1;
1784                                                 refc1 = refc2;
1785                                                 refc2 = refctmp;
1786                                         }
1787                                         assert_lt(mmpos1, mmpos2);
1788                                         if(diffs > 2) {
1789                                                 // Figure out the second mismatched position
1790                                                 diff3 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos2orig + anchorCushion) << 1));
1791                                                 mmpos3 = 31;
1792                                                 if((diff3 & 0xffffffffllu) == 0) { diff3 >>= 32llu; mmpos3 -= 16; }
1793                                                 assert_neq(0, diff3);
1794                                                 if((diff3 & 0xffffllu) == 0)     { diff3 >>= 16llu; mmpos3 -=  8; }
1795                                                 assert_neq(0, diff3);
1796                                                 if((diff3 & 0xffllu) == 0)       { diff3 >>= 8llu;  mmpos3 -=  4; }
1797                                                 assert_neq(0, diff3);
1798                                                 if((diff3 & 0xfllu) == 0)        { diff3 >>= 4llu;  mmpos3 -=  2; }
1799                                                 assert_neq(0, diff3);
1800                                                 if((diff3 & 0x3llu) == 0)        { mmpos3--; }
1801                                                 assert_neq(0, diff3);
1802                                                 mmpos3 -= anchorCushion;
1803                                                 assert_geq(mmpos3, 0);
1804                                                 assert_lt(mmpos3, 32);
1805                                                 assert_neq(mmpos1, mmpos3);
1806                                                 assert_neq(mmpos2, mmpos3);
1807                                                 refc3 = "ACGT"[(int)ref[rir + mmpos3]];
1808                                                 if(mmpos3 < mmpos1) {
1809                                                         uint32_t mmtmp = mmpos1;
1810                                                         mmpos1 = mmpos3;
1811                                                         mmpos3 = mmpos2;
1812                                                         mmpos2 = mmtmp;
1813                                                         int refctmp = refc1;
1814                                                         refc1 = refc3;
1815                                                         refc3 = refc2;
1816                                                         refc2 = refctmp;
1817                                                 } else if(mmpos3 < mmpos2) {
1818                                                         uint32_t mmtmp = mmpos2;
1819                                                         mmpos2 = mmpos3;
1820                                                         mmpos3 = mmtmp;
1821                                                         int refctmp = refc2;
1822                                                         refc2 = refc3;
1823                                                         refc3 = refctmp;
1824                                                 }
1825                                                 assert_lt(mmpos1, mmpos2);
1826                                                 assert_lt(mmpos2, mmpos3);
1827                                         }
1828                                 }
1829                         }
1830                         // Now extend the anchor into a longer alignment
1831                         bool foundHit = true;
1832                         if(anchorOverhang > 0) {
1833                                 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1834                                 bool skipCandidate = false;
1835                                 for(uint32_t j = 0; j < anchorOverhang; j++) {
1836                                         int rc = (int)ref[rir + 32 + j];
1837                                         if(rc == 4) {
1838                                                 // Oops, encountered an N in the reference in
1839                                                 // the overhang portion of the candidate
1840                                                 // alignment
1841                                                 // (Note that we inverted hi earlier)
1842                                                 if(hi) {
1843                                                         // Right-to-left
1844                                                         skipRightToLefts = anchorOverhang - j - 1;
1845                                                 } else {
1846                                                         // Left-to-right
1847                                                         skipLeftToRights = anchorBitPairs + j;
1848                                                 }
1849                                                 skipCandidate = true; // Skip this candidate
1850                                                 break;
1851                                         }
1852                                         if((int)qry[32 + j] != rc) {
1853                                                 if(++diffs > 3) {
1854                                                         foundHit = false;
1855                                                         break;
1856                                                 } else if(diffs == 3) {
1857                                                         mmpos3 = 32 + j;
1858                                                         refc3 = "ACGT"[(int)ref[rir + 32 + j]];
1859                                                 } else if(diffs == 2) {
1860                                                         mmpos2 = 32 + j;
1861                                                         refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1862                                                 } else {
1863                                                         assert_eq(1, diffs);
1864                                                         mmpos1 = 32 + j;
1865                                                         refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1866                                                 }
1867                                         }
1868                                 }
1869                                 if(skipCandidate) continue;
1870                         }
1871                         if(!foundHit) continue;
1872                         if(pairs != NULL) {
1873                                 TU64Pair p;
1874                                 if(ri < aoff) {
1875                                         // By convention, the upstream mate's
1876                                         // coordinates go in the 'first' field
1877                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
1878                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1879                                 } else {
1880                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1881                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1882                                 }
1883                                 if(pairs->find(p) != pairs->end()) {
1884                                         // We already found this hit!  Continue.
1885                                         ASSERT_ONLY(duplicates++);
1886                                         ASSERT_ONLY(r2i++);
1887                                         continue;
1888                                 } else {
1889                                         // Record this hit
1890                                         pairs->insert(p);
1891                                 }
1892                         }
1893                         assert_leq(diffs, 3);
1894                         assert_lt(r2i, r2.size());
1895                         assert_eq(re2[r2i], ri);
1896                         ranges.resize(ranges.size()+1);
1897                         Range& range = ranges.back();
1898                         assert_eq(diffs, r2[r2i].numMms);
1899                         range.stratum = diffs;
1900                         range.numMms = diffs;
1901                         assert_eq(0, range.mms.size());
1902                         assert_eq(0, range.refcs.size());
1903                         if(diffs > 0) {
1904                                 assert_neq(mmpos1, 0xffffffff);
1905                                 assert_eq(mmpos1, r2[r2i].mms[0]);
1906                                 assert_neq(-1, refc1);
1907                                 assert_eq(refc1, r2[r2i].refcs[0]);
1908                                 range.mms.push_back(mmpos1);
1909                                 range.refcs.push_back(refc1);
1910                                 if(diffs > 1) {
1911                                         assert_neq(mmpos2, 0xffffffff);
1912                                         assert_eq(mmpos2, r2[r2i].mms[1]);
1913                                         assert_neq(-1, refc2);
1914                                         assert_eq(refc2, r2[r2i].refcs[1]);
1915                                         range.mms.push_back(mmpos2);
1916                                         range.refcs.push_back(refc2);
1917                                         if(diffs > 2) {
1918                                                 assert_neq(mmpos3, 0xffffffff);
1919                                                 assert_eq(mmpos3, r2[r2i].mms[2]);
1920                                                 assert_neq(-1, refc3);
1921                                                 assert_eq(refc3, r2[r2i].refcs[2]);
1922                                                 range.mms.push_back(mmpos3);
1923                                                 range.refcs.push_back(refc3);
1924                                         }
1925                                 }
1926                         }
1927                         ASSERT_ONLY(r2i++);
1928                         results.push_back(ri);
1929                         if(--numToFind == 0) return;
1930                 }
1931                 assert_leq(duplicates, r2.size());
1932                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1933                 return; // no hit
1934         }
1935 };
1936
1937 /**
1938  * Concrete RefAligner for finding nearby 1-mismatch hits given an
1939  * anchor hit.
1940  */
1941 template<typename TStr>
1942 class Seed0RefAligner : public RefAligner<TStr> {
1943
1944         typedef seqan::String<seqan::Dna5> TDna5Str;
1945         typedef seqan::String<char> TCharStr;
1946         typedef std::vector<Range> TRangeVec;
1947         typedef std::vector<uint32_t> TU32Vec;
1948         typedef std::pair<uint64_t, uint64_t> TU64Pair;
1949         typedef std::set<TU64Pair> TSetPairs;
1950
1951 public:
1952
1953         Seed0RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
1954                 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
1955
1956         virtual ~Seed0RefAligner() { }
1957
1958 protected:
1959         /**
1960          * This schematic shows the roles played by the begin, qbegin, end,
1961          * qend, halfway, slen, qlen, and lim variables:
1962          *
1963          * seedOnLeft == true:
1964          *
1965          * |<                   lim                   >|<     qlen       >|
1966          *  --------------------------------------------------------------
1967          * |                     | slen | qlen-slen |  | slen | qlen-slen |
1968          *  --------------------------------------------------------------
1969          * ^                     ^                     ^                  ^
1970          * begin & qbegin     halfway                qend               end
1971          *
1972          * seedOnLeft == false:
1973          *
1974          * |<     qlen       >|<                   lim                   >|
1975          *  --------------------------------------------------------------
1976          * | qlen-slen | slen |  | qlen-slen | slen |                     |
1977          *  --------------------------------------------------------------
1978          * ^                  ^                     ^                     ^
1979          * begin            qbegin             halfway           qend & end
1980          *
1981          * Note that, for seeds longer than 32 base-pairs, the seed is
1982          * further subdivided into a 32-bit anchor and a seed overhang of
1983          * length > 0.
1984          */
1985         void naiveFind(uint32_t numToFind,
1986                                    uint32_t tidx,
1987                                    uint8_t* ref,
1988                                    const TDna5Str& qry,
1989                                    const TCharStr& quals,
1990                                    uint32_t begin,
1991                                    uint32_t end,
1992                                    TRangeVec& ranges,
1993                                    TU32Vec& results,
1994                                    TSetPairs* pairs,
1995                                    uint32_t aoff,
1996                                    bool seedOnLeft) const
1997         {
1998                 assert_gt(numToFind, 0);
1999                 assert_gt(end, begin);
2000                 const uint32_t qlen = seqan::length(qry);
2001                 assert_gt(qlen, 0);
2002                 assert_geq(end - begin, qlen); // caller should have checked this
2003                 assert_gt(this->seedLen_, 0);
2004                 const uint32_t slen = min(qlen, this->seedLen_);
2005                 uint32_t qend = end;
2006                 uint32_t qbegin = begin;
2007                 // If the seed is on the left-hand side of the alignment, then
2008                 // leave a gap at the right-hand side of the interval;
2009                 // otherwise, do the opposite
2010                 if(seedOnLeft) {
2011                         // Leave gap on right-hand side of the interval
2012                         qend -= qlen;
2013                 } else {
2014                         // Leave gap on left-hand side of the interval
2015                         qbegin += qlen;
2016                 }
2017                 // lim = number of alignments to try
2018                 const uint32_t lim = qend - qbegin;
2019                 // halfway = position in the reference to start at (and then
2020                 // we work our way out to the right and to the left).
2021                 const uint32_t halfway = qbegin + (lim >> 1);
2022                 // Vectors for holding edit information
2023                 std::vector<uint32_t> nonSeedMms;
2024                 std::vector<uint8_t> nonSeedRefcs;
2025                 bool hi = false;
2026                 for(uint32_t i = 1; i <= lim+1; i++) {
2027                         uint32_t ri;  // leftmost position in candidate alignment
2028                         uint32_t rir; // same, minus begin; for indexing into ref[]
2029                         if(hi) {
2030                                 ri = halfway + (i >> 1); rir = ri - begin;
2031                                 assert_leq(ri, qend);
2032                         } else {
2033                                 ri = halfway - (i >> 1); rir = ri - begin;
2034                                 assert_geq(ri, begin);
2035                         }
2036                         hi = !hi;
2037                         // Do the naive comparison
2038                         bool match = true;
2039                         int mms = 0;
2040                         unsigned int ham = 0;
2041                         nonSeedMms.clear();
2042                         nonSeedRefcs.clear();
2043                         // Walk through each position of the alignment
2044                         for(uint32_t jj = 0; jj < qlen; jj++) {
2045                                 uint32_t j = jj;
2046                                 if(!seedOnLeft) {
2047                                         // If seed is on the right, scan right-to-left
2048                                         j = qlen - jj - 1;
2049                                 } else {
2050                                         // Go left-to-right
2051                                 }
2052                                 uint32_t rirj = rir + j;
2053                                 if(!seedOnLeft) {
2054                                         assert_geq(rir, jj);
2055                                         rirj = rir - jj - 1;
2056                                 }
2057 #if 0
2058                                 // Count Ns in the reference as mismatches
2059                                 const int q = (int)qry[j];
2060                                 const int r = (int)ref[rirj];
2061                                 assert_leq(q, 4);
2062                                 assert_leq(r, 4);
2063                                 if(q == 4 || r == 4 || q != r) {
2064 #else
2065                                 // Disallow alignments that involve an N in the
2066                                 // reference
2067                                 const int r = (int)ref[rirj];
2068                                 if(r & 4) {
2069                                         // N in reference; bail on this alignment
2070                                         match = false;
2071                                         break;
2072                                 }
2073                                 const int q = (int)qry[j];
2074                                 assert_leq(q, 4);
2075                                 assert_lt(r, 4);
2076                                 if(q != r) {
2077 #endif
2078                                         // Mismatch!
2079                                         if(jj < slen) {
2080                                                 // More than one mismatch in the anchor; reject
2081                                                 match = false;
2082                                                 break;
2083                                         }
2084                                         uint8_t qual = phredCharToPhredQual(quals[j]);
2085                                         ham += mmPenalty(this->maqPenalty_, qual);
2086                                         if(ham > this->qualMax_) {
2087                                                 // Exceeded quality ceiling; reject
2088                                                 match = false;
2089                                                 break;
2090                                         } else {
2091                                                 // Legal mismatch outside of the anchor; record it
2092                                                 mms++;
2093                                                 nonSeedMms.push_back(j);
2094                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
2095                                                 nonSeedRefcs.push_back("ACGTN"[r]);
2096                                         }
2097                                 }
2098                         }
2099                         if(match) {
2100                                 ranges.resize(ranges.size()+1);
2101                                 Range& range = ranges.back();
2102                                 range.stratum = 0;
2103                                 range.numMms = mms;
2104                                 assert_eq(0, range.mms.size());
2105                                 assert_eq(0, range.refcs.size());
2106                                 if(mms >= 1) {
2107                                         // Be careful to add edits in left-to-right order
2108                                         // with respect to the read/alignment
2109                                         const size_t nonSeedMmsSz = nonSeedMms.size();
2110                                         if(nonSeedMmsSz > 0) {
2111                                                 if(seedOnLeft) {
2112                                                         for(size_t k = 0; k < nonSeedMmsSz; k++) {
2113                                                                 range.mms.push_back(nonSeedMms[k]);
2114                                                                 range.refcs.push_back(nonSeedRefcs[k]);
2115                                                         }
2116                                                 } else {
2117                                                         for(size_t k = nonSeedMmsSz; k > 0; k--) {
2118                                                                 range.mms.push_back(nonSeedMms[k-1]);
2119                                                                 range.refcs.push_back(nonSeedRefcs[k-1]);
2120                                                         }
2121                                                 }
2122                                         }
2123                                 }
2124                                 assert_eq((size_t)mms, range.mms.size());
2125                                 assert_eq((size_t)mms, range.refcs.size());
2126                                 if(seedOnLeft) {
2127                                         results.push_back(ri);
2128                                 } else {
2129                                         results.push_back(ri - qlen);
2130                                 }
2131                         }
2132                 }
2133                 return;
2134         }
2135
2136         /**
2137          * This schematic shows the roles played by the begin, qbegin, end,
2138          * qend, halfway, slen, qlen, and lim variables:
2139          *
2140          * seedOnLeft == true:
2141          *
2142          * |<                   lim                   >|<     qlen       >|
2143          *  --------------------------------------------------------------
2144          * |                     | slen | qlen-slen |  | slen | qlen-slen |
2145          *  --------------------------------------------------------------
2146          * ^                     ^                     ^                  ^
2147          * begin & qbegin     halfway                qend               end
2148          *
2149          * seedOnLeft == false:
2150          *
2151          *             |<                   lim                   >|
2152          *  --------------------------------------------------------------
2153          * | qlen-slen |         | qlen-slen | slen |              | slen |
2154          *  --------------------------------------------------------------
2155          * ^           ^                     ^                     ^      ^
2156          * begin       qbegin             halfway                qend   end
2157          *
2158          * Note that, for seeds longer than 32 base-pairs, the seed is
2159          * further subdivided into a 32-bit anchor and a seed overhang of
2160          * length > 0.
2161          */
2162         virtual void anchor64Find(uint32_t numToFind,
2163                                         uint32_t tidx,
2164                                         uint8_t* ref,
2165                                         const TDna5Str& qry,
2166                                         const TCharStr& quals,
2167                                         uint32_t begin,
2168                                         uint32_t end,
2169                                         TRangeVec& ranges,
2170                                         TU32Vec& results,
2171                                         TSetPairs* pairs = NULL,
2172                                         uint32_t aoff = 0xffffffff,
2173                                         bool seedOnLeft = false) const
2174         {
2175                 assert_gt(numToFind, 0);
2176                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
2177                 ASSERT_ONLY(uint32_t duplicates = 0);
2178                 ASSERT_ONLY(uint32_t r2i = 0);
2179                 const uint32_t qlen = seqan::length(qry);
2180                 assert_gt(qlen, 0);
2181                 assert_gt(end, begin);
2182                 assert_geq(end - begin, qlen); // caller should have checked this
2183                 assert_gt(this->seedLen_, 0);
2184                 uint32_t slen = min(qlen, this->seedLen_);
2185 #ifndef NDEBUG
2186                 // Get results from the naive matcher for sanity-checking
2187                 TRangeVec r2; TU32Vec re2;
2188                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
2189                                   re2, pairs, aoff, seedOnLeft);
2190 #endif
2191                 const uint32_t anchorBitPairs = min<int>(slen, 32);
2192                 const int lhsShift = ((anchorBitPairs - 1) << 1);
2193                 ASSERT_ONLY(const uint32_t anchorCushion  = 32 - anchorBitPairs);
2194                 // seedAnchorOverhang = # seed bases not included in the anchor
2195                 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
2196                 // seedAnchorOverhang = # seed bases not included in the anchor
2197                 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
2198                 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
2199                 assert_eq(qlen, readSeedOverhang + slen);
2200                 uint32_t qend = end;
2201                 uint32_t qbegin = begin;
2202                 if(seedOnLeft) {
2203                         // Leave read-sized gap on right-hand side of the interval
2204                         qend -= qlen;
2205                 } else {
2206                         // Leave seed-sized gap on right-hand side and
2207                         // non-seed-sized gap on the left-hand side
2208                         qbegin += readSeedOverhang;
2209                         qend -= slen;
2210                 }
2211                 // lim = # possible alignments in the range
2212                 const uint32_t lim = qend - qbegin;
2213                 // halfway = point on the genome to radiate out from
2214                 const uint32_t halfway = qbegin + (lim >> 1);
2215                 uint64_t anchor = 0llu;
2216                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
2217                 // Set up a mask that we'll apply to the two bufs every round
2218                 // to discard bits that were rotated out of the anchor area
2219                 uint64_t clearMask = 0xffffffffffffffffllu;
2220                 bool useMask = false;
2221                 if(anchorBitPairs < 32) {
2222                         clearMask >>= ((32-anchorBitPairs) << 1);
2223                         useMask = true;
2224                 }
2225                 uint32_t skipLeftToRights = 0;
2226                 uint32_t skipRightToLefts = 0;
2227                 const uint32_t halfwayRi = halfway - begin;
2228                 // Construct the 'anchor' 64-bit buffer so that it holds all of
2229                 // the first 'anchorBitPairs' bit pairs of the query.
2230                 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
2231                         uint32_t i = ii;
2232                         if(!seedOnLeft) {
2233                                 // Fill in the anchor using characters from the right-
2234                                 // hand side of the query (but take the characters in
2235                                 // left-to-right order)
2236                                 i = qlen - slen + ii;
2237                         }
2238                         int c = (int)qry[i]; // next query character
2239                         int r = (int)ref[halfwayRi + ii]; // next reference character
2240                         if(r & 4) {
2241                                 // The reference character is an N; to mimic the
2242                                 // behavior of BW alignment, we have to skip all
2243                                 // alignments that involve an N in the reference.  Set
2244                                 // the skip* variables accordingly.
2245                                 r = 0;
2246                                 uint32_t lrSkips = ii;
2247                                 uint32_t rlSkips = qlen - ii;
2248                                 if(!seedOnLeft && readSeedOverhang) {
2249                                         lrSkips += readSeedOverhang;
2250                                         assert_geq(rlSkips, readSeedOverhang);
2251                                         rlSkips -= readSeedOverhang;
2252                                 }
2253                                 // The right-to-left direction absorbs the candidate
2254                                 // alignment based at 'halfway'
2255                                 skipLeftToRights = max(skipLeftToRights, lrSkips);
2256                                 skipRightToLefts = max(skipRightToLefts, rlSkips);
2257                                 assert_leq(skipLeftToRights, qlen);
2258                                 assert_leq(skipRightToLefts, qlen);
2259                         }
2260                         assert_leq(c, 4);
2261                         assert_lt(r, 4);
2262                         // Special case: query has an 'N'
2263                         if(c == 4) {
2264                                 // One or more 'N's in the anchor region; can't
2265                                 // possibly have a 0-mismatch hit anywhere
2266                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2267                                 return;   // can't match if query has Ns
2268                         }
2269                         anchor = ((anchor << 2llu) | c);
2270                         buffw = ((buffw << 2llu) | r);
2271                 }
2272                 // Check whether read is disqualified by Ns inside the seed
2273                 // region but outside the anchor region
2274                 if(seedAnchorOverhang) {
2275                         assert_lt(anchorBitPairs, slen);
2276                         for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
2277                                 uint32_t i = ii;
2278                                 if(!seedOnLeft) {
2279                                         i = qlen - slen + ii;
2280                                 }
2281                                 if((int)qry[i] == 4) {
2282                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
2283                                         return; // can't match if query has Ns
2284                                 }
2285                         }
2286                 } else {
2287                         assert_eq(anchorBitPairs, slen);
2288                 }
2289                 uint64_t bufbw = buffw;
2290                 // Slide the anchor out in either direction, alternating
2291                 // between right-to-left and left-to-right shifts, until all of
2292                 // the positions from qbegin to qend have been covered.
2293                 bool hi = false;
2294                 uint32_t riHi  = halfway;
2295                 uint32_t rirHi = halfway - begin;
2296                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
2297                 uint32_t riLo  = halfway + 1;
2298                 uint32_t rirLo = halfway - begin + 1;
2299                 uint32_t lrSkips = anchorBitPairs;
2300                 uint32_t rlSkips = qlen;
2301                 if(!seedOnLeft && readSeedOverhang) {
2302                         lrSkips += readSeedOverhang;
2303                         assert_geq(rlSkips, readSeedOverhang);
2304                         rlSkips -= readSeedOverhang;
2305                 }
2306                 for(uint32_t i = 1; i <= lim + 1; i++) {
2307                         int r;       // new reference char
2308                         uint64_t diff;
2309                         assert_leq(skipLeftToRights, qlen);
2310                         assert_leq(skipRightToLefts, qlen);
2311                         if(hi) {
2312                                 hi = false;
2313                                 // Moving left-to-right
2314                                 riHi++;
2315                                 rirHi++;
2316                                 rirHiAnchor++;
2317                                 r = (int)ref[rirHiAnchor];
2318                                 if(r & 4) {
2319                                         r = 0;
2320                                         skipLeftToRights = lrSkips;
2321                                 }
2322                                 assert_lt(r, 4);
2323                                 // Bring in new base pair at the least significant
2324                                 // position
2325                                 buffw = ((buffw << 2llu) | r);
2326                                 if(useMask) buffw &= clearMask;
2327                                 if(skipLeftToRights > 0) {
2328                                         skipLeftToRights--;
2329                                         continue;
2330                                 }
2331                                 diff = buffw ^ anchor;
2332                         } else {
2333                                 hi = true;
2334                                 // Moving right-to-left
2335                                 riLo--;
2336                                 rirLo--;
2337                                 r = (int)ref[rirLo];
2338                                 if(r & 4) {
2339                                         r = 0;
2340                                         skipRightToLefts = rlSkips;
2341                                 }
2342                                 assert_lt(r, 4);
2343                                 if(i >= 2) {
2344                                         bufbw >>= 2llu;
2345                                         // Bring in new base pair at the most significant
2346                                         // position
2347                                         bufbw |= ((uint64_t)r << lhsShift);
2348                                 }
2349                                 if(skipRightToLefts > 0) {
2350                                         skipRightToLefts--;
2351                                         continue;
2352                                 }
2353                                 diff = bufbw ^ anchor;
2354                         }
2355                         if(diff) continue;
2356                         uint32_t ri  = hi ? riLo  : riHi;
2357                         uint32_t rir = hi ? rirLo : rirHi;
2358                         unsigned int ham = 0;
2359                         // If the seed is longer than the anchor, then scan the
2360                         // rest of the seed characters
2361                         bool foundHit = true;
2362                         if(seedAnchorOverhang) {
2363                                 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
2364                                         int rc = (int)ref[rir + anchorBitPairs + j];
2365                                         if(rc == 4) {
2366                                                 // Oops, encountered an N in the reference in
2367                                                 // the overhang portion of the candidate
2368                                                 // alignment
2369                                                 // (Note that we inverted hi earlier)
2370                                                 if(hi) {
2371                                                         // Right-to-left
2372                                                         // Skip out of the seedAnchorOverhang
2373                                                         assert_eq(0, skipRightToLefts);
2374                                                         skipRightToLefts = seedAnchorOverhang - j - 1;
2375                                                         if(seedOnLeft) {
2376                                                                 // ...and skip out of the rest of the read
2377                                                                 skipRightToLefts += readSeedOverhang;
2378                                                         }
2379                                                 } else {
2380                                                         // Left-to-right
2381                                                         // Skip out of the seedAnchorOverhang
2382                                                         assert_eq(0, skipLeftToRights);
2383                                                         skipLeftToRights = anchorBitPairs + j;
2384                                                         if(!seedOnLeft) {
2385                                                                 // ...and skip out of the rest of the read
2386                                                                 skipLeftToRights += readSeedOverhang;
2387                                                         }
2388                                                 }
2389                                                 foundHit = false; // Skip this candidate
2390                                                 break;
2391                                         }
2392                                         uint32_t qoff = anchorBitPairs + j;
2393                                         if(!seedOnLeft) {
2394                                                 qoff += readSeedOverhang;
2395                                         }
2396                                         assert_lt(qoff, qlen);
2397                                         if((int)qry[qoff] != rc) {
2398                                                 foundHit = false;
2399                                                 break;
2400                                         }
2401                                 }
2402                                 if(!foundHit) continue;
2403                         }
2404                         // If the read is longer than the seed, then scan the rest
2405                         // of the read characters; mismatches no longer count
2406                         // toward the stratum or the 1-mm limit.
2407                         // Vectors for holding edit information
2408                         std::vector<uint32_t> nonSeedMms;
2409                         std::vector<uint8_t> nonSeedRefcs;
2410                         int mms = 0; // start counting total mismatches
2411                         if((qlen - slen) > 0) {
2412                                 // Going left-to-right
2413                                 for(uint32_t j = 0; j < readSeedOverhang; j++) {
2414                                         uint32_t roff = rir + slen + j;
2415                                         uint32_t qoff = slen + j;
2416                                         if(!seedOnLeft) {
2417                                                 assert_geq(roff, qlen);
2418                                                 roff -= qlen;
2419                                                 qoff = j;
2420                                         }
2421                                         int rc = (int)ref[roff];
2422                                         if(rc == 4) {
2423                                                 // Oops, encountered an N in the reference in
2424                                                 // the overhang portion of the candidate
2425                                                 // alignment
2426                                                 // (Note that we inverted hi earlier)
2427                                                 if(hi) {
2428                                                         // Right-to-left
2429                                                         // Skip what's left of the readSeedOverhang
2430                                                         skipRightToLefts = readSeedOverhang - j - 1;
2431                                                         if(!seedOnLeft) {
2432                                                                 // ...and skip the seed if it's on the right
2433                                                                 skipRightToLefts += slen;
2434                                                         }
2435                                                 } else {
2436                                                         // Left-to-right
2437                                                         // Skip what we've matched of the overhang
2438                                                         skipLeftToRights = j;
2439                                                         if(seedOnLeft) {
2440                                                                 // ...and skip the seed if it's on the left
2441                                                                 skipLeftToRights += slen;
2442                                                         }
2443                                                 }
2444                                                 foundHit = false; // Skip this candidate
2445                                                 break;
2446                                         }
2447                                         if((int)qry[qoff] != rc) {
2448                                                 // Calculate quality of mismatched base
2449                                                 char q = phredCharToPhredQual(quals[qoff]);
2450                                                 ham += mmPenalty(this->maqPenalty_, q);
2451                                                 if(ham > this->qualMax_) {
2452                                                         // Exceeded quality limit
2453                                                         foundHit = false;
2454                                                         break;
2455                                                 }
2456                                                 // Legal mismatch outside of the anchor; record it
2457                                                 mms++;
2458                                                 nonSeedMms.push_back(qoff);
2459                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
2460                                                 nonSeedRefcs.push_back("ACGTN"[rc]);
2461                                         }
2462                                 }
2463                                 if(!foundHit) continue;
2464                         }
2465                         assert(foundHit);
2466                         // Adjust ri if seed is on the right-hand side
2467                         if(!seedOnLeft) {
2468                                 ri -= readSeedOverhang;
2469                                 rir -= readSeedOverhang;
2470                         }
2471                         if(pairs != NULL) {
2472                                 TU64Pair p;
2473                                 if(ri < aoff) {
2474                                         // By convention, the upstream mate's
2475                                         // coordinates go in the 'first' field
2476                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
2477                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2478                                 } else {
2479                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
2480                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2481                                 }
2482                                 if(pairs->find(p) != pairs->end()) {
2483                                         // We already found this hit!  Continue.
2484                                         ASSERT_ONLY(duplicates++);
2485                                         ASSERT_ONLY(r2i++);
2486                                         continue;
2487                                 } else {
2488                                         // Record this hit
2489                                         pairs->insert(p);
2490                                 }
2491                         }
2492                         if(this->verbose_) {
2493                                 cout << "About to report seed0:" << endl;
2494                                 cout << "  ";
2495                                 for(size_t i = 0; i < qlen; i++) {
2496                                         cout << (char)qry[i];
2497                                 }
2498                                 cout << endl;
2499                                 cout << "  ";
2500                                 for(size_t i = 0; i < qlen; i++) {
2501                                         cout << "ACGT"[ref[rir+i]];
2502                                 }
2503                                 cout << endl;
2504                         }
2505                         assert_lt(r2i, r2.size());
2506                         assert_eq(re2[r2i], ri);
2507                         ranges.resize(ranges.size()+1);
2508                         Range& range = ranges.back();
2509                         assert_eq((size_t)mms, r2[r2i].numMms);
2510                         range.stratum = 0;
2511                         range.numMms = mms;
2512                         assert_eq(0, range.mms.size());
2513                         assert_eq(0, range.refcs.size());
2514                         if(mms > 0) {
2515                                 ASSERT_ONLY(size_t mmcur = 0);
2516                                 const size_t nonSeedMmsSz = nonSeedMms.size();
2517                                 for(size_t i = 0; i < nonSeedMmsSz; i++) {
2518                                         assert_neq(0xffffffff, nonSeedMms[i]);
2519                                         assert_lt(mmcur, (size_t)mms);
2520                                         assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
2521                                         range.mms.push_back(nonSeedMms[i]);
2522                                         assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
2523                                         ASSERT_ONLY(mmcur++);
2524                                         range.refcs.push_back(nonSeedRefcs[i]);
2525                                 }
2526                                 assert_eq(mmcur, r2[r2i].mms.size());
2527                         }
2528                         assert_eq((size_t)mms, range.mms.size());
2529                         assert_eq((size_t)mms, range.refcs.size());
2530                         ASSERT_ONLY(r2i++);
2531                         results.push_back(ri);
2532                         if(--numToFind == 0) return;
2533                 }
2534                 assert_leq(duplicates, r2.size());
2535                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
2536                 return; // no hit
2537         }
2538 };
2539
2540 /**
2541  * Concrete RefAligner for finding nearby 1-mismatch hits given an
2542  * anchor hit.
2543  */
2544 template<typename TStr>
2545 class Seed1RefAligner : public RefAligner<TStr> {
2546
2547         typedef seqan::String<seqan::Dna5> TDna5Str;
2548         typedef seqan::String<char> TCharStr;
2549         typedef std::vector<Range> TRangeVec;
2550         typedef std::vector<uint32_t> TU32Vec;
2551         typedef std::pair<uint64_t, uint64_t> TU64Pair;
2552         typedef std::set<TU64Pair> TSetPairs;
2553
2554 public:
2555
2556         Seed1RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
2557                 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
2558
2559         virtual ~Seed1RefAligner() { }
2560
2561 protected:
2562         /**
2563          * This schematic shows the roles played by the begin, qbegin, end,
2564          * qend, halfway, slen, qlen, and lim variables:
2565          *
2566          * seedOnLeft == true:
2567          *
2568          * |<                   lim                   >|<     qlen       >|
2569          *  --------------------------------------------------------------
2570          * |                     | slen | qlen-slen |  | slen | qlen-slen |
2571          *  --------------------------------------------------------------
2572          * ^                     ^                     ^                  ^
2573          * begin & qbegin     halfway                qend               end
2574          *
2575          * seedOnLeft == false:
2576          *
2577          * |<     qlen       >|<                   lim                   >|
2578          *  --------------------------------------------------------------
2579          * | qlen-slen | slen |  | qlen-slen | slen |                     |
2580          *  --------------------------------------------------------------
2581          * ^                  ^                     ^                     ^
2582          * begin            qbegin             halfway           qend & end
2583          *
2584          * Note that, for seeds longer than 32 base-pairs, the seed is
2585          * further subdivided into a 32-bit anchor and a seed overhang of
2586          * length > 0.
2587          */
2588         void naiveFind(uint32_t numToFind,
2589                                    uint32_t tidx,
2590                                    uint8_t* ref,
2591                                    const TDna5Str& qry,
2592                                    const TCharStr& quals,
2593                                    uint32_t begin,
2594                                    uint32_t end,
2595                                    TRangeVec& ranges,
2596                                    TU32Vec& results,
2597                                    TSetPairs* pairs,
2598                                    uint32_t aoff,
2599                                    bool seedOnLeft) const
2600         {
2601                 assert_gt(numToFind, 0);
2602                 assert_gt(end, begin);
2603                 const uint32_t qlen = seqan::length(qry);
2604                 assert_gt(qlen, 0);
2605                 assert_geq(end - begin, qlen); // caller should have checked this
2606                 assert_gt(this->seedLen_, 0);
2607                 const uint32_t slen = min(qlen, this->seedLen_);
2608                 uint32_t qend = end;
2609                 uint32_t qbegin = begin;
2610                 // If the seed is on the left-hand side of the alignment, then
2611                 // leave a gap at the right-hand side of the interval;
2612                 // otherwise, do the opposite
2613                 if(seedOnLeft) {
2614                         // Leave gap on right-hand side of the interval
2615                         qend -= qlen;
2616                 } else {
2617                         // Leave gap on left-hand side of the interval
2618                         qbegin += qlen;
2619                 }
2620                 // lim = number of alignments to try
2621                 const uint32_t lim = qend - qbegin;
2622                 // halfway = position in the reference to start at (and then
2623                 // we work our way out to the right and to the left).
2624                 const uint32_t halfway = qbegin + (lim >> 1);
2625                 // Vectors for holding edit information
2626                 std::vector<uint32_t> nonSeedMms;
2627                 assert_eq(0, nonSeedMms.size());
2628                 std::vector<uint8_t> nonSeedRefcs;
2629                 assert_eq(0, nonSeedRefcs.size());
2630                 bool hi = false;
2631                 for(uint32_t i = 1; i <= lim+1; i++) {
2632                         uint32_t ri;  // leftmost position in candidate alignment
2633                         uint32_t rir; // same, minus begin; for indexing into ref[]
2634                         if(hi) {
2635                                 ri = halfway + (i >> 1); rir = ri - begin;
2636                                 assert_leq(ri, qend);
2637                         } else {
2638                                 ri = halfway - (i >> 1); rir = ri - begin;
2639                                 assert_geq(ri, begin);
2640                         }
2641                         hi = !hi;
2642                         // Do the naive comparison
2643                         bool match = true;
2644                         int refc = -1;
2645                         uint32_t mmOff = 0xffffffff;
2646                         int mms = 0;
2647                         int seedMms = 0;
2648                         unsigned int ham = 0;
2649                         nonSeedMms.clear();
2650                         nonSeedRefcs.clear();
2651                         // Walk through each position of the alignment
2652                         for(uint32_t jj = 0; jj < qlen; jj++) {
2653                                 uint32_t j = jj;
2654                                 if(!seedOnLeft) {
2655                                         // If seed is on the right, scan right-to-left
2656                                         j = qlen - jj - 1;
2657                                 } else {
2658                                         // Go left-to-right
2659                                 }
2660                                 uint32_t rirj = rir + j;
2661                                 if(!seedOnLeft) {
2662                                         assert_geq(rir, jj);
2663                                         rirj = rir - jj - 1;
2664                                 }
2665 #if 0
2666                                 // Count Ns in the reference as mismatches
2667                                 const int q = (int)qry[j];
2668                                 const int r = (int)ref[rirj];
2669                                 assert_leq(q, 4);
2670                                 assert_leq(r, 4);
2671                                 if(q == 4 || r == 4 || q != r) {
2672 #else
2673                                 // Disallow alignments that involve an N in the
2674                                 // reference
2675                                 const int r = (int)ref[rirj];
2676                                 if(r & 4) {
2677                                         // N in reference; bail on this alignment
2678                                         match = false;
2679                                         break;
2680                                 }
2681                                 const int q = (int)qry[j];
2682                                 assert_leq(q, 4);
2683                                 assert_lt(r, 4);
2684                                 if(q != r) {
2685 #endif
2686                                         // Mismatch!
2687                                         mms++;
2688                                         if(mms > 1 && jj < slen) {
2689                                                 // More than one mismatch in the anchor; reject
2690                                                 match = false;
2691                                                 break;
2692                                         }
2693                                         uint8_t qual = phredCharToPhredQual(quals[j]);
2694                                         ham += mmPenalty(this->maqPenalty_, qual);
2695                                         if(ham > this->qualMax_) {
2696                                                 // Exceeded quality ceiling; reject
2697                                                 match = false;
2698                                                 break;
2699                                         } else if(jj < slen) {
2700                                                 // First mismatch in the anchor; remember offset
2701                                                 // and ref char
2702                                                 refc = "ACGTN"[r];
2703                                                 mmOff = j;
2704                                                 seedMms = 1;
2705                                         } else {
2706                                                 // Legal mismatch outside of the anchor; record it
2707                                                 nonSeedMms.push_back(j);
2708                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
2709                                                 nonSeedRefcs.push_back("ACGTN"[r]);
2710                                         }
2711                                 }
2712                         }
2713                         if(match) {
2714                                 ranges.resize(ranges.size()+1);
2715                                 Range& range = ranges.back();
2716                                 assert_leq(seedMms, mms);
2717                                 range.stratum = seedMms;
2718                                 range.numMms = mms;
2719                                 assert_eq(0, range.mms.size());
2720                                 assert_eq(0, range.refcs.size());
2721                                 if(mms >= 1) {
2722                                         // Be careful to add edits in left-to-right order
2723                                         // with respect to the read/alignment
2724                                         if(seedOnLeft && seedMms) {
2725                                                 assert_lt(mmOff, qlen);
2726                                                 range.mms.push_back(mmOff);
2727                                                 range.refcs.push_back(refc);
2728                                         }
2729                                         const size_t nonSeedMmsSz = nonSeedMms.size();
2730                                         if(nonSeedMmsSz > 0) {
2731                                                 if(seedOnLeft) {
2732                                                         for(size_t k = 0; k < nonSeedMmsSz; k++) {
2733                                                                 range.mms.push_back(nonSeedMms[k]);
2734                                                                 range.refcs.push_back(nonSeedRefcs[k]);
2735                                                         }
2736                                                 } else {
2737                                                         for(size_t k = nonSeedMmsSz; k > 0; k--) {
2738                                                                 range.mms.push_back(nonSeedMms[k-1]);
2739                                                                 range.refcs.push_back(nonSeedRefcs[k-1]);
2740                                                         }
2741                                                 }
2742                                         }
2743                                         if(!seedOnLeft && seedMms) {
2744                                                 assert_lt(mmOff, qlen);
2745                                                 range.mms.push_back(mmOff);
2746                                                 range.refcs.push_back(refc);
2747                                         }
2748                                 }
2749                                 assert_eq((size_t)mms, range.mms.size());
2750                                 assert_eq((size_t)mms, range.refcs.size());
2751                                 if(seedOnLeft) {
2752                                         results.push_back(ri);
2753                                 } else {
2754                                         results.push_back(ri - qlen);
2755                                 }
2756                         }
2757                 }
2758                 return;
2759         }
2760
2761         /**
2762          * This schematic shows the roles played by the begin, qbegin, end,
2763          * qend, halfway, slen, qlen, and lim variables:
2764          *
2765          * seedOnLeft == true:
2766          *
2767          * |<                   lim                   >|<     qlen       >|
2768          *  --------------------------------------------------------------
2769          * |                     | slen | qlen-slen |  | slen | qlen-slen |
2770          *  --------------------------------------------------------------
2771          * ^                     ^                     ^                  ^
2772          * begin & qbegin     halfway                qend               end
2773          *
2774          * seedOnLeft == false:
2775          *
2776          *             |<                   lim                   >|
2777          *  --------------------------------------------------------------
2778          * | qlen-slen |         | qlen-slen | slen |              | slen |
2779          *  --------------------------------------------------------------
2780          * ^           ^                     ^                     ^      ^
2781          * begin       qbegin             halfway                qend   end
2782          *
2783          * Note that, for seeds longer than 32 base-pairs, the seed is
2784          * further subdivided into a 32-bit anchor and a seed overhang of
2785          * length > 0.
2786          */
2787         virtual void anchor64Find(uint32_t numToFind,
2788                                         uint32_t tidx,
2789                                         uint8_t* ref,
2790                                         const TDna5Str& qry,
2791                                         const TCharStr& quals,
2792                                         uint32_t begin,
2793                                         uint32_t end,
2794                                         TRangeVec& ranges,
2795                                         TU32Vec& results,
2796                                         TSetPairs* pairs = NULL,
2797                                         uint32_t aoff = 0xffffffff,
2798                                         bool seedOnLeft = false) const
2799         {
2800                 assert_gt(numToFind, 0);
2801                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
2802                 ASSERT_ONLY(uint32_t duplicates = 0);
2803                 ASSERT_ONLY(uint32_t r2i = 0);
2804                 const uint32_t qlen = seqan::length(qry);
2805                 assert_gt(qlen, 0);
2806                 assert_gt(end, begin);
2807                 assert_geq(end - begin, qlen); // caller should have checked this
2808                 assert_gt(this->seedLen_, 0);
2809                 uint32_t slen = min(qlen, this->seedLen_);
2810 #ifndef NDEBUG
2811                 // Get results from the naive matcher for sanity-checking
2812                 TRangeVec r2; TU32Vec re2;
2813                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
2814                                   re2, pairs, aoff, seedOnLeft);
2815 #endif
2816                 const uint32_t anchorBitPairs = min<int>(slen, 32);
2817                 const int lhsShift = ((anchorBitPairs - 1) << 1);
2818                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
2819                 // seedAnchorOverhang = # seed bases not included in the anchor
2820                 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
2821                 // seedAnchorOverhang = # seed bases not included in the anchor
2822                 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
2823                 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
2824                 assert_eq(qlen, readSeedOverhang + slen);
2825                 uint32_t qend = end;
2826                 uint32_t qbegin = begin;
2827                 if(seedOnLeft) {
2828                         // Leave read-sized gap on right-hand side of the interval
2829                         qend -= qlen;
2830                 } else {
2831                         // Leave seed-sized gap on right-hand side and
2832                         // non-seed-sized gap on the left-hand side
2833                         qbegin += readSeedOverhang;
2834                         qend -= slen;
2835                 }
2836                 // lim = # possible alignments in the range
2837                 const uint32_t lim = qend - qbegin;
2838                 // halfway = point on the genome to radiate out from
2839                 const uint32_t halfway = qbegin + (lim >> 1);
2840                 uint64_t anchor = 0llu;
2841                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
2842                 // OR the 'diff' buffer with this so that we can always count
2843                 // 'N's as mismatches
2844                 uint64_t diffMask = 0llu;
2845                 // Set up a mask that we'll apply to the two bufs every round
2846                 // to discard bits that were rotated out of the anchor area
2847                 uint64_t clearMask = 0xffffffffffffffffllu;
2848                 bool useMask = false;
2849                 if(anchorBitPairs < 32) {
2850                         clearMask >>= ((32-anchorBitPairs) << 1);
2851                         useMask = true;
2852                 }
2853                 int nsInSeed = 0;
2854                 int nPos = -1;
2855                 uint32_t skipLeftToRights = 0;
2856                 uint32_t skipRightToLefts = 0;
2857                 const uint32_t halfwayRi = halfway - begin;
2858                 // Construct the 'anchor' 64-bit buffer so that it holds all of
2859                 // the first 'anchorBitPairs' bit pairs of the query.
2860                 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
2861                         uint32_t i = ii;
2862                         if(!seedOnLeft) {
2863                                 // Fill in the anchor using characters from the right-
2864                                 // hand side of the query (but take the characters in
2865                                 // left-to-right order)
2866                                 i = qlen - slen + ii;
2867                         }
2868                         int c = (int)qry[i]; // next query character
2869                         int r = (int)ref[halfwayRi + ii]; // next reference character
2870                         if(r & 4) {
2871                                 // The reference character is an N; to mimic the
2872                                 // behavior of BW alignment, we have to skip all
2873                                 // alignments that involve an N in the reference.  Set
2874                                 // the skip* variables accordingly.
2875                                 r = 0;
2876                                 uint32_t lrSkips = ii;
2877                                 uint32_t rlSkips = qlen - ii;
2878                                 if(!seedOnLeft && readSeedOverhang) {
2879                                         lrSkips += readSeedOverhang;
2880                                         assert_geq(rlSkips, readSeedOverhang);
2881                                         rlSkips -= readSeedOverhang;
2882                                 }
2883                                 // The right-to-left direction absorbs the candidate
2884                                 // alignment based at 'halfway'
2885                                 skipLeftToRights = max(skipLeftToRights, lrSkips);
2886                                 skipRightToLefts = max(skipRightToLefts, rlSkips);
2887                                 assert_leq(skipLeftToRights, qlen);
2888                                 assert_leq(skipRightToLefts, qlen);
2889                         }
2890                         assert_leq(c, 4);
2891                         assert_lt(r, 4);
2892                         // Special case: query has an 'N'
2893                         if(c == 4) {
2894                                 if(++nsInSeed > 1) {
2895                                         // More than one 'N' in the anchor region; can't
2896                                         // possibly have a 1-mismatch hit anywhere
2897                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
2898                                         return;   // can't match if query has Ns
2899                                 }
2900                                 nPos = (int)ii; // w/r/t LHS of anchor
2901                                 // Make it look like an 'A' in the anchor
2902                                 c = 0;
2903                                 diffMask = (diffMask << 2llu) | 1llu;
2904                         } else {
2905                                 diffMask <<= 2llu;
2906                         }
2907                         anchor = ((anchor << 2llu) | c);
2908                         buffw = ((buffw << 2llu) | r);
2909                 }
2910                 // Check whether read is disqualified by Ns inside the seed
2911                 // region but outside the anchor region
2912                 if(seedAnchorOverhang) {
2913                         assert_lt(anchorBitPairs, slen);
2914                         for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
2915                                 uint32_t i = ii;
2916                                 if(!seedOnLeft) {
2917                                         i = qlen - slen + ii;
2918                                 }
2919                                 if((int)qry[i] == 4) {
2920                                         if(++nsInSeed > 1) {
2921                                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2922                                                 return; // can't match if query has Ns
2923                                         }
2924                                 }
2925                         }
2926                 } else {
2927                         assert_eq(anchorBitPairs, slen);
2928                 }
2929                 uint64_t bufbw = buffw;
2930                 // Slide the anchor out in either direction, alternating
2931                 // between right-to-left and left-to-right shifts, until all of
2932                 // the positions from qbegin to qend have been covered.
2933                 bool hi = false;
2934                 uint32_t riHi  = halfway;
2935                 uint32_t rirHi = halfway - begin;
2936                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
2937                 uint32_t riLo  = halfway + 1;
2938                 uint32_t rirLo = halfway - begin + 1;
2939                 uint32_t lrSkips = anchorBitPairs;
2940                 uint32_t rlSkips = qlen;
2941                 if(!seedOnLeft && readSeedOverhang) {
2942                         lrSkips += readSeedOverhang;
2943                         assert_geq(rlSkips, readSeedOverhang);
2944                         rlSkips -= readSeedOverhang;
2945                 }
2946                 for(uint32_t i = 1; i <= lim + 1; i++) {
2947                         int r;       // new reference char
2948                         uint64_t diff;
2949                         assert_leq(skipLeftToRights, qlen);
2950                         assert_leq(skipRightToLefts, qlen);
2951                         if(hi) {
2952                                 hi = false;
2953                                 // Moving left-to-right
2954                                 riHi++;
2955                                 rirHi++;
2956                                 rirHiAnchor++;
2957                                 r = (int)ref[rirHiAnchor];
2958                                 if(r & 4) {
2959                                         r = 0;
2960                                         skipLeftToRights = lrSkips;
2961                                 }
2962                                 assert_lt(r, 4);
2963                                 // Bring in new base pair at the least significant
2964                                 // position
2965                                 buffw = ((buffw << 2llu) | r);
2966                                 if(useMask) buffw &= clearMask;
2967                                 if(skipLeftToRights > 0) {
2968                                         skipLeftToRights--;
2969                                         continue;
2970                                 }
2971                                 diff = (buffw ^ anchor) | diffMask;
2972                         } else {
2973                                 hi = true;
2974                                 // Moving right-to-left
2975                                 riLo--;
2976                                 rirLo--;
2977                                 r = (int)ref[rirLo];
2978                                 if(r & 4) {
2979                                         r = 0;
2980                                         skipRightToLefts = rlSkips;
2981                                 }
2982                                 assert_lt(r, 4);
2983                                 if(i >= 2) {
2984                                         bufbw >>= 2llu;
2985                                         // Bring in new base pair at the most significant
2986                                         // position
2987                                         bufbw |= ((uint64_t)r << lhsShift);
2988                                 }
2989                                 if(skipRightToLefts > 0) {
2990                                         skipRightToLefts--;
2991                                         continue;
2992                                 }
2993                                 diff = (bufbw ^ anchor) | diffMask;
2994                         }
2995                         if((diff & 0xffffffff00000000llu) &&
2996                            (diff & 0x00000000ffffffffllu)) continue;
2997                         uint32_t ri  = hi ? riLo  : riHi;
2998                         uint32_t rir = hi ? rirLo : rirHi;
2999                         // Could use pop count
3000                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
3001                         // As a first cut, see if there are too many mismatches in
3002                         // the first and last parts of the anchor
3003                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
3004                         if(diffs > 1) continue;
3005                         diffs += u8toMms[(int)diff8[1]] +
3006                                          u8toMms[(int)diff8[2]] +
3007                                          u8toMms[(int)diff8[3]] +
3008                                          u8toMms[(int)diff8[4]] +
3009                                          u8toMms[(int)diff8[5]] +
3010                                          u8toMms[(int)diff8[6]];
3011                         uint32_t mmpos = 0xffffffff;
3012                         int refc = -1;
3013                         unsigned int ham = 0;
3014                         if(diffs > 1) {
3015                                 // Too many differences in the seed; stop
3016                                 continue;
3017                         } else if(diffs == 1 && nPos != -1) {
3018                                 // There was one difference, but there was also one N,
3019                                 // so we already know where the difference is
3020                                 mmpos = nPos;
3021                                 assert_lt(mmpos, anchorBitPairs);
3022                                 refc = "ACGT"[(int)ref[rir + nPos]];
3023                                 if(!seedOnLeft) {
3024                                         mmpos += readSeedOverhang;
3025                                 }
3026                                 char q = quals[mmpos];
3027                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3028                                 if(ham > this->qualMax_) {
3029                                         // Exceeded quality limit
3030                                         continue;
3031                                 }
3032                         } else if(diffs == 1) {
3033                                 // Figure out which position mismatched
3034                                 mmpos = 31;
3035                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos -= 16; }
3036                                 assert_neq(0, diff);
3037                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos -=  8; }
3038                                 assert_neq(0, diff);
3039                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos -=  4; }
3040                                 assert_neq(0, diff);
3041                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos -=  2; }
3042                                 assert_neq(0, diff);
3043                                 if((diff & 0x3llu) == 0)        { mmpos--; }
3044                                 assert_neq(0, diff);
3045                                 assert_geq(mmpos, 0);
3046                                 assert_lt(mmpos, 32);
3047                                 mmpos -= anchorCushion;
3048                                 assert_lt(mmpos, anchorBitPairs);
3049                                 refc = "ACGT"[(int)ref[rir + mmpos]];
3050                                 if(!seedOnLeft) {
3051                                         mmpos += readSeedOverhang;
3052                                 }
3053                                 char q = quals[mmpos];
3054                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3055                                 if(ham > this->qualMax_) {
3056                                         // Exceeded quality limit
3057                                         continue;
3058                                 }
3059                         }
3060                         // If the seed is longer than the anchor, then scan the
3061                         // rest of the seed characters
3062                         bool foundHit = true;
3063                         if(seedAnchorOverhang) {
3064                                 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
3065                                         int rc = (int)ref[rir + anchorBitPairs + j];
3066                                         if(rc == 4) {
3067                                                 // Oops, encountered an N in the reference in
3068                                                 // the overhang portion of the candidate
3069                                                 // alignment
3070                                                 // (Note that we inverted hi earlier)
3071                                                 if(hi) {
3072                                                         // Right-to-left
3073                                                         // Skip out of the seedAnchorOverhang
3074                                                         assert_eq(0, skipRightToLefts);
3075                                                         skipRightToLefts = seedAnchorOverhang - j - 1;
3076                                                         if(seedOnLeft) {
3077                                                                 // ...and skip out of the rest of the read
3078                                                                 skipRightToLefts += readSeedOverhang;
3079                                                         }
3080                                                 } else {
3081                                                         // Left-to-right
3082                                                         // Skip out of the seedAnchorOverhang
3083                                                         assert_eq(0, skipLeftToRights);
3084                                                         skipLeftToRights = anchorBitPairs + j;
3085                                                         if(!seedOnLeft) {
3086                                                                 // ...and skip out of the rest of the read
3087                                                                 skipLeftToRights += readSeedOverhang;
3088                                                         }
3089                                                 }
3090                                                 foundHit = false; // Skip this candidate
3091                                                 break;
3092                                         }
3093                                         uint32_t qoff = anchorBitPairs + j;
3094                                         if(!seedOnLeft) {
3095                                                 qoff += readSeedOverhang;
3096                                         }
3097                                         assert_lt(qoff, qlen);
3098                                         if((int)qry[qoff] != rc) {
3099                                                 if(++diffs > 1) {
3100                                                         foundHit = false;
3101                                                         break;
3102                                                 } else {
3103                                                         assert_eq(0xffffffff, mmpos);
3104                                                         mmpos = qoff;
3105                                                         assert_eq(-1, refc);
3106                                                         refc = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
3107                                                         char q = phredCharToPhredQual(quals[qoff]);
3108                                                         ham += mmPenalty(this->maqPenalty_, q);
3109                                                         if(ham > this->qualMax_) {
3110                                                                 // Exceeded quality limit
3111                                                                 foundHit = false;
3112                                                                 break;
3113                                                         }
3114                                                 }
3115                                         }
3116                                 }
3117                                 if(!foundHit) continue;
3118                         }
3119                         // If the read is longer than the seed, then scan the rest
3120                         // of the read characters; mismatches no longer count
3121                         // toward the stratum or the 1-mm limit.
3122                         // Vectors for holding edit information
3123                         std::vector<uint32_t> nonSeedMms;
3124                         std::vector<uint8_t> nonSeedRefcs;
3125                         int mms = diffs; // start counting total mismatches
3126                         if((qlen - slen) > 0) {
3127                                 // Going left-to-right
3128                                 for(uint32_t j = 0; j < readSeedOverhang; j++) {
3129                                         uint32_t roff = rir + slen + j;
3130                                         uint32_t qoff = slen + j;
3131                                         if(!seedOnLeft) {
3132                                                 assert_geq(roff, qlen);
3133                                                 roff -= qlen;
3134                                                 qoff = j;
3135                                         }
3136                                         int rc = (int)ref[roff];
3137                                         if(rc == 4) {
3138                                                 // Oops, encountered an N in the reference in
3139                                                 // the overhang portion of the candidate
3140                                                 // alignment
3141                                                 // (Note that we inverted hi earlier)
3142                                                 if(hi) {
3143                                                         // Right-to-left
3144                                                         // Skip what's left of the readSeedOverhang
3145                                                         skipRightToLefts = readSeedOverhang - j - 1;
3146                                                         if(!seedOnLeft) {
3147                                                                 // ...and skip the seed if it's on the right
3148                                                                 skipRightToLefts += slen;
3149                                                         }
3150                                                 } else {
3151                                                         // Left-to-right
3152                                                         // Skip what we've matched of the overhang
3153                                                         skipLeftToRights = j;
3154                                                         if(seedOnLeft) {
3155                                                                 // ...and skip the seed if it's on the left
3156                                                                 skipLeftToRights += slen;
3157                                                         }
3158                                                 }
3159                                                 foundHit = false; // Skip this candidate
3160                                                 break;
3161                                         }
3162                                         if((int)qry[qoff] != rc) {
3163                                                 // Calculate quality of mismatched base
3164                                                 char q = phredCharToPhredQual(quals[qoff]);
3165                                                 ham += mmPenalty(this->maqPenalty_, q);
3166                                                 if(ham > this->qualMax_) {
3167                                                         // Exceeded quality limit
3168                                                         foundHit = false;
3169                                                         break;
3170                                                 }
3171                                                 // Legal mismatch outside of the anchor; record it
3172                                                 mms++;
3173                                                 nonSeedMms.push_back(qoff);
3174                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
3175                                                 nonSeedRefcs.push_back("ACGTN"[rc]);
3176                                         }
3177                                 }
3178                                 if(!foundHit) continue;
3179                         }
3180                         assert(foundHit);
3181                         // Adjust ri if seed is on the right-hand side
3182                         if(!seedOnLeft) {
3183                                 ri -= readSeedOverhang;
3184                                 rir -= readSeedOverhang;
3185                         }
3186                         if(pairs != NULL) {
3187                                 TU64Pair p;
3188                                 if(ri < aoff) {
3189                                         // By convention, the upstream mate's
3190                                         // coordinates go in the 'first' field
3191                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
3192                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
3193                                 } else {
3194                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
3195                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
3196                                 }
3197                                 if(pairs->find(p) != pairs->end()) {
3198                                         // We already found this hit!  Continue.
3199                                         ASSERT_ONLY(duplicates++);
3200                                         ASSERT_ONLY(r2i++);
3201                                         continue;
3202                                 } else {
3203                                         // Record this hit
3204                                         pairs->insert(p);
3205                                 }
3206                         }
3207                         if(this->verbose_) {
3208                                 cout << "About to report:" << endl;
3209                                 cout << "  ";
3210                                 for(size_t i = 0; i < qlen; i++) {
3211                                         cout << (char)qry[i];
3212                                 }
3213                                 cout << endl;
3214                                 cout << "  ";
3215                                 for(size_t i = 0; i < qlen; i++) {
3216                                         cout << "ACGT"[ref[rir+i]];
3217                                 }
3218                                 cout << endl;
3219                         }
3220                         assert_leq(diffs, 1);
3221                         assert_geq((size_t)mms, diffs);
3222                         assert_lt(r2i, r2.size());
3223                         assert_eq(re2[r2i], ri);
3224                         ranges.resize(ranges.size()+1);
3225                         Range& range = ranges.back();
3226                         assert_eq((size_t)mms, r2[r2i].numMms);
3227                         range.stratum = diffs;
3228                         range.numMms = mms;
3229                         assert_eq(0, range.mms.size());
3230                         assert_eq(0, range.refcs.size());
3231                         if(mms > 0) {
3232                                 ASSERT_ONLY(size_t mmcur = 0);
3233                                 if(seedOnLeft && diffs > 0) {
3234                                         assert_neq(mmpos, 0xffffffff);
3235                                         assert_lt(mmpos, qlen);
3236                                         assert_lt(mmcur, (size_t)mms);
3237                                         assert_eq(mmpos, r2[r2i].mms[mmcur]);
3238                                         assert_neq(-1, refc);
3239                                         assert_eq(refc, r2[r2i].refcs[mmcur]);
3240                                         ASSERT_ONLY(mmcur++);
3241                                         range.mms.push_back(mmpos);
3242                                         range.refcs.push_back(refc);
3243                                 }
3244                                 const size_t nonSeedMmsSz = nonSeedMms.size();
3245                                 for(size_t i = 0; i < nonSeedMmsSz; i++) {
3246                                         assert_neq(0xffffffff, nonSeedMms[i]);
3247                                         assert_lt(mmcur, (size_t)mms);
3248                                         assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
3249                                         range.mms.push_back(nonSeedMms[i]);
3250                                         assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
3251                                         ASSERT_ONLY(mmcur++);
3252                                         range.refcs.push_back(nonSeedRefcs[i]);
3253                                 }
3254                                 if(!seedOnLeft && diffs > 0) {
3255                                         assert_neq(mmpos, 0xffffffff);
3256                                         assert_lt(mmpos, qlen);
3257                                         assert_lt(mmcur, (size_t)mms);
3258                                         assert_eq(mmpos, r2[r2i].mms[mmcur]);
3259                                         assert_neq(-1, refc);
3260                                         assert_eq(refc, r2[r2i].refcs[mmcur]);
3261                                         ASSERT_ONLY(mmcur++);
3262                                         range.mms.push_back(mmpos);
3263                                         range.refcs.push_back(refc);
3264                                 }
3265                                 assert_eq(mmcur, r2[r2i].mms.size());
3266                         }
3267                         assert_eq((size_t)mms, range.mms.size());
3268                         assert_eq((size_t)mms, range.refcs.size());
3269                         ASSERT_ONLY(r2i++);
3270                         results.push_back(ri);
3271                         if(--numToFind == 0) return;
3272                 }
3273                 assert_leq(duplicates, r2.size());
3274                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
3275                 return; // no hit
3276         }
3277 };
3278
3279 /**
3280  * Concrete RefAligner for finding nearby 2-mismatch hits given an
3281  * anchor hit.
3282  */
3283 template<typename TStr>
3284 class Seed2RefAligner : public RefAligner<TStr> {
3285
3286         typedef seqan::String<seqan::Dna5> TDna5Str;
3287         typedef seqan::String<char> TCharStr;
3288         typedef std::vector<Range> TRangeVec;
3289         typedef std::vector<uint32_t> TU32Vec;
3290         typedef std::pair<uint64_t, uint64_t> TU64Pair;
3291         typedef std::set<TU64Pair> TSetPairs;
3292
3293 public:
3294
3295         Seed2RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
3296                 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
3297
3298         virtual ~Seed2RefAligner() { }
3299
3300 protected:
3301         /**
3302          * This schematic shows the roles played by the begin, qbegin, end,
3303          * qend, halfway, slen, qlen, and lim variables:
3304          *
3305          * seedOnLeft == true:
3306          *
3307          * |<                   lim                   >|<     qlen       >|
3308          *  --------------------------------------------------------------
3309          * |                     | slen | qlen-slen |  | slen | qlen-slen |
3310          *  --------------------------------------------------------------
3311          * ^                     ^                     ^                  ^
3312          * begin & qbegin     halfway                qend               end
3313          *
3314          * seedOnLeft == false:
3315          *
3316          * |<     qlen       >|<                   lim                   >|
3317          *  --------------------------------------------------------------
3318          * | qlen-slen | slen |  | qlen-slen | slen |                     |
3319          *  --------------------------------------------------------------
3320          * ^                  ^                     ^                     ^
3321          * begin            qbegin             halfway           qend & end
3322          *
3323          * Note that, for seeds longer than 32 base-pairs, the seed is
3324          * further subdivided into a 32-bit anchor and a seed overhang of
3325          * length > 0.
3326          */
3327         void naiveFind(uint32_t numToFind,
3328                                    uint32_t tidx,
3329                                    uint8_t* ref,
3330                                    const TDna5Str& qry,
3331                                    const TCharStr& quals,
3332                                    uint32_t begin,
3333                                    uint32_t end,
3334                                    TRangeVec& ranges,
3335                                    TU32Vec& results,
3336                                    TSetPairs* pairs,
3337                                    uint32_t aoff,
3338                                    bool seedOnLeft) const
3339         {
3340                 assert_gt(numToFind, 0);
3341                 assert_gt(end, begin);
3342                 const uint32_t qlen = seqan::length(qry);
3343                 assert_gt(qlen, 0);
3344                 assert_geq(end - begin, qlen); // caller should have checked this
3345                 assert_gt(this->seedLen_, 0);
3346                 const uint32_t slen = min(qlen, this->seedLen_);
3347                 uint32_t qend = end;
3348                 uint32_t qbegin = begin;
3349                 // If the seed is on the left-hand side of the alignment, then
3350                 // leave a gap at the right-hand side of the interval;
3351                 // otherwise, do the opposite
3352                 if(seedOnLeft) {
3353                         // Leave gap on right-hand side of the interval
3354                         qend -= qlen;
3355                 } else {
3356                         // Leave gap on left-hand side of the interval
3357                         qbegin += qlen;
3358                 }
3359                 // lim = number of alignments to try
3360                 const uint32_t lim = qend - qbegin;
3361                 // halfway = position in the reference to start at (and then
3362                 // we work our way out to the right and to the left).
3363                 const uint32_t halfway = qbegin + (lim >> 1);
3364                 // Vectors for holding edit information
3365                 std::vector<uint32_t> nonSeedMms;
3366                 std::vector<uint8_t> nonSeedRefcs;
3367                 bool hi = false;
3368                 for(uint32_t i = 1; i <= lim+1; i++) {
3369                         uint32_t ri;  // leftmost position in candidate alignment
3370                         uint32_t rir; // same, minus begin; for indexing into ref[]
3371                         if(hi) {
3372                                 ri = halfway + (i >> 1); rir = ri - begin;
3373                                 assert_leq(ri, qend);
3374                         } else {
3375                                 ri = halfway - (i >> 1); rir = ri - begin;
3376                                 assert_geq(ri, begin);
3377                         }
3378                         hi = !hi;
3379                         // Do the naive comparison
3380                         bool match = true;
3381                         int refc1 = -1;
3382                         uint32_t mmOff1 = 0xffffffff;
3383                         int refc2 = -1;
3384                         uint32_t mmOff2 = 0xffffffff;
3385                         int mms = 0;
3386                         int seedMms = 0;
3387                         unsigned int ham = 0;
3388                         nonSeedMms.clear();
3389                         nonSeedRefcs.clear();
3390                         // Walk through each position of the alignment
3391                         for(uint32_t jj = 0; jj < qlen; jj++) {
3392                                 uint32_t j = jj;
3393                                 if(!seedOnLeft) {
3394                                         // If seed is on the right, scan right-to-left
3395                                         j = qlen - jj - 1;
3396                                 } else {
3397                                         // Go left-to-right
3398                                 }
3399                                 uint32_t rirj = rir + j;
3400                                 if(!seedOnLeft) {
3401                                         assert_geq(rir, jj);
3402                                         rirj = rir - jj - 1;
3403                                 }
3404 #if 0
3405                                 // Count Ns in the reference as mismatches
3406                                 const int q = (int)qry[j];
3407                                 const int r = (int)ref[rirj];
3408                                 assert_leq(q, 4);
3409                                 assert_leq(r, 4);
3410                                 if(q == 4 || r == 4 || q != r) {
3411 #else
3412                                 // Disallow alignments that involve an N in the
3413                                 // reference
3414                                 const int r = (int)ref[rirj];
3415                                 if(r & 4) {
3416                                         // N in reference; bail on this alignment
3417                                         match = false;
3418                                         break;
3419                                 }
3420                                 const int q = (int)qry[j];
3421                                 assert_leq(q, 4);
3422                                 assert_lt(r, 4);
3423                                 if(q != r) {
3424 #endif
3425                                         // Mismatch!
3426                                         mms++;
3427                                         if(mms > 2 && jj < slen) {
3428                                                 // More than one mismatch in the anchor; reject
3429                                                 match = false;
3430                                                 break;
3431                                         }
3432                                         uint8_t qual = phredCharToPhredQual(quals[j]);
3433                                         ham += mmPenalty(this->maqPenalty_, qual);
3434                                         if(ham > this->qualMax_) {
3435                                                 // Exceeded quality ceiling; reject
3436                                                 match = false;
3437                                                 break;
3438                                         } else if(mms == 1 && jj < slen) {
3439                                                 // First mismatch in the anchor; remember offset
3440                                                 // and ref char
3441                                                 refc1 = "ACGTN"[r];
3442                                                 mmOff1 = j;
3443                                                 seedMms = 1;
3444                                         } else if(mms == 2 && jj < slen) {
3445                                                 // Second mismatch in the anchor; remember offset
3446                                                 // and ref char
3447                                                 refc2 = "ACGTN"[r];
3448                                                 mmOff2 = j;
3449                                                 seedMms = 2;
3450                                         } else {
3451                                                 // Legal mismatch outside of the anchor; record it
3452                                                 nonSeedMms.push_back(j);
3453                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
3454                                                 nonSeedRefcs.push_back("ACGTN"[r]);
3455                                         }
3456                                 }
3457                         }
3458                         if(match) {
3459                                 ranges.resize(ranges.size()+1);
3460                                 Range& range = ranges.back();
3461                                 assert_leq(seedMms, mms);
3462                                 range.stratum = seedMms;
3463                                 range.numMms = mms;
3464                                 assert_eq(0, range.mms.size());
3465                                 assert_eq(0, range.refcs.size());
3466                                 if(mms >= 1) {
3467                                         // Be careful to add edits in left-to-right order
3468                                         // with respect to the read/alignment
3469                                         if(seedOnLeft && seedMms) {
3470                                                 assert_lt(mmOff1, qlen);
3471                                                 range.mms.push_back(mmOff1);
3472                                                 range.refcs.push_back(refc1);
3473                                                 if(seedMms > 1) {
3474                                                         assert_lt(mmOff1, mmOff2);
3475                                                         assert_lt(mmOff2, qlen);
3476                                                         range.mms.push_back(mmOff2);
3477                                                         range.refcs.push_back(refc2);
3478                                                 }
3479                                         }
3480                                         const size_t nonSeedMmsSz = nonSeedMms.size();
3481                                         if(nonSeedMmsSz > 0) {
3482                                                 if(seedOnLeft) {
3483                                                         for(size_t k = 0; k < nonSeedMmsSz; k++) {
3484                                                                 range.mms.push_back(nonSeedMms[k]);
3485                                                                 range.refcs.push_back(nonSeedRefcs[k]);
3486                                                         }
3487                                                 } else {
3488                                                         for(size_t k = nonSeedMmsSz; k > 0; k--) {
3489                                                                 range.mms.push_back(nonSeedMms[k-1]);
3490                                                                 range.refcs.push_back(nonSeedRefcs[k-1]);
3491                                                         }
3492                                                 }
3493                                         }
3494                                         if(!seedOnLeft && seedMms) {
3495                                                 if(seedMms > 1) {
3496                                                         assert_lt(mmOff2, mmOff1);
3497                                                         assert_lt(mmOff2, qlen);
3498                                                         range.mms.push_back(mmOff2);
3499                                                         range.refcs.push_back(refc2);
3500                                                 }
3501                                                 assert_lt(mmOff1, qlen);
3502                                                 range.mms.push_back(mmOff1);
3503                                                 range.refcs.push_back(refc1);
3504                                         }
3505                                 }
3506                                 assert_eq((size_t)mms, range.mms.size());
3507                                 assert_eq((size_t)mms, range.refcs.size());
3508                                 if(seedOnLeft) {
3509                                         results.push_back(ri);
3510                                 } else {
3511                                         results.push_back(ri - qlen);
3512                                 }
3513                         }
3514                 }
3515                 return;
3516         }
3517
3518         /**
3519          * This schematic shows the roles played by the begin, qbegin, end,
3520          * qend, halfway, slen, qlen, and lim variables:
3521          *
3522          * seedOnLeft == true:
3523          *
3524          * |<                   lim                   >|<     qlen       >|
3525          *  --------------------------------------------------------------
3526          * |                     | slen | qlen-slen |  | slen | qlen-slen |
3527          *  --------------------------------------------------------------
3528          * ^                     ^                     ^                  ^
3529          * begin & qbegin     halfway                qend               end
3530          *
3531          * seedOnLeft == false:
3532          *
3533          *             |<                   lim                   >|
3534          *  --------------------------------------------------------------
3535          * | qlen-slen |         | qlen-slen | slen |              | slen |
3536          *  --------------------------------------------------------------
3537          * ^           ^                     ^                     ^      ^
3538          * begin       qbegin             halfway                qend   end
3539          *
3540          * Note that, for seeds longer than 32 base-pairs, the seed is
3541          * further subdivided into a 32-bit anchor and a seed overhang of
3542          * length > 0.
3543          */
3544         virtual void anchor64Find(uint32_t numToFind,
3545                                         uint32_t tidx,
3546                                         uint8_t* ref,
3547                                         const TDna5Str& qry,
3548                                         const TCharStr& quals,
3549                                         uint32_t begin,
3550                                         uint32_t end,
3551                                         TRangeVec& ranges,
3552                                         TU32Vec& results,
3553                                         TSetPairs* pairs = NULL,
3554                                         uint32_t aoff = 0xffffffff,
3555                                         bool seedOnLeft = false) const
3556         {
3557                 assert_gt(numToFind, 0);
3558                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
3559                 ASSERT_ONLY(uint32_t duplicates = 0);
3560                 ASSERT_ONLY(uint32_t r2i = 0);
3561                 const uint32_t qlen = seqan::length(qry);
3562                 assert_gt(qlen, 0);
3563                 assert_gt(end, begin);
3564                 assert_geq(end - begin, qlen); // caller should have checked this
3565                 assert_gt(this->seedLen_, 0);
3566                 uint32_t slen = min(qlen, this->seedLen_);
3567 #ifndef NDEBUG
3568                 // Get results from the naive matcher for sanity-checking
3569                 TRangeVec r2; TU32Vec re2;
3570                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
3571                                   re2, pairs, aoff, seedOnLeft);
3572 #endif
3573                 const uint32_t anchorBitPairs = min<int>(slen, 32);
3574                 const int lhsShift = ((anchorBitPairs - 1) << 1);
3575                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
3576                 // seedAnchorOverhang = # seed bases not included in the anchor
3577                 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
3578                 // seedAnchorOverhang = # seed bases not included in the anchor
3579                 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
3580                 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
3581                 assert_eq(qlen, readSeedOverhang + slen);
3582                 uint32_t qend = end;
3583                 uint32_t qbegin = begin;
3584                 if(seedOnLeft) {
3585                         // Leave read-sized gap on right-hand side of the interval
3586                         qend -= qlen;
3587                 } else {
3588                         // Leave seed-sized gap on right-hand side and
3589                         // non-seed-sized gap on the left-hand side
3590                         qbegin += readSeedOverhang;
3591                         qend -= slen;
3592                 }
3593                 // lim = # possible alignments in the range
3594                 const uint32_t lim = qend - qbegin;
3595                 // halfway = point on the genome to radiate out from
3596                 const uint32_t halfway = qbegin + (lim >> 1);
3597                 uint64_t anchor = 0llu;
3598                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
3599                 // OR the 'diff' buffer with this so that we can always count
3600                 // 'N's as mismatches
3601                 uint64_t diffMask = 0llu;
3602                 // Set up a mask that we'll apply to the two bufs every round
3603                 // to discard bits that were rotated out of the anchor area
3604                 uint64_t clearMask = 0xffffffffffffffffllu;
3605                 bool useMask = false;
3606                 if(anchorBitPairs < 32) {
3607                         clearMask >>= ((32-anchorBitPairs) << 1);
3608                         useMask = true;
3609                 }
3610                 int nsInSeed = 0;
3611                 uint32_t nPoss = 0;
3612                 int nPos1 = -1;
3613                 int nPos2 = -1;
3614                 uint32_t skipLeftToRights = 0;
3615                 uint32_t skipRightToLefts = 0;
3616                 const uint32_t halfwayRi = halfway - begin;
3617                 assert_leq(anchorBitPairs, slen);
3618                 // Construct the 'anchor' 64-bit buffer so that it holds all of
3619                 // the first 'anchorBitPairs' bit pairs of the query.
3620                 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
3621                         uint32_t i = ii;
3622                         if(!seedOnLeft) {
3623                                 // Fill in the anchor using characters from the seed
3624                                 // portion of the read, starting at the left.  Note
3625                                 // that we're subtracting by slen rather than
3626                                 // anchorBitPairs because we want the seed anchor
3627                                 // overhang to be on the right-hand side
3628                                 i = qlen - slen + ii;
3629                         }
3630                         int c = (int)qry[i]; // next query character
3631                         int r = (int)ref[halfwayRi + ii]; // next reference character
3632                         if(r & 4) {
3633                                 // The reference character is an N; to mimic the
3634                                 // behavior of BW alignment, we have to skip all
3635                                 // alignments that involve an N in the reference.  Set
3636                                 // the skip* variables accordingly.
3637                                 r = 0;
3638                                 uint32_t lrSkips = ii;
3639                                 uint32_t rlSkips = qlen - ii;
3640                                 if(!seedOnLeft && readSeedOverhang) {
3641                                         lrSkips += readSeedOverhang;
3642                                         assert_geq(rlSkips, readSeedOverhang);
3643                                         rlSkips -= readSeedOverhang;
3644                                 }
3645                                 // The right-to-left direction absorbs the candidate
3646                                 // alignment based at 'halfway'
3647                                 skipLeftToRights = max(skipLeftToRights, lrSkips);
3648                                 skipRightToLefts = max(skipRightToLefts, rlSkips);
3649                                 assert_leq(skipLeftToRights, qlen);
3650                                 assert_leq(skipRightToLefts, qlen);
3651                         }
3652                         assert_leq(c, 4);
3653                         assert_lt(r, 4);
3654                         // Special case: query has an 'N'
3655                         if(c == 4) {
3656                                 if(++nsInSeed > 2) {
3657                                         // More than one 'N' in the anchor region; can't
3658                                         // possibly have a 1-mismatch hit anywhere
3659                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
3660                                         return;   // can't match if query has Ns
3661                                 }
3662                                 if(nsInSeed == 1) {
3663                                         nPos1 = (int)ii; // w/r/t LHS of anchor
3664                                 } else {
3665                                         assert_eq(2, nsInSeed);
3666                                         nPos2 = (int)ii; // w/r/t LHS of anchor
3667                                         assert_gt(nPos2, nPos1);
3668                                 }
3669                                 // Make it look like an 'A' in the anchor
3670                                 c = 0;
3671                                 diffMask = (diffMask << 2llu) | 1llu;
3672                         } else {
3673                                 diffMask <<= 2llu;
3674                         }
3675                         anchor = ((anchor << 2llu) | c);
3676                         buffw = ((buffw << 2llu) | r);
3677                 }
3678                 // Check whether read is disqualified by Ns inside the seed
3679                 // region but outside the anchor region
3680                 if(seedAnchorOverhang) {
3681                         assert_lt(anchorBitPairs, slen);
3682                         for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
3683                                 uint32_t i = ii;
3684                                 if(!seedOnLeft) {
3685                                         i = qlen - slen + ii;
3686                                 }
3687                                 if((int)qry[i] == 4) {
3688                                         if(++nsInSeed > 2) {
3689                                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
3690                                                 return; // can't match if query has Ns
3691                                         }
3692                                 }
3693                         }
3694                 } else {
3695                         assert_eq(anchorBitPairs, slen);
3696                 }
3697                 uint64_t bufbw = buffw;
3698                 // Slide the anchor out in either direction, alternating
3699                 // between right-to-left and left-to-right shifts, until all of
3700                 // the positions from qbegin to qend have been covered.
3701                 bool hi = false;
3702                 uint32_t riHi  = halfway;
3703                 uint32_t rirHi = halfway - begin;
3704                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
3705                 uint32_t riLo  = halfway + 1;
3706                 uint32_t rirLo = halfway - begin + 1;
3707                 uint32_t lrSkips = anchorBitPairs;
3708                 uint32_t rlSkips = qlen;
3709                 if(!seedOnLeft && readSeedOverhang) {
3710                         lrSkips += readSeedOverhang;
3711                         assert_geq(rlSkips, readSeedOverhang);
3712                         rlSkips -= readSeedOverhang;
3713                 }
3714                 for(uint32_t i = 1; i <= lim + 1; i++) {
3715                         int r;       // new reference char
3716                         uint64_t diff;
3717                         assert_leq(skipLeftToRights, qlen);
3718                         assert_leq(skipRightToLefts, qlen);
3719                         if(hi) {
3720                                 hi = false;
3721                                 // Moving left-to-right
3722                                 riHi++;
3723                                 rirHi++;
3724                                 rirHiAnchor++;
3725                                 r = (int)ref[rirHiAnchor];
3726                                 if(r & 4) {
3727                                         r = 0;
3728                                         skipLeftToRights = lrSkips;
3729                                 }
3730                                 assert_lt(r, 4);
3731                                 // Bring in new base pair at the least significant
3732                                 // position
3733                                 buffw = ((buffw << 2llu) | r);
3734                                 if(useMask) buffw &= clearMask;
3735                                 if(skipLeftToRights > 0) {
3736                                         skipLeftToRights--;
3737                                         continue;
3738                                 }
3739                                 diff = (buffw ^ anchor) | diffMask;
3740                         } else {
3741                                 hi = true;
3742                                 // Moving right-to-left
3743                                 riLo--;
3744                                 rirLo--;
3745                                 r = (int)ref[rirLo];
3746                                 if(r & 4) {
3747                                         r = 0;
3748                                         skipRightToLefts = rlSkips;
3749                                 }
3750                                 assert_lt(r, 4);
3751                                 if(i >= 2) {
3752                                         bufbw >>= 2llu;
3753                                         // Bring in new base pair at the most significant
3754                                         // position
3755                                         bufbw |= ((uint64_t)r << lhsShift);
3756                                 }
3757                                 if(skipRightToLefts > 0) {
3758                                         skipRightToLefts--;
3759                                         continue;
3760                                 }
3761                                 diff = (bufbw ^ anchor) | diffMask;
3762                         }
3763                         if((diff & 0xf00f00f00f00f00fllu) &&
3764                            (diff & 0x0f00f00f00f00f00llu) &&
3765                            (diff & 0x00f00f00f00f00f0llu)) continue;
3766                         if((diff & 0xc30c30c30c30c30cllu) &&
3767                            (diff & 0x30c30c30c30c30c3llu) &&
3768                            (diff & 0x0c30c30c30c30c30llu)) continue;
3769                         uint32_t ri  = hi ? riLo  : riHi;
3770                         uint32_t rir = hi ? rirLo : rirHi;
3771                         // Could use pop count
3772                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
3773                         // As a first cut, see if there are too many mismatches in
3774                         // the first and last parts of the anchor
3775                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
3776                         if(diffs > 2) continue;
3777                         diffs += u8toMms[(int)diff8[1]] +
3778                                          u8toMms[(int)diff8[2]] +
3779                                          u8toMms[(int)diff8[3]] +
3780                                          u8toMms[(int)diff8[4]] +
3781                                          u8toMms[(int)diff8[5]] +
3782                                          u8toMms[(int)diff8[6]];
3783                         uint32_t mmpos1 = 0xffffffff;
3784                         int refc1 = -1;
3785                         uint32_t mmpos2 = 0xffffffff;
3786                         int refc2 = -1;
3787                         unsigned int ham = 0;
3788                         if(diffs > 2) {
3789                                 // Too many differences in the seed; stop
3790                                 continue;
3791                         } else if(nPoss > 1 && diffs == nPoss) {
3792                                 // There was one difference, but there was also one N,
3793                                 // so we already know where the difference is
3794                                 mmpos1 = nPos1;
3795                                 refc1 = "ACGT"[(int)ref[rir + nPos1]];
3796                                 if(!seedOnLeft) {
3797                                         mmpos1 += readSeedOverhang;
3798                                 }
3799                                 char q = quals[mmpos1];
3800                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3801                                 if(ham > this->qualMax_) {
3802                                         // Exceeded quality limit
3803                                         continue;
3804                                 }
3805                                 if(nPoss == 2) {
3806                                         mmpos2 = nPos2;
3807                                         refc2 = "ACGT"[(int)ref[rir + nPos2]];
3808                                         if(!seedOnLeft) {
3809                                                 mmpos2 += readSeedOverhang;
3810                                         }
3811                                         q = quals[mmpos2];
3812                                         ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3813                                         if(ham > this->qualMax_) {
3814                                                 // Exceeded quality limit
3815                                                 continue;
3816                                         }
3817                                 }
3818
3819                         } else if(diffs > 0) {
3820                                 // Figure out which position mismatched
3821                                 uint64_t diff2 = diff;
3822                                 mmpos1 = 31;
3823                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
3824                                 assert_neq(0, diff);
3825                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos1 -=  8; }
3826                                 assert_neq(0, diff);
3827                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos1 -=  4; }
3828                                 assert_neq(0, diff);
3829                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos1 -=  2; }
3830                                 assert_neq(0, diff);
3831                                 if((diff & 0x3llu) == 0)        { mmpos1--; }
3832                                 assert_neq(0, diff);
3833                                 assert_geq(mmpos1, 0);
3834                                 assert_lt(mmpos1, 32);
3835                                 uint32_t savedMmpos1 = mmpos1;
3836                                 mmpos1 -= anchorCushion;
3837                                 assert_lt(mmpos1, anchorBitPairs);
3838                                 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
3839                                 if(!seedOnLeft) {
3840                                         mmpos1 += readSeedOverhang;
3841                                 }
3842                                 char q = quals[mmpos1];
3843                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3844                                 if(ham > this->qualMax_) {
3845                                         // Exceeded quality limit
3846                                         continue;
3847                                 }
3848                                 if(diffs > 1) {
3849                                         // Figure out the second mismatched position
3850                                         ASSERT_ONLY(uint64_t origDiff2 = diff2);
3851                                         diff2 &= ~(0xc000000000000000llu >> (uint64_t)((savedMmpos1) << 1));
3852                                         assert_neq(diff2, origDiff2);
3853                                         mmpos2 = 31;
3854                                         if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
3855                                         assert_neq(0, diff2);
3856                                         if((diff2 & 0xffffllu) == 0)     { diff2 >>= 16llu; mmpos2 -=  8; }
3857                                         assert_neq(0, diff2);
3858                                         if((diff2 & 0xffllu) == 0)       { diff2 >>= 8llu;  mmpos2 -=  4; }
3859                                         assert_neq(0, diff2);
3860                                         if((diff2 & 0xfllu) == 0)        { diff2 >>= 4llu;  mmpos2 -=  2; }
3861                                         assert_neq(0, diff2);
3862                                         if((diff2 & 0x3llu) == 0)        { mmpos2--; }
3863                                         assert_neq(0, diff2);
3864                                         assert_geq(mmpos2, 0);
3865                                         assert_lt(mmpos2, 32);
3866                                         mmpos2 -= anchorCushion;
3867                                         assert_neq(mmpos1, mmpos2);
3868                                         refc2 = "ACGT"[(int)ref[rir + mmpos2]];
3869                                         if(!seedOnLeft) {
3870                                                 mmpos2 += readSeedOverhang;
3871                                         }
3872                                         q = quals[mmpos2];
3873                                         ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3874                                         if(ham > this->qualMax_) {
3875                                                 // Exceeded quality limit
3876                                                 continue;
3877                                         }
3878                                         if(mmpos2 < mmpos1) {
3879                                                 uint32_t mmtmp = mmpos1;
3880                                                 mmpos1 = mmpos2;
3881                                                 mmpos2 = mmtmp;
3882                                                 int refctmp = refc1;
3883                                                 refc1 = refc2;
3884                                                 refc2 = refctmp;
3885                                         }
3886                                         assert_lt(mmpos1, mmpos2);
3887                                 }
3888                         }
3889                         // If the seed is longer than the anchor, then scan the
3890                         // rest of the seed characters
3891                         bool foundHit = true;
3892                         if(seedAnchorOverhang) {
3893                                 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
3894                                         int rc = (int)ref[rir + anchorBitPairs + j];
3895                                         if(rc == 4) {
3896                                                 // Oops, encountered an N in the reference in
3897                                                 // the overhang portion of the candidate
3898                                                 // alignment
3899                                                 // (Note that we inverted hi earlier)
3900                                                 if(hi) {
3901                                                         // Right-to-left
3902                                                         // Skip out of the seedAnchorOverhang
3903                                                         assert_eq(0, skipRightToLefts);
3904                                                         skipRightToLefts = seedAnchorOverhang - j - 1;
3905                                                         if(seedOnLeft) {
3906                                                                 // ...and skip out of the rest of the read
3907                                                                 skipRightToLefts += readSeedOverhang;
3908                                                         }
3909                                                 } else {
3910                                                         // Left-to-right
3911                                                         // Skip out of the seedAnchorOverhang
3912                                                         assert_eq(0, skipLeftToRights);
3913                                                         skipLeftToRights = anchorBitPairs + j;
3914                                                         if(!seedOnLeft) {
3915                                                                 // ...and skip out of the rest of the read
3916                                                                 skipLeftToRights += readSeedOverhang;
3917                                                         }
3918                                                 }
3919                                                 foundHit = false; // Skip this candidate
3920                                                 break;
3921                                         }
3922                                         uint32_t qoff = anchorBitPairs + j;
3923                                         if(!seedOnLeft) {
3924                                                 qoff += readSeedOverhang;
3925                                         }
3926                                         assert_lt(qoff, qlen);
3927                                         if((int)qry[qoff] != rc) {
3928                                                 diffs++;
3929                                                 if(diffs > 2) {
3930                                                         foundHit = false;
3931                                                         break;
3932                                                 } else if(diffs == 2) {
3933                                                         assert_eq(0xffffffff, mmpos2);
3934                                                         mmpos2 = qoff;
3935                                                         assert_eq(-1, refc2);
3936                                                         refc2 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
3937                                                 } else {
3938                                                         assert_eq(1, diffs);
3939                                                         assert_eq(0xffffffff, mmpos1);
3940                                                         mmpos1 = qoff;
3941                                                         assert_eq(-1, refc1);
3942                                                         refc1 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
3943                                                 }
3944                                                 char q = phredCharToPhredQual(quals[qoff]);
3945                                                 ham += mmPenalty(this->maqPenalty_, q);
3946                                                 if(ham > this->qualMax_) {
3947                                                         // Exceeded quality limit
3948                                                         foundHit = false;
3949                                                         break;
3950                                                 }
3951                                         }
3952                                 }
3953                                 if(!foundHit) continue;
3954                         }
3955                         // If the read is longer than the seed, then scan the rest
3956                         // of the read characters; mismatches no longer count
3957                         // toward the stratum or the 1-mm limit.
3958                         // Vectors for holding edit information
3959                         std::vector<uint32_t> nonSeedMms;
3960                         std::vector<uint8_t> nonSeedRefcs;
3961                         int mms = diffs; // start counting total mismatches
3962                         if((qlen - slen) > 0) {
3963                                 // Going left-to-right
3964                                 for(uint32_t j = 0; j < readSeedOverhang; j++) {
3965                                         uint32_t roff = rir + slen + j;
3966                                         uint32_t qoff = slen + j;
3967                                         if(!seedOnLeft) {
3968                                                 assert_geq(roff, qlen);
3969                                                 roff -= qlen;
3970                                                 qoff = j;
3971                                         }
3972                                         int rc = (int)ref[roff];
3973                                         if(rc == 4) {
3974                                                 // Oops, encountered an N in the reference in
3975                                                 // the overhang portion of the candidate
3976                                                 // alignment
3977                                                 // (Note that we inverted hi earlier)
3978                                                 if(hi) {
3979                                                         // Right-to-left
3980                                                         // Skip what's left of the readSeedOverhang
3981                                                         skipRightToLefts = readSeedOverhang - j - 1;
3982                                                         if(!seedOnLeft) {
3983                                                                 // ...and skip the seed if it's on the right
3984                                                                 skipRightToLefts += slen;
3985                                                         }
3986                                                 } else {
3987                                                         // Left-to-right
3988                                                         // Skip what we've matched of the overhang
3989                                                         skipLeftToRights = j;
3990                                                         if(seedOnLeft) {
3991                                                                 // ...and skip the seed if it's on the left
3992                                                                 skipLeftToRights += slen;
3993                                                         }
3994                                                 }
3995                                                 foundHit = false; // Skip this candidate
3996                                                 break;
3997                                         }
3998                                         if((int)qry[qoff] != rc) {
3999                                                 // Calculate quality of mismatched base
4000                                                 char q = phredCharToPhredQual(quals[qoff]);
4001                                                 ham += mmPenalty(this->maqPenalty_, q);
4002                                                 if(ham > this->qualMax_) {
4003                                                         // Exceeded quality limit
4004                                                         foundHit = false;
4005                                                         break;
4006                                                 }
4007                                                 // Legal mismatch outside of the anchor; record it
4008                                                 mms++;
4009                                                 nonSeedMms.push_back(qoff);
4010                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
4011                                                 nonSeedRefcs.push_back("ACGTN"[rc]);
4012                                         }
4013                                 }
4014                                 if(!foundHit) continue;
4015                         }
4016                         assert(foundHit);
4017                         // Adjust ri if seed is on the right-hand side
4018                         if(!seedOnLeft) {
4019                                 ri -= readSeedOverhang;
4020                                 rir -= readSeedOverhang;
4021                         }
4022                         if(pairs != NULL) {
4023                                 TU64Pair p;
4024                                 if(ri < aoff) {
4025                                         // By convention, the upstream mate's
4026                                         // coordinates go in the 'first' field
4027                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
4028                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4029                                 } else {
4030                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
4031                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4032                                 }
4033                                 if(pairs->find(p) != pairs->end()) {
4034                                         // We already found this hit!  Continue.
4035                                         ASSERT_ONLY(duplicates++);
4036                                         ASSERT_ONLY(r2i++);
4037                                         continue;
4038                                 } else {
4039                                         // Record this hit
4040                                         pairs->insert(p);
4041                                 }
4042                         }
4043                         if(this->verbose_) {
4044                                 cout << "About to report:" << endl;
4045                                 cout << "  ";
4046                                 for(size_t i = 0; i < qlen; i++) {
4047                                         cout << (char)qry[i];
4048                                 }
4049                                 cout << endl;
4050                                 cout << "  ";
4051                                 for(size_t i = 0; i < qlen; i++) {
4052                                         cout << "ACGT"[ref[rir+i]];
4053                                 }
4054                                 cout << endl;
4055                         }
4056                         assert_leq(diffs, 2);
4057                         assert_geq((size_t)mms, diffs);
4058                         assert_lt(r2i, r2.size());
4059                         assert_eq(re2[r2i], ri);
4060                         ranges.resize(ranges.size()+1);
4061                         Range& range = ranges.back();
4062                         assert_eq((size_t)mms, r2[r2i].numMms);
4063                         range.stratum = diffs;
4064                         range.numMms = mms;
4065                         assert_eq(0, range.mms.size());
4066                         assert_eq(0, range.refcs.size());
4067                         if(mms > 0) {
4068                                 ASSERT_ONLY(size_t mmcur = 0);
4069                                 if(seedOnLeft && diffs > 0) {
4070                                         assert_neq(mmpos1, 0xffffffff);
4071                                         assert_lt(mmpos1, qlen);
4072                                         assert_lt(mmcur, (size_t)mms);
4073                                         assert_eq(mmpos1, r2[r2i].mms[mmcur]);
4074                                         assert_neq(-1, refc1);
4075                                         assert_eq(refc1, r2[r2i].refcs[mmcur]);
4076                                         ASSERT_ONLY(mmcur++);
4077                                         range.mms.push_back(mmpos1);
4078                                         range.refcs.push_back(refc1);
4079                                         if(diffs > 1) {
4080                                                 assert_eq(2, diffs);
4081                                                 assert_neq(mmpos2, 0xffffffff);
4082                                                 assert_lt(mmpos2, qlen);
4083                                                 assert_lt(mmcur, (size_t)mms);
4084                                                 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
4085                                                 assert_neq(-1, refc2);
4086                                                 assert_eq(refc2, r2[r2i].refcs[mmcur]);
4087                                                 ASSERT_ONLY(mmcur++);
4088                                                 range.mms.push_back(mmpos2);
4089                                                 range.refcs.push_back(refc2);
4090                                         }
4091                                 }
4092                                 const size_t nonSeedMmsSz = nonSeedMms.size();
4093                                 for(size_t i = 0; i < nonSeedMmsSz; i++) {
4094                                         assert_neq(0xffffffff, nonSeedMms[i]);
4095                                         assert_lt(mmcur, (size_t)mms);
4096                                         assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
4097                                         range.mms.push_back(nonSeedMms[i]);
4098                                         assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
4099                                         ASSERT_ONLY(mmcur++);
4100                                         range.refcs.push_back(nonSeedRefcs[i]);
4101                                 }
4102                                 if(!seedOnLeft && diffs > 0) {
4103                                         assert_neq(mmpos1, 0xffffffff);
4104                                         assert_lt(mmpos1, qlen);
4105                                         assert_lt(mmcur, (size_t)mms);
4106                                         assert_eq(mmpos1, r2[r2i].mms[mmcur]);
4107                                         assert_neq(-1, refc1);
4108                                         assert_eq(refc1, r2[r2i].refcs[mmcur]);
4109                                         ASSERT_ONLY(mmcur++);
4110                                         range.mms.push_back(mmpos1);
4111                                         range.refcs.push_back(refc1);
4112                                         if(diffs > 1) {
4113                                                 assert_eq(2, diffs);
4114                                                 assert_neq(mmpos2, 0xffffffff);
4115                                                 assert_lt(mmpos2, qlen);
4116                                                 assert_lt(mmcur, (size_t)mms);
4117                                                 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
4118                                                 assert_neq(-1, refc2);
4119                                                 assert_eq(refc2, r2[r2i].refcs[mmcur]);
4120                                                 ASSERT_ONLY(mmcur++);
4121                                                 range.mms.push_back(mmpos2);
4122                                                 range.refcs.push_back(refc2);
4123                                         }
4124                                 }
4125                                 assert_eq(mmcur, r2[r2i].mms.size());
4126                         }
4127                         assert_eq((size_t)mms, range.mms.size());
4128                         assert_eq((size_t)mms, range.refcs.size());
4129                         ASSERT_ONLY(r2i++);
4130                         results.push_back(ri);
4131                         if(--numToFind == 0) return;
4132                 }
4133                 assert_leq(duplicates, r2.size());
4134                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
4135                 return; // no hit
4136         }
4137 };
4138
4139 /**
4140  * Concrete RefAligner for finding nearby 3-mismatch hits given an
4141  * anchor hit.
4142  */
4143 template<typename TStr>
4144 class Seed3RefAligner : public RefAligner<TStr> {
4145
4146         typedef seqan::String<seqan::Dna5> TDna5Str;
4147         typedef seqan::String<char> TCharStr;
4148         typedef std::vector<Range> TRangeVec;
4149         typedef std::vector<uint32_t> TU32Vec;
4150         typedef std::pair<uint64_t, uint64_t> TU64Pair;
4151         typedef std::set<TU64Pair> TSetPairs;
4152
4153 public:
4154
4155         Seed3RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
4156                 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
4157
4158         virtual ~Seed3RefAligner() { }
4159
4160 protected:
4161         /**
4162          * This schematic shows the roles played by the begin, qbegin, end,
4163          * qend, halfway, slen, qlen, and lim variables:
4164          *
4165          * seedOnLeft == true:
4166          *
4167          * |<                   lim                   >|<     qlen       >|
4168          *  --------------------------------------------------------------
4169          * |                     | slen | qlen-slen |  | slen | qlen-slen |
4170          *  --------------------------------------------------------------
4171          * ^                     ^                     ^                  ^
4172          * begin & qbegin     halfway                qend               end
4173          *
4174          * seedOnLeft == false:
4175          *
4176          * |<     qlen       >|<                   lim                   >|
4177          *  --------------------------------------------------------------
4178          * | qlen-slen | slen |  | qlen-slen | slen |                     |
4179          *  --------------------------------------------------------------
4180          * ^                  ^                     ^                     ^
4181          * begin            qbegin             halfway           qend & end
4182          *
4183          * Note that, for seeds longer than 32 base-pairs, the seed is
4184          * further subdivided into a 32-bit anchor and a seed overhang of
4185          * length > 0.
4186          */
4187         void naiveFind(uint32_t numToFind,
4188                                    uint32_t tidx,
4189                                    uint8_t* ref,
4190                                    const TDna5Str& qry,
4191                                    const TCharStr& quals,
4192                                    uint32_t begin,
4193                                    uint32_t end,
4194                                    TRangeVec& ranges,
4195                                    TU32Vec& results,
4196                                    TSetPairs* pairs,
4197                                    uint32_t aoff,
4198                                    bool seedOnLeft) const
4199         {
4200                 assert_gt(numToFind, 0);
4201                 assert_gt(end, begin);
4202                 const uint32_t qlen = seqan::length(qry);
4203                 assert_gt(qlen, 0);
4204                 assert_geq(end - begin, qlen); // caller should have checked this
4205                 assert_gt(this->seedLen_, 0);
4206                 const uint32_t slen = min(qlen, this->seedLen_);
4207                 uint32_t qend = end;
4208                 uint32_t qbegin = begin;
4209                 // If the seed is on the left-hand side of the alignment, then
4210                 // leave a gap at the right-hand side of the interval;
4211                 // otherwise, do the opposite
4212                 if(seedOnLeft) {
4213                         // Leave gap on right-hand side of the interval
4214                         qend -= qlen;
4215                 } else {
4216                         // Leave gap on left-hand side of the interval
4217                         qbegin += qlen;
4218                 }
4219                 // lim = number of alignments to try
4220                 const uint32_t lim = qend - qbegin;
4221                 // halfway = position in the reference to start at (and then
4222                 // we work our way out to the right and to the left).
4223                 const uint32_t halfway = qbegin + (lim >> 1);
4224                 // Vectors for holding edit information
4225                 std::vector<uint32_t> nonSeedMms;
4226                 std::vector<uint8_t> nonSeedRefcs;
4227                 bool hi = false;
4228                 for(uint32_t i = 1; i <= lim+1; i++) {
4229                         uint32_t ri;  // leftmost position in candidate alignment
4230                         uint32_t rir; // same, minus begin; for indexing into ref[]
4231                         if(hi) {
4232                                 ri = halfway + (i >> 1); rir = ri - begin;
4233                                 assert_leq(ri, qend);
4234                         } else {
4235                                 ri = halfway - (i >> 1); rir = ri - begin;
4236                                 assert_geq(ri, begin);
4237                         }
4238                         hi = !hi;
4239                         // Do the naive comparison
4240                         bool match = true;
4241                         int refc1 = -1;
4242                         uint32_t mmOff1 = 0xffffffff;
4243                         int refc2 = -1;
4244                         uint32_t mmOff2 = 0xffffffff;
4245                         int refc3 = -1;
4246                         uint32_t mmOff3 = 0xffffffff;
4247                         int mms = 0;
4248                         int seedMms = 0;
4249                         unsigned int ham = 0;
4250                         nonSeedMms.clear();
4251                         nonSeedRefcs.clear();
4252                         // Walk through each position of the alignment
4253                         for(uint32_t jj = 0; jj < qlen; jj++) {
4254                                 uint32_t j = jj;
4255                                 if(!seedOnLeft) {
4256                                         // If seed is on the right, scan right-to-left
4257                                         j = qlen - jj - 1;
4258                                 } else {
4259                                         // Go left-to-right
4260                                 }
4261                                 uint32_t rirj = rir + j;
4262                                 if(!seedOnLeft) {
4263                                         assert_geq(rir, jj);
4264                                         rirj = rir - jj - 1;
4265                                 }
4266 #if 0
4267                                 // Count Ns in the reference as mismatches
4268                                 const int q = (int)qry[j];
4269                                 const int r = (int)ref[rirj];
4270                                 assert_leq(q, 4);
4271                                 assert_leq(r, 4);
4272                                 if(q == 4 || r == 4 || q != r) {
4273 #else
4274                                 // Disallow alignments that involve an N in the
4275                                 // reference
4276                                 const int r = (int)ref[rirj];
4277                                 if(r & 4) {
4278                                         // N in reference; bail on this alignment
4279                                         match = false;
4280                                         break;
4281                                 }
4282                                 const int q = (int)qry[j];
4283                                 assert_leq(q, 4);
4284                                 assert_lt(r, 4);
4285                                 if(q != r) {
4286 #endif
4287                                         // Mismatch!
4288                                         mms++;
4289                                         if(mms > 3 && jj < slen) {
4290                                                 // More than one mismatch in the anchor; reject
4291                                                 match = false;
4292                                                 break;
4293                                         }
4294                                         uint8_t qual = phredCharToPhredQual(quals[j]);
4295                                         ham += mmPenalty(this->maqPenalty_, qual);
4296                                         if(ham > this->qualMax_) {
4297                                                 // Exceeded quality ceiling; reject
4298                                                 match = false;
4299                                                 break;
4300                                         } else if(mms == 1 && jj < slen) {
4301                                                 // First mismatch in the anchor; remember offset
4302                                                 // and ref char
4303                                                 refc1 = "ACGTN"[r];
4304                                                 mmOff1 = j;
4305                                                 seedMms = 1;
4306                                         } else if(mms == 2 && jj < slen) {
4307                                                 // Second mismatch in the anchor; remember offset
4308                                                 // and ref char
4309                                                 refc2 = "ACGTN"[r];
4310                                                 mmOff2 = j;
4311                                                 seedMms = 2;
4312                                         } else if(mms == 3 && jj < slen) {
4313                                                 // Third mismatch in the anchor; remember offset
4314                                                 // and ref char
4315                                                 refc3 = "ACGTN"[r];
4316                                                 mmOff3 = j;
4317                                                 seedMms = 3;
4318                                         } else {
4319                                                 // Legal mismatch outside of the anchor; record it
4320                                                 nonSeedMms.push_back(j);
4321                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
4322                                                 nonSeedRefcs.push_back("ACGTN"[r]);
4323                                         }
4324                                 }
4325                         }
4326                         if(match) {
4327                                 ranges.resize(ranges.size()+1);
4328                                 Range& range = ranges.back();
4329                                 assert_leq(seedMms, mms);
4330                                 range.stratum = seedMms;
4331                                 range.numMms = mms;
4332                                 assert_eq(0, range.mms.size());
4333                                 assert_eq(0, range.refcs.size());
4334                                 if(mms >= 1) {
4335                                         // Be careful to add edits in left-to-right order
4336                                         // with respect to the read/alignment
4337                                         if(seedOnLeft && seedMms) {
4338                                                 assert_lt(mmOff1, qlen);
4339                                                 range.mms.push_back(mmOff1);
4340                                                 range.refcs.push_back(refc1);
4341                                                 if(seedMms > 1) {
4342                                                         assert_lt(mmOff1, mmOff2);
4343                                                         assert_lt(mmOff2, qlen);
4344                                                         range.mms.push_back(mmOff2);
4345                                                         range.refcs.push_back(refc2);
4346                                                         if(seedMms > 2) {
4347                                                                 assert_lt(mmOff2, mmOff3);
4348                                                                 assert_lt(mmOff3, qlen);
4349                                                                 range.mms.push_back(mmOff3);
4350                                                                 range.refcs.push_back(refc3);
4351                                                         }
4352                                                 }
4353                                         }
4354                                         const size_t nonSeedMmsSz = nonSeedMms.size();
4355                                         if(nonSeedMmsSz > 0) {
4356                                                 if(seedOnLeft) {
4357                                                         for(size_t k = 0; k < nonSeedMmsSz; k++) {
4358                                                                 range.mms.push_back(nonSeedMms[k]);
4359                                                                 range.refcs.push_back(nonSeedRefcs[k]);
4360                                                         }
4361                                                 } else {
4362                                                         for(size_t k = nonSeedMmsSz; k > 0; k--) {
4363                                                                 range.mms.push_back(nonSeedMms[k-1]);
4364                                                                 range.refcs.push_back(nonSeedRefcs[k-1]);
4365                                                         }
4366                                                 }
4367                                         }
4368                                         if(!seedOnLeft && seedMms) {
4369                                                 if(seedMms > 1) {
4370                                                         if(seedMms > 2) {
4371                                                                 assert_lt(mmOff3, mmOff2);
4372                                                                 assert_lt(mmOff3, qlen);
4373                                                                 range.mms.push_back(mmOff3);
4374                                                                 range.refcs.push_back(refc3);
4375                                                         }
4376                                                         assert_lt(mmOff2, mmOff1);
4377                                                         assert_lt(mmOff2, qlen);
4378                                                         range.mms.push_back(mmOff2);
4379                                                         range.refcs.push_back(refc2);
4380                                                 }
4381                                                 assert_lt(mmOff1, qlen);
4382                                                 range.mms.push_back(mmOff1);
4383                                                 range.refcs.push_back(refc1);
4384                                         }
4385                                 }
4386                                 assert_eq((size_t)mms, range.mms.size());
4387                                 assert_eq((size_t)mms, range.refcs.size());
4388                                 if(seedOnLeft) {
4389                                         results.push_back(ri);
4390                                 } else {
4391                                         results.push_back(ri - qlen);
4392                                 }
4393                         }
4394                 }
4395                 return;
4396         }
4397
4398         /**
4399          * This schematic shows the roles played by the begin, qbegin, end,
4400          * qend, halfway, slen, qlen, and lim variables:
4401          *
4402          * seedOnLeft == true:
4403          *
4404          * |<                   lim                   >|<     qlen       >|
4405          *  --------------------------------------------------------------
4406          * |                     | slen | qlen-slen |  | slen | qlen-slen |
4407          *  --------------------------------------------------------------
4408          * ^                     ^                     ^                  ^
4409          * begin & qbegin     halfway                qend               end
4410          *
4411          * seedOnLeft == false:
4412          *
4413          *             |<                   lim                   >|
4414          *  --------------------------------------------------------------
4415          * | qlen-slen |         | qlen-slen | slen |              | slen |
4416          *  --------------------------------------------------------------
4417          * ^           ^                     ^                     ^      ^
4418          * begin       qbegin             halfway                qend   end
4419          *
4420          * Note that, for seeds longer than 32 base-pairs, the seed is
4421          * further subdivided into a 32-bit anchor and a seed overhang of
4422          * length > 0.
4423          */
4424         virtual void anchor64Find(uint32_t numToFind,
4425                                         uint32_t tidx,
4426                                         uint8_t* ref,
4427                                         const TDna5Str& qry,
4428                                         const TCharStr& quals,
4429                                         uint32_t begin,
4430                                         uint32_t end,
4431                                         TRangeVec& ranges,
4432                                         TU32Vec& results,
4433                                         TSetPairs* pairs = NULL,
4434                                         uint32_t aoff = 0xffffffff,
4435                                         bool seedOnLeft = false) const
4436         {
4437                 assert_gt(numToFind, 0);
4438                 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
4439                 ASSERT_ONLY(uint32_t duplicates = 0);
4440                 ASSERT_ONLY(uint32_t r2i = 0);
4441                 const uint32_t qlen = seqan::length(qry);
4442                 assert_gt(qlen, 0);
4443                 assert_gt(end, begin);
4444                 assert_geq(end - begin, qlen); // caller should have checked this
4445                 assert_gt(this->seedLen_, 0);
4446                 uint32_t slen = min(qlen, this->seedLen_);
4447 #ifndef NDEBUG
4448                 // Get results from the naive matcher for sanity-checking
4449                 TRangeVec r2; TU32Vec re2;
4450                 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
4451                                   re2, pairs, aoff, seedOnLeft);
4452 #endif
4453                 const uint32_t anchorBitPairs = min<int>(slen, 32);
4454                 const int lhsShift = ((anchorBitPairs - 1) << 1);
4455                 const uint32_t anchorCushion  = 32 - anchorBitPairs;
4456                 // seedAnchorOverhang = # seed bases not included in the anchor
4457                 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
4458                 // seedAnchorOverhang = # seed bases not included in the anchor
4459                 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
4460                 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
4461                 assert_eq(qlen, readSeedOverhang + slen);
4462                 uint32_t qend = end;
4463                 uint32_t qbegin = begin;
4464                 if(seedOnLeft) {
4465                         // Leave read-sized gap on right-hand side of the interval
4466                         qend -= qlen;
4467                 } else {
4468                         // Leave seed-sized gap on right-hand side and
4469                         // non-seed-sized gap on the left-hand side
4470                         qbegin += readSeedOverhang;
4471                         qend -= slen;
4472                 }
4473                 // lim = # possible alignments in the range
4474                 const uint32_t lim = qend - qbegin;
4475                 // halfway = point on the genome to radiate out from
4476                 const uint32_t halfway = qbegin + (lim >> 1);
4477                 uint64_t anchor = 0llu;
4478                 uint64_t buffw = 0llu;  // rotating ref sequence buffer
4479                 // OR the 'diff' buffer with this so that we can always count
4480                 // 'N's as mismatches
4481                 uint64_t diffMask = 0llu;
4482                 // Set up a mask that we'll apply to the two bufs every round
4483                 // to discard bits that were rotated out of the anchor area
4484                 uint64_t clearMask = 0xffffffffffffffffllu;
4485                 bool useMask = false;
4486                 if(anchorBitPairs < 32) {
4487                         clearMask >>= ((32-anchorBitPairs) << 1);
4488                         useMask = true;
4489                 }
4490                 int nsInSeed = 0;
4491                 uint32_t nPoss = 0;
4492                 int nPos1 = -1;
4493                 int nPos2 = -1;
4494                 int nPos3 = -1;
4495                 uint32_t skipLeftToRights = 0;
4496                 uint32_t skipRightToLefts = 0;
4497                 const uint32_t halfwayRi = halfway - begin;
4498                 // Construct the 'anchor' 64-bit buffer so that it holds all of
4499                 // the first 'anchorBitPairs' bit pairs of the query.
4500                 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
4501                         uint32_t i = ii;
4502                         if(!seedOnLeft) {
4503                                 // Fill in the anchor using characters from the right-
4504                                 // hand side of the query (but take the characters in
4505                                 // left-to-right order).  Be sure to subtract slen from
4506                                 // qlen; not anchorBitPairs from qlen.  We want the
4507                                 // characters in the seedAnchorOverhang region to be to
4508                                 // the right of the characters in the anchor.
4509                                 i = qlen - slen + ii;
4510                         }
4511                         int c = (int)qry[i]; // next query character
4512                         int r = (int)ref[halfwayRi + ii]; // next reference character
4513                         if(r & 4) {
4514                                 // The reference character is an N; to mimic the
4515                                 // behavior of BW alignment, we have to skip all
4516                                 // alignments that involve an N in the reference.  Set
4517                                 // the skip* variables accordingly.
4518                                 r = 0;
4519                                 uint32_t lrSkips = ii;
4520                                 uint32_t rlSkips = qlen - ii;
4521                                 if(!seedOnLeft && readSeedOverhang) {
4522                                         lrSkips += readSeedOverhang;
4523                                         assert_geq(rlSkips, readSeedOverhang);
4524                                         rlSkips -= readSeedOverhang;
4525                                 }
4526                                 // The right-to-left direction absorbs the candidate
4527                                 // alignment based at 'halfway'
4528                                 skipLeftToRights = max(skipLeftToRights, lrSkips);
4529                                 skipRightToLefts = max(skipRightToLefts, rlSkips);
4530                                 assert_leq(skipLeftToRights, qlen);
4531                                 assert_leq(skipRightToLefts, qlen);
4532                         }
4533                         assert_leq(c, 4);
4534                         assert_lt(r, 4);
4535                         // Special case: query has an 'N'
4536                         if(c == 4) {
4537                                 if(++nsInSeed > 3) {
4538                                         // More than one 'N' in the anchor region; can't
4539                                         // possibly have a 1-mismatch hit anywhere
4540                                         assert_eq(r2.size(), ranges.size() - rangesInitSz);
4541                                         return;   // can't match if query has Ns
4542                                 }
4543                                 if(nsInSeed == 1) {
4544                                         nPos1 = (int)ii; // w/r/t LHS of anchor
4545                                 } else if(nsInSeed == 2) {
4546                                         nPos2 = (int)ii; // w/r/t LHS of anchor
4547                                         assert_gt(nPos2, nPos1);
4548                                 } else {
4549                                         assert_eq(3, nsInSeed);
4550                                         nPos3 = (int)ii; // w/r/t LHS of anchor
4551                                         assert_gt(nPos3, nPos2);
4552                                 }
4553                                 // Make it look like an 'A' in the anchor
4554                                 c = 0;
4555                                 diffMask = (diffMask << 2llu) | 1llu;
4556                         } else {
4557                                 diffMask <<= 2llu;
4558                         }
4559                         anchor = ((anchor << 2llu) | c);
4560                         buffw = ((buffw << 2llu) | r);
4561                 }
4562                 // Check whether read is disqualified by Ns inside the seed
4563                 // region but outside the anchor region
4564                 if(seedAnchorOverhang) {
4565                         assert_lt(anchorBitPairs, slen);
4566                         for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
4567                                 uint32_t i = ii;
4568                                 if(!seedOnLeft) {
4569                                         i = qlen - slen + ii;
4570                                 }
4571                                 if((int)qry[i] == 4) {
4572                                         if(++nsInSeed > 3) {
4573                                                 assert_eq(r2.size(), ranges.size() - rangesInitSz);
4574                                                 return; // can't match if query has Ns
4575                                         }
4576                                 }
4577                         }
4578                 } else {
4579                         assert_eq(anchorBitPairs, slen);
4580                 }
4581                 uint64_t bufbw = buffw;
4582                 // Slide the anchor out in either direction, alternating
4583                 // between right-to-left and left-to-right shifts, until all of
4584                 // the positions from qbegin to qend have been covered.
4585                 bool hi = false;
4586                 uint32_t riHi  = halfway;
4587                 uint32_t rirHi = halfway - begin;
4588                 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
4589                 uint32_t riLo  = halfway + 1;
4590                 uint32_t rirLo = halfway - begin + 1;
4591                 uint32_t lrSkips = anchorBitPairs;
4592                 uint32_t rlSkips = qlen;
4593                 if(!seedOnLeft && readSeedOverhang) {
4594                         lrSkips += readSeedOverhang;
4595                         assert_geq(rlSkips, readSeedOverhang);
4596                         rlSkips -= readSeedOverhang;
4597                 }
4598                 for(uint32_t i = 1; i <= lim + 1; i++) {
4599                         int r;       // new reference char
4600                         uint64_t diff;
4601                         assert_leq(skipLeftToRights, qlen);
4602                         assert_leq(skipRightToLefts, qlen);
4603                         if(hi) {
4604                                 hi = false;
4605                                 // Moving left-to-right
4606                                 riHi++;
4607                                 rirHi++;
4608                                 rirHiAnchor++;
4609                                 r = (int)ref[rirHiAnchor];
4610                                 if(r & 4) {
4611                                         r = 0;
4612                                         skipLeftToRights = lrSkips;
4613                                 }
4614                                 assert_lt(r, 4);
4615                                 // Bring in new base pair at the least significant
4616                                 // position
4617                                 buffw = ((buffw << 2llu) | r);
4618                                 if(useMask) buffw &= clearMask;
4619                                 if(skipLeftToRights > 0) {
4620                                         skipLeftToRights--;
4621                                         continue;
4622                                 }
4623                                 diff = (buffw ^ anchor) | diffMask;
4624                         } else {
4625                                 hi = true;
4626                                 // Moving right-to-left
4627                                 riLo--;
4628                                 rirLo--;
4629                                 r = (int)ref[rirLo];
4630                                 if(r & 4) {
4631                                         r = 0;
4632                                         skipRightToLefts = rlSkips;
4633                                 }
4634                                 assert_lt(r, 4);
4635                                 if(i >= 2) {
4636                                         bufbw >>= 2llu;
4637                                         // Bring in new base pair at the most significant
4638                                         // position
4639                                         bufbw |= ((uint64_t)r << lhsShift);
4640                                 }
4641                                 if(skipRightToLefts > 0) {
4642                                         skipRightToLefts--;
4643                                         continue;
4644                                 }
4645                                 diff = (bufbw ^ anchor) | diffMask;
4646                         }
4647                         if((diff & 0xf000f000f000f000llu) &&
4648                            (diff & 0x0f000f000f000f00llu) &&
4649                            (diff & 0x00f000f000f000f0llu) &&
4650                            (diff & 0x000f000f000f000fllu)) continue;
4651                         if((diff & 0xc003c003c003c003llu) &&
4652                            (diff & 0x3c003c003c003c00llu) &&
4653                            (diff & 0x03c003c003c003c0llu) &&
4654                            (diff & 0x003c003c003c003cllu)) continue;
4655                         uint32_t ri  = hi ? riLo  : riHi;
4656                         uint32_t rir = hi ? rirLo : rirHi;
4657                         // Could use pop count
4658                         uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
4659                         // As a first cut, see if there are too many mismatches in
4660                         // the first and last parts of the anchor
4661                         uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
4662                         if(diffs > 3) continue;
4663                         diffs += u8toMms[(int)diff8[1]] +
4664                                          u8toMms[(int)diff8[2]] +
4665                                          u8toMms[(int)diff8[3]] +
4666                                          u8toMms[(int)diff8[4]] +
4667                                          u8toMms[(int)diff8[5]] +
4668                                          u8toMms[(int)diff8[6]];
4669                         uint32_t mmpos1 = 0xffffffff;
4670                         int refc1 = -1;
4671                         uint32_t mmpos2 = 0xffffffff;
4672                         int refc2 = -1;
4673                         uint32_t mmpos3 = 0xffffffff;
4674                         int refc3 = -1;
4675                         unsigned int ham = 0;
4676                         if(diffs > 3) {
4677                                 // Too many differences in the seed; stop
4678                                 continue;
4679                         } else if(nPoss > 1 && diffs == nPoss) {
4680                                 // There was one difference, but there was also one N,
4681                                 // so we already know where the difference is
4682                                 mmpos1 = nPos1;
4683                                 refc1 = "ACGT"[(int)ref[rir + nPos1]];
4684                                 if(!seedOnLeft) {
4685                                         mmpos1 += readSeedOverhang;
4686                                 }
4687                                 char q = quals[mmpos1];
4688                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4689                                 if(ham > this->qualMax_) {
4690                                         // Exceeded quality limit
4691                                         continue;
4692                                 }
4693                                 if(nPoss > 1) {
4694                                         mmpos2 = nPos2;
4695                                         refc2 = "ACGT"[(int)ref[rir + nPos2]];
4696                                         if(!seedOnLeft) {
4697                                                 mmpos2 += readSeedOverhang;
4698                                         }
4699                                         q = quals[mmpos2];
4700                                         ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4701                                         if(ham > this->qualMax_) {
4702                                                 // Exceeded quality limit
4703                                                 continue;
4704                                         }
4705                                         if(nPoss > 3) {
4706                                                 mmpos3 = nPos3;
4707                                                 refc3 = "ACGT"[(int)ref[rir + nPos3]];
4708                                                 if(!seedOnLeft) {
4709                                                         mmpos3 += readSeedOverhang;
4710                                                 }
4711                                                 q = quals[mmpos3];
4712                                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4713                                                 if(ham > this->qualMax_) {
4714                                                         // Exceeded quality limit
4715                                                         continue;
4716                                                 }
4717                                         }
4718                                 }
4719                         } else if(diffs > 0) {
4720                                 // Figure out which position mismatched
4721                                 uint64_t diff2 = diff;
4722                                 mmpos1 = 31;
4723                                 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
4724                                 assert_neq(0, diff);
4725                                 if((diff & 0xffffllu) == 0)     { diff >>= 16llu; mmpos1 -=  8; }
4726                                 assert_neq(0, diff);
4727                                 if((diff & 0xffllu) == 0)       { diff >>= 8llu;  mmpos1 -=  4; }
4728                                 assert_neq(0, diff);
4729                                 if((diff & 0xfllu) == 0)        { diff >>= 4llu;  mmpos1 -=  2; }
4730                                 assert_neq(0, diff);
4731                                 if((diff & 0x3llu) == 0)        { mmpos1--; }
4732                                 assert_neq(0, diff);
4733                                 assert_geq(mmpos1, 0);
4734                                 assert_lt(mmpos1, 32);
4735                                 uint32_t savedMmpos1 = mmpos1;
4736                                 mmpos1 -= anchorCushion;
4737                                 assert_lt(mmpos1, anchorBitPairs);
4738                                 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
4739                                 if(!seedOnLeft) {
4740                                         mmpos1 += readSeedOverhang;
4741                                 }
4742                                 char q = quals[mmpos1];
4743                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4744                                 if(ham > this->qualMax_) {
4745                                         // Exceeded quality limit
4746                                         continue;
4747                                 }
4748                                 if(diffs > 1) {
4749                                         // Figure out the second mismatched position
4750                                         ASSERT_ONLY(uint64_t origDiff2 = diff2);
4751                                         diff2 &= ~(0xc000000000000000llu >> (uint64_t)((savedMmpos1) << 1));
4752                                         uint64_t diff3 = diff2;
4753                                         assert_neq(diff2, origDiff2);
4754                                         mmpos2 = 31;
4755                                         if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
4756                                         assert_neq(0, diff2);
4757                                         if((diff2 & 0xffffllu) == 0)     { diff2 >>= 16llu; mmpos2 -=  8; }
4758                                         assert_neq(0, diff2);
4759                                         if((diff2 & 0xffllu) == 0)       { diff2 >>= 8llu;  mmpos2 -=  4; }
4760                                         assert_neq(0, diff2);
4761                                         if((diff2 & 0xfllu) == 0)        { diff2 >>= 4llu;  mmpos2 -=  2; }
4762                                         assert_neq(0, diff2);
4763                                         if((diff2 & 0x3llu) == 0)        { mmpos2--; }
4764                                         assert_neq(0, diff2);
4765                                         assert_geq(mmpos2, 0);
4766                                         assert_lt(mmpos2, 32);
4767                                         uint32_t savedMmpos2 = mmpos2;
4768                                         mmpos2 -= anchorCushion;
4769                                         assert_neq(mmpos1, mmpos2);
4770                                         refc2 = "ACGT"[(int)ref[rir + mmpos2]];
4771                                         if(!seedOnLeft) {
4772                                                 mmpos2 += readSeedOverhang;
4773                                         }
4774                                         q = quals[mmpos2];
4775                                         ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4776                                         if(ham > this->qualMax_) {
4777                                                 // Exceeded quality limit
4778                                                 continue;
4779                                         }
4780                                         if(mmpos2 < mmpos1) {
4781                                                 uint32_t mmtmp = mmpos1;
4782                                                 mmpos1 = mmpos2;
4783                                                 mmpos2 = mmtmp;
4784                                                 int refctmp = refc1;
4785                                                 refc1 = refc2;
4786                                                 refc2 = refctmp;
4787                                         }
4788                                         assert_lt(mmpos1, mmpos2);
4789                                         if(diffs > 2) {
4790                                                 // Figure out the second mismatched position
4791                                                 ASSERT_ONLY(uint32_t origDiff3 = diff3);
4792                                                 diff3 &= ~(0xc000000000000000llu >> (uint64_t)((savedMmpos2) << 1));
4793                                                 assert_neq(diff3, origDiff3);
4794                                                 mmpos3 = 31;
4795                                                 if((diff3 & 0xffffffffllu) == 0) { diff3 >>= 32llu; mmpos3 -= 16; }
4796                                                 assert_neq(0, diff3);
4797                                                 if((diff3 & 0xffffllu) == 0)     { diff3 >>= 16llu; mmpos3 -=  8; }
4798                                                 assert_neq(0, diff3);
4799                                                 if((diff3 & 0xffllu) == 0)       { diff3 >>= 8llu;  mmpos3 -=  4; }
4800                                                 assert_neq(0, diff3);
4801                                                 if((diff3 & 0xfllu) == 0)        { diff3 >>= 4llu;  mmpos3 -=  2; }
4802                                                 assert_neq(0, diff3);
4803                                                 if((diff3 & 0x3llu) == 0)        { mmpos3--; }
4804                                                 assert_neq(0, diff3);
4805                                                 assert_geq(mmpos3, 0);
4806                                                 assert_lt(mmpos3, 32);
4807                                                 mmpos3 -= anchorCushion;
4808                                                 assert_neq(mmpos2, mmpos3);
4809                                                 assert_neq(mmpos1, mmpos3);
4810                                                 refc3 = "ACGT"[(int)ref[rir + mmpos3]];
4811                                                 if(!seedOnLeft) {
4812                                                         mmpos3 += readSeedOverhang;
4813                                                 }
4814                                                 q = quals[mmpos3];
4815                                                 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4816                                                 if(ham > this->qualMax_) {
4817                                                         // Exceeded quality limit
4818                                                         continue;
4819                                                 }
4820                                                 if(mmpos3 < mmpos1) {
4821                                                         uint32_t mmtmp = mmpos1;
4822                                                         mmpos1 = mmpos3;
4823                                                         mmpos3 = mmpos2;
4824                                                         mmpos2 = mmtmp;
4825                                                         int refctmp = refc1;
4826                                                         refc1 = refc3;
4827                                                         refc3 = refc2;
4828                                                         refc2 = refctmp;
4829                                                 } else if(mmpos3 < mmpos2) {
4830                                                         uint32_t mmtmp = mmpos2;
4831                                                         mmpos2 = mmpos3;
4832                                                         mmpos3 = mmtmp;
4833                                                         int refctmp = refc2;
4834                                                         refc2 = refc3;
4835                                                         refc3 = refctmp;
4836                                                 }
4837                                                 assert_lt(mmpos1, mmpos2);
4838                                                 assert_lt(mmpos2, mmpos3);
4839                                         }
4840                                 }
4841                         }
4842                         // If the seed is longer than the anchor, then scan the
4843                         // rest of the seed characters
4844                         bool foundHit = true;
4845                         if(seedAnchorOverhang) {
4846                                 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
4847                                         int rc = (int)ref[rir + anchorBitPairs + j];
4848                                         if(rc == 4) {
4849                                                 // Oops, encountered an N in the reference in
4850                                                 // the overhang portion of the candidate
4851                                                 // alignment
4852                                                 // (Note that we inverted hi earlier)
4853                                                 if(hi) {
4854                                                         // Right-to-left
4855                                                         // Skip out of the seedAnchorOverhang
4856                                                         assert_eq(0, skipRightToLefts);
4857                                                         skipRightToLefts = seedAnchorOverhang - j - 1;
4858                                                         if(seedOnLeft) {
4859                                                                 // ...and skip out of the rest of the read
4860                                                                 skipRightToLefts += readSeedOverhang;
4861                                                         }
4862                                                 } else {
4863                                                         // Left-to-right
4864                                                         // Skip out of the seedAnchorOverhang
4865                                                         assert_eq(0, skipLeftToRights);
4866                                                         skipLeftToRights = anchorBitPairs + j;
4867                                                         if(!seedOnLeft) {
4868                                                                 // ...and skip out of the rest of the read
4869                                                                 skipLeftToRights += readSeedOverhang;
4870                                                         }
4871                                                 }
4872                                                 foundHit = false; // Skip this candidate
4873                                                 break;
4874                                         }
4875                                         uint32_t qoff = anchorBitPairs + j;
4876                                         if(!seedOnLeft) {
4877                                                 qoff += readSeedOverhang;
4878                                         }
4879                                         assert_lt(qoff, qlen);
4880                                         if((int)qry[qoff] != rc) {
4881                                                 diffs++;
4882                                                 if(diffs > 3) {
4883                                                         foundHit = false;
4884                                                         break;
4885                                                 } else if(diffs == 3) {
4886                                                         assert_eq(0xffffffff, mmpos3);
4887                                                         mmpos3 = qoff;
4888                                                         assert_eq(-1, refc3);
4889                                                         refc3 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
4890                                                 } else if(diffs == 2) {
4891                                                         assert_eq(0xffffffff, mmpos2);
4892                                                         mmpos2 = qoff;
4893                                                         assert_eq(-1, refc2);
4894                                                         refc2 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
4895                                                 } else {
4896                                                         assert_eq(1, diffs);
4897                                                         assert_eq(0xffffffff, mmpos1);
4898                                                         mmpos1 = qoff;
4899                                                         assert_eq(-1, refc1);
4900                                                         refc1 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
4901                                                 }
4902                                                 char q = phredCharToPhredQual(quals[qoff]);
4903                                                 ham += mmPenalty(this->maqPenalty_, q);
4904                                                 if(ham > this->qualMax_) {
4905                                                         // Exceeded quality limit
4906                                                         foundHit = false;
4907                                                         break;
4908                                                 }
4909                                         }
4910                                 }
4911                                 if(!foundHit) continue;
4912                         }
4913                         // If the read is longer than the seed, then scan the rest
4914                         // of the read characters; mismatches no longer count
4915                         // toward the stratum or the 1-mm limit.
4916                         // Vectors for holding edit information
4917                         std::vector<uint32_t> nonSeedMms;
4918                         std::vector<uint8_t> nonSeedRefcs;
4919                         int mms = diffs; // start counting total mismatches
4920                         if((qlen - slen) > 0) {
4921                                 // Going left-to-right
4922                                 for(uint32_t j = 0; j < readSeedOverhang; j++) {
4923                                         uint32_t roff = rir + slen + j;
4924                                         uint32_t qoff = slen + j;
4925                                         if(!seedOnLeft) {
4926                                                 assert_geq(roff, qlen);
4927                                                 roff -= qlen;
4928                                                 qoff = j;
4929                                         }
4930                                         int rc = (int)ref[roff];
4931                                         if(rc == 4) {
4932                                                 // Oops, encountered an N in the reference in
4933                                                 // the overhang portion of the candidate
4934                                                 // alignment
4935                                                 // (Note that we inverted hi earlier)
4936                                                 if(hi) {
4937                                                         // Right-to-left
4938                                                         // Skip what's left of the readSeedOverhang
4939                                                         skipRightToLefts = readSeedOverhang - j - 1;
4940                                                         if(!seedOnLeft) {
4941                                                                 // ...and skip the seed if it's on the right
4942                                                                 skipRightToLefts += slen;
4943                                                         }
4944                                                 } else {
4945                                                         // Left-to-right
4946                                                         // Skip what we've matched of the overhang
4947                                                         skipLeftToRights = j;
4948                                                         if(seedOnLeft) {
4949                                                                 // ...and skip the seed if it's on the left
4950                                                                 skipLeftToRights += slen;
4951                                                         }
4952                                                 }
4953                                                 foundHit = false; // Skip this candidate
4954                                                 break;
4955                                         }
4956                                         if((int)qry[qoff] != rc) {
4957                                                 // Calculate quality of mismatched base
4958                                                 char q = phredCharToPhredQual(quals[qoff]);
4959                                                 ham += mmPenalty(this->maqPenalty_, q);
4960                                                 if(ham > this->qualMax_) {
4961                                                         // Exceeded quality limit
4962                                                         foundHit = false;
4963                                                         break;
4964                                                 }
4965                                                 // Legal mismatch outside of the anchor; record it
4966                                                 mms++;
4967                                                 nonSeedMms.push_back(qoff);
4968                                                 assert_leq(nonSeedMms.size(), (size_t)mms);
4969                                                 nonSeedRefcs.push_back("ACGTN"[rc]);
4970                                         }
4971                                 }
4972                                 if(!foundHit) continue;
4973                         }
4974                         assert(foundHit);
4975                         // Adjust ri if seed is on the right-hand side
4976                         if(!seedOnLeft) {
4977                                 ri -= readSeedOverhang;
4978                                 rir -= readSeedOverhang;
4979                         }
4980                         if(pairs != NULL) {
4981                                 TU64Pair p;
4982                                 if(ri < aoff) {
4983                                         // By convention, the upstream mate's
4984                                         // coordinates go in the 'first' field
4985                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)ri;
4986                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4987                                 } else {
4988                                         p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
4989                                         p.first  = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4990                                 }
4991                                 if(pairs->find(p) != pairs->end()) {
4992                                         // We already found this hit!  Continue.
4993                                         ASSERT_ONLY(duplicates++);
4994                                         ASSERT_ONLY(r2i++);
4995                                         continue;
4996                                 } else {
4997                                         // Record this hit
4998                                         pairs->insert(p);
4999                                 }
5000                         }
5001                         if(this->verbose_) {
5002                                 cout << "About to report:" << endl;
5003                                 cout << "  ";
5004                                 for(size_t i = 0; i < qlen; i++) {
5005                                         cout << (char)qry[i];
5006                                 }
5007                                 cout << endl;
5008                                 cout << "  ";
5009                                 for(size_t i = 0; i < qlen; i++) {
5010                                         cout << "ACGT"[ref[rir+i]];
5011                                 }
5012                                 cout << endl;
5013                         }
5014                         assert_leq(diffs, 3);
5015                         assert_geq((size_t)mms, diffs);
5016                         assert_lt(r2i, r2.size());
5017                         assert_eq(re2[r2i], ri);
5018                         ranges.resize(ranges.size()+1);
5019                         Range& range = ranges.back();
5020                         assert_eq((size_t)mms, r2[r2i].numMms);
5021                         range.stratum = diffs;
5022                         range.numMms = mms;
5023                         assert_eq(0, range.mms.size());
5024                         assert_eq(0, range.refcs.size());
5025                         if(mms > 0) {
5026                                 ASSERT_ONLY(size_t mmcur = 0);
5027                                 if(seedOnLeft && diffs > 0) {
5028                                         assert_neq(mmpos1, 0xffffffff);
5029                                         assert_lt(mmpos1, qlen);
5030                                         assert_lt(mmcur, (size_t)mms);
5031                                         assert_eq(mmpos1, r2[r2i].mms[mmcur]);
5032                                         assert_neq(-1, refc1);
5033                                         assert_eq(refc1, r2[r2i].refcs[mmcur]);
5034                                         ASSERT_ONLY(mmcur++);
5035                                         range.mms.push_back(mmpos1);
5036                                         range.refcs.push_back(refc1);
5037                                         if(diffs > 1) {
5038                                                 assert_neq(mmpos2, 0xffffffff);
5039                                                 assert_lt(mmpos2, qlen);
5040                                                 assert_lt(mmcur, (size_t)mms);
5041                                                 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
5042                                                 assert_neq(-1, refc2);
5043                                                 assert_eq(refc2, r2[r2i].refcs[mmcur]);
5044                                                 ASSERT_ONLY(mmcur++);
5045                                                 range.mms.push_back(mmpos2);
5046                                                 range.refcs.push_back(refc2);
5047                                                 if(diffs > 2) {
5048                                                         assert_eq(3, diffs);
5049                                                         assert_neq(mmpos3, 0xffffffff);
5050                                                         assert_lt(mmpos3, qlen);
5051                                                         assert_lt(mmcur, (size_t)mms);
5052                                                         assert_eq(mmpos3, r2[r2i].mms[mmcur]);
5053                                                         assert_neq(-1, refc3);
5054                                                         assert_eq(refc3, r2[r2i].refcs[mmcur]);
5055                                                         ASSERT_ONLY(mmcur++);
5056                                                         range.mms.push_back(mmpos3);
5057                                                         range.refcs.push_back(refc3);
5058                                                 }
5059                                         }
5060                                 }
5061                                 const size_t nonSeedMmsSz = nonSeedMms.size();
5062                                 for(size_t i = 0; i < nonSeedMmsSz; i++) {
5063                                         assert_neq(0xffffffff, nonSeedMms[i]);
5064                                         assert_lt(mmcur, (size_t)mms);
5065                                         assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
5066                                         range.mms.push_back(nonSeedMms[i]);
5067                                         assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
5068                                         ASSERT_ONLY(mmcur++);
5069                                         range.refcs.push_back(nonSeedRefcs[i]);
5070                                 }
5071                                 if(!seedOnLeft && diffs > 0) {
5072                                         assert_neq(mmpos1, 0xffffffff);
5073                                         assert_lt(mmpos1, qlen);
5074                                         assert_lt(mmcur, (size_t)mms);
5075                                         assert_eq(mmpos1, r2[r2i].mms[mmcur]);
5076                                         assert_neq(-1, refc1);
5077                                         assert_eq(refc1, r2[r2i].refcs[mmcur]);
5078                                         ASSERT_ONLY(mmcur++);
5079                                         range.mms.push_back(mmpos1);
5080                                         range.refcs.push_back(refc1);
5081                                         if(diffs > 1) {
5082                                                 assert_neq(mmpos2, 0xffffffff);
5083                                                 assert_lt(mmpos2, qlen);
5084                                                 assert_lt(mmcur, (size_t)mms);
5085                                                 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
5086                                                 assert_neq(-1, refc2);
5087                                                 assert_eq(refc2, r2[r2i].refcs[mmcur]);
5088                                                 ASSERT_ONLY(mmcur++);
5089                                                 range.mms.push_back(mmpos2);
5090                                                 range.refcs.push_back(refc2);
5091                                                 if(diffs > 2) {
5092                                                         assert_eq(3, diffs);
5093                                                         assert_neq(mmpos3, 0xffffffff);
5094                                                         assert_lt(mmpos3, qlen);
5095                                                         assert_lt(mmcur, (size_t)mms);
5096                                                         assert_eq(mmpos3, r2[r2i].mms[mmcur]);
5097                                                         assert_neq(-1, refc3);
5098                                                         assert_eq(refc3, r2[r2i].refcs[mmcur]);
5099                                                         ASSERT_ONLY(mmcur++);
5100                                                         range.mms.push_back(mmpos3);
5101                                                         range.refcs.push_back(refc3);
5102                                                 }
5103                                         }
5104                                 }
5105                                 assert_eq(mmcur, r2[r2i].mms.size());
5106                         }
5107                         assert_eq((size_t)mms, range.mms.size());
5108                         assert_eq((size_t)mms, range.refcs.size());
5109                         ASSERT_ONLY(r2i++);
5110                         results.push_back(ri);
5111                         if(--numToFind == 0) return;
5112                 }
5113                 assert_leq(duplicates, r2.size());
5114                 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
5115                 return; // no hit
5116         }
5117 };
5118
5119 #endif /* REF_ALIGNER_H_ */