12 #include "seqan/sequence.h"
15 #include "reference.h"
17 // Let the reference-aligner buffer size be 16K by default. If more
18 // room is required, a new buffer must be allocated from the heap.
19 const static int REF_ALIGNER_BUFSZ = 16 * 1024;
22 * Abstract parent class for classes that look for alignments by
23 * matching against the reference sequence directly. This is useful
24 * both for sanity-checking results from the Bowtie index and for
25 * finding mates when the reference location of the opposite mate is
28 template<typename TStr>
31 typedef seqan::String<seqan::Dna5> TDna5Str;
32 typedef seqan::String<char> TCharStr;
33 typedef std::vector<uint32_t> TU32Vec;
34 typedef std::vector<Range> TRangeVec;
35 typedef std::pair<uint64_t, uint64_t> TU64Pair;
36 typedef std::set<TU64Pair> TSetPairs;
39 RefAligner(bool color,
43 uint32_t qualMax = 0xffffffff,
44 bool maqPenalty = false) :
45 color_(color), verbose_(verbose), seedLen_(seedLen),
46 qualMax_(qualMax), maqPenalty_(maqPenalty), refbuf_(buf_),
47 refbufSz_(REF_ALIGNER_BUFSZ), freeRefbuf_(false)
51 * Free the reference-space alignment buffer if this object
54 virtual ~RefAligner() {
61 * Find one alignment of qry:quals in the range begin-end in
62 * reference string ref. Store the alignment details in range.
64 virtual void find(uint32_t numToFind,
66 const BitPairReference *refs,
68 const TCharStr& quals,
73 TSetPairs* pairs = NULL,
74 uint32_t aoff = 0xffffffff,
75 bool seedOnLeft = false)
77 assert_gt(numToFind, 0);
78 assert_gt(end, begin);
79 uint32_t spread = end - begin + (color_ ? 1 : 0);
80 uint32_t spreadPlus = spread + 12;
81 // Make sure the buffer is large enough to accommodate the spread
82 if(spreadPlus > this->refbufSz_) {
83 this->newBuf(spreadPlus);
85 // Read in the relevant stretch of the reference string
86 int offset = refs->getStretch(this->refbuf_, tidx, begin, spread);
87 uint8_t *buf = ((uint8_t*)this->refbuf_) + offset;
90 for(size_t i = 0; i < (end-begin); i++) {
91 assert_leq((int)buf[i], 4);
92 buf[i] = dinuc2color[(int)buf[i]][(int)buf[i+1]];
95 // Look for alignments
96 ASSERT_ONLY(uint32_t irsz = ranges.size());
97 anchor64Find(numToFind, tidx, buf, qry, quals, begin,
98 end, ranges, results, pairs, aoff, seedOnLeft);
100 for(size_t i = irsz; i < results.size(); i++) {
101 assert_eq(ranges[i].numMms, ranges[i].mms.size());
102 assert_eq(ranges[i].numMms, ranges[i].refcs.size());
108 * Find one alignment of qry:quals in the range begin-end in
109 * reference string ref. Store the alignment details in range.
110 * Uses a combination of the anchor bases and 64-bit arithmetic to
111 * find anchors quickly.
113 virtual void anchor64Find(uint32_t numToFind,
117 const TCharStr& quals,
122 TSetPairs* pairs = NULL,
123 uint32_t aoff = 0xffffffff,
124 bool seedOnLeft = false) const = 0;
127 * Set a new reference-sequence buffer.
129 void setBuf(uint32_t *newbuf, uint32_t newsz) {
139 * Set a new reference-sequence buffer.
141 void newBuf(uint32_t newsz) {
146 refbuf_ = new uint32_t[(newsz + 3) / 4];
147 if(refbuf_ == NULL) throw std::bad_alloc();
148 } catch(std::bad_alloc& e) {
149 cerr << "Error: Could not allocate reference-space alignment buffer of " << newsz << "B" << endl;
157 bool color_; /// whether to colorize reference buffers before handing off
158 bool verbose_; /// be talkative
159 uint32_t seedLen_; /// length of seed region for read
160 uint32_t qualMax_; /// maximum sum of quality penalties
161 bool maqPenalty_;/// whether to round as in Maq
162 uint32_t *refbuf_; /// pointer to current reference buffer
163 uint32_t refbufSz_; /// size of current reference buffer
164 uint32_t buf_[REF_ALIGNER_BUFSZ / 4]; /// built-in reference buffer (may be superseded)
165 bool freeRefbuf_; /// whether refbuf_ points to something we should delete
169 * Concrete RefAligner for finding nearby exact hits given an anchor
172 template<typename TStr>
173 class ExactRefAligner : public RefAligner<TStr> {
175 typedef seqan::String<seqan::Dna5> TDna5Str;
176 typedef seqan::String<char> TCharStr;
177 typedef std::vector<uint32_t> TU32Vec;
178 typedef std::vector<Range> TRangeVec;
179 typedef std::pair<uint64_t, uint64_t> TU64Pair;
180 typedef std::set<TU64Pair> TSetPairs;
184 ExactRefAligner(bool color, bool verbose, bool quiet) :
185 RefAligner<TStr>(color, verbose, quiet) { }
187 virtual ~ExactRefAligner() { }
191 * Because we're doing end-to-end exact, we don't care which end of
192 * 'qry' is the 5' end.
194 void naiveFind(uint32_t numToFind,
198 const TCharStr& quals,
205 bool seedOnLeft) const
207 assert_gt(numToFind, 0);
208 const uint32_t qlen = seqan::length(qry);
209 assert_geq(end - begin, qlen); // caller should have checked this
210 assert_gt(end, begin);
212 uint32_t qend = end - qlen;
213 uint32_t lim = qend - begin;
214 uint32_t halfway = begin + (lim >> 1);
216 for(uint32_t i = 1; i <= lim+1; i++) {
217 uint32_t ri; // leftmost position in candidate alignment
218 uint32_t rir; // same, minus begin; for indexing into ref[]
220 ri = halfway + (i >> 1); rir = ri - begin;
221 assert_leq(ri, qend);
223 ri = halfway - (i >> 1); rir = ri - begin;
224 assert_geq(ri, begin);
227 // Do the naive comparison
229 for(uint32_t j = 0; j < qlen; j++) {
231 // Count Ns in the reference as mismatches
232 const int q = (int)qry[j];
233 const int r = (int)ref[rir + j];
236 if(q == 4 || r == 4 || q != r) {
243 // Disallow alignments that involve an N in the
245 const int r = (int)ref[rir + j];
247 // N in reference; bail
251 const int q = (int)qry[j];
263 ranges.resize(ranges.size()+1);
264 Range& range = ranges.back();
267 assert_eq(0, range.mms.size());
268 assert_eq(0, range.refcs.size());
269 results.push_back(ri);
276 * Because we're doing end-to-end exact, we don't care which end of
277 * 'qry' is the 5' end.
279 virtual void anchor64Find(uint32_t numToFind,
283 const TCharStr& quals,
289 uint32_t aoff, // offset of anchor mate
290 bool seedOnLeft) const
292 assert_gt(numToFind, 0);
293 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
294 ASSERT_ONLY(uint32_t duplicates = 0);
295 ASSERT_ONLY(uint32_t r2i = 0);
296 const uint32_t qlen = seqan::length(qry);
297 assert_geq(end - begin, qlen); // caller should have checked this
298 assert_gt(end, begin);
301 // Get all naive hits
302 TRangeVec r2; TU32Vec re2;
303 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
304 re2, pairs, aoff, seedOnLeft);
306 const uint32_t anchorBitPairs = min<int>(qlen, 32);
307 // anchorOverhang = # read bases not included in the anchor
308 const uint32_t anchorOverhang = qlen <= 32 ? 0 : qlen - 32;
309 const uint32_t lim = end - qlen - begin;
310 const uint32_t halfway = begin + (lim >> 1);
311 uint64_t anchor = 0llu;
312 uint64_t buffw = 0llu;
313 // Set up a mask that we'll apply to the two bufs every round
314 // to discard bits that were rotated out of the anchor area
315 uint64_t clearMask = 0xffffffffffffffffllu;
316 bool useMask = false;
317 if(anchorBitPairs < 32) {
318 clearMask >>= ((32-anchorBitPairs) << 1);
321 const int lhsShift = ((anchorBitPairs - 1) << 1);
322 // Build the contents of the 'anchor' dword and the initial
323 // contents of the 'buffw' dword. If there are fewer than 32
324 // anchorBitPairs, the content will be packed into the least
325 // significant bits of the word.
326 uint32_t skipLeftToRights = 0;
327 uint32_t skipRightToLefts = 0;
328 for(uint32_t i = 0; i < anchorBitPairs; i++) {
329 int c = (int)qry[i]; // next query character
332 assert_eq(r2.size(), ranges.size() - rangesInitSz);
333 return; // can't match if query has Ns
335 int r = (int)ref[halfway - begin + i]; // next reference character
338 // The right-to-left direction absorbs the candidate
339 // alignment based at 'halfway'; so skipLeftToRights is
341 skipLeftToRights = max(skipLeftToRights, i);
342 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
346 anchor = ((anchor << 2llu) | c);
347 buffw = ((buffw << 2llu) | r);
349 // Check whether read is disqualified by Ns outside of the anchor
351 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
352 if((int)qry[i] == 4) {
353 assert_eq(r2.size(), ranges.size() - rangesInitSz);
354 return; // can't match if query has Ns
357 uint64_t bufbw = buffw;
358 // We're moving the right-hand edge of the anchor along until
359 // it's 'anchorOverhang' chars from the end of the target region.
360 // Note that we're not making a 3'/5' distinction here; if we
361 // were, we might need to make the 'anchorOverhang' adjustment on
362 // the left end of the range rather than the right.
364 uint32_t riHi = halfway;
365 uint32_t rirHi = halfway - begin;
366 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
367 uint32_t riLo = halfway + 1;
368 uint32_t rirLo = halfway - begin + 1;
369 for(uint32_t i = 1; i <= lim + 1; i++) {
370 int r; // new reference char
371 assert_lt(skipLeftToRights, qlen);
372 assert_leq(skipRightToLefts, qlen);
375 // Moving left-to-right
376 riHi++; rirHi++; rirHiAnchor++;
377 r = (int)ref[rirHiAnchor];
380 skipLeftToRights = anchorBitPairs;
383 // Bring in new base pair at the least significant
385 buffw = ((buffw << 2llu) | r);
386 if(useMask) buffw &= clearMask;
387 if(skipLeftToRights > 0) {
391 if(buffw != anchor) {
396 // Moving right-to-left
401 skipRightToLefts = qlen;
406 // Bring in new base pair at the most significant
408 bufbw |= ((uint64_t)r << lhsShift);
410 if(skipRightToLefts > 0) {
414 if(bufbw != anchor) {
419 bool foundHit = true;
420 uint32_t ri = hi ? riLo : riHi;
421 uint32_t rir = hi ? rirLo : rirHi;
422 if(anchorOverhang > 0) {
423 // Does the non-anchor part of the alignment (the
424 // "overhang") ruin it?
425 bool skipCandidate = false;
426 for(uint32_t j = 0; j < anchorOverhang; j++) {
427 assert_lt(ri + anchorBitPairs + j, end);
428 int rc = (int)ref[rir + anchorBitPairs + j];
430 // Oops, encountered an N in the reference in
431 // the overhang portion of the candidate
433 // (Note that we inverted hi earlier)
436 skipRightToLefts = anchorOverhang - j - 1;
439 skipLeftToRights = anchorBitPairs + j;
441 skipCandidate = true;
442 break; // Skip this candidate
444 if((int)qry[32 + j] != rc) {
445 // Yes, overhang ruins it
450 if(skipCandidate) continue;
456 // By convention, the upstream mate's
457 // coordinates go in the 'first' field
458 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
459 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
461 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
462 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
464 if(pairs->find(p) != pairs->end()) {
465 // We already found this hit! Continue.
466 ASSERT_ONLY(duplicates++);
474 assert_lt(r2i, r2.size());
475 assert_eq(re2[r2i], ri);
476 ranges.resize(ranges.size()+1);
477 Range& range = ranges.back();
480 assert_eq(0, range.mms.size());
481 assert_eq(0, range.refcs.size());
483 results.push_back(ri);
484 if(--numToFind == 0) return;
489 assert_leq(duplicates, r2.size());
490 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
496 * Defined in ref_aligner.cpp. Maps an octet representing the XOR of
497 * two two-bit-per-base-encoded DNA sequences to the number of bases
498 * that mismatch between the two.
500 extern unsigned char u8toMms[];
503 * Concrete RefAligner for finding nearby 1-mismatch hits given an
506 template<typename TStr>
507 class OneMMRefAligner : public RefAligner<TStr> {
509 typedef seqan::String<seqan::Dna5> TDna5Str;
510 typedef seqan::String<char> TCharStr;
511 typedef std::vector<Range> TRangeVec;
512 typedef std::vector<uint32_t> TU32Vec;
513 typedef std::pair<uint64_t, uint64_t> TU64Pair;
514 typedef std::set<TU64Pair> TSetPairs;
518 OneMMRefAligner(bool color, bool verbose, bool quiet) :
519 RefAligner<TStr>(color, verbose, quiet) { }
521 virtual ~OneMMRefAligner() { }
525 * Because we're doing end-to-end exact, we don't care which end of
526 * 'qry' is the 5' end.
528 void naiveFind(uint32_t numToFind,
532 const TCharStr& quals,
539 bool seedOnLeft) const
541 assert_gt(numToFind, 0);
542 const uint32_t qlen = seqan::length(qry);
543 assert_geq(end - begin, qlen); // caller should have checked this
544 assert_gt(end, begin);
546 uint32_t qend = end - qlen;
547 uint32_t lim = qend - begin;
548 uint32_t halfway = begin + (lim >> 1);
550 for(uint32_t i = 1; i <= lim+1; i++) {
551 uint32_t ri; // leftmost position in candidate alignment
552 uint32_t rir; // same, minus begin; for indexing into ref[]
554 ri = halfway + (i >> 1); rir = ri - begin;
555 assert_leq(ri, qend);
557 ri = halfway - (i >> 1); rir = ri - begin;
558 assert_geq(ri, begin);
561 // Do the naive comparison
564 uint32_t mmOff = 0xffffffff;
566 for(uint32_t j = 0; j < qlen; j++) {
568 // Count Ns in the reference as mismatches
569 const int q = (int)qry[j];
570 const int r = (int)ref[rir + j];
573 if(q == 4 || r == 4 || q != r) {
575 // Disallow alignments that involve an N in the
577 const int r = (int)ref[rir + j];
579 // N in reference; bail
583 const int q = (int)qry[j];
590 // Too many; reject this alignment
594 // First one; remember offset and ref char
602 ranges.resize(ranges.size()+1);
603 Range& range = ranges.back();
606 assert_eq(0, range.mms.size());
607 assert_eq(0, range.refcs.size());
609 assert_lt(mmOff, qlen);
610 range.mms.push_back(mmOff);
611 range.refcs.push_back(refc);
613 results.push_back(ri);
620 * Because we're doing end-to-end exact, we don't care which end of
621 * 'qry' is the 5' end.
623 virtual void anchor64Find(uint32_t numToFind,
627 const TCharStr& quals,
632 TSetPairs* pairs = NULL,
633 uint32_t aoff = 0xffffffff,
634 bool seedOnLeft = false) const
636 assert_gt(numToFind, 0);
637 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
638 ASSERT_ONLY(uint32_t duplicates = 0);
639 ASSERT_ONLY(uint32_t r2i = 0);
640 const uint32_t qlen = seqan::length(qry);
641 assert_geq(end - begin, qlen); // caller should have checked this
642 assert_gt(end, begin);
645 // Get results from the naive matcher for sanity-checking
646 TRangeVec r2; TU32Vec re2;
647 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
648 re2, pairs, aoff, seedOnLeft);
650 const uint32_t anchorBitPairs = min<int>(qlen, 32);
651 const int lhsShift = ((anchorBitPairs - 1) << 1);
652 const uint32_t anchorCushion = 32 - anchorBitPairs;
653 // anchorOverhang = # read bases not included in the anchor
654 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
655 const uint32_t lim = end - qlen - begin;
656 const uint32_t halfway = begin + (lim >> 1);
657 uint64_t anchor = 0llu;
658 uint64_t buffw = 0llu; // rotating ref sequence buffer
659 // OR the 'diff' buffer with this so that we can always count
660 // 'N's as mismatches
661 uint64_t diffMask = 0llu;
662 // Set up a mask that we'll apply to the two bufs every round
663 // to discard bits that were rotated out of the anchor area
664 uint64_t clearMask = 0xffffffffffffffffllu;
665 bool useMask = false;
666 if(anchorBitPairs < 32) {
667 clearMask >>= ((32-anchorBitPairs) << 1);
672 uint32_t skipLeftToRights = 0;
673 uint32_t skipRightToLefts = 0;
674 // Construct the 'anchor' 64-bit buffer so that it holds all of
675 // the first 'anchorBitPairs' bit pairs of the query.
676 for(uint32_t i = 0; i < anchorBitPairs; i++) {
677 int c = (int)qry[i]; // next query character
678 int r = (int)ref[halfway - begin + i]; // next reference character
681 // The right-to-left direction absorbs the candidate
682 // alignment based at 'halfway'; so skipLeftToRights is
684 skipLeftToRights = max(skipLeftToRights, i);
685 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
689 // Special case: query has an 'N'
691 if(++nsInAnchor > 1) {
692 // More than one 'N' in the anchor region; can't
693 // possibly have a 1-mismatch hit anywhere
694 assert_eq(r2.size(), ranges.size() - rangesInitSz);
695 return; // can't match if query has Ns
698 // Make it look like an 'A' in the anchor
700 diffMask = (diffMask << 2llu) | 1llu;
704 anchor = ((anchor << 2llu) | c);
705 buffw = ((buffw << 2llu) | r);
707 // Check whether read is disqualified by Ns outside of the anchor
709 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
710 if((int)qry[i] == 4) {
711 if(++nsInAnchor > 1) {
712 assert_eq(r2.size(), ranges.size() - rangesInitSz);
713 return; // can't match if query has Ns
717 uint64_t bufbw = buffw;
718 // We're moving the right-hand edge of the anchor along until
719 // it's 'anchorOverhang' chars from the end of the target region.
720 // Note that we're not making a 3'/5' distinction here; if we
721 // were, we might need to make the 'anchorOverhang' adjustment on
722 // the left end of the range rather than the right.
724 uint32_t riHi = halfway;
725 uint32_t rirHi = halfway - begin;
726 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
727 uint32_t riLo = halfway + 1;
728 uint32_t rirLo = halfway - begin + 1;
729 for(uint32_t i = 1; i <= lim + 1; i++) {
730 int r; // new reference char
732 assert_lt(skipLeftToRights, qlen);
733 assert_leq(skipRightToLefts, qlen);
736 // Moving left-to-right
740 r = (int)ref[rirHiAnchor];
743 skipLeftToRights = anchorBitPairs;
746 // Bring in new base pair at the least significant
748 buffw = ((buffw << 2llu) | r);
749 if(useMask) buffw &= clearMask;
750 if(skipLeftToRights > 0) {
754 diff = (buffw ^ anchor) | diffMask;
757 // Moving right-to-left
763 skipRightToLefts = qlen;
768 // Bring in new base pair at the most significant
770 bufbw |= ((uint64_t)r << lhsShift);
772 if(skipRightToLefts > 0) {
776 diff = (bufbw ^ anchor) | diffMask;
778 if((diff & 0xffffffff00000000llu) &&
779 (diff & 0x00000000ffffffffllu)) continue;
780 uint32_t ri = hi ? riLo : riHi;
781 uint32_t rir = hi ? rirLo : rirHi;
782 // Could use pop count
783 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
784 // As a first cut, see if there are too many mismatches in
785 // the first and last parts of the anchor
786 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
787 if(diffs > 1) continue;
788 diffs += u8toMms[(int)diff8[1]] +
789 u8toMms[(int)diff8[2]] +
790 u8toMms[(int)diff8[3]] +
791 u8toMms[(int)diff8[4]] +
792 u8toMms[(int)diff8[5]] +
793 u8toMms[(int)diff8[6]];
794 uint32_t mmpos = 0xffffffff;
797 // Too many differences
799 } else if(diffs == 1 && nPos != -1) {
800 // There was one difference, but there was also one N,
801 // so we already know where the difference is
803 refc = "ACGT"[(int)ref[rir + nPos]];
804 } else if(diffs == 1) {
805 // Figure out which position mismatched
807 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos -= 16; }
809 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos -= 8; }
811 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos -= 4; }
813 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos -= 2; }
815 if((diff & 0x3llu) == 0) { mmpos--; }
817 assert_geq(mmpos, 0);
818 assert_lt(mmpos, 32);
819 mmpos -= anchorCushion;
820 refc = "ACGT"[(int)ref[rir + mmpos]];
822 // Now extend the anchor into a longer alignment
823 bool foundHit = true;
824 if(anchorOverhang > 0) {
825 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
826 bool skipCandidate = false;
827 for(uint32_t j = 0; j < anchorOverhang; j++) {
828 int rc = (int)ref[rir + 32 + j];
830 // Oops, encountered an N in the reference in
831 // the overhang portion of the candidate
833 // (Note that we inverted hi earlier)
836 skipRightToLefts = anchorOverhang - j - 1;
839 skipLeftToRights = anchorBitPairs + j;
841 skipCandidate = true; // Skip this candidate
844 if((int)qry[32 + j] != rc) {
850 refc = "ACGT"[(int)ref[rir + 32 + j]];
854 if(skipCandidate) continue;
856 if(!foundHit) continue;
860 // By convention, the upstream mate's
861 // coordinates go in the 'first' field
862 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
863 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
865 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
866 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
868 if(pairs->find(p) != pairs->end()) {
869 // We already found this hit! Continue.
870 ASSERT_ONLY(duplicates++);
878 assert_leq(diffs, 1);
879 assert_lt(r2i, r2.size());
880 assert_eq(re2[r2i], ri);
881 ranges.resize(ranges.size()+1);
882 Range& range = ranges.back();
883 assert_eq(diffs, r2[r2i].numMms);
884 range.stratum = diffs;
885 range.numMms = diffs;
886 assert_eq(0, range.mms.size());
887 assert_eq(0, range.refcs.size());
889 assert_neq(mmpos, 0xffffffff);
890 assert_eq(mmpos, r2[r2i].mms[0]);
891 assert_neq(-1, refc);
892 assert_eq(refc, r2[r2i].refcs[0]);
893 range.mms.push_back(mmpos);
894 range.refcs.push_back(refc);
897 results.push_back(ri);
898 if(--numToFind == 0) return;
900 assert_leq(duplicates, r2.size());
901 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
907 * Concrete RefAligner for finding nearby 2-mismatch hits given an
910 template<typename TStr>
911 class TwoMMRefAligner : public RefAligner<TStr> {
913 typedef seqan::String<seqan::Dna5> TDna5Str;
914 typedef seqan::String<char> TCharStr;
915 typedef std::vector<Range> TRangeVec;
916 typedef std::vector<uint32_t> TU32Vec;
917 typedef std::pair<uint64_t, uint64_t> TU64Pair;
918 typedef std::set<TU64Pair> TSetPairs;
922 TwoMMRefAligner(bool color, bool verbose, bool quiet) :
923 RefAligner<TStr>(color, verbose, quiet) { }
925 virtual ~TwoMMRefAligner() { }
929 * Because we're doing end-to-end exact, we don't care which end of
930 * 'qry' is the 5' end.
932 void naiveFind(uint32_t numToFind,
936 const TCharStr& quals,
943 bool seedOnLeft) const
945 assert_gt(numToFind, 0);
946 const uint32_t qlen = seqan::length(qry);
947 assert_geq(end - begin, qlen); // caller should have checked this
948 assert_gt(end, begin);
950 uint32_t qend = end - qlen;
951 uint32_t lim = qend - begin;
952 uint32_t halfway = begin + (lim >> 1);
954 for(uint32_t i = 1; i <= lim+1; i++) {
955 uint32_t ri; // leftmost position in candidate alignment
956 uint32_t rir; // same, minus begin; for indexing into ref[]
958 ri = halfway + (i >> 1); rir = ri - begin;
959 assert_leq(ri, qend);
961 ri = halfway - (i >> 1); rir = ri - begin;
962 assert_geq(ri, begin);
965 // Do the naive comparison
968 uint32_t mmOff1 = 0xffffffff;
970 uint32_t mmOff2 = 0xffffffff;
972 for(uint32_t j = 0; j < qlen; j++) {
974 // Count Ns in the reference as mismatches
975 const int q = (int)qry[j];
976 const int r = (int)ref[rir + j];
979 if(q == 4 || r == 4 || q != r) {
981 // Disallow alignments that involve an N in the
983 const int r = (int)ref[rir + j];
985 // N in reference; bail
989 const int q = (int)qry[j];
996 // Too many; reject this alignment
999 } else if(mms == 2) {
1000 // Second one; remember offset and ref char
1005 // First one; remember offset and ref char
1013 ranges.resize(ranges.size()+1);
1014 Range& range = ranges.back();
1015 range.stratum = mms;
1017 assert_eq(0, range.mms.size());
1018 assert_eq(0, range.refcs.size());
1020 assert_lt(mmOff1, qlen);
1021 assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1022 range.mms.push_back(mmOff1);
1023 range.refcs.push_back(refc1);
1026 assert_lt(mmOff2, qlen);
1027 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1028 range.mms.push_back(mmOff2);
1029 range.refcs.push_back(refc2);
1032 results.push_back(ri);
1039 * Because we're doing end-to-end exact, we don't care which end of
1040 * 'qry' is the 5' end.
1042 virtual void anchor64Find(uint32_t numToFind,
1045 const TDna5Str& qry,
1046 const TCharStr& quals,
1051 TSetPairs* pairs = NULL,
1052 uint32_t aoff = 0xffffffff,
1053 bool seedOnLeft = false) const
1055 assert_gt(numToFind, 0);
1056 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1057 ASSERT_ONLY(uint32_t duplicates = 0);
1058 ASSERT_ONLY(uint32_t r2i = 0);
1059 const uint32_t qlen = seqan::length(qry);
1060 assert_geq(end - begin, qlen); // caller should have checked this
1061 assert_gt(end, begin);
1064 // Get results from the naive matcher for sanity-checking
1065 TRangeVec r2; TU32Vec re2;
1066 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1067 re2, pairs, aoff, seedOnLeft);
1069 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1070 const int lhsShift = ((anchorBitPairs - 1) << 1);
1071 const uint32_t anchorCushion = 32 - anchorBitPairs;
1072 // anchorOverhang = # read bases not included in the anchor
1073 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1074 const uint32_t lim = end - qlen - begin;
1075 const uint32_t halfway = begin + (lim >> 1);
1076 uint64_t anchor = 0llu;
1077 uint64_t buffw = 0llu; // rotating ref sequence buffer
1078 // OR the 'diff' buffer with this so that we can always count
1079 // 'N's as mismatches
1080 uint64_t diffMask = 0llu;
1081 // Set up a mask that we'll apply to the two bufs every round
1082 // to discard bits that were rotated out of the anchor area
1083 uint64_t clearMask = 0xffffffffffffffffllu;
1084 bool useMask = false;
1085 if(anchorBitPairs < 32) {
1086 clearMask >>= ((32-anchorBitPairs) << 1);
1093 uint32_t skipLeftToRights = 0;
1094 uint32_t skipRightToLefts = 0;
1095 // Construct the 'anchor' 64-bit buffer so that it holds all of
1096 // the first 'anchorBitPairs' bit pairs of the query.
1097 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1098 int c = (int)qry[i]; // next query character
1099 int r = (int)ref[halfway - begin + i]; // next reference character
1102 // The right-to-left direction absorbs the candidate
1103 // alignment based at 'halfway'; so skipLeftToRights is
1105 skipLeftToRights = max(skipLeftToRights, i);
1106 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1110 // Special case: query has an 'N'
1112 if(++nsInAnchor > 2) {
1113 // More than two 'N's in the anchor region; can't
1114 // possibly have a 2-mismatch hit anywhere
1115 return; // can't match if query has Ns
1116 } else if(nsInAnchor == 2) {
1120 assert_eq(1, nsInAnchor);
1124 // Make it look like an 'A' in the anchor
1126 diffMask = (diffMask << 2llu) | 1llu;
1130 anchor = ((anchor << 2llu) | c);
1131 buffw = ((buffw << 2llu) | r);
1133 assert_leq(nPoss, 2);
1134 // Check whether read is disqualified by Ns outside of the anchor
1136 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1137 if((int)qry[i] == 4) {
1138 if(++nsInAnchor > 2) {
1139 return; // can't match if query has Ns
1143 uint64_t bufbw = buffw;
1144 // We're moving the right-hand edge of the anchor along until
1145 // it's 'anchorOverhang' chars from the end of the target region.
1146 // Note that we're not making a 3'/5' distinction here; if we
1147 // were, we might need to make the 'anchorOverhang' adjustment on
1148 // the left end of the range rather than the right.
1150 uint32_t riHi = halfway;
1151 uint32_t rirHi = halfway - begin;
1152 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1153 uint32_t riLo = halfway + 1;
1154 uint32_t rirLo = halfway - begin + 1;
1156 for(i = 1; i <= lim + 1; i++) {
1157 int r; // new reference char
1159 assert_lt(skipLeftToRights, qlen);
1160 assert_leq(skipRightToLefts, qlen);
1163 // Moving left-to-right
1167 r = (int)ref[rirHiAnchor];
1170 skipLeftToRights = anchorBitPairs;
1173 // Bring in new base pair at the least significant
1175 buffw = ((buffw << 2llu) | r);
1176 if(useMask) buffw &= clearMask;
1177 if(skipLeftToRights > 0) {
1181 diff = (buffw ^ anchor) | diffMask;
1184 // Moving right-to-left
1187 r = (int)ref[rirLo];
1190 skipRightToLefts = qlen;
1195 // Bring in new base pair at the most significant
1197 bufbw |= ((uint64_t)r << lhsShift);
1199 if(skipRightToLefts > 0) {
1203 diff = (bufbw ^ anchor) | diffMask;
1205 if((diff & 0xfffff00000000000llu) &&
1206 (diff & 0x00000ffffff00000llu) &&
1207 (diff & 0x00000000000fffffllu)) continue;
1208 uint32_t ri = hi ? riLo : riHi;
1209 uint32_t rir = hi ? rirLo : rirHi;
1210 // Could use pop count
1211 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1212 // As a first cut, see if there are too many mismatches in
1213 // the first and last parts of the anchor
1214 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1215 if(diffs > 2) continue;
1216 diffs += u8toMms[(int)diff8[1]] +
1217 u8toMms[(int)diff8[2]] +
1218 u8toMms[(int)diff8[3]] +
1219 u8toMms[(int)diff8[4]] +
1220 u8toMms[(int)diff8[5]] +
1221 u8toMms[(int)diff8[6]];
1222 uint32_t mmpos1 = 0xffffffff;
1224 uint32_t mmpos2 = 0xffffffff;
1227 // Too many differences
1229 } else if(nPoss > 1 && diffs == nPoss) {
1230 // There was one difference, but there was also one N,
1231 // so we already know where the difference is
1233 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1236 refc2 = "ACGT"[(int)ref[rir + nPos2]];
1238 } else if(diffs > 0) {
1239 // Figure out which position mismatched
1240 uint64_t diff2 = diff;
1242 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1243 assert_neq(0, diff);
1244 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos1 -= 8; }
1245 assert_neq(0, diff);
1246 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos1 -= 4; }
1247 assert_neq(0, diff);
1248 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos1 -= 2; }
1249 assert_neq(0, diff);
1250 if((diff & 0x3llu) == 0) { mmpos1--; }
1251 assert_neq(0, diff);
1252 assert_geq(mmpos1, 0);
1253 assert_lt(mmpos1, 32);
1254 mmpos1 -= anchorCushion;
1255 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1257 // Figure out the second mismatched position
1258 ASSERT_ONLY(uint64_t origDiff2 = diff2);
1259 diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1+anchorCushion) << 1));
1260 assert_neq(diff2, origDiff2);
1262 if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1263 assert_neq(0, diff2);
1264 if((diff2 & 0xffffllu) == 0) { diff2 >>= 16llu; mmpos2 -= 8; }
1265 assert_neq(0, diff2);
1266 if((diff2 & 0xffllu) == 0) { diff2 >>= 8llu; mmpos2 -= 4; }
1267 assert_neq(0, diff2);
1268 if((diff2 & 0xfllu) == 0) { diff2 >>= 4llu; mmpos2 -= 2; }
1269 assert_neq(0, diff2);
1270 if((diff2 & 0x3llu) == 0) { mmpos2--; }
1271 assert_neq(0, diff2);
1272 assert_geq(mmpos2, 0);
1273 assert_lt(mmpos2, 32);
1274 mmpos2 -= anchorCushion;
1275 assert_neq(mmpos1, mmpos2);
1276 refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1277 if(mmpos2 < mmpos1) {
1278 uint32_t mmtmp = mmpos1;
1281 int refctmp = refc1;
1285 assert_lt(mmpos1, mmpos2);
1288 // Now extend the anchor into a longer alignment
1289 bool foundHit = true;
1290 if(anchorOverhang > 0) {
1291 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1292 bool skipCandidate = false;
1293 for(uint32_t j = 0; j < anchorOverhang; j++) {
1294 int rc = (int)ref[rir + 32 + j];
1296 // Oops, encountered an N in the reference in
1297 // the overhang portion of the candidate
1299 // (Note that we inverted hi earlier)
1302 skipRightToLefts = anchorOverhang - j - 1;
1305 skipLeftToRights = anchorBitPairs + j;
1307 skipCandidate = true; // Skip this candidate
1310 if((int)qry[32 + j] != rc) {
1314 } else if(diffs == 2) {
1316 refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1318 assert_eq(1, diffs);
1320 refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1324 if(skipCandidate) continue;
1326 if(!foundHit) continue;
1330 // By convention, the upstream mate's
1331 // coordinates go in the 'first' field
1332 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
1333 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1335 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1336 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1338 if(pairs->find(p) != pairs->end()) {
1339 // We already found this hit! Continue.
1340 ASSERT_ONLY(duplicates++);
1348 assert_leq(diffs, 2);
1349 assert_lt(r2i, r2.size());
1350 assert_eq(re2[r2i], ri);
1351 ranges.resize(ranges.size()+1);
1352 Range& range = ranges.back();
1353 assert_eq(diffs, r2[r2i].numMms);
1354 range.stratum = diffs;
1355 range.numMms = diffs;
1356 assert_eq(0, range.mms.size());
1357 assert_eq(0, range.refcs.size());
1359 assert_neq(mmpos1, 0xffffffff);
1360 assert_eq(mmpos1, r2[r2i].mms[0]);
1361 assert_neq(-1, refc1);
1362 assert_eq(refc1, r2[r2i].refcs[0]);
1363 range.mms.push_back(mmpos1);
1364 range.refcs.push_back(refc1);
1366 assert_neq(mmpos2, 0xffffffff);
1367 assert_eq(mmpos2, r2[r2i].mms[1]);
1368 assert_neq(-1, refc2);
1369 assert_eq(refc2, r2[r2i].refcs[1]);
1370 range.mms.push_back(mmpos2);
1371 range.refcs.push_back(refc2);
1375 results.push_back(ri);
1376 if(--numToFind == 0) return;
1378 assert_leq(duplicates, r2.size());
1379 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1385 * Concrete RefAligner for finding nearby 2-mismatch hits given an
1388 template<typename TStr>
1389 class ThreeMMRefAligner : public RefAligner<TStr> {
1391 typedef seqan::String<seqan::Dna5> TDna5Str;
1392 typedef seqan::String<char> TCharStr;
1393 typedef std::vector<Range> TRangeVec;
1394 typedef std::vector<uint32_t> TU32Vec;
1395 typedef std::pair<uint64_t, uint64_t> TU64Pair;
1396 typedef std::set<TU64Pair> TSetPairs;
1400 ThreeMMRefAligner(bool color, bool verbose, bool quiet) :
1401 RefAligner<TStr>(color, verbose, quiet) { }
1403 virtual ~ThreeMMRefAligner() { }
1407 * Because we're doing end-to-end exact, we don't care which end of
1408 * 'qry' is the 5' end.
1410 void naiveFind(uint32_t numToFind,
1413 const TDna5Str& qry,
1414 const TCharStr& quals,
1421 bool seedOnLeft) const
1423 assert_gt(numToFind, 0);
1424 const uint32_t qlen = seqan::length(qry);
1425 assert_geq(end - begin, qlen); // caller should have checked this
1426 assert_gt(end, begin);
1428 uint32_t qend = end - qlen;
1429 uint32_t lim = qend - begin;
1430 uint32_t halfway = begin + (lim >> 1);
1432 for(uint32_t i = 1; i <= lim+1; i++) {
1433 uint32_t ri; // leftmost position in candidate alignment
1434 uint32_t rir; // same, minus begin; for indexing into ref[]
1436 ri = halfway + (i >> 1); rir = ri - begin;
1437 assert_leq(ri, qend);
1439 ri = halfway - (i >> 1); rir = ri - begin;
1440 assert_geq(ri, begin);
1443 // Do the naive comparison
1446 uint32_t mmOff1 = 0xffffffff;
1448 uint32_t mmOff2 = 0xffffffff;
1450 uint32_t mmOff3 = 0xffffffff;
1452 for(uint32_t j = 0; j < qlen; j++) {
1454 // Count Ns in the reference as mismatches
1455 const int q = (int)qry[j];
1456 const int r = (int)ref[rir + j];
1459 if(q == 4 || r == 4 || q != r) {
1461 // Disallow alignments that involve an N in the
1463 const int r = (int)ref[rir + j];
1465 // N in reference; bail
1469 const int q = (int)qry[j];
1476 // Too many; reject this alignment
1479 } else if(mms == 3) {
1480 // Second one; remember offset and ref char
1483 } else if(mms == 2) {
1484 // Second one; remember offset and ref char
1489 // First one; remember offset and ref char
1497 ranges.resize(ranges.size()+1);
1498 Range& range = ranges.back();
1499 range.stratum = mms;
1501 assert_eq(0, range.mms.size());
1502 assert_eq(0, range.refcs.size());
1504 assert_lt(mmOff1, qlen);
1505 assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1506 range.mms.push_back(mmOff1);
1507 range.refcs.push_back(refc1);
1509 assert_lt(mmOff2, qlen);
1510 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1511 range.mms.push_back(mmOff2);
1512 range.refcs.push_back(refc2);
1515 assert_lt(mmOff3, qlen);
1516 assert(refc3 == 'A' || refc3 == 'C' || refc3 == 'G' || refc3 == 'T');
1517 range.mms.push_back(mmOff3);
1518 range.refcs.push_back(refc3);
1522 results.push_back(ri);
1529 * Because we're doing end-to-end exact, we don't care which end of
1530 * 'qry' is the 5' end.
1532 virtual void anchor64Find(uint32_t numToFind,
1535 const TDna5Str& qry,
1536 const TCharStr& quals,
1541 TSetPairs* pairs = NULL,
1542 uint32_t aoff = 0xffffffff,
1543 bool seedOnLeft = false) const
1545 assert_gt(numToFind, 0);
1546 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1547 ASSERT_ONLY(uint32_t duplicates = 0);
1548 ASSERT_ONLY(uint32_t r2i = 0);
1549 const uint32_t qlen = seqan::length(qry);
1550 assert_geq(end - begin, qlen); // caller should have checked this
1551 assert_gt(end, begin);
1554 // Get results from the naive matcher for sanity-checking
1555 TRangeVec r2; TU32Vec re2;
1556 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1557 re2, pairs, aoff, seedOnLeft);
1559 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1560 const int lhsShift = ((anchorBitPairs - 1) << 1);
1561 const uint32_t anchorCushion = 32 - anchorBitPairs;
1562 // anchorOverhang = # read bases not included in the anchor
1563 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1564 const uint32_t lim = end - qlen - begin;
1565 const uint32_t halfway = begin + (lim >> 1);
1566 uint64_t anchor = 0llu;
1567 uint64_t buffw = 0llu; // rotating ref sequence buffer
1568 // OR the 'diff' buffer with this so that we can always count
1569 // 'N's as mismatches
1570 uint64_t diffMask = 0llu;
1571 // Set up a mask that we'll apply to the two bufs every round
1572 // to discard bits that were rotated out of the anchor area
1573 uint64_t clearMask = 0xffffffffffffffffllu;
1574 bool useMask = false;
1575 if(anchorBitPairs < 32) {
1576 clearMask >>= ((32-anchorBitPairs) << 1);
1584 uint32_t skipLeftToRights = 0;
1585 uint32_t skipRightToLefts = 0;
1586 // Construct the 'anchor' 64-bit buffer so that it holds all of
1587 // the first 'anchorBitPairs' bit pairs of the query.
1588 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1589 int c = (int)qry[i]; // next query character
1590 int r = (int)ref[halfway - begin + i]; // next reference character
1593 // The right-to-left direction absorbs the candidate
1594 // alignment based at 'halfway'; so skipLeftToRights is
1596 skipLeftToRights = max(skipLeftToRights, i);
1597 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1601 // Special case: query has an 'N'
1603 if(++nsInAnchor > 3) {
1604 // More than two 'N's in the anchor region; can't
1605 // possibly have a 2-mismatch hit anywhere
1606 assert_eq(r2.size(), ranges.size() - rangesInitSz);
1607 return; // can't match if query has Ns
1608 } else if(nsInAnchor == 3) {
1611 } else if(nsInAnchor == 2) {
1615 assert_eq(1, nsInAnchor);
1619 // Make it look like an 'A' in the anchor
1621 diffMask = (diffMask << 2llu) | 1llu;
1625 anchor = ((anchor << 2llu) | c);
1626 buffw = ((buffw << 2llu) | r);
1628 assert_leq(nPoss, 3);
1629 // Check whether read is disqualified by Ns outside of the anchor
1631 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1632 if((int)qry[i] == 4) {
1633 if(++nsInAnchor > 3) {
1634 assert_eq(r2.size(), ranges.size() - rangesInitSz);
1635 return; // can't match if query has Ns
1639 uint64_t bufbw = buffw;
1640 // We're moving the right-hand edge of the anchor along until
1641 // it's 'anchorOverhang' chars from the end of the target region.
1642 // Note that we're not making a 3'/5' distinction here; if we
1643 // were, we might need to make the 'anchorOverhang' adjustment on
1644 // the left end of the range rather than the right.
1646 uint32_t riHi = halfway;
1647 uint32_t rirHi = halfway - begin;
1648 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1649 uint32_t riLo = halfway + 1;
1650 uint32_t rirLo = halfway - begin + 1;
1651 for(uint32_t i = 1; i <= lim + 1; i++) {
1652 int r; // new reference char
1654 assert_lt(skipLeftToRights, qlen);
1655 assert_leq(skipRightToLefts, qlen);
1658 // Moving left-to-right
1662 r = (int)ref[rirHiAnchor];
1665 skipLeftToRights = anchorBitPairs;
1668 // Bring in new base pair at the least significant
1670 buffw = ((buffw << 2llu) | r);
1671 if(useMask) buffw &= clearMask;
1672 if(skipLeftToRights > 0) {
1676 diff = (buffw ^ anchor) | diffMask;
1679 // Moving right-to-left
1682 r = (int)ref[rirLo];
1685 skipRightToLefts = qlen;
1690 // Bring in new base pair at the most significant
1692 bufbw |= ((uint64_t)r << lhsShift);
1694 if(skipRightToLefts > 0) {
1698 diff = (bufbw ^ anchor) | diffMask;
1700 if((diff & 0xffff000000000000llu) &&
1701 (diff & 0x0000ffff00000000llu) &&
1702 (diff & 0x00000000ffff0000llu) &&
1703 (diff & 0x000000000000ffffllu)) continue;
1704 uint32_t ri = hi ? riLo : riHi;
1705 uint32_t rir = hi ? rirLo : rirHi;
1706 // Could use pop count
1707 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1708 // As a first cut, see if there are too many mismatches in
1709 // the first and last parts of the anchor
1710 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1711 if(diffs > 3) continue;
1712 diffs += u8toMms[(int)diff8[1]] +
1713 u8toMms[(int)diff8[2]] +
1714 u8toMms[(int)diff8[3]] +
1715 u8toMms[(int)diff8[4]] +
1716 u8toMms[(int)diff8[5]] +
1717 u8toMms[(int)diff8[6]];
1718 uint32_t mmpos1 = 0xffffffff;
1720 uint32_t mmpos2 = 0xffffffff;
1722 uint32_t mmpos3 = 0xffffffff;
1725 // Too many differences
1727 } else if(nPoss > 1 && diffs == nPoss) {
1728 // There was one difference, but there was also one N,
1729 // so we already know where the difference is
1731 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1734 refc2 = "ACGT"[(int)ref[rir + nPos2]];
1737 refc3 = "ACGT"[(int)ref[rir + nPos3]];
1740 } else if(diffs > 0) {
1741 // Figure out which position mismatched
1742 uint64_t diff2 = diff;
1744 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1745 assert_neq(0, diff);
1746 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos1 -= 8; }
1747 assert_neq(0, diff);
1748 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos1 -= 4; }
1749 assert_neq(0, diff);
1750 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos1 -= 2; }
1751 assert_neq(0, diff);
1752 if((diff & 0x3llu) == 0) { mmpos1--; }
1753 assert_neq(0, diff);
1754 assert_geq(mmpos1, 0);
1755 assert_lt(mmpos1, 32);
1756 mmpos1 -= anchorCushion;
1757 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1759 // Figure out the second mismatched position
1760 diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1 + anchorCushion) << 1));
1761 uint64_t diff3 = diff2;
1763 if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1764 assert_neq(0, diff2);
1765 if((diff2 & 0xffffllu) == 0) { diff2 >>= 16llu; mmpos2 -= 8; }
1766 assert_neq(0, diff2);
1767 if((diff2 & 0xffllu) == 0) { diff2 >>= 8llu; mmpos2 -= 4; }
1768 assert_neq(0, diff2);
1769 if((diff2 & 0xfllu) == 0) { diff2 >>= 4llu; mmpos2 -= 2; }
1770 assert_neq(0, diff2);
1771 if((diff2 & 0x3llu) == 0) { mmpos2--; }
1772 assert_neq(0, diff2);
1773 mmpos2 -= anchorCushion;
1774 assert_geq(mmpos2, 0);
1775 assert_lt(mmpos2, 32);
1776 assert_neq(mmpos1, mmpos2);
1777 refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1778 uint32_t mmpos2orig = mmpos2;
1779 if(mmpos2 < mmpos1) {
1780 uint32_t mmtmp = mmpos1;
1783 int refctmp = refc1;
1787 assert_lt(mmpos1, mmpos2);
1789 // Figure out the second mismatched position
1790 diff3 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos2orig + anchorCushion) << 1));
1792 if((diff3 & 0xffffffffllu) == 0) { diff3 >>= 32llu; mmpos3 -= 16; }
1793 assert_neq(0, diff3);
1794 if((diff3 & 0xffffllu) == 0) { diff3 >>= 16llu; mmpos3 -= 8; }
1795 assert_neq(0, diff3);
1796 if((diff3 & 0xffllu) == 0) { diff3 >>= 8llu; mmpos3 -= 4; }
1797 assert_neq(0, diff3);
1798 if((diff3 & 0xfllu) == 0) { diff3 >>= 4llu; mmpos3 -= 2; }
1799 assert_neq(0, diff3);
1800 if((diff3 & 0x3llu) == 0) { mmpos3--; }
1801 assert_neq(0, diff3);
1802 mmpos3 -= anchorCushion;
1803 assert_geq(mmpos3, 0);
1804 assert_lt(mmpos3, 32);
1805 assert_neq(mmpos1, mmpos3);
1806 assert_neq(mmpos2, mmpos3);
1807 refc3 = "ACGT"[(int)ref[rir + mmpos3]];
1808 if(mmpos3 < mmpos1) {
1809 uint32_t mmtmp = mmpos1;
1813 int refctmp = refc1;
1817 } else if(mmpos3 < mmpos2) {
1818 uint32_t mmtmp = mmpos2;
1821 int refctmp = refc2;
1825 assert_lt(mmpos1, mmpos2);
1826 assert_lt(mmpos2, mmpos3);
1830 // Now extend the anchor into a longer alignment
1831 bool foundHit = true;
1832 if(anchorOverhang > 0) {
1833 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1834 bool skipCandidate = false;
1835 for(uint32_t j = 0; j < anchorOverhang; j++) {
1836 int rc = (int)ref[rir + 32 + j];
1838 // Oops, encountered an N in the reference in
1839 // the overhang portion of the candidate
1841 // (Note that we inverted hi earlier)
1844 skipRightToLefts = anchorOverhang - j - 1;
1847 skipLeftToRights = anchorBitPairs + j;
1849 skipCandidate = true; // Skip this candidate
1852 if((int)qry[32 + j] != rc) {
1856 } else if(diffs == 3) {
1858 refc3 = "ACGT"[(int)ref[rir + 32 + j]];
1859 } else if(diffs == 2) {
1861 refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1863 assert_eq(1, diffs);
1865 refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1869 if(skipCandidate) continue;
1871 if(!foundHit) continue;
1875 // By convention, the upstream mate's
1876 // coordinates go in the 'first' field
1877 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
1878 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1880 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1881 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1883 if(pairs->find(p) != pairs->end()) {
1884 // We already found this hit! Continue.
1885 ASSERT_ONLY(duplicates++);
1893 assert_leq(diffs, 3);
1894 assert_lt(r2i, r2.size());
1895 assert_eq(re2[r2i], ri);
1896 ranges.resize(ranges.size()+1);
1897 Range& range = ranges.back();
1898 assert_eq(diffs, r2[r2i].numMms);
1899 range.stratum = diffs;
1900 range.numMms = diffs;
1901 assert_eq(0, range.mms.size());
1902 assert_eq(0, range.refcs.size());
1904 assert_neq(mmpos1, 0xffffffff);
1905 assert_eq(mmpos1, r2[r2i].mms[0]);
1906 assert_neq(-1, refc1);
1907 assert_eq(refc1, r2[r2i].refcs[0]);
1908 range.mms.push_back(mmpos1);
1909 range.refcs.push_back(refc1);
1911 assert_neq(mmpos2, 0xffffffff);
1912 assert_eq(mmpos2, r2[r2i].mms[1]);
1913 assert_neq(-1, refc2);
1914 assert_eq(refc2, r2[r2i].refcs[1]);
1915 range.mms.push_back(mmpos2);
1916 range.refcs.push_back(refc2);
1918 assert_neq(mmpos3, 0xffffffff);
1919 assert_eq(mmpos3, r2[r2i].mms[2]);
1920 assert_neq(-1, refc3);
1921 assert_eq(refc3, r2[r2i].refcs[2]);
1922 range.mms.push_back(mmpos3);
1923 range.refcs.push_back(refc3);
1928 results.push_back(ri);
1929 if(--numToFind == 0) return;
1931 assert_leq(duplicates, r2.size());
1932 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1938 * Concrete RefAligner for finding nearby 1-mismatch hits given an
1941 template<typename TStr>
1942 class Seed0RefAligner : public RefAligner<TStr> {
1944 typedef seqan::String<seqan::Dna5> TDna5Str;
1945 typedef seqan::String<char> TCharStr;
1946 typedef std::vector<Range> TRangeVec;
1947 typedef std::vector<uint32_t> TU32Vec;
1948 typedef std::pair<uint64_t, uint64_t> TU64Pair;
1949 typedef std::set<TU64Pair> TSetPairs;
1953 Seed0RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
1954 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
1956 virtual ~Seed0RefAligner() { }
1960 * This schematic shows the roles played by the begin, qbegin, end,
1961 * qend, halfway, slen, qlen, and lim variables:
1963 * seedOnLeft == true:
1965 * |< lim >|< qlen >|
1966 * --------------------------------------------------------------
1967 * | | slen | qlen-slen | | slen | qlen-slen |
1968 * --------------------------------------------------------------
1970 * begin & qbegin halfway qend end
1972 * seedOnLeft == false:
1974 * |< qlen >|< lim >|
1975 * --------------------------------------------------------------
1976 * | qlen-slen | slen | | qlen-slen | slen | |
1977 * --------------------------------------------------------------
1979 * begin qbegin halfway qend & end
1981 * Note that, for seeds longer than 32 base-pairs, the seed is
1982 * further subdivided into a 32-bit anchor and a seed overhang of
1985 void naiveFind(uint32_t numToFind,
1988 const TDna5Str& qry,
1989 const TCharStr& quals,
1996 bool seedOnLeft) const
1998 assert_gt(numToFind, 0);
1999 assert_gt(end, begin);
2000 const uint32_t qlen = seqan::length(qry);
2002 assert_geq(end - begin, qlen); // caller should have checked this
2003 assert_gt(this->seedLen_, 0);
2004 const uint32_t slen = min(qlen, this->seedLen_);
2005 uint32_t qend = end;
2006 uint32_t qbegin = begin;
2007 // If the seed is on the left-hand side of the alignment, then
2008 // leave a gap at the right-hand side of the interval;
2009 // otherwise, do the opposite
2011 // Leave gap on right-hand side of the interval
2014 // Leave gap on left-hand side of the interval
2017 // lim = number of alignments to try
2018 const uint32_t lim = qend - qbegin;
2019 // halfway = position in the reference to start at (and then
2020 // we work our way out to the right and to the left).
2021 const uint32_t halfway = qbegin + (lim >> 1);
2022 // Vectors for holding edit information
2023 std::vector<uint32_t> nonSeedMms;
2024 std::vector<uint8_t> nonSeedRefcs;
2026 for(uint32_t i = 1; i <= lim+1; i++) {
2027 uint32_t ri; // leftmost position in candidate alignment
2028 uint32_t rir; // same, minus begin; for indexing into ref[]
2030 ri = halfway + (i >> 1); rir = ri - begin;
2031 assert_leq(ri, qend);
2033 ri = halfway - (i >> 1); rir = ri - begin;
2034 assert_geq(ri, begin);
2037 // Do the naive comparison
2040 unsigned int ham = 0;
2042 nonSeedRefcs.clear();
2043 // Walk through each position of the alignment
2044 for(uint32_t jj = 0; jj < qlen; jj++) {
2047 // If seed is on the right, scan right-to-left
2052 uint32_t rirj = rir + j;
2054 assert_geq(rir, jj);
2055 rirj = rir - jj - 1;
2058 // Count Ns in the reference as mismatches
2059 const int q = (int)qry[j];
2060 const int r = (int)ref[rirj];
2063 if(q == 4 || r == 4 || q != r) {
2065 // Disallow alignments that involve an N in the
2067 const int r = (int)ref[rirj];
2069 // N in reference; bail on this alignment
2073 const int q = (int)qry[j];
2080 // More than one mismatch in the anchor; reject
2084 uint8_t qual = phredCharToPhredQual(quals[j]);
2085 ham += mmPenalty(this->maqPenalty_, qual);
2086 if(ham > this->qualMax_) {
2087 // Exceeded quality ceiling; reject
2091 // Legal mismatch outside of the anchor; record it
2093 nonSeedMms.push_back(j);
2094 assert_leq(nonSeedMms.size(), (size_t)mms);
2095 nonSeedRefcs.push_back("ACGTN"[r]);
2100 ranges.resize(ranges.size()+1);
2101 Range& range = ranges.back();
2104 assert_eq(0, range.mms.size());
2105 assert_eq(0, range.refcs.size());
2107 // Be careful to add edits in left-to-right order
2108 // with respect to the read/alignment
2109 const size_t nonSeedMmsSz = nonSeedMms.size();
2110 if(nonSeedMmsSz > 0) {
2112 for(size_t k = 0; k < nonSeedMmsSz; k++) {
2113 range.mms.push_back(nonSeedMms[k]);
2114 range.refcs.push_back(nonSeedRefcs[k]);
2117 for(size_t k = nonSeedMmsSz; k > 0; k--) {
2118 range.mms.push_back(nonSeedMms[k-1]);
2119 range.refcs.push_back(nonSeedRefcs[k-1]);
2124 assert_eq((size_t)mms, range.mms.size());
2125 assert_eq((size_t)mms, range.refcs.size());
2127 results.push_back(ri);
2129 results.push_back(ri - qlen);
2137 * This schematic shows the roles played by the begin, qbegin, end,
2138 * qend, halfway, slen, qlen, and lim variables:
2140 * seedOnLeft == true:
2142 * |< lim >|< qlen >|
2143 * --------------------------------------------------------------
2144 * | | slen | qlen-slen | | slen | qlen-slen |
2145 * --------------------------------------------------------------
2147 * begin & qbegin halfway qend end
2149 * seedOnLeft == false:
2152 * --------------------------------------------------------------
2153 * | qlen-slen | | qlen-slen | slen | | slen |
2154 * --------------------------------------------------------------
2156 * begin qbegin halfway qend end
2158 * Note that, for seeds longer than 32 base-pairs, the seed is
2159 * further subdivided into a 32-bit anchor and a seed overhang of
2162 virtual void anchor64Find(uint32_t numToFind,
2165 const TDna5Str& qry,
2166 const TCharStr& quals,
2171 TSetPairs* pairs = NULL,
2172 uint32_t aoff = 0xffffffff,
2173 bool seedOnLeft = false) const
2175 assert_gt(numToFind, 0);
2176 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
2177 ASSERT_ONLY(uint32_t duplicates = 0);
2178 ASSERT_ONLY(uint32_t r2i = 0);
2179 const uint32_t qlen = seqan::length(qry);
2181 assert_gt(end, begin);
2182 assert_geq(end - begin, qlen); // caller should have checked this
2183 assert_gt(this->seedLen_, 0);
2184 uint32_t slen = min(qlen, this->seedLen_);
2186 // Get results from the naive matcher for sanity-checking
2187 TRangeVec r2; TU32Vec re2;
2188 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
2189 re2, pairs, aoff, seedOnLeft);
2191 const uint32_t anchorBitPairs = min<int>(slen, 32);
2192 const int lhsShift = ((anchorBitPairs - 1) << 1);
2193 ASSERT_ONLY(const uint32_t anchorCushion = 32 - anchorBitPairs);
2194 // seedAnchorOverhang = # seed bases not included in the anchor
2195 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
2196 // seedAnchorOverhang = # seed bases not included in the anchor
2197 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
2198 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
2199 assert_eq(qlen, readSeedOverhang + slen);
2200 uint32_t qend = end;
2201 uint32_t qbegin = begin;
2203 // Leave read-sized gap on right-hand side of the interval
2206 // Leave seed-sized gap on right-hand side and
2207 // non-seed-sized gap on the left-hand side
2208 qbegin += readSeedOverhang;
2211 // lim = # possible alignments in the range
2212 const uint32_t lim = qend - qbegin;
2213 // halfway = point on the genome to radiate out from
2214 const uint32_t halfway = qbegin + (lim >> 1);
2215 uint64_t anchor = 0llu;
2216 uint64_t buffw = 0llu; // rotating ref sequence buffer
2217 // Set up a mask that we'll apply to the two bufs every round
2218 // to discard bits that were rotated out of the anchor area
2219 uint64_t clearMask = 0xffffffffffffffffllu;
2220 bool useMask = false;
2221 if(anchorBitPairs < 32) {
2222 clearMask >>= ((32-anchorBitPairs) << 1);
2225 uint32_t skipLeftToRights = 0;
2226 uint32_t skipRightToLefts = 0;
2227 const uint32_t halfwayRi = halfway - begin;
2228 // Construct the 'anchor' 64-bit buffer so that it holds all of
2229 // the first 'anchorBitPairs' bit pairs of the query.
2230 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
2233 // Fill in the anchor using characters from the right-
2234 // hand side of the query (but take the characters in
2235 // left-to-right order)
2236 i = qlen - slen + ii;
2238 int c = (int)qry[i]; // next query character
2239 int r = (int)ref[halfwayRi + ii]; // next reference character
2241 // The reference character is an N; to mimic the
2242 // behavior of BW alignment, we have to skip all
2243 // alignments that involve an N in the reference. Set
2244 // the skip* variables accordingly.
2246 uint32_t lrSkips = ii;
2247 uint32_t rlSkips = qlen - ii;
2248 if(!seedOnLeft && readSeedOverhang) {
2249 lrSkips += readSeedOverhang;
2250 assert_geq(rlSkips, readSeedOverhang);
2251 rlSkips -= readSeedOverhang;
2253 // The right-to-left direction absorbs the candidate
2254 // alignment based at 'halfway'
2255 skipLeftToRights = max(skipLeftToRights, lrSkips);
2256 skipRightToLefts = max(skipRightToLefts, rlSkips);
2257 assert_leq(skipLeftToRights, qlen);
2258 assert_leq(skipRightToLefts, qlen);
2262 // Special case: query has an 'N'
2264 // One or more 'N's in the anchor region; can't
2265 // possibly have a 0-mismatch hit anywhere
2266 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2267 return; // can't match if query has Ns
2269 anchor = ((anchor << 2llu) | c);
2270 buffw = ((buffw << 2llu) | r);
2272 // Check whether read is disqualified by Ns inside the seed
2273 // region but outside the anchor region
2274 if(seedAnchorOverhang) {
2275 assert_lt(anchorBitPairs, slen);
2276 for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
2279 i = qlen - slen + ii;
2281 if((int)qry[i] == 4) {
2282 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2283 return; // can't match if query has Ns
2287 assert_eq(anchorBitPairs, slen);
2289 uint64_t bufbw = buffw;
2290 // Slide the anchor out in either direction, alternating
2291 // between right-to-left and left-to-right shifts, until all of
2292 // the positions from qbegin to qend have been covered.
2294 uint32_t riHi = halfway;
2295 uint32_t rirHi = halfway - begin;
2296 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
2297 uint32_t riLo = halfway + 1;
2298 uint32_t rirLo = halfway - begin + 1;
2299 uint32_t lrSkips = anchorBitPairs;
2300 uint32_t rlSkips = qlen;
2301 if(!seedOnLeft && readSeedOverhang) {
2302 lrSkips += readSeedOverhang;
2303 assert_geq(rlSkips, readSeedOverhang);
2304 rlSkips -= readSeedOverhang;
2306 for(uint32_t i = 1; i <= lim + 1; i++) {
2307 int r; // new reference char
2309 assert_leq(skipLeftToRights, qlen);
2310 assert_leq(skipRightToLefts, qlen);
2313 // Moving left-to-right
2317 r = (int)ref[rirHiAnchor];
2320 skipLeftToRights = lrSkips;
2323 // Bring in new base pair at the least significant
2325 buffw = ((buffw << 2llu) | r);
2326 if(useMask) buffw &= clearMask;
2327 if(skipLeftToRights > 0) {
2331 diff = buffw ^ anchor;
2334 // Moving right-to-left
2337 r = (int)ref[rirLo];
2340 skipRightToLefts = rlSkips;
2345 // Bring in new base pair at the most significant
2347 bufbw |= ((uint64_t)r << lhsShift);
2349 if(skipRightToLefts > 0) {
2353 diff = bufbw ^ anchor;
2356 uint32_t ri = hi ? riLo : riHi;
2357 uint32_t rir = hi ? rirLo : rirHi;
2358 unsigned int ham = 0;
2359 // If the seed is longer than the anchor, then scan the
2360 // rest of the seed characters
2361 bool foundHit = true;
2362 if(seedAnchorOverhang) {
2363 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
2364 int rc = (int)ref[rir + anchorBitPairs + j];
2366 // Oops, encountered an N in the reference in
2367 // the overhang portion of the candidate
2369 // (Note that we inverted hi earlier)
2372 // Skip out of the seedAnchorOverhang
2373 assert_eq(0, skipRightToLefts);
2374 skipRightToLefts = seedAnchorOverhang - j - 1;
2376 // ...and skip out of the rest of the read
2377 skipRightToLefts += readSeedOverhang;
2381 // Skip out of the seedAnchorOverhang
2382 assert_eq(0, skipLeftToRights);
2383 skipLeftToRights = anchorBitPairs + j;
2385 // ...and skip out of the rest of the read
2386 skipLeftToRights += readSeedOverhang;
2389 foundHit = false; // Skip this candidate
2392 uint32_t qoff = anchorBitPairs + j;
2394 qoff += readSeedOverhang;
2396 assert_lt(qoff, qlen);
2397 if((int)qry[qoff] != rc) {
2402 if(!foundHit) continue;
2404 // If the read is longer than the seed, then scan the rest
2405 // of the read characters; mismatches no longer count
2406 // toward the stratum or the 1-mm limit.
2407 // Vectors for holding edit information
2408 std::vector<uint32_t> nonSeedMms;
2409 std::vector<uint8_t> nonSeedRefcs;
2410 int mms = 0; // start counting total mismatches
2411 if((qlen - slen) > 0) {
2412 // Going left-to-right
2413 for(uint32_t j = 0; j < readSeedOverhang; j++) {
2414 uint32_t roff = rir + slen + j;
2415 uint32_t qoff = slen + j;
2417 assert_geq(roff, qlen);
2421 int rc = (int)ref[roff];
2423 // Oops, encountered an N in the reference in
2424 // the overhang portion of the candidate
2426 // (Note that we inverted hi earlier)
2429 // Skip what's left of the readSeedOverhang
2430 skipRightToLefts = readSeedOverhang - j - 1;
2432 // ...and skip the seed if it's on the right
2433 skipRightToLefts += slen;
2437 // Skip what we've matched of the overhang
2438 skipLeftToRights = j;
2440 // ...and skip the seed if it's on the left
2441 skipLeftToRights += slen;
2444 foundHit = false; // Skip this candidate
2447 if((int)qry[qoff] != rc) {
2448 // Calculate quality of mismatched base
2449 char q = phredCharToPhredQual(quals[qoff]);
2450 ham += mmPenalty(this->maqPenalty_, q);
2451 if(ham > this->qualMax_) {
2452 // Exceeded quality limit
2456 // Legal mismatch outside of the anchor; record it
2458 nonSeedMms.push_back(qoff);
2459 assert_leq(nonSeedMms.size(), (size_t)mms);
2460 nonSeedRefcs.push_back("ACGTN"[rc]);
2463 if(!foundHit) continue;
2466 // Adjust ri if seed is on the right-hand side
2468 ri -= readSeedOverhang;
2469 rir -= readSeedOverhang;
2474 // By convention, the upstream mate's
2475 // coordinates go in the 'first' field
2476 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
2477 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2479 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
2480 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2482 if(pairs->find(p) != pairs->end()) {
2483 // We already found this hit! Continue.
2484 ASSERT_ONLY(duplicates++);
2492 if(this->verbose_) {
2493 cout << "About to report seed0:" << endl;
2495 for(size_t i = 0; i < qlen; i++) {
2496 cout << (char)qry[i];
2500 for(size_t i = 0; i < qlen; i++) {
2501 cout << "ACGT"[ref[rir+i]];
2505 assert_lt(r2i, r2.size());
2506 assert_eq(re2[r2i], ri);
2507 ranges.resize(ranges.size()+1);
2508 Range& range = ranges.back();
2509 assert_eq((size_t)mms, r2[r2i].numMms);
2512 assert_eq(0, range.mms.size());
2513 assert_eq(0, range.refcs.size());
2515 ASSERT_ONLY(size_t mmcur = 0);
2516 const size_t nonSeedMmsSz = nonSeedMms.size();
2517 for(size_t i = 0; i < nonSeedMmsSz; i++) {
2518 assert_neq(0xffffffff, nonSeedMms[i]);
2519 assert_lt(mmcur, (size_t)mms);
2520 assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
2521 range.mms.push_back(nonSeedMms[i]);
2522 assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
2523 ASSERT_ONLY(mmcur++);
2524 range.refcs.push_back(nonSeedRefcs[i]);
2526 assert_eq(mmcur, r2[r2i].mms.size());
2528 assert_eq((size_t)mms, range.mms.size());
2529 assert_eq((size_t)mms, range.refcs.size());
2531 results.push_back(ri);
2532 if(--numToFind == 0) return;
2534 assert_leq(duplicates, r2.size());
2535 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
2541 * Concrete RefAligner for finding nearby 1-mismatch hits given an
2544 template<typename TStr>
2545 class Seed1RefAligner : public RefAligner<TStr> {
2547 typedef seqan::String<seqan::Dna5> TDna5Str;
2548 typedef seqan::String<char> TCharStr;
2549 typedef std::vector<Range> TRangeVec;
2550 typedef std::vector<uint32_t> TU32Vec;
2551 typedef std::pair<uint64_t, uint64_t> TU64Pair;
2552 typedef std::set<TU64Pair> TSetPairs;
2556 Seed1RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
2557 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
2559 virtual ~Seed1RefAligner() { }
2563 * This schematic shows the roles played by the begin, qbegin, end,
2564 * qend, halfway, slen, qlen, and lim variables:
2566 * seedOnLeft == true:
2568 * |< lim >|< qlen >|
2569 * --------------------------------------------------------------
2570 * | | slen | qlen-slen | | slen | qlen-slen |
2571 * --------------------------------------------------------------
2573 * begin & qbegin halfway qend end
2575 * seedOnLeft == false:
2577 * |< qlen >|< lim >|
2578 * --------------------------------------------------------------
2579 * | qlen-slen | slen | | qlen-slen | slen | |
2580 * --------------------------------------------------------------
2582 * begin qbegin halfway qend & end
2584 * Note that, for seeds longer than 32 base-pairs, the seed is
2585 * further subdivided into a 32-bit anchor and a seed overhang of
2588 void naiveFind(uint32_t numToFind,
2591 const TDna5Str& qry,
2592 const TCharStr& quals,
2599 bool seedOnLeft) const
2601 assert_gt(numToFind, 0);
2602 assert_gt(end, begin);
2603 const uint32_t qlen = seqan::length(qry);
2605 assert_geq(end - begin, qlen); // caller should have checked this
2606 assert_gt(this->seedLen_, 0);
2607 const uint32_t slen = min(qlen, this->seedLen_);
2608 uint32_t qend = end;
2609 uint32_t qbegin = begin;
2610 // If the seed is on the left-hand side of the alignment, then
2611 // leave a gap at the right-hand side of the interval;
2612 // otherwise, do the opposite
2614 // Leave gap on right-hand side of the interval
2617 // Leave gap on left-hand side of the interval
2620 // lim = number of alignments to try
2621 const uint32_t lim = qend - qbegin;
2622 // halfway = position in the reference to start at (and then
2623 // we work our way out to the right and to the left).
2624 const uint32_t halfway = qbegin + (lim >> 1);
2625 // Vectors for holding edit information
2626 std::vector<uint32_t> nonSeedMms;
2627 assert_eq(0, nonSeedMms.size());
2628 std::vector<uint8_t> nonSeedRefcs;
2629 assert_eq(0, nonSeedRefcs.size());
2631 for(uint32_t i = 1; i <= lim+1; i++) {
2632 uint32_t ri; // leftmost position in candidate alignment
2633 uint32_t rir; // same, minus begin; for indexing into ref[]
2635 ri = halfway + (i >> 1); rir = ri - begin;
2636 assert_leq(ri, qend);
2638 ri = halfway - (i >> 1); rir = ri - begin;
2639 assert_geq(ri, begin);
2642 // Do the naive comparison
2645 uint32_t mmOff = 0xffffffff;
2648 unsigned int ham = 0;
2650 nonSeedRefcs.clear();
2651 // Walk through each position of the alignment
2652 for(uint32_t jj = 0; jj < qlen; jj++) {
2655 // If seed is on the right, scan right-to-left
2660 uint32_t rirj = rir + j;
2662 assert_geq(rir, jj);
2663 rirj = rir - jj - 1;
2666 // Count Ns in the reference as mismatches
2667 const int q = (int)qry[j];
2668 const int r = (int)ref[rirj];
2671 if(q == 4 || r == 4 || q != r) {
2673 // Disallow alignments that involve an N in the
2675 const int r = (int)ref[rirj];
2677 // N in reference; bail on this alignment
2681 const int q = (int)qry[j];
2688 if(mms > 1 && jj < slen) {
2689 // More than one mismatch in the anchor; reject
2693 uint8_t qual = phredCharToPhredQual(quals[j]);
2694 ham += mmPenalty(this->maqPenalty_, qual);
2695 if(ham > this->qualMax_) {
2696 // Exceeded quality ceiling; reject
2699 } else if(jj < slen) {
2700 // First mismatch in the anchor; remember offset
2706 // Legal mismatch outside of the anchor; record it
2707 nonSeedMms.push_back(j);
2708 assert_leq(nonSeedMms.size(), (size_t)mms);
2709 nonSeedRefcs.push_back("ACGTN"[r]);
2714 ranges.resize(ranges.size()+1);
2715 Range& range = ranges.back();
2716 assert_leq(seedMms, mms);
2717 range.stratum = seedMms;
2719 assert_eq(0, range.mms.size());
2720 assert_eq(0, range.refcs.size());
2722 // Be careful to add edits in left-to-right order
2723 // with respect to the read/alignment
2724 if(seedOnLeft && seedMms) {
2725 assert_lt(mmOff, qlen);
2726 range.mms.push_back(mmOff);
2727 range.refcs.push_back(refc);
2729 const size_t nonSeedMmsSz = nonSeedMms.size();
2730 if(nonSeedMmsSz > 0) {
2732 for(size_t k = 0; k < nonSeedMmsSz; k++) {
2733 range.mms.push_back(nonSeedMms[k]);
2734 range.refcs.push_back(nonSeedRefcs[k]);
2737 for(size_t k = nonSeedMmsSz; k > 0; k--) {
2738 range.mms.push_back(nonSeedMms[k-1]);
2739 range.refcs.push_back(nonSeedRefcs[k-1]);
2743 if(!seedOnLeft && seedMms) {
2744 assert_lt(mmOff, qlen);
2745 range.mms.push_back(mmOff);
2746 range.refcs.push_back(refc);
2749 assert_eq((size_t)mms, range.mms.size());
2750 assert_eq((size_t)mms, range.refcs.size());
2752 results.push_back(ri);
2754 results.push_back(ri - qlen);