12 #include "seqan/sequence.h"
15 #include "reference.h"
17 // Let the reference-aligner buffer size be 16K by default. If more
18 // room is required, a new buffer must be allocated from the heap.
19 const static int REF_ALIGNER_BUFSZ = 16 * 1024;
22 * Abstract parent class for classes that look for alignments by
23 * matching against the reference sequence directly. This is useful
24 * both for sanity-checking results from the Bowtie index and for
25 * finding mates when the reference location of the opposite mate is
28 template<typename TStr>
31 typedef seqan::String<seqan::Dna5> TDna5Str;
32 typedef seqan::String<char> TCharStr;
33 typedef std::vector<uint32_t> TU32Vec;
34 typedef std::vector<Range> TRangeVec;
35 typedef std::pair<uint64_t, uint64_t> TU64Pair;
36 typedef std::set<TU64Pair> TSetPairs;
39 RefAligner(bool color,
43 uint32_t qualMax = 0xffffffff,
44 bool maqPenalty = false) :
45 color_(color), verbose_(verbose), seedLen_(seedLen),
46 qualMax_(qualMax), maqPenalty_(maqPenalty), refbuf_(buf_),
47 refbufSz_(REF_ALIGNER_BUFSZ), freeRefbuf_(false)
51 * Free the reference-space alignment buffer if this object
54 virtual ~RefAligner() {
61 * Find one alignment of qry:quals in the range begin-end in
62 * reference string ref. Store the alignment details in range.
64 virtual void find(uint32_t numToFind,
66 const BitPairReference *refs,
68 const TCharStr& quals,
73 TSetPairs* pairs = NULL,
74 uint32_t aoff = 0xffffffff,
75 bool seedOnLeft = false)
77 assert_gt(numToFind, 0);
78 assert_gt(end, begin);
79 uint32_t spread = end - begin + (color_ ? 1 : 0);
80 uint32_t spreadPlus = spread + 12;
81 // Make sure the buffer is large enough to accommodate the spread
82 if(spreadPlus > this->refbufSz_) {
83 this->newBuf(spreadPlus);
85 // Read in the relevant stretch of the reference string
86 int offset = refs->getStretch(this->refbuf_, tidx, begin, spread);
87 uint8_t *buf = ((uint8_t*)this->refbuf_) + offset;
90 for(size_t i = 0; i < (end-begin); i++) {
91 assert_leq((int)buf[i], 4);
92 buf[i] = dinuc2color[(int)buf[i]][(int)buf[i+1]];
95 // Look for alignments
96 ASSERT_ONLY(uint32_t irsz = ranges.size());
97 anchor64Find(numToFind, tidx, buf, qry, quals, begin,
98 end, ranges, results, pairs, aoff, seedOnLeft);
100 for(size_t i = irsz; i < results.size(); i++) {
101 assert_eq(ranges[i].numMms, ranges[i].mms.size());
102 assert_eq(ranges[i].numMms, ranges[i].refcs.size());
108 * Find one alignment of qry:quals in the range begin-end in
109 * reference string ref. Store the alignment details in range.
110 * Uses a combination of the anchor bases and 64-bit arithmetic to
111 * find anchors quickly.
113 virtual void anchor64Find(uint32_t numToFind,
117 const TCharStr& quals,
122 TSetPairs* pairs = NULL,
123 uint32_t aoff = 0xffffffff,
124 bool seedOnLeft = false) const = 0;
127 * Set a new reference-sequence buffer.
129 void setBuf(uint32_t *newbuf, uint32_t newsz) {
139 * Set a new reference-sequence buffer.
141 void newBuf(uint32_t newsz) {
146 refbuf_ = new uint32_t[(newsz + 3) / 4];
147 if(refbuf_ == NULL) throw std::bad_alloc();
148 } catch(std::bad_alloc& e) {
149 cerr << "Error: Could not allocate reference-space alignment buffer of " << newsz << "B" << endl;
157 bool color_; /// whether to colorize reference buffers before handing off
158 bool verbose_; /// be talkative
159 uint32_t seedLen_; /// length of seed region for read
160 uint32_t qualMax_; /// maximum sum of quality penalties
161 bool maqPenalty_;/// whether to round as in Maq
162 uint32_t *refbuf_; /// pointer to current reference buffer
163 uint32_t refbufSz_; /// size of current reference buffer
164 uint32_t buf_[REF_ALIGNER_BUFSZ / 4]; /// built-in reference buffer (may be superseded)
165 bool freeRefbuf_; /// whether refbuf_ points to something we should delete
169 * Concrete RefAligner for finding nearby exact hits given an anchor
172 template<typename TStr>
173 class ExactRefAligner : public RefAligner<TStr> {
175 typedef seqan::String<seqan::Dna5> TDna5Str;
176 typedef seqan::String<char> TCharStr;
177 typedef std::vector<uint32_t> TU32Vec;
178 typedef std::vector<Range> TRangeVec;
179 typedef std::pair<uint64_t, uint64_t> TU64Pair;
180 typedef std::set<TU64Pair> TSetPairs;
184 ExactRefAligner(bool color, bool verbose, bool quiet) :
185 RefAligner<TStr>(color, verbose, quiet) { }
187 virtual ~ExactRefAligner() { }
191 * Because we're doing end-to-end exact, we don't care which end of
192 * 'qry' is the 5' end.
194 void naiveFind(uint32_t numToFind,
198 const TCharStr& quals,
205 bool seedOnLeft) const
207 assert_gt(numToFind, 0);
208 const uint32_t qlen = seqan::length(qry);
209 assert_geq(end - begin, qlen); // caller should have checked this
210 assert_gt(end, begin);
212 uint32_t qend = end - qlen;
213 uint32_t lim = qend - begin;
214 uint32_t halfway = begin + (lim >> 1);
216 for(uint32_t i = 1; i <= lim+1; i++) {
217 uint32_t ri; // leftmost position in candidate alignment
218 uint32_t rir; // same, minus begin; for indexing into ref[]
220 ri = halfway + (i >> 1); rir = ri - begin;
221 assert_leq(ri, qend);
223 ri = halfway - (i >> 1); rir = ri - begin;
224 assert_geq(ri, begin);
227 // Do the naive comparison
229 for(uint32_t j = 0; j < qlen; j++) {
231 // Count Ns in the reference as mismatches
232 const int q = (int)qry[j];
233 const int r = (int)ref[rir + j];
236 if(q == 4 || r == 4 || q != r) {
243 // Disallow alignments that involve an N in the
245 const int r = (int)ref[rir + j];
247 // N in reference; bail
251 const int q = (int)qry[j];
263 ranges.resize(ranges.size()+1);
264 Range& range = ranges.back();
267 assert_eq(0, range.mms.size());
268 assert_eq(0, range.refcs.size());
269 results.push_back(ri);
276 * Because we're doing end-to-end exact, we don't care which end of
277 * 'qry' is the 5' end.
279 virtual void anchor64Find(uint32_t numToFind,
283 const TCharStr& quals,
289 uint32_t aoff, // offset of anchor mate
290 bool seedOnLeft) const
292 assert_gt(numToFind, 0);
293 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
294 ASSERT_ONLY(uint32_t duplicates = 0);
295 ASSERT_ONLY(uint32_t r2i = 0);
296 const uint32_t qlen = seqan::length(qry);
297 assert_geq(end - begin, qlen); // caller should have checked this
298 assert_gt(end, begin);
301 // Get all naive hits
302 TRangeVec r2; TU32Vec re2;
303 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
304 re2, pairs, aoff, seedOnLeft);
306 const uint32_t anchorBitPairs = min<int>(qlen, 32);
307 // anchorOverhang = # read bases not included in the anchor
308 const uint32_t anchorOverhang = qlen <= 32 ? 0 : qlen - 32;
309 const uint32_t lim = end - qlen - begin;
310 const uint32_t halfway = begin + (lim >> 1);
311 uint64_t anchor = 0llu;
312 uint64_t buffw = 0llu;
313 // Set up a mask that we'll apply to the two bufs every round
314 // to discard bits that were rotated out of the anchor area
315 uint64_t clearMask = 0xffffffffffffffffllu;
316 bool useMask = false;
317 if(anchorBitPairs < 32) {
318 clearMask >>= ((32-anchorBitPairs) << 1);
321 const int lhsShift = ((anchorBitPairs - 1) << 1);
322 // Build the contents of the 'anchor' dword and the initial
323 // contents of the 'buffw' dword. If there are fewer than 32
324 // anchorBitPairs, the content will be packed into the least
325 // significant bits of the word.
326 uint32_t skipLeftToRights = 0;
327 uint32_t skipRightToLefts = 0;
328 for(uint32_t i = 0; i < anchorBitPairs; i++) {
329 int c = (int)qry[i]; // next query character
332 assert_eq(r2.size(), ranges.size() - rangesInitSz);
333 return; // can't match if query has Ns
335 int r = (int)ref[halfway - begin + i]; // next reference character
338 // The right-to-left direction absorbs the candidate
339 // alignment based at 'halfway'; so skipLeftToRights is
341 skipLeftToRights = max(skipLeftToRights, i);
342 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
346 anchor = ((anchor << 2llu) | c);
347 buffw = ((buffw << 2llu) | r);
349 // Check whether read is disqualified by Ns outside of the anchor
351 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
352 if((int)qry[i] == 4) {
353 assert_eq(r2.size(), ranges.size() - rangesInitSz);
354 return; // can't match if query has Ns
357 uint64_t bufbw = buffw;
358 // We're moving the right-hand edge of the anchor along until
359 // it's 'anchorOverhang' chars from the end of the target region.
360 // Note that we're not making a 3'/5' distinction here; if we
361 // were, we might need to make the 'anchorOverhang' adjustment on
362 // the left end of the range rather than the right.
364 uint32_t riHi = halfway;
365 uint32_t rirHi = halfway - begin;
366 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
367 uint32_t riLo = halfway + 1;
368 uint32_t rirLo = halfway - begin + 1;
369 for(uint32_t i = 1; i <= lim + 1; i++) {
370 int r; // new reference char
371 assert_lt(skipLeftToRights, qlen);
372 assert_leq(skipRightToLefts, qlen);
375 // Moving left-to-right
376 riHi++; rirHi++; rirHiAnchor++;
377 r = (int)ref[rirHiAnchor];
380 skipLeftToRights = anchorBitPairs;
383 // Bring in new base pair at the least significant
385 buffw = ((buffw << 2llu) | r);
386 if(useMask) buffw &= clearMask;
387 if(skipLeftToRights > 0) {
391 if(buffw != anchor) {
396 // Moving right-to-left
401 skipRightToLefts = qlen;
406 // Bring in new base pair at the most significant
408 bufbw |= ((uint64_t)r << lhsShift);
410 if(skipRightToLefts > 0) {
414 if(bufbw != anchor) {
419 bool foundHit = true;
420 uint32_t ri = hi ? riLo : riHi;
421 uint32_t rir = hi ? rirLo : rirHi;
422 if(anchorOverhang > 0) {
423 // Does the non-anchor part of the alignment (the
424 // "overhang") ruin it?
425 bool skipCandidate = false;
426 for(uint32_t j = 0; j < anchorOverhang; j++) {
427 assert_lt(ri + anchorBitPairs + j, end);
428 int rc = (int)ref[rir + anchorBitPairs + j];
430 // Oops, encountered an N in the reference in
431 // the overhang portion of the candidate
433 // (Note that we inverted hi earlier)
436 skipRightToLefts = anchorOverhang - j - 1;
439 skipLeftToRights = anchorBitPairs + j;
441 skipCandidate = true;
442 break; // Skip this candidate
444 if((int)qry[32 + j] != rc) {
445 // Yes, overhang ruins it
450 if(skipCandidate) continue;
456 // By convention, the upstream mate's
457 // coordinates go in the 'first' field
458 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
459 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
461 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
462 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
464 if(pairs->find(p) != pairs->end()) {
465 // We already found this hit! Continue.
466 ASSERT_ONLY(duplicates++);
474 assert_lt(r2i, r2.size());
475 assert_eq(re2[r2i], ri);
476 ranges.resize(ranges.size()+1);
477 Range& range = ranges.back();
480 assert_eq(0, range.mms.size());
481 assert_eq(0, range.refcs.size());
483 results.push_back(ri);
484 if(--numToFind == 0) return;
489 assert_leq(duplicates, r2.size());
490 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
496 * Defined in ref_aligner.cpp. Maps an octet representing the XOR of
497 * two two-bit-per-base-encoded DNA sequences to the number of bases
498 * that mismatch between the two.
500 extern unsigned char u8toMms[];
503 * Concrete RefAligner for finding nearby 1-mismatch hits given an
506 template<typename TStr>
507 class OneMMRefAligner : public RefAligner<TStr> {
509 typedef seqan::String<seqan::Dna5> TDna5Str;
510 typedef seqan::String<char> TCharStr;
511 typedef std::vector<Range> TRangeVec;
512 typedef std::vector<uint32_t> TU32Vec;
513 typedef std::pair<uint64_t, uint64_t> TU64Pair;
514 typedef std::set<TU64Pair> TSetPairs;
518 OneMMRefAligner(bool color, bool verbose, bool quiet) :
519 RefAligner<TStr>(color, verbose, quiet) { }
521 virtual ~OneMMRefAligner() { }
525 * Because we're doing end-to-end exact, we don't care which end of
526 * 'qry' is the 5' end.
528 void naiveFind(uint32_t numToFind,
532 const TCharStr& quals,
539 bool seedOnLeft) const
541 assert_gt(numToFind, 0);
542 const uint32_t qlen = seqan::length(qry);
543 assert_geq(end - begin, qlen); // caller should have checked this
544 assert_gt(end, begin);
546 uint32_t qend = end - qlen;
547 uint32_t lim = qend - begin;
548 uint32_t halfway = begin + (lim >> 1);
550 for(uint32_t i = 1; i <= lim+1; i++) {
551 uint32_t ri; // leftmost position in candidate alignment
552 uint32_t rir; // same, minus begin; for indexing into ref[]
554 ri = halfway + (i >> 1); rir = ri - begin;
555 assert_leq(ri, qend);
557 ri = halfway - (i >> 1); rir = ri - begin;
558 assert_geq(ri, begin);
561 // Do the naive comparison
564 uint32_t mmOff = 0xffffffff;
566 for(uint32_t j = 0; j < qlen; j++) {
568 // Count Ns in the reference as mismatches
569 const int q = (int)qry[j];
570 const int r = (int)ref[rir + j];
573 if(q == 4 || r == 4 || q != r) {
575 // Disallow alignments that involve an N in the
577 const int r = (int)ref[rir + j];
579 // N in reference; bail
583 const int q = (int)qry[j];
590 // Too many; reject this alignment
594 // First one; remember offset and ref char
602 ranges.resize(ranges.size()+1);
603 Range& range = ranges.back();
606 assert_eq(0, range.mms.size());
607 assert_eq(0, range.refcs.size());
609 assert_lt(mmOff, qlen);
610 range.mms.push_back(mmOff);
611 range.refcs.push_back(refc);
613 results.push_back(ri);
620 * Because we're doing end-to-end exact, we don't care which end of
621 * 'qry' is the 5' end.
623 virtual void anchor64Find(uint32_t numToFind,
627 const TCharStr& quals,
632 TSetPairs* pairs = NULL,
633 uint32_t aoff = 0xffffffff,
634 bool seedOnLeft = false) const
636 assert_gt(numToFind, 0);
637 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
638 ASSERT_ONLY(uint32_t duplicates = 0);
639 ASSERT_ONLY(uint32_t r2i = 0);
640 const uint32_t qlen = seqan::length(qry);
641 assert_geq(end - begin, qlen); // caller should have checked this
642 assert_gt(end, begin);
645 // Get results from the naive matcher for sanity-checking
646 TRangeVec r2; TU32Vec re2;
647 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
648 re2, pairs, aoff, seedOnLeft);
650 const uint32_t anchorBitPairs = min<int>(qlen, 32);
651 const int lhsShift = ((anchorBitPairs - 1) << 1);
652 const uint32_t anchorCushion = 32 - anchorBitPairs;
653 // anchorOverhang = # read bases not included in the anchor
654 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
655 const uint32_t lim = end - qlen - begin;
656 const uint32_t halfway = begin + (lim >> 1);
657 uint64_t anchor = 0llu;
658 uint64_t buffw = 0llu; // rotating ref sequence buffer
659 // OR the 'diff' buffer with this so that we can always count
660 // 'N's as mismatches
661 uint64_t diffMask = 0llu;
662 // Set up a mask that we'll apply to the two bufs every round
663 // to discard bits that were rotated out of the anchor area
664 uint64_t clearMask = 0xffffffffffffffffllu;
665 bool useMask = false;
666 if(anchorBitPairs < 32) {
667 clearMask >>= ((32-anchorBitPairs) << 1);
672 uint32_t skipLeftToRights = 0;
673 uint32_t skipRightToLefts = 0;
674 // Construct the 'anchor' 64-bit buffer so that it holds all of
675 // the first 'anchorBitPairs' bit pairs of the query.
676 for(uint32_t i = 0; i < anchorBitPairs; i++) {
677 int c = (int)qry[i]; // next query character
678 int r = (int)ref[halfway - begin + i]; // next reference character
681 // The right-to-left direction absorbs the candidate
682 // alignment based at 'halfway'; so skipLeftToRights is
684 skipLeftToRights = max(skipLeftToRights, i);
685 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
689 // Special case: query has an 'N'
691 if(++nsInAnchor > 1) {
692 // More than one 'N' in the anchor region; can't
693 // possibly have a 1-mismatch hit anywhere
694 assert_eq(r2.size(), ranges.size() - rangesInitSz);
695 return; // can't match if query has Ns
698 // Make it look like an 'A' in the anchor
700 diffMask = (diffMask << 2llu) | 1llu;
704 anchor = ((anchor << 2llu) | c);
705 buffw = ((buffw << 2llu) | r);
707 // Check whether read is disqualified by Ns outside of the anchor
709 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
710 if((int)qry[i] == 4) {
711 if(++nsInAnchor > 1) {
712 assert_eq(r2.size(), ranges.size() - rangesInitSz);
713 return; // can't match if query has Ns
717 uint64_t bufbw = buffw;
718 // We're moving the right-hand edge of the anchor along until
719 // it's 'anchorOverhang' chars from the end of the target region.
720 // Note that we're not making a 3'/5' distinction here; if we
721 // were, we might need to make the 'anchorOverhang' adjustment on
722 // the left end of the range rather than the right.
724 uint32_t riHi = halfway;
725 uint32_t rirHi = halfway - begin;
726 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
727 uint32_t riLo = halfway + 1;
728 uint32_t rirLo = halfway - begin + 1;
729 for(uint32_t i = 1; i <= lim + 1; i++) {
730 int r; // new reference char
732 assert_lt(skipLeftToRights, qlen);
733 assert_leq(skipRightToLefts, qlen);
736 // Moving left-to-right
740 r = (int)ref[rirHiAnchor];
743 skipLeftToRights = anchorBitPairs;
746 // Bring in new base pair at the least significant
748 buffw = ((buffw << 2llu) | r);
749 if(useMask) buffw &= clearMask;
750 if(skipLeftToRights > 0) {
754 diff = (buffw ^ anchor) | diffMask;
757 // Moving right-to-left
763 skipRightToLefts = qlen;
768 // Bring in new base pair at the most significant
770 bufbw |= ((uint64_t)r << lhsShift);
772 if(skipRightToLefts > 0) {
776 diff = (bufbw ^ anchor) | diffMask;
778 if((diff & 0xffffffff00000000llu) &&
779 (diff & 0x00000000ffffffffllu)) continue;
780 uint32_t ri = hi ? riLo : riHi;
781 uint32_t rir = hi ? rirLo : rirHi;
782 // Could use pop count
783 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
784 // As a first cut, see if there are too many mismatches in
785 // the first and last parts of the anchor
786 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
787 if(diffs > 1) continue;
788 diffs += u8toMms[(int)diff8[1]] +
789 u8toMms[(int)diff8[2]] +
790 u8toMms[(int)diff8[3]] +
791 u8toMms[(int)diff8[4]] +
792 u8toMms[(int)diff8[5]] +
793 u8toMms[(int)diff8[6]];
794 uint32_t mmpos = 0xffffffff;
797 // Too many differences
799 } else if(diffs == 1 && nPos != -1) {
800 // There was one difference, but there was also one N,
801 // so we already know where the difference is
803 refc = "ACGT"[(int)ref[rir + nPos]];
804 } else if(diffs == 1) {
805 // Figure out which position mismatched
807 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos -= 16; }
809 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos -= 8; }
811 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos -= 4; }
813 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos -= 2; }
815 if((diff & 0x3llu) == 0) { mmpos--; }
817 assert_geq(mmpos, 0);
818 assert_lt(mmpos, 32);
819 mmpos -= anchorCushion;
820 refc = "ACGT"[(int)ref[rir + mmpos]];
822 // Now extend the anchor into a longer alignment
823 bool foundHit = true;
824 if(anchorOverhang > 0) {
825 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
826 bool skipCandidate = false;
827 for(uint32_t j = 0; j < anchorOverhang; j++) {
828 int rc = (int)ref[rir + 32 + j];
830 // Oops, encountered an N in the reference in
831 // the overhang portion of the candidate
833 // (Note that we inverted hi earlier)
836 skipRightToLefts = anchorOverhang - j - 1;
839 skipLeftToRights = anchorBitPairs + j;
841 skipCandidate = true; // Skip this candidate
844 if((int)qry[32 + j] != rc) {
850 refc = "ACGT"[(int)ref[rir + 32 + j]];
854 if(skipCandidate) continue;
856 if(!foundHit) continue;
860 // By convention, the upstream mate's
861 // coordinates go in the 'first' field
862 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
863 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
865 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
866 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
868 if(pairs->find(p) != pairs->end()) {
869 // We already found this hit! Continue.
870 ASSERT_ONLY(duplicates++);
878 assert_leq(diffs, 1);
879 assert_lt(r2i, r2.size());
880 assert_eq(re2[r2i], ri);
881 ranges.resize(ranges.size()+1);
882 Range& range = ranges.back();
883 assert_eq(diffs, r2[r2i].numMms);
884 range.stratum = diffs;
885 range.numMms = diffs;
886 assert_eq(0, range.mms.size());
887 assert_eq(0, range.refcs.size());
889 assert_neq(mmpos, 0xffffffff);
890 assert_eq(mmpos, r2[r2i].mms[0]);
891 assert_neq(-1, refc);
892 assert_eq(refc, r2[r2i].refcs[0]);
893 range.mms.push_back(mmpos);
894 range.refcs.push_back(refc);
897 results.push_back(ri);
898 if(--numToFind == 0) return;
900 assert_leq(duplicates, r2.size());
901 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
907 * Concrete RefAligner for finding nearby 2-mismatch hits given an
910 template<typename TStr>
911 class TwoMMRefAligner : public RefAligner<TStr> {
913 typedef seqan::String<seqan::Dna5> TDna5Str;
914 typedef seqan::String<char> TCharStr;
915 typedef std::vector<Range> TRangeVec;
916 typedef std::vector<uint32_t> TU32Vec;
917 typedef std::pair<uint64_t, uint64_t> TU64Pair;
918 typedef std::set<TU64Pair> TSetPairs;
922 TwoMMRefAligner(bool color, bool verbose, bool quiet) :
923 RefAligner<TStr>(color, verbose, quiet) { }
925 virtual ~TwoMMRefAligner() { }
929 * Because we're doing end-to-end exact, we don't care which end of
930 * 'qry' is the 5' end.
932 void naiveFind(uint32_t numToFind,
936 const TCharStr& quals,
943 bool seedOnLeft) const
945 assert_gt(numToFind, 0);
946 const uint32_t qlen = seqan::length(qry);
947 assert_geq(end - begin, qlen); // caller should have checked this
948 assert_gt(end, begin);
950 uint32_t qend = end - qlen;
951 uint32_t lim = qend - begin;
952 uint32_t halfway = begin + (lim >> 1);
954 for(uint32_t i = 1; i <= lim+1; i++) {
955 uint32_t ri; // leftmost position in candidate alignment
956 uint32_t rir; // same, minus begin; for indexing into ref[]
958 ri = halfway + (i >> 1); rir = ri - begin;
959 assert_leq(ri, qend);
961 ri = halfway - (i >> 1); rir = ri - begin;
962 assert_geq(ri, begin);
965 // Do the naive comparison
968 uint32_t mmOff1 = 0xffffffff;
970 uint32_t mmOff2 = 0xffffffff;
972 for(uint32_t j = 0; j < qlen; j++) {
974 // Count Ns in the reference as mismatches
975 const int q = (int)qry[j];
976 const int r = (int)ref[rir + j];
979 if(q == 4 || r == 4 || q != r) {
981 // Disallow alignments that involve an N in the
983 const int r = (int)ref[rir + j];
985 // N in reference; bail
989 const int q = (int)qry[j];
996 // Too many; reject this alignment
999 } else if(mms == 2) {
1000 // Second one; remember offset and ref char
1005 // First one; remember offset and ref char
1013 ranges.resize(ranges.size()+1);
1014 Range& range = ranges.back();
1015 range.stratum = mms;
1017 assert_eq(0, range.mms.size());
1018 assert_eq(0, range.refcs.size());
1020 assert_lt(mmOff1, qlen);
1021 assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1022 range.mms.push_back(mmOff1);
1023 range.refcs.push_back(refc1);
1026 assert_lt(mmOff2, qlen);
1027 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1028 range.mms.push_back(mmOff2);
1029 range.refcs.push_back(refc2);
1032 results.push_back(ri);
1039 * Because we're doing end-to-end exact, we don't care which end of
1040 * 'qry' is the 5' end.
1042 virtual void anchor64Find(uint32_t numToFind,
1045 const TDna5Str& qry,
1046 const TCharStr& quals,
1051 TSetPairs* pairs = NULL,
1052 uint32_t aoff = 0xffffffff,
1053 bool seedOnLeft = false) const
1055 assert_gt(numToFind, 0);
1056 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1057 ASSERT_ONLY(uint32_t duplicates = 0);
1058 ASSERT_ONLY(uint32_t r2i = 0);
1059 const uint32_t qlen = seqan::length(qry);
1060 assert_geq(end - begin, qlen); // caller should have checked this
1061 assert_gt(end, begin);
1064 // Get results from the naive matcher for sanity-checking
1065 TRangeVec r2; TU32Vec re2;
1066 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1067 re2, pairs, aoff, seedOnLeft);
1069 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1070 const int lhsShift = ((anchorBitPairs - 1) << 1);
1071 const uint32_t anchorCushion = 32 - anchorBitPairs;
1072 // anchorOverhang = # read bases not included in the anchor
1073 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1074 const uint32_t lim = end - qlen - begin;
1075 const uint32_t halfway = begin + (lim >> 1);
1076 uint64_t anchor = 0llu;
1077 uint64_t buffw = 0llu; // rotating ref sequence buffer
1078 // OR the 'diff' buffer with this so that we can always count
1079 // 'N's as mismatches
1080 uint64_t diffMask = 0llu;
1081 // Set up a mask that we'll apply to the two bufs every round
1082 // to discard bits that were rotated out of the anchor area
1083 uint64_t clearMask = 0xffffffffffffffffllu;
1084 bool useMask = false;
1085 if(anchorBitPairs < 32) {
1086 clearMask >>= ((32-anchorBitPairs) << 1);
1093 uint32_t skipLeftToRights = 0;
1094 uint32_t skipRightToLefts = 0;
1095 // Construct the 'anchor' 64-bit buffer so that it holds all of
1096 // the first 'anchorBitPairs' bit pairs of the query.
1097 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1098 int c = (int)qry[i]; // next query character
1099 int r = (int)ref[halfway - begin + i]; // next reference character
1102 // The right-to-left direction absorbs the candidate
1103 // alignment based at 'halfway'; so skipLeftToRights is
1105 skipLeftToRights = max(skipLeftToRights, i);
1106 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1110 // Special case: query has an 'N'
1112 if(++nsInAnchor > 2) {
1113 // More than two 'N's in the anchor region; can't
1114 // possibly have a 2-mismatch hit anywhere
1115 return; // can't match if query has Ns
1116 } else if(nsInAnchor == 2) {
1120 assert_eq(1, nsInAnchor);
1124 // Make it look like an 'A' in the anchor
1126 diffMask = (diffMask << 2llu) | 1llu;
1130 anchor = ((anchor << 2llu) | c);
1131 buffw = ((buffw << 2llu) | r);
1133 assert_leq(nPoss, 2);
1134 // Check whether read is disqualified by Ns outside of the anchor
1136 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1137 if((int)qry[i] == 4) {
1138 if(++nsInAnchor > 2) {
1139 return; // can't match if query has Ns
1143 uint64_t bufbw = buffw;
1144 // We're moving the right-hand edge of the anchor along until
1145 // it's 'anchorOverhang' chars from the end of the target region.
1146 // Note that we're not making a 3'/5' distinction here; if we
1147 // were, we might need to make the 'anchorOverhang' adjustment on
1148 // the left end of the range rather than the right.
1150 uint32_t riHi = halfway;
1151 uint32_t rirHi = halfway - begin;
1152 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1153 uint32_t riLo = halfway + 1;
1154 uint32_t rirLo = halfway - begin + 1;
1156 for(i = 1; i <= lim + 1; i++) {
1157 int r; // new reference char
1159 assert_lt(skipLeftToRights, qlen);
1160 assert_leq(skipRightToLefts, qlen);
1163 // Moving left-to-right
1167 r = (int)ref[rirHiAnchor];
1170 skipLeftToRights = anchorBitPairs;
1173 // Bring in new base pair at the least significant
1175 buffw = ((buffw << 2llu) | r);
1176 if(useMask) buffw &= clearMask;
1177 if(skipLeftToRights > 0) {
1181 diff = (buffw ^ anchor) | diffMask;
1184 // Moving right-to-left
1187 r = (int)ref[rirLo];
1190 skipRightToLefts = qlen;
1195 // Bring in new base pair at the most significant
1197 bufbw |= ((uint64_t)r << lhsShift);
1199 if(skipRightToLefts > 0) {
1203 diff = (bufbw ^ anchor) | diffMask;
1205 if((diff & 0xfffff00000000000llu) &&
1206 (diff & 0x00000ffffff00000llu) &&
1207 (diff & 0x00000000000fffffllu)) continue;
1208 uint32_t ri = hi ? riLo : riHi;
1209 uint32_t rir = hi ? rirLo : rirHi;
1210 // Could use pop count
1211 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1212 // As a first cut, see if there are too many mismatches in
1213 // the first and last parts of the anchor
1214 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1215 if(diffs > 2) continue;
1216 diffs += u8toMms[(int)diff8[1]] +
1217 u8toMms[(int)diff8[2]] +
1218 u8toMms[(int)diff8[3]] +
1219 u8toMms[(int)diff8[4]] +
1220 u8toMms[(int)diff8[5]] +
1221 u8toMms[(int)diff8[6]];
1222 uint32_t mmpos1 = 0xffffffff;
1224 uint32_t mmpos2 = 0xffffffff;
1227 // Too many differences
1229 } else if(nPoss > 1 && diffs == nPoss) {
1230 // There was one difference, but there was also one N,
1231 // so we already know where the difference is
1233 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1236 refc2 = "ACGT"[(int)ref[rir + nPos2]];
1238 } else if(diffs > 0) {
1239 // Figure out which position mismatched
1240 uint64_t diff2 = diff;
1242 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1243 assert_neq(0, diff);
1244 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos1 -= 8; }
1245 assert_neq(0, diff);
1246 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos1 -= 4; }
1247 assert_neq(0, diff);
1248 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos1 -= 2; }
1249 assert_neq(0, diff);
1250 if((diff & 0x3llu) == 0) { mmpos1--; }
1251 assert_neq(0, diff);
1252 assert_geq(mmpos1, 0);
1253 assert_lt(mmpos1, 32);
1254 mmpos1 -= anchorCushion;
1255 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1257 // Figure out the second mismatched position
1258 ASSERT_ONLY(uint64_t origDiff2 = diff2);
1259 diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1+anchorCushion) << 1));
1260 assert_neq(diff2, origDiff2);
1262 if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1263 assert_neq(0, diff2);
1264 if((diff2 & 0xffffllu) == 0) { diff2 >>= 16llu; mmpos2 -= 8; }
1265 assert_neq(0, diff2);
1266 if((diff2 & 0xffllu) == 0) { diff2 >>= 8llu; mmpos2 -= 4; }
1267 assert_neq(0, diff2);
1268 if((diff2 & 0xfllu) == 0) { diff2 >>= 4llu; mmpos2 -= 2; }
1269 assert_neq(0, diff2);
1270 if((diff2 & 0x3llu) == 0) { mmpos2--; }
1271 assert_neq(0, diff2);
1272 assert_geq(mmpos2, 0);
1273 assert_lt(mmpos2, 32);
1274 mmpos2 -= anchorCushion;
1275 assert_neq(mmpos1, mmpos2);
1276 refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1277 if(mmpos2 < mmpos1) {
1278 uint32_t mmtmp = mmpos1;
1281 int refctmp = refc1;
1285 assert_lt(mmpos1, mmpos2);
1288 // Now extend the anchor into a longer alignment
1289 bool foundHit = true;
1290 if(anchorOverhang > 0) {
1291 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1292 bool skipCandidate = false;
1293 for(uint32_t j = 0; j < anchorOverhang; j++) {
1294 int rc = (int)ref[rir + 32 + j];
1296 // Oops, encountered an N in the reference in
1297 // the overhang portion of the candidate
1299 // (Note that we inverted hi earlier)
1302 skipRightToLefts = anchorOverhang - j - 1;
1305 skipLeftToRights = anchorBitPairs + j;
1307 skipCandidate = true; // Skip this candidate
1310 if((int)qry[32 + j] != rc) {
1314 } else if(diffs == 2) {
1316 refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1318 assert_eq(1, diffs);
1320 refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1324 if(skipCandidate) continue;
1326 if(!foundHit) continue;
1330 // By convention, the upstream mate's
1331 // coordinates go in the 'first' field
1332 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
1333 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1335 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1336 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1338 if(pairs->find(p) != pairs->end()) {
1339 // We already found this hit! Continue.
1340 ASSERT_ONLY(duplicates++);
1348 assert_leq(diffs, 2);
1349 assert_lt(r2i, r2.size());
1350 assert_eq(re2[r2i], ri);
1351 ranges.resize(ranges.size()+1);
1352 Range& range = ranges.back();
1353 assert_eq(diffs, r2[r2i].numMms);
1354 range.stratum = diffs;
1355 range.numMms = diffs;
1356 assert_eq(0, range.mms.size());
1357 assert_eq(0, range.refcs.size());
1359 assert_neq(mmpos1, 0xffffffff);
1360 assert_eq(mmpos1, r2[r2i].mms[0]);
1361 assert_neq(-1, refc1);
1362 assert_eq(refc1, r2[r2i].refcs[0]);
1363 range.mms.push_back(mmpos1);
1364 range.refcs.push_back(refc1);
1366 assert_neq(mmpos2, 0xffffffff);
1367 assert_eq(mmpos2, r2[r2i].mms[1]);
1368 assert_neq(-1, refc2);
1369 assert_eq(refc2, r2[r2i].refcs[1]);
1370 range.mms.push_back(mmpos2);
1371 range.refcs.push_back(refc2);
1375 results.push_back(ri);
1376 if(--numToFind == 0) return;
1378 assert_leq(duplicates, r2.size());
1379 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1385 * Concrete RefAligner for finding nearby 2-mismatch hits given an
1388 template<typename TStr>
1389 class ThreeMMRefAligner : public RefAligner<TStr> {
1391 typedef seqan::String<seqan::Dna5> TDna5Str;
1392 typedef seqan::String<char> TCharStr;
1393 typedef std::vector<Range> TRangeVec;
1394 typedef std::vector<uint32_t> TU32Vec;
1395 typedef std::pair<uint64_t, uint64_t> TU64Pair;
1396 typedef std::set<TU64Pair> TSetPairs;
1400 ThreeMMRefAligner(bool color, bool verbose, bool quiet) :
1401 RefAligner<TStr>(color, verbose, quiet) { }
1403 virtual ~ThreeMMRefAligner() { }
1407 * Because we're doing end-to-end exact, we don't care which end of
1408 * 'qry' is the 5' end.
1410 void naiveFind(uint32_t numToFind,
1413 const TDna5Str& qry,
1414 const TCharStr& quals,
1421 bool seedOnLeft) const
1423 assert_gt(numToFind, 0);
1424 const uint32_t qlen = seqan::length(qry);
1425 assert_geq(end - begin, qlen); // caller should have checked this
1426 assert_gt(end, begin);
1428 uint32_t qend = end - qlen;
1429 uint32_t lim = qend - begin;
1430 uint32_t halfway = begin + (lim >> 1);
1432 for(uint32_t i = 1; i <= lim+1; i++) {
1433 uint32_t ri; // leftmost position in candidate alignment
1434 uint32_t rir; // same, minus begin; for indexing into ref[]
1436 ri = halfway + (i >> 1); rir = ri - begin;
1437 assert_leq(ri, qend);
1439 ri = halfway - (i >> 1); rir = ri - begin;
1440 assert_geq(ri, begin);
1443 // Do the naive comparison
1446 uint32_t mmOff1 = 0xffffffff;
1448 uint32_t mmOff2 = 0xffffffff;
1450 uint32_t mmOff3 = 0xffffffff;
1452 for(uint32_t j = 0; j < qlen; j++) {
1454 // Count Ns in the reference as mismatches
1455 const int q = (int)qry[j];
1456 const int r = (int)ref[rir + j];
1459 if(q == 4 || r == 4 || q != r) {
1461 // Disallow alignments that involve an N in the
1463 const int r = (int)ref[rir + j];
1465 // N in reference; bail
1469 const int q = (int)qry[j];
1476 // Too many; reject this alignment
1479 } else if(mms == 3) {
1480 // Second one; remember offset and ref char
1483 } else if(mms == 2) {
1484 // Second one; remember offset and ref char
1489 // First one; remember offset and ref char
1497 ranges.resize(ranges.size()+1);
1498 Range& range = ranges.back();
1499 range.stratum = mms;
1501 assert_eq(0, range.mms.size());
1502 assert_eq(0, range.refcs.size());
1504 assert_lt(mmOff1, qlen);
1505 assert(refc1 == 'A' || refc1 == 'C' || refc1 == 'G' || refc1 == 'T');
1506 range.mms.push_back(mmOff1);
1507 range.refcs.push_back(refc1);
1509 assert_lt(mmOff2, qlen);
1510 assert(refc2 == 'A' || refc2 == 'C' || refc2 == 'G' || refc2 == 'T');
1511 range.mms.push_back(mmOff2);
1512 range.refcs.push_back(refc2);
1515 assert_lt(mmOff3, qlen);
1516 assert(refc3 == 'A' || refc3 == 'C' || refc3 == 'G' || refc3 == 'T');
1517 range.mms.push_back(mmOff3);
1518 range.refcs.push_back(refc3);
1522 results.push_back(ri);
1529 * Because we're doing end-to-end exact, we don't care which end of
1530 * 'qry' is the 5' end.
1532 virtual void anchor64Find(uint32_t numToFind,
1535 const TDna5Str& qry,
1536 const TCharStr& quals,
1541 TSetPairs* pairs = NULL,
1542 uint32_t aoff = 0xffffffff,
1543 bool seedOnLeft = false) const
1545 assert_gt(numToFind, 0);
1546 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
1547 ASSERT_ONLY(uint32_t duplicates = 0);
1548 ASSERT_ONLY(uint32_t r2i = 0);
1549 const uint32_t qlen = seqan::length(qry);
1550 assert_geq(end - begin, qlen); // caller should have checked this
1551 assert_gt(end, begin);
1554 // Get results from the naive matcher for sanity-checking
1555 TRangeVec r2; TU32Vec re2;
1556 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
1557 re2, pairs, aoff, seedOnLeft);
1559 const uint32_t anchorBitPairs = min<int>(qlen, 32);
1560 const int lhsShift = ((anchorBitPairs - 1) << 1);
1561 const uint32_t anchorCushion = 32 - anchorBitPairs;
1562 // anchorOverhang = # read bases not included in the anchor
1563 const uint32_t anchorOverhang = (qlen <= 32 ? 0 : (qlen - 32));
1564 const uint32_t lim = end - qlen - begin;
1565 const uint32_t halfway = begin + (lim >> 1);
1566 uint64_t anchor = 0llu;
1567 uint64_t buffw = 0llu; // rotating ref sequence buffer
1568 // OR the 'diff' buffer with this so that we can always count
1569 // 'N's as mismatches
1570 uint64_t diffMask = 0llu;
1571 // Set up a mask that we'll apply to the two bufs every round
1572 // to discard bits that were rotated out of the anchor area
1573 uint64_t clearMask = 0xffffffffffffffffllu;
1574 bool useMask = false;
1575 if(anchorBitPairs < 32) {
1576 clearMask >>= ((32-anchorBitPairs) << 1);
1584 uint32_t skipLeftToRights = 0;
1585 uint32_t skipRightToLefts = 0;
1586 // Construct the 'anchor' 64-bit buffer so that it holds all of
1587 // the first 'anchorBitPairs' bit pairs of the query.
1588 for(uint32_t i = 0; i < anchorBitPairs; i++) {
1589 int c = (int)qry[i]; // next query character
1590 int r = (int)ref[halfway - begin + i]; // next reference character
1593 // The right-to-left direction absorbs the candidate
1594 // alignment based at 'halfway'; so skipLeftToRights is
1596 skipLeftToRights = max(skipLeftToRights, i);
1597 skipRightToLefts = max(skipRightToLefts, anchorBitPairs - i);
1601 // Special case: query has an 'N'
1603 if(++nsInAnchor > 3) {
1604 // More than two 'N's in the anchor region; can't
1605 // possibly have a 2-mismatch hit anywhere
1606 assert_eq(r2.size(), ranges.size() - rangesInitSz);
1607 return; // can't match if query has Ns
1608 } else if(nsInAnchor == 3) {
1611 } else if(nsInAnchor == 2) {
1615 assert_eq(1, nsInAnchor);
1619 // Make it look like an 'A' in the anchor
1621 diffMask = (diffMask << 2llu) | 1llu;
1625 anchor = ((anchor << 2llu) | c);
1626 buffw = ((buffw << 2llu) | r);
1628 assert_leq(nPoss, 3);
1629 // Check whether read is disqualified by Ns outside of the anchor
1631 for(uint32_t i = anchorBitPairs; i < qlen; i++) {
1632 if((int)qry[i] == 4) {
1633 if(++nsInAnchor > 3) {
1634 assert_eq(r2.size(), ranges.size() - rangesInitSz);
1635 return; // can't match if query has Ns
1639 uint64_t bufbw = buffw;
1640 // We're moving the right-hand edge of the anchor along until
1641 // it's 'anchorOverhang' chars from the end of the target region.
1642 // Note that we're not making a 3'/5' distinction here; if we
1643 // were, we might need to make the 'anchorOverhang' adjustment on
1644 // the left end of the range rather than the right.
1646 uint32_t riHi = halfway;
1647 uint32_t rirHi = halfway - begin;
1648 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
1649 uint32_t riLo = halfway + 1;
1650 uint32_t rirLo = halfway - begin + 1;
1651 for(uint32_t i = 1; i <= lim + 1; i++) {
1652 int r; // new reference char
1654 assert_lt(skipLeftToRights, qlen);
1655 assert_leq(skipRightToLefts, qlen);
1658 // Moving left-to-right
1662 r = (int)ref[rirHiAnchor];
1665 skipLeftToRights = anchorBitPairs;
1668 // Bring in new base pair at the least significant
1670 buffw = ((buffw << 2llu) | r);
1671 if(useMask) buffw &= clearMask;
1672 if(skipLeftToRights > 0) {
1676 diff = (buffw ^ anchor) | diffMask;
1679 // Moving right-to-left
1682 r = (int)ref[rirLo];
1685 skipRightToLefts = qlen;
1690 // Bring in new base pair at the most significant
1692 bufbw |= ((uint64_t)r << lhsShift);
1694 if(skipRightToLefts > 0) {
1698 diff = (bufbw ^ anchor) | diffMask;
1700 if((diff & 0xffff000000000000llu) &&
1701 (diff & 0x0000ffff00000000llu) &&
1702 (diff & 0x00000000ffff0000llu) &&
1703 (diff & 0x000000000000ffffllu)) continue;
1704 uint32_t ri = hi ? riLo : riHi;
1705 uint32_t rir = hi ? rirLo : rirHi;
1706 // Could use pop count
1707 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
1708 // As a first cut, see if there are too many mismatches in
1709 // the first and last parts of the anchor
1710 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
1711 if(diffs > 3) continue;
1712 diffs += u8toMms[(int)diff8[1]] +
1713 u8toMms[(int)diff8[2]] +
1714 u8toMms[(int)diff8[3]] +
1715 u8toMms[(int)diff8[4]] +
1716 u8toMms[(int)diff8[5]] +
1717 u8toMms[(int)diff8[6]];
1718 uint32_t mmpos1 = 0xffffffff;
1720 uint32_t mmpos2 = 0xffffffff;
1722 uint32_t mmpos3 = 0xffffffff;
1725 // Too many differences
1727 } else if(nPoss > 1 && diffs == nPoss) {
1728 // There was one difference, but there was also one N,
1729 // so we already know where the difference is
1731 refc1 = "ACGT"[(int)ref[rir + nPos1]];
1734 refc2 = "ACGT"[(int)ref[rir + nPos2]];
1737 refc3 = "ACGT"[(int)ref[rir + nPos3]];
1740 } else if(diffs > 0) {
1741 // Figure out which position mismatched
1742 uint64_t diff2 = diff;
1744 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
1745 assert_neq(0, diff);
1746 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos1 -= 8; }
1747 assert_neq(0, diff);
1748 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos1 -= 4; }
1749 assert_neq(0, diff);
1750 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos1 -= 2; }
1751 assert_neq(0, diff);
1752 if((diff & 0x3llu) == 0) { mmpos1--; }
1753 assert_neq(0, diff);
1754 assert_geq(mmpos1, 0);
1755 assert_lt(mmpos1, 32);
1756 mmpos1 -= anchorCushion;
1757 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
1759 // Figure out the second mismatched position
1760 diff2 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos1 + anchorCushion) << 1));
1761 uint64_t diff3 = diff2;
1763 if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
1764 assert_neq(0, diff2);
1765 if((diff2 & 0xffffllu) == 0) { diff2 >>= 16llu; mmpos2 -= 8; }
1766 assert_neq(0, diff2);
1767 if((diff2 & 0xffllu) == 0) { diff2 >>= 8llu; mmpos2 -= 4; }
1768 assert_neq(0, diff2);
1769 if((diff2 & 0xfllu) == 0) { diff2 >>= 4llu; mmpos2 -= 2; }
1770 assert_neq(0, diff2);
1771 if((diff2 & 0x3llu) == 0) { mmpos2--; }
1772 assert_neq(0, diff2);
1773 mmpos2 -= anchorCushion;
1774 assert_geq(mmpos2, 0);
1775 assert_lt(mmpos2, 32);
1776 assert_neq(mmpos1, mmpos2);
1777 refc2 = "ACGT"[(int)ref[rir + mmpos2]];
1778 uint32_t mmpos2orig = mmpos2;
1779 if(mmpos2 < mmpos1) {
1780 uint32_t mmtmp = mmpos1;
1783 int refctmp = refc1;
1787 assert_lt(mmpos1, mmpos2);
1789 // Figure out the second mismatched position
1790 diff3 &= ~(0xc000000000000000llu >> (uint64_t)((mmpos2orig + anchorCushion) << 1));
1792 if((diff3 & 0xffffffffllu) == 0) { diff3 >>= 32llu; mmpos3 -= 16; }
1793 assert_neq(0, diff3);
1794 if((diff3 & 0xffffllu) == 0) { diff3 >>= 16llu; mmpos3 -= 8; }
1795 assert_neq(0, diff3);
1796 if((diff3 & 0xffllu) == 0) { diff3 >>= 8llu; mmpos3 -= 4; }
1797 assert_neq(0, diff3);
1798 if((diff3 & 0xfllu) == 0) { diff3 >>= 4llu; mmpos3 -= 2; }
1799 assert_neq(0, diff3);
1800 if((diff3 & 0x3llu) == 0) { mmpos3--; }
1801 assert_neq(0, diff3);
1802 mmpos3 -= anchorCushion;
1803 assert_geq(mmpos3, 0);
1804 assert_lt(mmpos3, 32);
1805 assert_neq(mmpos1, mmpos3);
1806 assert_neq(mmpos2, mmpos3);
1807 refc3 = "ACGT"[(int)ref[rir + mmpos3]];
1808 if(mmpos3 < mmpos1) {
1809 uint32_t mmtmp = mmpos1;
1813 int refctmp = refc1;
1817 } else if(mmpos3 < mmpos2) {
1818 uint32_t mmtmp = mmpos2;
1821 int refctmp = refc2;
1825 assert_lt(mmpos1, mmpos2);
1826 assert_lt(mmpos2, mmpos3);
1830 // Now extend the anchor into a longer alignment
1831 bool foundHit = true;
1832 if(anchorOverhang > 0) {
1833 assert_leq(ri + anchorBitPairs + anchorOverhang, end);
1834 bool skipCandidate = false;
1835 for(uint32_t j = 0; j < anchorOverhang; j++) {
1836 int rc = (int)ref[rir + 32 + j];
1838 // Oops, encountered an N in the reference in
1839 // the overhang portion of the candidate
1841 // (Note that we inverted hi earlier)
1844 skipRightToLefts = anchorOverhang - j - 1;
1847 skipLeftToRights = anchorBitPairs + j;
1849 skipCandidate = true; // Skip this candidate
1852 if((int)qry[32 + j] != rc) {
1856 } else if(diffs == 3) {
1858 refc3 = "ACGT"[(int)ref[rir + 32 + j]];
1859 } else if(diffs == 2) {
1861 refc2 = "ACGT"[(int)ref[rir + 32 + j]];
1863 assert_eq(1, diffs);
1865 refc1 = "ACGT"[(int)ref[rir + 32 + j]];
1869 if(skipCandidate) continue;
1871 if(!foundHit) continue;
1875 // By convention, the upstream mate's
1876 // coordinates go in the 'first' field
1877 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
1878 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1880 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
1881 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
1883 if(pairs->find(p) != pairs->end()) {
1884 // We already found this hit! Continue.
1885 ASSERT_ONLY(duplicates++);
1893 assert_leq(diffs, 3);
1894 assert_lt(r2i, r2.size());
1895 assert_eq(re2[r2i], ri);
1896 ranges.resize(ranges.size()+1);
1897 Range& range = ranges.back();
1898 assert_eq(diffs, r2[r2i].numMms);
1899 range.stratum = diffs;
1900 range.numMms = diffs;
1901 assert_eq(0, range.mms.size());
1902 assert_eq(0, range.refcs.size());
1904 assert_neq(mmpos1, 0xffffffff);
1905 assert_eq(mmpos1, r2[r2i].mms[0]);
1906 assert_neq(-1, refc1);
1907 assert_eq(refc1, r2[r2i].refcs[0]);
1908 range.mms.push_back(mmpos1);
1909 range.refcs.push_back(refc1);
1911 assert_neq(mmpos2, 0xffffffff);
1912 assert_eq(mmpos2, r2[r2i].mms[1]);
1913 assert_neq(-1, refc2);
1914 assert_eq(refc2, r2[r2i].refcs[1]);
1915 range.mms.push_back(mmpos2);
1916 range.refcs.push_back(refc2);
1918 assert_neq(mmpos3, 0xffffffff);
1919 assert_eq(mmpos3, r2[r2i].mms[2]);
1920 assert_neq(-1, refc3);
1921 assert_eq(refc3, r2[r2i].refcs[2]);
1922 range.mms.push_back(mmpos3);
1923 range.refcs.push_back(refc3);
1928 results.push_back(ri);
1929 if(--numToFind == 0) return;
1931 assert_leq(duplicates, r2.size());
1932 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
1938 * Concrete RefAligner for finding nearby 1-mismatch hits given an
1941 template<typename TStr>
1942 class Seed0RefAligner : public RefAligner<TStr> {
1944 typedef seqan::String<seqan::Dna5> TDna5Str;
1945 typedef seqan::String<char> TCharStr;
1946 typedef std::vector<Range> TRangeVec;
1947 typedef std::vector<uint32_t> TU32Vec;
1948 typedef std::pair<uint64_t, uint64_t> TU64Pair;
1949 typedef std::set<TU64Pair> TSetPairs;
1953 Seed0RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
1954 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
1956 virtual ~Seed0RefAligner() { }
1960 * This schematic shows the roles played by the begin, qbegin, end,
1961 * qend, halfway, slen, qlen, and lim variables:
1963 * seedOnLeft == true:
1965 * |< lim >|< qlen >|
1966 * --------------------------------------------------------------
1967 * | | slen | qlen-slen | | slen | qlen-slen |
1968 * --------------------------------------------------------------
1970 * begin & qbegin halfway qend end
1972 * seedOnLeft == false:
1974 * |< qlen >|< lim >|
1975 * --------------------------------------------------------------
1976 * | qlen-slen | slen | | qlen-slen | slen | |
1977 * --------------------------------------------------------------
1979 * begin qbegin halfway qend & end
1981 * Note that, for seeds longer than 32 base-pairs, the seed is
1982 * further subdivided into a 32-bit anchor and a seed overhang of
1985 void naiveFind(uint32_t numToFind,
1988 const TDna5Str& qry,
1989 const TCharStr& quals,
1996 bool seedOnLeft) const
1998 assert_gt(numToFind, 0);
1999 assert_gt(end, begin);
2000 const uint32_t qlen = seqan::length(qry);
2002 assert_geq(end - begin, qlen); // caller should have checked this
2003 assert_gt(this->seedLen_, 0);
2004 const uint32_t slen = min(qlen, this->seedLen_);
2005 uint32_t qend = end;
2006 uint32_t qbegin = begin;
2007 // If the seed is on the left-hand side of the alignment, then
2008 // leave a gap at the right-hand side of the interval;
2009 // otherwise, do the opposite
2011 // Leave gap on right-hand side of the interval
2014 // Leave gap on left-hand side of the interval
2017 // lim = number of alignments to try
2018 const uint32_t lim = qend - qbegin;
2019 // halfway = position in the reference to start at (and then
2020 // we work our way out to the right and to the left).
2021 const uint32_t halfway = qbegin + (lim >> 1);
2022 // Vectors for holding edit information
2023 std::vector<uint32_t> nonSeedMms;
2024 std::vector<uint8_t> nonSeedRefcs;
2026 for(uint32_t i = 1; i <= lim+1; i++) {
2027 uint32_t ri; // leftmost position in candidate alignment
2028 uint32_t rir; // same, minus begin; for indexing into ref[]
2030 ri = halfway + (i >> 1); rir = ri - begin;
2031 assert_leq(ri, qend);
2033 ri = halfway - (i >> 1); rir = ri - begin;
2034 assert_geq(ri, begin);
2037 // Do the naive comparison
2040 unsigned int ham = 0;
2042 nonSeedRefcs.clear();
2043 // Walk through each position of the alignment
2044 for(uint32_t jj = 0; jj < qlen; jj++) {
2047 // If seed is on the right, scan right-to-left
2052 uint32_t rirj = rir + j;
2054 assert_geq(rir, jj);
2055 rirj = rir - jj - 1;
2058 // Count Ns in the reference as mismatches
2059 const int q = (int)qry[j];
2060 const int r = (int)ref[rirj];
2063 if(q == 4 || r == 4 || q != r) {
2065 // Disallow alignments that involve an N in the
2067 const int r = (int)ref[rirj];
2069 // N in reference; bail on this alignment
2073 const int q = (int)qry[j];
2080 // More than one mismatch in the anchor; reject
2084 uint8_t qual = phredCharToPhredQual(quals[j]);
2085 ham += mmPenalty(this->maqPenalty_, qual);
2086 if(ham > this->qualMax_) {
2087 // Exceeded quality ceiling; reject
2091 // Legal mismatch outside of the anchor; record it
2093 nonSeedMms.push_back(j);
2094 assert_leq(nonSeedMms.size(), (size_t)mms);
2095 nonSeedRefcs.push_back("ACGTN"[r]);
2100 ranges.resize(ranges.size()+1);
2101 Range& range = ranges.back();
2104 assert_eq(0, range.mms.size());
2105 assert_eq(0, range.refcs.size());
2107 // Be careful to add edits in left-to-right order
2108 // with respect to the read/alignment
2109 const size_t nonSeedMmsSz = nonSeedMms.size();
2110 if(nonSeedMmsSz > 0) {
2112 for(size_t k = 0; k < nonSeedMmsSz; k++) {
2113 range.mms.push_back(nonSeedMms[k]);
2114 range.refcs.push_back(nonSeedRefcs[k]);
2117 for(size_t k = nonSeedMmsSz; k > 0; k--) {
2118 range.mms.push_back(nonSeedMms[k-1]);
2119 range.refcs.push_back(nonSeedRefcs[k-1]);
2124 assert_eq((size_t)mms, range.mms.size());
2125 assert_eq((size_t)mms, range.refcs.size());
2127 results.push_back(ri);
2129 results.push_back(ri - qlen);
2137 * This schematic shows the roles played by the begin, qbegin, end,
2138 * qend, halfway, slen, qlen, and lim variables:
2140 * seedOnLeft == true:
2142 * |< lim >|< qlen >|
2143 * --------------------------------------------------------------
2144 * | | slen | qlen-slen | | slen | qlen-slen |
2145 * --------------------------------------------------------------
2147 * begin & qbegin halfway qend end
2149 * seedOnLeft == false:
2152 * --------------------------------------------------------------
2153 * | qlen-slen | | qlen-slen | slen | | slen |
2154 * --------------------------------------------------------------
2156 * begin qbegin halfway qend end
2158 * Note that, for seeds longer than 32 base-pairs, the seed is
2159 * further subdivided into a 32-bit anchor and a seed overhang of
2162 virtual void anchor64Find(uint32_t numToFind,
2165 const TDna5Str& qry,
2166 const TCharStr& quals,
2171 TSetPairs* pairs = NULL,
2172 uint32_t aoff = 0xffffffff,
2173 bool seedOnLeft = false) const
2175 assert_gt(numToFind, 0);
2176 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
2177 ASSERT_ONLY(uint32_t duplicates = 0);
2178 ASSERT_ONLY(uint32_t r2i = 0);
2179 const uint32_t qlen = seqan::length(qry);
2181 assert_gt(end, begin);
2182 assert_geq(end - begin, qlen); // caller should have checked this
2183 assert_gt(this->seedLen_, 0);
2184 uint32_t slen = min(qlen, this->seedLen_);
2186 // Get results from the naive matcher for sanity-checking
2187 TRangeVec r2; TU32Vec re2;
2188 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
2189 re2, pairs, aoff, seedOnLeft);
2191 const uint32_t anchorBitPairs = min<int>(slen, 32);
2192 const int lhsShift = ((anchorBitPairs - 1) << 1);
2193 ASSERT_ONLY(const uint32_t anchorCushion = 32 - anchorBitPairs);
2194 // seedAnchorOverhang = # seed bases not included in the anchor
2195 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
2196 // seedAnchorOverhang = # seed bases not included in the anchor
2197 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
2198 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
2199 assert_eq(qlen, readSeedOverhang + slen);
2200 uint32_t qend = end;
2201 uint32_t qbegin = begin;
2203 // Leave read-sized gap on right-hand side of the interval
2206 // Leave seed-sized gap on right-hand side and
2207 // non-seed-sized gap on the left-hand side
2208 qbegin += readSeedOverhang;
2211 // lim = # possible alignments in the range
2212 const uint32_t lim = qend - qbegin;
2213 // halfway = point on the genome to radiate out from
2214 const uint32_t halfway = qbegin + (lim >> 1);
2215 uint64_t anchor = 0llu;
2216 uint64_t buffw = 0llu; // rotating ref sequence buffer
2217 // Set up a mask that we'll apply to the two bufs every round
2218 // to discard bits that were rotated out of the anchor area
2219 uint64_t clearMask = 0xffffffffffffffffllu;
2220 bool useMask = false;
2221 if(anchorBitPairs < 32) {
2222 clearMask >>= ((32-anchorBitPairs) << 1);
2225 uint32_t skipLeftToRights = 0;
2226 uint32_t skipRightToLefts = 0;
2227 const uint32_t halfwayRi = halfway - begin;
2228 // Construct the 'anchor' 64-bit buffer so that it holds all of
2229 // the first 'anchorBitPairs' bit pairs of the query.
2230 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
2233 // Fill in the anchor using characters from the right-
2234 // hand side of the query (but take the characters in
2235 // left-to-right order)
2236 i = qlen - slen + ii;
2238 int c = (int)qry[i]; // next query character
2239 int r = (int)ref[halfwayRi + ii]; // next reference character
2241 // The reference character is an N; to mimic the
2242 // behavior of BW alignment, we have to skip all
2243 // alignments that involve an N in the reference. Set
2244 // the skip* variables accordingly.
2246 uint32_t lrSkips = ii;
2247 uint32_t rlSkips = qlen - ii;
2248 if(!seedOnLeft && readSeedOverhang) {
2249 lrSkips += readSeedOverhang;
2250 assert_geq(rlSkips, readSeedOverhang);
2251 rlSkips -= readSeedOverhang;
2253 // The right-to-left direction absorbs the candidate
2254 // alignment based at 'halfway'
2255 skipLeftToRights = max(skipLeftToRights, lrSkips);
2256 skipRightToLefts = max(skipRightToLefts, rlSkips);
2257 assert_leq(skipLeftToRights, qlen);
2258 assert_leq(skipRightToLefts, qlen);
2262 // Special case: query has an 'N'
2264 // One or more 'N's in the anchor region; can't
2265 // possibly have a 0-mismatch hit anywhere
2266 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2267 return; // can't match if query has Ns
2269 anchor = ((anchor << 2llu) | c);
2270 buffw = ((buffw << 2llu) | r);
2272 // Check whether read is disqualified by Ns inside the seed
2273 // region but outside the anchor region
2274 if(seedAnchorOverhang) {
2275 assert_lt(anchorBitPairs, slen);
2276 for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
2279 i = qlen - slen + ii;
2281 if((int)qry[i] == 4) {
2282 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2283 return; // can't match if query has Ns
2287 assert_eq(anchorBitPairs, slen);
2289 uint64_t bufbw = buffw;
2290 // Slide the anchor out in either direction, alternating
2291 // between right-to-left and left-to-right shifts, until all of
2292 // the positions from qbegin to qend have been covered.
2294 uint32_t riHi = halfway;
2295 uint32_t rirHi = halfway - begin;
2296 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
2297 uint32_t riLo = halfway + 1;
2298 uint32_t rirLo = halfway - begin + 1;
2299 uint32_t lrSkips = anchorBitPairs;
2300 uint32_t rlSkips = qlen;
2301 if(!seedOnLeft && readSeedOverhang) {
2302 lrSkips += readSeedOverhang;
2303 assert_geq(rlSkips, readSeedOverhang);
2304 rlSkips -= readSeedOverhang;
2306 for(uint32_t i = 1; i <= lim + 1; i++) {
2307 int r; // new reference char
2309 assert_leq(skipLeftToRights, qlen);
2310 assert_leq(skipRightToLefts, qlen);
2313 // Moving left-to-right
2317 r = (int)ref[rirHiAnchor];
2320 skipLeftToRights = lrSkips;
2323 // Bring in new base pair at the least significant
2325 buffw = ((buffw << 2llu) | r);
2326 if(useMask) buffw &= clearMask;
2327 if(skipLeftToRights > 0) {
2331 diff = buffw ^ anchor;
2334 // Moving right-to-left
2337 r = (int)ref[rirLo];
2340 skipRightToLefts = rlSkips;
2345 // Bring in new base pair at the most significant
2347 bufbw |= ((uint64_t)r << lhsShift);
2349 if(skipRightToLefts > 0) {
2353 diff = bufbw ^ anchor;
2356 uint32_t ri = hi ? riLo : riHi;
2357 uint32_t rir = hi ? rirLo : rirHi;
2358 unsigned int ham = 0;
2359 // If the seed is longer than the anchor, then scan the
2360 // rest of the seed characters
2361 bool foundHit = true;
2362 if(seedAnchorOverhang) {
2363 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
2364 int rc = (int)ref[rir + anchorBitPairs + j];
2366 // Oops, encountered an N in the reference in
2367 // the overhang portion of the candidate
2369 // (Note that we inverted hi earlier)
2372 // Skip out of the seedAnchorOverhang
2373 assert_eq(0, skipRightToLefts);
2374 skipRightToLefts = seedAnchorOverhang - j - 1;
2376 // ...and skip out of the rest of the read
2377 skipRightToLefts += readSeedOverhang;
2381 // Skip out of the seedAnchorOverhang
2382 assert_eq(0, skipLeftToRights);
2383 skipLeftToRights = anchorBitPairs + j;
2385 // ...and skip out of the rest of the read
2386 skipLeftToRights += readSeedOverhang;
2389 foundHit = false; // Skip this candidate
2392 uint32_t qoff = anchorBitPairs + j;
2394 qoff += readSeedOverhang;
2396 assert_lt(qoff, qlen);
2397 if((int)qry[qoff] != rc) {
2402 if(!foundHit) continue;
2404 // If the read is longer than the seed, then scan the rest
2405 // of the read characters; mismatches no longer count
2406 // toward the stratum or the 1-mm limit.
2407 // Vectors for holding edit information
2408 std::vector<uint32_t> nonSeedMms;
2409 std::vector<uint8_t> nonSeedRefcs;
2410 int mms = 0; // start counting total mismatches
2411 if((qlen - slen) > 0) {
2412 // Going left-to-right
2413 for(uint32_t j = 0; j < readSeedOverhang; j++) {
2414 uint32_t roff = rir + slen + j;
2415 uint32_t qoff = slen + j;
2417 assert_geq(roff, qlen);
2421 int rc = (int)ref[roff];
2423 // Oops, encountered an N in the reference in
2424 // the overhang portion of the candidate
2426 // (Note that we inverted hi earlier)
2429 // Skip what's left of the readSeedOverhang
2430 skipRightToLefts = readSeedOverhang - j - 1;
2432 // ...and skip the seed if it's on the right
2433 skipRightToLefts += slen;
2437 // Skip what we've matched of the overhang
2438 skipLeftToRights = j;
2440 // ...and skip the seed if it's on the left
2441 skipLeftToRights += slen;
2444 foundHit = false; // Skip this candidate
2447 if((int)qry[qoff] != rc) {
2448 // Calculate quality of mismatched base
2449 char q = phredCharToPhredQual(quals[qoff]);
2450 ham += mmPenalty(this->maqPenalty_, q);
2451 if(ham > this->qualMax_) {
2452 // Exceeded quality limit
2456 // Legal mismatch outside of the anchor; record it
2458 nonSeedMms.push_back(qoff);
2459 assert_leq(nonSeedMms.size(), (size_t)mms);
2460 nonSeedRefcs.push_back("ACGTN"[rc]);
2463 if(!foundHit) continue;
2466 // Adjust ri if seed is on the right-hand side
2468 ri -= readSeedOverhang;
2469 rir -= readSeedOverhang;
2474 // By convention, the upstream mate's
2475 // coordinates go in the 'first' field
2476 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
2477 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2479 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
2480 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
2482 if(pairs->find(p) != pairs->end()) {
2483 // We already found this hit! Continue.
2484 ASSERT_ONLY(duplicates++);
2492 if(this->verbose_) {
2493 cout << "About to report seed0:" << endl;
2495 for(size_t i = 0; i < qlen; i++) {
2496 cout << (char)qry[i];
2500 for(size_t i = 0; i < qlen; i++) {
2501 cout << "ACGT"[ref[rir+i]];
2505 assert_lt(r2i, r2.size());
2506 assert_eq(re2[r2i], ri);
2507 ranges.resize(ranges.size()+1);
2508 Range& range = ranges.back();
2509 assert_eq((size_t)mms, r2[r2i].numMms);
2512 assert_eq(0, range.mms.size());
2513 assert_eq(0, range.refcs.size());
2515 ASSERT_ONLY(size_t mmcur = 0);
2516 const size_t nonSeedMmsSz = nonSeedMms.size();
2517 for(size_t i = 0; i < nonSeedMmsSz; i++) {
2518 assert_neq(0xffffffff, nonSeedMms[i]);
2519 assert_lt(mmcur, (size_t)mms);
2520 assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
2521 range.mms.push_back(nonSeedMms[i]);
2522 assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
2523 ASSERT_ONLY(mmcur++);
2524 range.refcs.push_back(nonSeedRefcs[i]);
2526 assert_eq(mmcur, r2[r2i].mms.size());
2528 assert_eq((size_t)mms, range.mms.size());
2529 assert_eq((size_t)mms, range.refcs.size());
2531 results.push_back(ri);
2532 if(--numToFind == 0) return;
2534 assert_leq(duplicates, r2.size());
2535 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
2541 * Concrete RefAligner for finding nearby 1-mismatch hits given an
2544 template<typename TStr>
2545 class Seed1RefAligner : public RefAligner<TStr> {
2547 typedef seqan::String<seqan::Dna5> TDna5Str;
2548 typedef seqan::String<char> TCharStr;
2549 typedef std::vector<Range> TRangeVec;
2550 typedef std::vector<uint32_t> TU32Vec;
2551 typedef std::pair<uint64_t, uint64_t> TU64Pair;
2552 typedef std::set<TU64Pair> TSetPairs;
2556 Seed1RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
2557 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
2559 virtual ~Seed1RefAligner() { }
2563 * This schematic shows the roles played by the begin, qbegin, end,
2564 * qend, halfway, slen, qlen, and lim variables:
2566 * seedOnLeft == true:
2568 * |< lim >|< qlen >|
2569 * --------------------------------------------------------------
2570 * | | slen | qlen-slen | | slen | qlen-slen |
2571 * --------------------------------------------------------------
2573 * begin & qbegin halfway qend end
2575 * seedOnLeft == false:
2577 * |< qlen >|< lim >|
2578 * --------------------------------------------------------------
2579 * | qlen-slen | slen | | qlen-slen | slen | |
2580 * --------------------------------------------------------------
2582 * begin qbegin halfway qend & end
2584 * Note that, for seeds longer than 32 base-pairs, the seed is
2585 * further subdivided into a 32-bit anchor and a seed overhang of
2588 void naiveFind(uint32_t numToFind,
2591 const TDna5Str& qry,
2592 const TCharStr& quals,
2599 bool seedOnLeft) const
2601 assert_gt(numToFind, 0);
2602 assert_gt(end, begin);
2603 const uint32_t qlen = seqan::length(qry);
2605 assert_geq(end - begin, qlen); // caller should have checked this
2606 assert_gt(this->seedLen_, 0);
2607 const uint32_t slen = min(qlen, this->seedLen_);
2608 uint32_t qend = end;
2609 uint32_t qbegin = begin;
2610 // If the seed is on the left-hand side of the alignment, then
2611 // leave a gap at the right-hand side of the interval;
2612 // otherwise, do the opposite
2614 // Leave gap on right-hand side of the interval
2617 // Leave gap on left-hand side of the interval
2620 // lim = number of alignments to try
2621 const uint32_t lim = qend - qbegin;
2622 // halfway = position in the reference to start at (and then
2623 // we work our way out to the right and to the left).
2624 const uint32_t halfway = qbegin + (lim >> 1);
2625 // Vectors for holding edit information
2626 std::vector<uint32_t> nonSeedMms;
2627 assert_eq(0, nonSeedMms.size());
2628 std::vector<uint8_t> nonSeedRefcs;
2629 assert_eq(0, nonSeedRefcs.size());
2631 for(uint32_t i = 1; i <= lim+1; i++) {
2632 uint32_t ri; // leftmost position in candidate alignment
2633 uint32_t rir; // same, minus begin; for indexing into ref[]
2635 ri = halfway + (i >> 1); rir = ri - begin;
2636 assert_leq(ri, qend);
2638 ri = halfway - (i >> 1); rir = ri - begin;
2639 assert_geq(ri, begin);
2642 // Do the naive comparison
2645 uint32_t mmOff = 0xffffffff;
2648 unsigned int ham = 0;
2650 nonSeedRefcs.clear();
2651 // Walk through each position of the alignment
2652 for(uint32_t jj = 0; jj < qlen; jj++) {
2655 // If seed is on the right, scan right-to-left
2660 uint32_t rirj = rir + j;
2662 assert_geq(rir, jj);
2663 rirj = rir - jj - 1;
2666 // Count Ns in the reference as mismatches
2667 const int q = (int)qry[j];
2668 const int r = (int)ref[rirj];
2671 if(q == 4 || r == 4 || q != r) {
2673 // Disallow alignments that involve an N in the
2675 const int r = (int)ref[rirj];
2677 // N in reference; bail on this alignment
2681 const int q = (int)qry[j];
2688 if(mms > 1 && jj < slen) {
2689 // More than one mismatch in the anchor; reject
2693 uint8_t qual = phredCharToPhredQual(quals[j]);
2694 ham += mmPenalty(this->maqPenalty_, qual);
2695 if(ham > this->qualMax_) {
2696 // Exceeded quality ceiling; reject
2699 } else if(jj < slen) {
2700 // First mismatch in the anchor; remember offset
2706 // Legal mismatch outside of the anchor; record it
2707 nonSeedMms.push_back(j);
2708 assert_leq(nonSeedMms.size(), (size_t)mms);
2709 nonSeedRefcs.push_back("ACGTN"[r]);
2714 ranges.resize(ranges.size()+1);
2715 Range& range = ranges.back();
2716 assert_leq(seedMms, mms);
2717 range.stratum = seedMms;
2719 assert_eq(0, range.mms.size());
2720 assert_eq(0, range.refcs.size());
2722 // Be careful to add edits in left-to-right order
2723 // with respect to the read/alignment
2724 if(seedOnLeft && seedMms) {
2725 assert_lt(mmOff, qlen);
2726 range.mms.push_back(mmOff);
2727 range.refcs.push_back(refc);
2729 const size_t nonSeedMmsSz = nonSeedMms.size();
2730 if(nonSeedMmsSz > 0) {
2732 for(size_t k = 0; k < nonSeedMmsSz; k++) {
2733 range.mms.push_back(nonSeedMms[k]);
2734 range.refcs.push_back(nonSeedRefcs[k]);
2737 for(size_t k = nonSeedMmsSz; k > 0; k--) {
2738 range.mms.push_back(nonSeedMms[k-1]);
2739 range.refcs.push_back(nonSeedRefcs[k-1]);
2743 if(!seedOnLeft && seedMms) {
2744 assert_lt(mmOff, qlen);
2745 range.mms.push_back(mmOff);
2746 range.refcs.push_back(refc);
2749 assert_eq((size_t)mms, range.mms.size());
2750 assert_eq((size_t)mms, range.refcs.size());
2752 results.push_back(ri);
2754 results.push_back(ri - qlen);
2762 * This schematic shows the roles played by the begin, qbegin, end,
2763 * qend, halfway, slen, qlen, and lim variables:
2765 * seedOnLeft == true:
2767 * |< lim >|< qlen >|
2768 * --------------------------------------------------------------
2769 * | | slen | qlen-slen | | slen | qlen-slen |
2770 * --------------------------------------------------------------
2772 * begin & qbegin halfway qend end
2774 * seedOnLeft == false:
2777 * --------------------------------------------------------------
2778 * | qlen-slen | | qlen-slen | slen | | slen |
2779 * --------------------------------------------------------------
2781 * begin qbegin halfway qend end
2783 * Note that, for seeds longer than 32 base-pairs, the seed is
2784 * further subdivided into a 32-bit anchor and a seed overhang of
2787 virtual void anchor64Find(uint32_t numToFind,
2790 const TDna5Str& qry,
2791 const TCharStr& quals,
2796 TSetPairs* pairs = NULL,
2797 uint32_t aoff = 0xffffffff,
2798 bool seedOnLeft = false) const
2800 assert_gt(numToFind, 0);
2801 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
2802 ASSERT_ONLY(uint32_t duplicates = 0);
2803 ASSERT_ONLY(uint32_t r2i = 0);
2804 const uint32_t qlen = seqan::length(qry);
2806 assert_gt(end, begin);
2807 assert_geq(end - begin, qlen); // caller should have checked this
2808 assert_gt(this->seedLen_, 0);
2809 uint32_t slen = min(qlen, this->seedLen_);
2811 // Get results from the naive matcher for sanity-checking
2812 TRangeVec r2; TU32Vec re2;
2813 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
2814 re2, pairs, aoff, seedOnLeft);
2816 const uint32_t anchorBitPairs = min<int>(slen, 32);
2817 const int lhsShift = ((anchorBitPairs - 1) << 1);
2818 const uint32_t anchorCushion = 32 - anchorBitPairs;
2819 // seedAnchorOverhang = # seed bases not included in the anchor
2820 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
2821 // seedAnchorOverhang = # seed bases not included in the anchor
2822 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
2823 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
2824 assert_eq(qlen, readSeedOverhang + slen);
2825 uint32_t qend = end;
2826 uint32_t qbegin = begin;
2828 // Leave read-sized gap on right-hand side of the interval
2831 // Leave seed-sized gap on right-hand side and
2832 // non-seed-sized gap on the left-hand side
2833 qbegin += readSeedOverhang;
2836 // lim = # possible alignments in the range
2837 const uint32_t lim = qend - qbegin;
2838 // halfway = point on the genome to radiate out from
2839 const uint32_t halfway = qbegin + (lim >> 1);
2840 uint64_t anchor = 0llu;
2841 uint64_t buffw = 0llu; // rotating ref sequence buffer
2842 // OR the 'diff' buffer with this so that we can always count
2843 // 'N's as mismatches
2844 uint64_t diffMask = 0llu;
2845 // Set up a mask that we'll apply to the two bufs every round
2846 // to discard bits that were rotated out of the anchor area
2847 uint64_t clearMask = 0xffffffffffffffffllu;
2848 bool useMask = false;
2849 if(anchorBitPairs < 32) {
2850 clearMask >>= ((32-anchorBitPairs) << 1);
2855 uint32_t skipLeftToRights = 0;
2856 uint32_t skipRightToLefts = 0;
2857 const uint32_t halfwayRi = halfway - begin;
2858 // Construct the 'anchor' 64-bit buffer so that it holds all of
2859 // the first 'anchorBitPairs' bit pairs of the query.
2860 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
2863 // Fill in the anchor using characters from the right-
2864 // hand side of the query (but take the characters in
2865 // left-to-right order)
2866 i = qlen - slen + ii;
2868 int c = (int)qry[i]; // next query character
2869 int r = (int)ref[halfwayRi + ii]; // next reference character
2871 // The reference character is an N; to mimic the
2872 // behavior of BW alignment, we have to skip all
2873 // alignments that involve an N in the reference. Set
2874 // the skip* variables accordingly.
2876 uint32_t lrSkips = ii;
2877 uint32_t rlSkips = qlen - ii;
2878 if(!seedOnLeft && readSeedOverhang) {
2879 lrSkips += readSeedOverhang;
2880 assert_geq(rlSkips, readSeedOverhang);
2881 rlSkips -= readSeedOverhang;
2883 // The right-to-left direction absorbs the candidate
2884 // alignment based at 'halfway'
2885 skipLeftToRights = max(skipLeftToRights, lrSkips);
2886 skipRightToLefts = max(skipRightToLefts, rlSkips);
2887 assert_leq(skipLeftToRights, qlen);
2888 assert_leq(skipRightToLefts, qlen);
2892 // Special case: query has an 'N'
2894 if(++nsInSeed > 1) {
2895 // More than one 'N' in the anchor region; can't
2896 // possibly have a 1-mismatch hit anywhere
2897 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2898 return; // can't match if query has Ns
2900 nPos = (int)ii; // w/r/t LHS of anchor
2901 // Make it look like an 'A' in the anchor
2903 diffMask = (diffMask << 2llu) | 1llu;
2907 anchor = ((anchor << 2llu) | c);
2908 buffw = ((buffw << 2llu) | r);
2910 // Check whether read is disqualified by Ns inside the seed
2911 // region but outside the anchor region
2912 if(seedAnchorOverhang) {
2913 assert_lt(anchorBitPairs, slen);
2914 for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
2917 i = qlen - slen + ii;
2919 if((int)qry[i] == 4) {
2920 if(++nsInSeed > 1) {
2921 assert_eq(r2.size(), ranges.size() - rangesInitSz);
2922 return; // can't match if query has Ns
2927 assert_eq(anchorBitPairs, slen);
2929 uint64_t bufbw = buffw;
2930 // Slide the anchor out in either direction, alternating
2931 // between right-to-left and left-to-right shifts, until all of
2932 // the positions from qbegin to qend have been covered.
2934 uint32_t riHi = halfway;
2935 uint32_t rirHi = halfway - begin;
2936 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
2937 uint32_t riLo = halfway + 1;
2938 uint32_t rirLo = halfway - begin + 1;
2939 uint32_t lrSkips = anchorBitPairs;
2940 uint32_t rlSkips = qlen;
2941 if(!seedOnLeft && readSeedOverhang) {
2942 lrSkips += readSeedOverhang;
2943 assert_geq(rlSkips, readSeedOverhang);
2944 rlSkips -= readSeedOverhang;
2946 for(uint32_t i = 1; i <= lim + 1; i++) {
2947 int r; // new reference char
2949 assert_leq(skipLeftToRights, qlen);
2950 assert_leq(skipRightToLefts, qlen);
2953 // Moving left-to-right
2957 r = (int)ref[rirHiAnchor];
2960 skipLeftToRights = lrSkips;
2963 // Bring in new base pair at the least significant
2965 buffw = ((buffw << 2llu) | r);
2966 if(useMask) buffw &= clearMask;
2967 if(skipLeftToRights > 0) {
2971 diff = (buffw ^ anchor) | diffMask;
2974 // Moving right-to-left
2977 r = (int)ref[rirLo];
2980 skipRightToLefts = rlSkips;
2985 // Bring in new base pair at the most significant
2987 bufbw |= ((uint64_t)r << lhsShift);
2989 if(skipRightToLefts > 0) {
2993 diff = (bufbw ^ anchor) | diffMask;
2995 if((diff & 0xffffffff00000000llu) &&
2996 (diff & 0x00000000ffffffffllu)) continue;
2997 uint32_t ri = hi ? riLo : riHi;
2998 uint32_t rir = hi ? rirLo : rirHi;
2999 // Could use pop count
3000 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
3001 // As a first cut, see if there are too many mismatches in
3002 // the first and last parts of the anchor
3003 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
3004 if(diffs > 1) continue;
3005 diffs += u8toMms[(int)diff8[1]] +
3006 u8toMms[(int)diff8[2]] +
3007 u8toMms[(int)diff8[3]] +
3008 u8toMms[(int)diff8[4]] +
3009 u8toMms[(int)diff8[5]] +
3010 u8toMms[(int)diff8[6]];
3011 uint32_t mmpos = 0xffffffff;
3013 unsigned int ham = 0;
3015 // Too many differences in the seed; stop
3017 } else if(diffs == 1 && nPos != -1) {
3018 // There was one difference, but there was also one N,
3019 // so we already know where the difference is
3021 assert_lt(mmpos, anchorBitPairs);
3022 refc = "ACGT"[(int)ref[rir + nPos]];
3024 mmpos += readSeedOverhang;
3026 char q = quals[mmpos];
3027 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3028 if(ham > this->qualMax_) {
3029 // Exceeded quality limit
3032 } else if(diffs == 1) {
3033 // Figure out which position mismatched
3035 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos -= 16; }
3036 assert_neq(0, diff);
3037 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos -= 8; }
3038 assert_neq(0, diff);
3039 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos -= 4; }
3040 assert_neq(0, diff);
3041 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos -= 2; }
3042 assert_neq(0, diff);
3043 if((diff & 0x3llu) == 0) { mmpos--; }
3044 assert_neq(0, diff);
3045 assert_geq(mmpos, 0);
3046 assert_lt(mmpos, 32);
3047 mmpos -= anchorCushion;
3048 assert_lt(mmpos, anchorBitPairs);
3049 refc = "ACGT"[(int)ref[rir + mmpos]];
3051 mmpos += readSeedOverhang;
3053 char q = quals[mmpos];
3054 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3055 if(ham > this->qualMax_) {
3056 // Exceeded quality limit
3060 // If the seed is longer than the anchor, then scan the
3061 // rest of the seed characters
3062 bool foundHit = true;
3063 if(seedAnchorOverhang) {
3064 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
3065 int rc = (int)ref[rir + anchorBitPairs + j];
3067 // Oops, encountered an N in the reference in
3068 // the overhang portion of the candidate
3070 // (Note that we inverted hi earlier)
3073 // Skip out of the seedAnchorOverhang
3074 assert_eq(0, skipRightToLefts);
3075 skipRightToLefts = seedAnchorOverhang - j - 1;
3077 // ...and skip out of the rest of the read
3078 skipRightToLefts += readSeedOverhang;
3082 // Skip out of the seedAnchorOverhang
3083 assert_eq(0, skipLeftToRights);
3084 skipLeftToRights = anchorBitPairs + j;
3086 // ...and skip out of the rest of the read
3087 skipLeftToRights += readSeedOverhang;
3090 foundHit = false; // Skip this candidate
3093 uint32_t qoff = anchorBitPairs + j;
3095 qoff += readSeedOverhang;
3097 assert_lt(qoff, qlen);
3098 if((int)qry[qoff] != rc) {
3103 assert_eq(0xffffffff, mmpos);
3105 assert_eq(-1, refc);
3106 refc = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
3107 char q = phredCharToPhredQual(quals[qoff]);
3108 ham += mmPenalty(this->maqPenalty_, q);
3109 if(ham > this->qualMax_) {
3110 // Exceeded quality limit
3117 if(!foundHit) continue;
3119 // If the read is longer than the seed, then scan the rest
3120 // of the read characters; mismatches no longer count
3121 // toward the stratum or the 1-mm limit.
3122 // Vectors for holding edit information
3123 std::vector<uint32_t> nonSeedMms;
3124 std::vector<uint8_t> nonSeedRefcs;
3125 int mms = diffs; // start counting total mismatches
3126 if((qlen - slen) > 0) {
3127 // Going left-to-right
3128 for(uint32_t j = 0; j < readSeedOverhang; j++) {
3129 uint32_t roff = rir + slen + j;
3130 uint32_t qoff = slen + j;
3132 assert_geq(roff, qlen);
3136 int rc = (int)ref[roff];
3138 // Oops, encountered an N in the reference in
3139 // the overhang portion of the candidate
3141 // (Note that we inverted hi earlier)
3144 // Skip what's left of the readSeedOverhang
3145 skipRightToLefts = readSeedOverhang - j - 1;
3147 // ...and skip the seed if it's on the right
3148 skipRightToLefts += slen;
3152 // Skip what we've matched of the overhang
3153 skipLeftToRights = j;
3155 // ...and skip the seed if it's on the left
3156 skipLeftToRights += slen;
3159 foundHit = false; // Skip this candidate
3162 if((int)qry[qoff] != rc) {
3163 // Calculate quality of mismatched base
3164 char q = phredCharToPhredQual(quals[qoff]);
3165 ham += mmPenalty(this->maqPenalty_, q);
3166 if(ham > this->qualMax_) {
3167 // Exceeded quality limit
3171 // Legal mismatch outside of the anchor; record it
3173 nonSeedMms.push_back(qoff);
3174 assert_leq(nonSeedMms.size(), (size_t)mms);
3175 nonSeedRefcs.push_back("ACGTN"[rc]);
3178 if(!foundHit) continue;
3181 // Adjust ri if seed is on the right-hand side
3183 ri -= readSeedOverhang;
3184 rir -= readSeedOverhang;
3189 // By convention, the upstream mate's
3190 // coordinates go in the 'first' field
3191 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
3192 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
3194 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
3195 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
3197 if(pairs->find(p) != pairs->end()) {
3198 // We already found this hit! Continue.
3199 ASSERT_ONLY(duplicates++);
3207 if(this->verbose_) {
3208 cout << "About to report:" << endl;
3210 for(size_t i = 0; i < qlen; i++) {
3211 cout << (char)qry[i];
3215 for(size_t i = 0; i < qlen; i++) {
3216 cout << "ACGT"[ref[rir+i]];
3220 assert_leq(diffs, 1);
3221 assert_geq((size_t)mms, diffs);
3222 assert_lt(r2i, r2.size());
3223 assert_eq(re2[r2i], ri);
3224 ranges.resize(ranges.size()+1);
3225 Range& range = ranges.back();
3226 assert_eq((size_t)mms, r2[r2i].numMms);
3227 range.stratum = diffs;
3229 assert_eq(0, range.mms.size());
3230 assert_eq(0, range.refcs.size());
3232 ASSERT_ONLY(size_t mmcur = 0);
3233 if(seedOnLeft && diffs > 0) {
3234 assert_neq(mmpos, 0xffffffff);
3235 assert_lt(mmpos, qlen);
3236 assert_lt(mmcur, (size_t)mms);
3237 assert_eq(mmpos, r2[r2i].mms[mmcur]);
3238 assert_neq(-1, refc);
3239 assert_eq(refc, r2[r2i].refcs[mmcur]);
3240 ASSERT_ONLY(mmcur++);
3241 range.mms.push_back(mmpos);
3242 range.refcs.push_back(refc);
3244 const size_t nonSeedMmsSz = nonSeedMms.size();
3245 for(size_t i = 0; i < nonSeedMmsSz; i++) {
3246 assert_neq(0xffffffff, nonSeedMms[i]);
3247 assert_lt(mmcur, (size_t)mms);
3248 assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
3249 range.mms.push_back(nonSeedMms[i]);
3250 assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
3251 ASSERT_ONLY(mmcur++);
3252 range.refcs.push_back(nonSeedRefcs[i]);
3254 if(!seedOnLeft && diffs > 0) {
3255 assert_neq(mmpos, 0xffffffff);
3256 assert_lt(mmpos, qlen);
3257 assert_lt(mmcur, (size_t)mms);
3258 assert_eq(mmpos, r2[r2i].mms[mmcur]);
3259 assert_neq(-1, refc);
3260 assert_eq(refc, r2[r2i].refcs[mmcur]);
3261 ASSERT_ONLY(mmcur++);
3262 range.mms.push_back(mmpos);
3263 range.refcs.push_back(refc);
3265 assert_eq(mmcur, r2[r2i].mms.size());
3267 assert_eq((size_t)mms, range.mms.size());
3268 assert_eq((size_t)mms, range.refcs.size());
3270 results.push_back(ri);
3271 if(--numToFind == 0) return;
3273 assert_leq(duplicates, r2.size());
3274 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
3280 * Concrete RefAligner for finding nearby 2-mismatch hits given an
3283 template<typename TStr>
3284 class Seed2RefAligner : public RefAligner<TStr> {
3286 typedef seqan::String<seqan::Dna5> TDna5Str;
3287 typedef seqan::String<char> TCharStr;
3288 typedef std::vector<Range> TRangeVec;
3289 typedef std::vector<uint32_t> TU32Vec;
3290 typedef std::pair<uint64_t, uint64_t> TU64Pair;
3291 typedef std::set<TU64Pair> TSetPairs;
3295 Seed2RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
3296 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
3298 virtual ~Seed2RefAligner() { }
3302 * This schematic shows the roles played by the begin, qbegin, end,
3303 * qend, halfway, slen, qlen, and lim variables:
3305 * seedOnLeft == true:
3307 * |< lim >|< qlen >|
3308 * --------------------------------------------------------------
3309 * | | slen | qlen-slen | | slen | qlen-slen |
3310 * --------------------------------------------------------------
3312 * begin & qbegin halfway qend end
3314 * seedOnLeft == false:
3316 * |< qlen >|< lim >|
3317 * --------------------------------------------------------------
3318 * | qlen-slen | slen | | qlen-slen | slen | |
3319 * --------------------------------------------------------------
3321 * begin qbegin halfway qend & end
3323 * Note that, for seeds longer than 32 base-pairs, the seed is
3324 * further subdivided into a 32-bit anchor and a seed overhang of
3327 void naiveFind(uint32_t numToFind,
3330 const TDna5Str& qry,
3331 const TCharStr& quals,
3338 bool seedOnLeft) const
3340 assert_gt(numToFind, 0);
3341 assert_gt(end, begin);
3342 const uint32_t qlen = seqan::length(qry);
3344 assert_geq(end - begin, qlen); // caller should have checked this
3345 assert_gt(this->seedLen_, 0);
3346 const uint32_t slen = min(qlen, this->seedLen_);
3347 uint32_t qend = end;
3348 uint32_t qbegin = begin;
3349 // If the seed is on the left-hand side of the alignment, then
3350 // leave a gap at the right-hand side of the interval;
3351 // otherwise, do the opposite
3353 // Leave gap on right-hand side of the interval
3356 // Leave gap on left-hand side of the interval
3359 // lim = number of alignments to try
3360 const uint32_t lim = qend - qbegin;
3361 // halfway = position in the reference to start at (and then
3362 // we work our way out to the right and to the left).
3363 const uint32_t halfway = qbegin + (lim >> 1);
3364 // Vectors for holding edit information
3365 std::vector<uint32_t> nonSeedMms;
3366 std::vector<uint8_t> nonSeedRefcs;
3368 for(uint32_t i = 1; i <= lim+1; i++) {
3369 uint32_t ri; // leftmost position in candidate alignment
3370 uint32_t rir; // same, minus begin; for indexing into ref[]
3372 ri = halfway + (i >> 1); rir = ri - begin;
3373 assert_leq(ri, qend);
3375 ri = halfway - (i >> 1); rir = ri - begin;
3376 assert_geq(ri, begin);
3379 // Do the naive comparison
3382 uint32_t mmOff1 = 0xffffffff;
3384 uint32_t mmOff2 = 0xffffffff;
3387 unsigned int ham = 0;
3389 nonSeedRefcs.clear();
3390 // Walk through each position of the alignment
3391 for(uint32_t jj = 0; jj < qlen; jj++) {
3394 // If seed is on the right, scan right-to-left
3399 uint32_t rirj = rir + j;
3401 assert_geq(rir, jj);
3402 rirj = rir - jj - 1;
3405 // Count Ns in the reference as mismatches
3406 const int q = (int)qry[j];
3407 const int r = (int)ref[rirj];
3410 if(q == 4 || r == 4 || q != r) {
3412 // Disallow alignments that involve an N in the
3414 const int r = (int)ref[rirj];
3416 // N in reference; bail on this alignment
3420 const int q = (int)qry[j];
3427 if(mms > 2 && jj < slen) {
3428 // More than one mismatch in the anchor; reject
3432 uint8_t qual = phredCharToPhredQual(quals[j]);
3433 ham += mmPenalty(this->maqPenalty_, qual);
3434 if(ham > this->qualMax_) {
3435 // Exceeded quality ceiling; reject
3438 } else if(mms == 1 && jj < slen) {
3439 // First mismatch in the anchor; remember offset
3444 } else if(mms == 2 && jj < slen) {
3445 // Second mismatch in the anchor; remember offset
3451 // Legal mismatch outside of the anchor; record it
3452 nonSeedMms.push_back(j);
3453 assert_leq(nonSeedMms.size(), (size_t)mms);
3454 nonSeedRefcs.push_back("ACGTN"[r]);
3459 ranges.resize(ranges.size()+1);
3460 Range& range = ranges.back();
3461 assert_leq(seedMms, mms);
3462 range.stratum = seedMms;
3464 assert_eq(0, range.mms.size());
3465 assert_eq(0, range.refcs.size());
3467 // Be careful to add edits in left-to-right order
3468 // with respect to the read/alignment
3469 if(seedOnLeft && seedMms) {
3470 assert_lt(mmOff1, qlen);
3471 range.mms.push_back(mmOff1);
3472 range.refcs.push_back(refc1);
3474 assert_lt(mmOff1, mmOff2);
3475 assert_lt(mmOff2, qlen);
3476 range.mms.push_back(mmOff2);
3477 range.refcs.push_back(refc2);
3480 const size_t nonSeedMmsSz = nonSeedMms.size();
3481 if(nonSeedMmsSz > 0) {
3483 for(size_t k = 0; k < nonSeedMmsSz; k++) {
3484 range.mms.push_back(nonSeedMms[k]);
3485 range.refcs.push_back(nonSeedRefcs[k]);
3488 for(size_t k = nonSeedMmsSz; k > 0; k--) {
3489 range.mms.push_back(nonSeedMms[k-1]);
3490 range.refcs.push_back(nonSeedRefcs[k-1]);
3494 if(!seedOnLeft && seedMms) {
3496 assert_lt(mmOff2, mmOff1);
3497 assert_lt(mmOff2, qlen);
3498 range.mms.push_back(mmOff2);
3499 range.refcs.push_back(refc2);
3501 assert_lt(mmOff1, qlen);
3502 range.mms.push_back(mmOff1);
3503 range.refcs.push_back(refc1);
3506 assert_eq((size_t)mms, range.mms.size());
3507 assert_eq((size_t)mms, range.refcs.size());
3509 results.push_back(ri);
3511 results.push_back(ri - qlen);
3519 * This schematic shows the roles played by the begin, qbegin, end,
3520 * qend, halfway, slen, qlen, and lim variables:
3522 * seedOnLeft == true:
3524 * |< lim >|< qlen >|
3525 * --------------------------------------------------------------
3526 * | | slen | qlen-slen | | slen | qlen-slen |
3527 * --------------------------------------------------------------
3529 * begin & qbegin halfway qend end
3531 * seedOnLeft == false:
3534 * --------------------------------------------------------------
3535 * | qlen-slen | | qlen-slen | slen | | slen |
3536 * --------------------------------------------------------------
3538 * begin qbegin halfway qend end
3540 * Note that, for seeds longer than 32 base-pairs, the seed is
3541 * further subdivided into a 32-bit anchor and a seed overhang of
3544 virtual void anchor64Find(uint32_t numToFind,
3547 const TDna5Str& qry,
3548 const TCharStr& quals,
3553 TSetPairs* pairs = NULL,
3554 uint32_t aoff = 0xffffffff,
3555 bool seedOnLeft = false) const
3557 assert_gt(numToFind, 0);
3558 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
3559 ASSERT_ONLY(uint32_t duplicates = 0);
3560 ASSERT_ONLY(uint32_t r2i = 0);
3561 const uint32_t qlen = seqan::length(qry);
3563 assert_gt(end, begin);
3564 assert_geq(end - begin, qlen); // caller should have checked this
3565 assert_gt(this->seedLen_, 0);
3566 uint32_t slen = min(qlen, this->seedLen_);
3568 // Get results from the naive matcher for sanity-checking
3569 TRangeVec r2; TU32Vec re2;
3570 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
3571 re2, pairs, aoff, seedOnLeft);
3573 const uint32_t anchorBitPairs = min<int>(slen, 32);
3574 const int lhsShift = ((anchorBitPairs - 1) << 1);
3575 const uint32_t anchorCushion = 32 - anchorBitPairs;
3576 // seedAnchorOverhang = # seed bases not included in the anchor
3577 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
3578 // seedAnchorOverhang = # seed bases not included in the anchor
3579 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
3580 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
3581 assert_eq(qlen, readSeedOverhang + slen);
3582 uint32_t qend = end;
3583 uint32_t qbegin = begin;
3585 // Leave read-sized gap on right-hand side of the interval
3588 // Leave seed-sized gap on right-hand side and
3589 // non-seed-sized gap on the left-hand side
3590 qbegin += readSeedOverhang;
3593 // lim = # possible alignments in the range
3594 const uint32_t lim = qend - qbegin;
3595 // halfway = point on the genome to radiate out from
3596 const uint32_t halfway = qbegin + (lim >> 1);
3597 uint64_t anchor = 0llu;
3598 uint64_t buffw = 0llu; // rotating ref sequence buffer
3599 // OR the 'diff' buffer with this so that we can always count
3600 // 'N's as mismatches
3601 uint64_t diffMask = 0llu;
3602 // Set up a mask that we'll apply to the two bufs every round
3603 // to discard bits that were rotated out of the anchor area
3604 uint64_t clearMask = 0xffffffffffffffffllu;
3605 bool useMask = false;
3606 if(anchorBitPairs < 32) {
3607 clearMask >>= ((32-anchorBitPairs) << 1);
3614 uint32_t skipLeftToRights = 0;
3615 uint32_t skipRightToLefts = 0;
3616 const uint32_t halfwayRi = halfway - begin;
3617 assert_leq(anchorBitPairs, slen);
3618 // Construct the 'anchor' 64-bit buffer so that it holds all of
3619 // the first 'anchorBitPairs' bit pairs of the query.
3620 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
3623 // Fill in the anchor using characters from the seed
3624 // portion of the read, starting at the left. Note
3625 // that we're subtracting by slen rather than
3626 // anchorBitPairs because we want the seed anchor
3627 // overhang to be on the right-hand side
3628 i = qlen - slen + ii;
3630 int c = (int)qry[i]; // next query character
3631 int r = (int)ref[halfwayRi + ii]; // next reference character
3633 // The reference character is an N; to mimic the
3634 // behavior of BW alignment, we have to skip all
3635 // alignments that involve an N in the reference. Set
3636 // the skip* variables accordingly.
3638 uint32_t lrSkips = ii;
3639 uint32_t rlSkips = qlen - ii;
3640 if(!seedOnLeft && readSeedOverhang) {
3641 lrSkips += readSeedOverhang;
3642 assert_geq(rlSkips, readSeedOverhang);
3643 rlSkips -= readSeedOverhang;
3645 // The right-to-left direction absorbs the candidate
3646 // alignment based at 'halfway'
3647 skipLeftToRights = max(skipLeftToRights, lrSkips);
3648 skipRightToLefts = max(skipRightToLefts, rlSkips);
3649 assert_leq(skipLeftToRights, qlen);
3650 assert_leq(skipRightToLefts, qlen);
3654 // Special case: query has an 'N'
3656 if(++nsInSeed > 2) {
3657 // More than one 'N' in the anchor region; can't
3658 // possibly have a 1-mismatch hit anywhere
3659 assert_eq(r2.size(), ranges.size() - rangesInitSz);
3660 return; // can't match if query has Ns
3663 nPos1 = (int)ii; // w/r/t LHS of anchor
3665 assert_eq(2, nsInSeed);
3666 nPos2 = (int)ii; // w/r/t LHS of anchor
3667 assert_gt(nPos2, nPos1);
3669 // Make it look like an 'A' in the anchor
3671 diffMask = (diffMask << 2llu) | 1llu;
3675 anchor = ((anchor << 2llu) | c);
3676 buffw = ((buffw << 2llu) | r);
3678 // Check whether read is disqualified by Ns inside the seed
3679 // region but outside the anchor region
3680 if(seedAnchorOverhang) {
3681 assert_lt(anchorBitPairs, slen);
3682 for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
3685 i = qlen - slen + ii;
3687 if((int)qry[i] == 4) {
3688 if(++nsInSeed > 2) {
3689 assert_eq(r2.size(), ranges.size() - rangesInitSz);
3690 return; // can't match if query has Ns
3695 assert_eq(anchorBitPairs, slen);
3697 uint64_t bufbw = buffw;
3698 // Slide the anchor out in either direction, alternating
3699 // between right-to-left and left-to-right shifts, until all of
3700 // the positions from qbegin to qend have been covered.
3702 uint32_t riHi = halfway;
3703 uint32_t rirHi = halfway - begin;
3704 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
3705 uint32_t riLo = halfway + 1;
3706 uint32_t rirLo = halfway - begin + 1;
3707 uint32_t lrSkips = anchorBitPairs;
3708 uint32_t rlSkips = qlen;
3709 if(!seedOnLeft && readSeedOverhang) {
3710 lrSkips += readSeedOverhang;
3711 assert_geq(rlSkips, readSeedOverhang);
3712 rlSkips -= readSeedOverhang;
3714 for(uint32_t i = 1; i <= lim + 1; i++) {
3715 int r; // new reference char
3717 assert_leq(skipLeftToRights, qlen);
3718 assert_leq(skipRightToLefts, qlen);
3721 // Moving left-to-right
3725 r = (int)ref[rirHiAnchor];
3728 skipLeftToRights = lrSkips;
3731 // Bring in new base pair at the least significant
3733 buffw = ((buffw << 2llu) | r);
3734 if(useMask) buffw &= clearMask;
3735 if(skipLeftToRights > 0) {
3739 diff = (buffw ^ anchor) | diffMask;
3742 // Moving right-to-left
3745 r = (int)ref[rirLo];
3748 skipRightToLefts = rlSkips;
3753 // Bring in new base pair at the most significant
3755 bufbw |= ((uint64_t)r << lhsShift);
3757 if(skipRightToLefts > 0) {
3761 diff = (bufbw ^ anchor) | diffMask;
3763 if((diff & 0xf00f00f00f00f00fllu) &&
3764 (diff & 0x0f00f00f00f00f00llu) &&
3765 (diff & 0x00f00f00f00f00f0llu)) continue;
3766 if((diff & 0xc30c30c30c30c30cllu) &&
3767 (diff & 0x30c30c30c30c30c3llu) &&
3768 (diff & 0x0c30c30c30c30c30llu)) continue;
3769 uint32_t ri = hi ? riLo : riHi;
3770 uint32_t rir = hi ? rirLo : rirHi;
3771 // Could use pop count
3772 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
3773 // As a first cut, see if there are too many mismatches in
3774 // the first and last parts of the anchor
3775 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
3776 if(diffs > 2) continue;
3777 diffs += u8toMms[(int)diff8[1]] +
3778 u8toMms[(int)diff8[2]] +
3779 u8toMms[(int)diff8[3]] +
3780 u8toMms[(int)diff8[4]] +
3781 u8toMms[(int)diff8[5]] +
3782 u8toMms[(int)diff8[6]];
3783 uint32_t mmpos1 = 0xffffffff;
3785 uint32_t mmpos2 = 0xffffffff;
3787 unsigned int ham = 0;
3789 // Too many differences in the seed; stop
3791 } else if(nPoss > 1 && diffs == nPoss) {
3792 // There was one difference, but there was also one N,
3793 // so we already know where the difference is
3795 refc1 = "ACGT"[(int)ref[rir + nPos1]];
3797 mmpos1 += readSeedOverhang;
3799 char q = quals[mmpos1];
3800 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3801 if(ham > this->qualMax_) {
3802 // Exceeded quality limit
3807 refc2 = "ACGT"[(int)ref[rir + nPos2]];
3809 mmpos2 += readSeedOverhang;
3812 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3813 if(ham > this->qualMax_) {
3814 // Exceeded quality limit
3819 } else if(diffs > 0) {
3820 // Figure out which position mismatched
3821 uint64_t diff2 = diff;
3823 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
3824 assert_neq(0, diff);
3825 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos1 -= 8; }
3826 assert_neq(0, diff);
3827 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos1 -= 4; }
3828 assert_neq(0, diff);
3829 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos1 -= 2; }
3830 assert_neq(0, diff);
3831 if((diff & 0x3llu) == 0) { mmpos1--; }
3832 assert_neq(0, diff);
3833 assert_geq(mmpos1, 0);
3834 assert_lt(mmpos1, 32);
3835 uint32_t savedMmpos1 = mmpos1;
3836 mmpos1 -= anchorCushion;
3837 assert_lt(mmpos1, anchorBitPairs);
3838 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
3840 mmpos1 += readSeedOverhang;
3842 char q = quals[mmpos1];
3843 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3844 if(ham > this->qualMax_) {
3845 // Exceeded quality limit
3849 // Figure out the second mismatched position
3850 ASSERT_ONLY(uint64_t origDiff2 = diff2);
3851 diff2 &= ~(0xc000000000000000llu >> (uint64_t)((savedMmpos1) << 1));
3852 assert_neq(diff2, origDiff2);
3854 if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
3855 assert_neq(0, diff2);
3856 if((diff2 & 0xffffllu) == 0) { diff2 >>= 16llu; mmpos2 -= 8; }
3857 assert_neq(0, diff2);
3858 if((diff2 & 0xffllu) == 0) { diff2 >>= 8llu; mmpos2 -= 4; }
3859 assert_neq(0, diff2);
3860 if((diff2 & 0xfllu) == 0) { diff2 >>= 4llu; mmpos2 -= 2; }
3861 assert_neq(0, diff2);
3862 if((diff2 & 0x3llu) == 0) { mmpos2--; }
3863 assert_neq(0, diff2);
3864 assert_geq(mmpos2, 0);
3865 assert_lt(mmpos2, 32);
3866 mmpos2 -= anchorCushion;
3867 assert_neq(mmpos1, mmpos2);
3868 refc2 = "ACGT"[(int)ref[rir + mmpos2]];
3870 mmpos2 += readSeedOverhang;
3873 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
3874 if(ham > this->qualMax_) {
3875 // Exceeded quality limit
3878 if(mmpos2 < mmpos1) {
3879 uint32_t mmtmp = mmpos1;
3882 int refctmp = refc1;
3886 assert_lt(mmpos1, mmpos2);
3889 // If the seed is longer than the anchor, then scan the
3890 // rest of the seed characters
3891 bool foundHit = true;
3892 if(seedAnchorOverhang) {
3893 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
3894 int rc = (int)ref[rir + anchorBitPairs + j];
3896 // Oops, encountered an N in the reference in
3897 // the overhang portion of the candidate
3899 // (Note that we inverted hi earlier)
3902 // Skip out of the seedAnchorOverhang
3903 assert_eq(0, skipRightToLefts);
3904 skipRightToLefts = seedAnchorOverhang - j - 1;
3906 // ...and skip out of the rest of the read
3907 skipRightToLefts += readSeedOverhang;
3911 // Skip out of the seedAnchorOverhang
3912 assert_eq(0, skipLeftToRights);
3913 skipLeftToRights = anchorBitPairs + j;
3915 // ...and skip out of the rest of the read
3916 skipLeftToRights += readSeedOverhang;
3919 foundHit = false; // Skip this candidate
3922 uint32_t qoff = anchorBitPairs + j;
3924 qoff += readSeedOverhang;
3926 assert_lt(qoff, qlen);
3927 if((int)qry[qoff] != rc) {
3932 } else if(diffs == 2) {
3933 assert_eq(0xffffffff, mmpos2);
3935 assert_eq(-1, refc2);
3936 refc2 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
3938 assert_eq(1, diffs);
3939 assert_eq(0xffffffff, mmpos1);
3941 assert_eq(-1, refc1);
3942 refc1 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
3944 char q = phredCharToPhredQual(quals[qoff]);
3945 ham += mmPenalty(this->maqPenalty_, q);
3946 if(ham > this->qualMax_) {
3947 // Exceeded quality limit
3953 if(!foundHit) continue;
3955 // If the read is longer than the seed, then scan the rest
3956 // of the read characters; mismatches no longer count
3957 // toward the stratum or the 1-mm limit.
3958 // Vectors for holding edit information
3959 std::vector<uint32_t> nonSeedMms;
3960 std::vector<uint8_t> nonSeedRefcs;
3961 int mms = diffs; // start counting total mismatches
3962 if((qlen - slen) > 0) {
3963 // Going left-to-right
3964 for(uint32_t j = 0; j < readSeedOverhang; j++) {
3965 uint32_t roff = rir + slen + j;
3966 uint32_t qoff = slen + j;
3968 assert_geq(roff, qlen);
3972 int rc = (int)ref[roff];
3974 // Oops, encountered an N in the reference in
3975 // the overhang portion of the candidate
3977 // (Note that we inverted hi earlier)
3980 // Skip what's left of the readSeedOverhang
3981 skipRightToLefts = readSeedOverhang - j - 1;
3983 // ...and skip the seed if it's on the right
3984 skipRightToLefts += slen;
3988 // Skip what we've matched of the overhang
3989 skipLeftToRights = j;
3991 // ...and skip the seed if it's on the left
3992 skipLeftToRights += slen;
3995 foundHit = false; // Skip this candidate
3998 if((int)qry[qoff] != rc) {
3999 // Calculate quality of mismatched base
4000 char q = phredCharToPhredQual(quals[qoff]);
4001 ham += mmPenalty(this->maqPenalty_, q);
4002 if(ham > this->qualMax_) {
4003 // Exceeded quality limit
4007 // Legal mismatch outside of the anchor; record it
4009 nonSeedMms.push_back(qoff);
4010 assert_leq(nonSeedMms.size(), (size_t)mms);
4011 nonSeedRefcs.push_back("ACGTN"[rc]);
4014 if(!foundHit) continue;
4017 // Adjust ri if seed is on the right-hand side
4019 ri -= readSeedOverhang;
4020 rir -= readSeedOverhang;
4025 // By convention, the upstream mate's
4026 // coordinates go in the 'first' field
4027 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
4028 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4030 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
4031 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4033 if(pairs->find(p) != pairs->end()) {
4034 // We already found this hit! Continue.
4035 ASSERT_ONLY(duplicates++);
4043 if(this->verbose_) {
4044 cout << "About to report:" << endl;
4046 for(size_t i = 0; i < qlen; i++) {
4047 cout << (char)qry[i];
4051 for(size_t i = 0; i < qlen; i++) {
4052 cout << "ACGT"[ref[rir+i]];
4056 assert_leq(diffs, 2);
4057 assert_geq((size_t)mms, diffs);
4058 assert_lt(r2i, r2.size());
4059 assert_eq(re2[r2i], ri);
4060 ranges.resize(ranges.size()+1);
4061 Range& range = ranges.back();
4062 assert_eq((size_t)mms, r2[r2i].numMms);
4063 range.stratum = diffs;
4065 assert_eq(0, range.mms.size());
4066 assert_eq(0, range.refcs.size());
4068 ASSERT_ONLY(size_t mmcur = 0);
4069 if(seedOnLeft && diffs > 0) {
4070 assert_neq(mmpos1, 0xffffffff);
4071 assert_lt(mmpos1, qlen);
4072 assert_lt(mmcur, (size_t)mms);
4073 assert_eq(mmpos1, r2[r2i].mms[mmcur]);
4074 assert_neq(-1, refc1);
4075 assert_eq(refc1, r2[r2i].refcs[mmcur]);
4076 ASSERT_ONLY(mmcur++);
4077 range.mms.push_back(mmpos1);
4078 range.refcs.push_back(refc1);
4080 assert_eq(2, diffs);
4081 assert_neq(mmpos2, 0xffffffff);
4082 assert_lt(mmpos2, qlen);
4083 assert_lt(mmcur, (size_t)mms);
4084 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
4085 assert_neq(-1, refc2);
4086 assert_eq(refc2, r2[r2i].refcs[mmcur]);
4087 ASSERT_ONLY(mmcur++);
4088 range.mms.push_back(mmpos2);
4089 range.refcs.push_back(refc2);
4092 const size_t nonSeedMmsSz = nonSeedMms.size();
4093 for(size_t i = 0; i < nonSeedMmsSz; i++) {
4094 assert_neq(0xffffffff, nonSeedMms[i]);
4095 assert_lt(mmcur, (size_t)mms);
4096 assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
4097 range.mms.push_back(nonSeedMms[i]);
4098 assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
4099 ASSERT_ONLY(mmcur++);
4100 range.refcs.push_back(nonSeedRefcs[i]);
4102 if(!seedOnLeft && diffs > 0) {
4103 assert_neq(mmpos1, 0xffffffff);
4104 assert_lt(mmpos1, qlen);
4105 assert_lt(mmcur, (size_t)mms);
4106 assert_eq(mmpos1, r2[r2i].mms[mmcur]);
4107 assert_neq(-1, refc1);
4108 assert_eq(refc1, r2[r2i].refcs[mmcur]);
4109 ASSERT_ONLY(mmcur++);
4110 range.mms.push_back(mmpos1);
4111 range.refcs.push_back(refc1);
4113 assert_eq(2, diffs);
4114 assert_neq(mmpos2, 0xffffffff);
4115 assert_lt(mmpos2, qlen);
4116 assert_lt(mmcur, (size_t)mms);
4117 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
4118 assert_neq(-1, refc2);
4119 assert_eq(refc2, r2[r2i].refcs[mmcur]);
4120 ASSERT_ONLY(mmcur++);
4121 range.mms.push_back(mmpos2);
4122 range.refcs.push_back(refc2);
4125 assert_eq(mmcur, r2[r2i].mms.size());
4127 assert_eq((size_t)mms, range.mms.size());
4128 assert_eq((size_t)mms, range.refcs.size());
4130 results.push_back(ri);
4131 if(--numToFind == 0) return;
4133 assert_leq(duplicates, r2.size());
4134 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
4140 * Concrete RefAligner for finding nearby 3-mismatch hits given an
4143 template<typename TStr>
4144 class Seed3RefAligner : public RefAligner<TStr> {
4146 typedef seqan::String<seqan::Dna5> TDna5Str;
4147 typedef seqan::String<char> TCharStr;
4148 typedef std::vector<Range> TRangeVec;
4149 typedef std::vector<uint32_t> TU32Vec;
4150 typedef std::pair<uint64_t, uint64_t> TU64Pair;
4151 typedef std::set<TU64Pair> TSetPairs;
4155 Seed3RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) :
4156 RefAligner<TStr>(color, verbose, quiet, seedLen, qualMax, maqPenalty) { }
4158 virtual ~Seed3RefAligner() { }
4162 * This schematic shows the roles played by the begin, qbegin, end,
4163 * qend, halfway, slen, qlen, and lim variables:
4165 * seedOnLeft == true:
4167 * |< lim >|< qlen >|
4168 * --------------------------------------------------------------
4169 * | | slen | qlen-slen | | slen | qlen-slen |
4170 * --------------------------------------------------------------
4172 * begin & qbegin halfway qend end
4174 * seedOnLeft == false:
4176 * |< qlen >|< lim >|
4177 * --------------------------------------------------------------
4178 * | qlen-slen | slen | | qlen-slen | slen | |
4179 * --------------------------------------------------------------
4181 * begin qbegin halfway qend & end
4183 * Note that, for seeds longer than 32 base-pairs, the seed is
4184 * further subdivided into a 32-bit anchor and a seed overhang of
4187 void naiveFind(uint32_t numToFind,
4190 const TDna5Str& qry,
4191 const TCharStr& quals,
4198 bool seedOnLeft) const
4200 assert_gt(numToFind, 0);
4201 assert_gt(end, begin);
4202 const uint32_t qlen = seqan::length(qry);
4204 assert_geq(end - begin, qlen); // caller should have checked this
4205 assert_gt(this->seedLen_, 0);
4206 const uint32_t slen = min(qlen, this->seedLen_);
4207 uint32_t qend = end;
4208 uint32_t qbegin = begin;
4209 // If the seed is on the left-hand side of the alignment, then
4210 // leave a gap at the right-hand side of the interval;
4211 // otherwise, do the opposite
4213 // Leave gap on right-hand side of the interval
4216 // Leave gap on left-hand side of the interval
4219 // lim = number of alignments to try
4220 const uint32_t lim = qend - qbegin;
4221 // halfway = position in the reference to start at (and then
4222 // we work our way out to the right and to the left).
4223 const uint32_t halfway = qbegin + (lim >> 1);
4224 // Vectors for holding edit information
4225 std::vector<uint32_t> nonSeedMms;
4226 std::vector<uint8_t> nonSeedRefcs;
4228 for(uint32_t i = 1; i <= lim+1; i++) {
4229 uint32_t ri; // leftmost position in candidate alignment
4230 uint32_t rir; // same, minus begin; for indexing into ref[]
4232 ri = halfway + (i >> 1); rir = ri - begin;
4233 assert_leq(ri, qend);
4235 ri = halfway - (i >> 1); rir = ri - begin;
4236 assert_geq(ri, begin);
4239 // Do the naive comparison
4242 uint32_t mmOff1 = 0xffffffff;
4244 uint32_t mmOff2 = 0xffffffff;
4246 uint32_t mmOff3 = 0xffffffff;
4249 unsigned int ham = 0;
4251 nonSeedRefcs.clear();
4252 // Walk through each position of the alignment
4253 for(uint32_t jj = 0; jj < qlen; jj++) {
4256 // If seed is on the right, scan right-to-left
4261 uint32_t rirj = rir + j;
4263 assert_geq(rir, jj);
4264 rirj = rir - jj - 1;
4267 // Count Ns in the reference as mismatches
4268 const int q = (int)qry[j];
4269 const int r = (int)ref[rirj];
4272 if(q == 4 || r == 4 || q != r) {
4274 // Disallow alignments that involve an N in the
4276 const int r = (int)ref[rirj];
4278 // N in reference; bail on this alignment
4282 const int q = (int)qry[j];
4289 if(mms > 3 && jj < slen) {
4290 // More than one mismatch in the anchor; reject
4294 uint8_t qual = phredCharToPhredQual(quals[j]);
4295 ham += mmPenalty(this->maqPenalty_, qual);
4296 if(ham > this->qualMax_) {
4297 // Exceeded quality ceiling; reject
4300 } else if(mms == 1 && jj < slen) {
4301 // First mismatch in the anchor; remember offset
4306 } else if(mms == 2 && jj < slen) {
4307 // Second mismatch in the anchor; remember offset
4312 } else if(mms == 3 && jj < slen) {
4313 // Third mismatch in the anchor; remember offset
4319 // Legal mismatch outside of the anchor; record it
4320 nonSeedMms.push_back(j);
4321 assert_leq(nonSeedMms.size(), (size_t)mms);
4322 nonSeedRefcs.push_back("ACGTN"[r]);
4327 ranges.resize(ranges.size()+1);
4328 Range& range = ranges.back();
4329 assert_leq(seedMms, mms);
4330 range.stratum = seedMms;
4332 assert_eq(0, range.mms.size());
4333 assert_eq(0, range.refcs.size());
4335 // Be careful to add edits in left-to-right order
4336 // with respect to the read/alignment
4337 if(seedOnLeft && seedMms) {
4338 assert_lt(mmOff1, qlen);
4339 range.mms.push_back(mmOff1);
4340 range.refcs.push_back(refc1);
4342 assert_lt(mmOff1, mmOff2);
4343 assert_lt(mmOff2, qlen);
4344 range.mms.push_back(mmOff2);
4345 range.refcs.push_back(refc2);
4347 assert_lt(mmOff2, mmOff3);
4348 assert_lt(mmOff3, qlen);
4349 range.mms.push_back(mmOff3);
4350 range.refcs.push_back(refc3);
4354 const size_t nonSeedMmsSz = nonSeedMms.size();
4355 if(nonSeedMmsSz > 0) {
4357 for(size_t k = 0; k < nonSeedMmsSz; k++) {
4358 range.mms.push_back(nonSeedMms[k]);
4359 range.refcs.push_back(nonSeedRefcs[k]);
4362 for(size_t k = nonSeedMmsSz; k > 0; k--) {
4363 range.mms.push_back(nonSeedMms[k-1]);
4364 range.refcs.push_back(nonSeedRefcs[k-1]);
4368 if(!seedOnLeft && seedMms) {
4371 assert_lt(mmOff3, mmOff2);
4372 assert_lt(mmOff3, qlen);
4373 range.mms.push_back(mmOff3);
4374 range.refcs.push_back(refc3);
4376 assert_lt(mmOff2, mmOff1);
4377 assert_lt(mmOff2, qlen);
4378 range.mms.push_back(mmOff2);
4379 range.refcs.push_back(refc2);
4381 assert_lt(mmOff1, qlen);
4382 range.mms.push_back(mmOff1);
4383 range.refcs.push_back(refc1);
4386 assert_eq((size_t)mms, range.mms.size());
4387 assert_eq((size_t)mms, range.refcs.size());
4389 results.push_back(ri);
4391 results.push_back(ri - qlen);
4399 * This schematic shows the roles played by the begin, qbegin, end,
4400 * qend, halfway, slen, qlen, and lim variables:
4402 * seedOnLeft == true:
4404 * |< lim >|< qlen >|
4405 * --------------------------------------------------------------
4406 * | | slen | qlen-slen | | slen | qlen-slen |
4407 * --------------------------------------------------------------
4409 * begin & qbegin halfway qend end
4411 * seedOnLeft == false:
4414 * --------------------------------------------------------------
4415 * | qlen-slen | | qlen-slen | slen | | slen |
4416 * --------------------------------------------------------------
4418 * begin qbegin halfway qend end
4420 * Note that, for seeds longer than 32 base-pairs, the seed is
4421 * further subdivided into a 32-bit anchor and a seed overhang of
4424 virtual void anchor64Find(uint32_t numToFind,
4427 const TDna5Str& qry,
4428 const TCharStr& quals,
4433 TSetPairs* pairs = NULL,
4434 uint32_t aoff = 0xffffffff,
4435 bool seedOnLeft = false) const
4437 assert_gt(numToFind, 0);
4438 ASSERT_ONLY(const uint32_t rangesInitSz = ranges.size());
4439 ASSERT_ONLY(uint32_t duplicates = 0);
4440 ASSERT_ONLY(uint32_t r2i = 0);
4441 const uint32_t qlen = seqan::length(qry);
4443 assert_gt(end, begin);
4444 assert_geq(end - begin, qlen); // caller should have checked this
4445 assert_gt(this->seedLen_, 0);
4446 uint32_t slen = min(qlen, this->seedLen_);
4448 // Get results from the naive matcher for sanity-checking
4449 TRangeVec r2; TU32Vec re2;
4450 naiveFind(numToFind, tidx, ref, qry, quals, begin, end, r2,
4451 re2, pairs, aoff, seedOnLeft);
4453 const uint32_t anchorBitPairs = min<int>(slen, 32);
4454 const int lhsShift = ((anchorBitPairs - 1) << 1);
4455 const uint32_t anchorCushion = 32 - anchorBitPairs;
4456 // seedAnchorOverhang = # seed bases not included in the anchor
4457 const uint32_t seedAnchorOverhang = (slen <= anchorBitPairs ? 0 : (slen - anchorBitPairs));
4458 // seedAnchorOverhang = # seed bases not included in the anchor
4459 const uint32_t readSeedOverhang = (slen == qlen ? 0 : (qlen - slen));
4460 assert(anchorCushion == 0 || seedAnchorOverhang == 0);
4461 assert_eq(qlen, readSeedOverhang + slen);
4462 uint32_t qend = end;
4463 uint32_t qbegin = begin;
4465 // Leave read-sized gap on right-hand side of the interval
4468 // Leave seed-sized gap on right-hand side and
4469 // non-seed-sized gap on the left-hand side
4470 qbegin += readSeedOverhang;
4473 // lim = # possible alignments in the range
4474 const uint32_t lim = qend - qbegin;
4475 // halfway = point on the genome to radiate out from
4476 const uint32_t halfway = qbegin + (lim >> 1);
4477 uint64_t anchor = 0llu;
4478 uint64_t buffw = 0llu; // rotating ref sequence buffer
4479 // OR the 'diff' buffer with this so that we can always count
4480 // 'N's as mismatches
4481 uint64_t diffMask = 0llu;
4482 // Set up a mask that we'll apply to the two bufs every round
4483 // to discard bits that were rotated out of the anchor area
4484 uint64_t clearMask = 0xffffffffffffffffllu;
4485 bool useMask = false;
4486 if(anchorBitPairs < 32) {
4487 clearMask >>= ((32-anchorBitPairs) << 1);
4495 uint32_t skipLeftToRights = 0;
4496 uint32_t skipRightToLefts = 0;
4497 const uint32_t halfwayRi = halfway - begin;
4498 // Construct the 'anchor' 64-bit buffer so that it holds all of
4499 // the first 'anchorBitPairs' bit pairs of the query.
4500 for(uint32_t ii = 0; ii < anchorBitPairs; ii++) {
4503 // Fill in the anchor using characters from the right-
4504 // hand side of the query (but take the characters in
4505 // left-to-right order). Be sure to subtract slen from
4506 // qlen; not anchorBitPairs from qlen. We want the
4507 // characters in the seedAnchorOverhang region to be to
4508 // the right of the characters in the anchor.
4509 i = qlen - slen + ii;
4511 int c = (int)qry[i]; // next query character
4512 int r = (int)ref[halfwayRi + ii]; // next reference character
4514 // The reference character is an N; to mimic the
4515 // behavior of BW alignment, we have to skip all
4516 // alignments that involve an N in the reference. Set
4517 // the skip* variables accordingly.
4519 uint32_t lrSkips = ii;
4520 uint32_t rlSkips = qlen - ii;
4521 if(!seedOnLeft && readSeedOverhang) {
4522 lrSkips += readSeedOverhang;
4523 assert_geq(rlSkips, readSeedOverhang);
4524 rlSkips -= readSeedOverhang;
4526 // The right-to-left direction absorbs the candidate
4527 // alignment based at 'halfway'
4528 skipLeftToRights = max(skipLeftToRights, lrSkips);
4529 skipRightToLefts = max(skipRightToLefts, rlSkips);
4530 assert_leq(skipLeftToRights, qlen);
4531 assert_leq(skipRightToLefts, qlen);
4535 // Special case: query has an 'N'
4537 if(++nsInSeed > 3) {
4538 // More than one 'N' in the anchor region; can't
4539 // possibly have a 1-mismatch hit anywhere
4540 assert_eq(r2.size(), ranges.size() - rangesInitSz);
4541 return; // can't match if query has Ns
4544 nPos1 = (int)ii; // w/r/t LHS of anchor
4545 } else if(nsInSeed == 2) {
4546 nPos2 = (int)ii; // w/r/t LHS of anchor
4547 assert_gt(nPos2, nPos1);
4549 assert_eq(3, nsInSeed);
4550 nPos3 = (int)ii; // w/r/t LHS of anchor
4551 assert_gt(nPos3, nPos2);
4553 // Make it look like an 'A' in the anchor
4555 diffMask = (diffMask << 2llu) | 1llu;
4559 anchor = ((anchor << 2llu) | c);
4560 buffw = ((buffw << 2llu) | r);
4562 // Check whether read is disqualified by Ns inside the seed
4563 // region but outside the anchor region
4564 if(seedAnchorOverhang) {
4565 assert_lt(anchorBitPairs, slen);
4566 for(uint32_t ii = anchorBitPairs; ii < slen; ii++) {
4569 i = qlen - slen + ii;
4571 if((int)qry[i] == 4) {
4572 if(++nsInSeed > 3) {
4573 assert_eq(r2.size(), ranges.size() - rangesInitSz);
4574 return; // can't match if query has Ns
4579 assert_eq(anchorBitPairs, slen);
4581 uint64_t bufbw = buffw;
4582 // Slide the anchor out in either direction, alternating
4583 // between right-to-left and left-to-right shifts, until all of
4584 // the positions from qbegin to qend have been covered.
4586 uint32_t riHi = halfway;
4587 uint32_t rirHi = halfway - begin;
4588 uint32_t rirHiAnchor = rirHi + anchorBitPairs - 1;
4589 uint32_t riLo = halfway + 1;
4590 uint32_t rirLo = halfway - begin + 1;
4591 uint32_t lrSkips = anchorBitPairs;
4592 uint32_t rlSkips = qlen;
4593 if(!seedOnLeft && readSeedOverhang) {
4594 lrSkips += readSeedOverhang;
4595 assert_geq(rlSkips, readSeedOverhang);
4596 rlSkips -= readSeedOverhang;
4598 for(uint32_t i = 1; i <= lim + 1; i++) {
4599 int r; // new reference char
4601 assert_leq(skipLeftToRights, qlen);
4602 assert_leq(skipRightToLefts, qlen);
4605 // Moving left-to-right
4609 r = (int)ref[rirHiAnchor];
4612 skipLeftToRights = lrSkips;
4615 // Bring in new base pair at the least significant
4617 buffw = ((buffw << 2llu) | r);
4618 if(useMask) buffw &= clearMask;
4619 if(skipLeftToRights > 0) {
4623 diff = (buffw ^ anchor) | diffMask;
4626 // Moving right-to-left
4629 r = (int)ref[rirLo];
4632 skipRightToLefts = rlSkips;
4637 // Bring in new base pair at the most significant
4639 bufbw |= ((uint64_t)r << lhsShift);
4641 if(skipRightToLefts > 0) {
4645 diff = (bufbw ^ anchor) | diffMask;
4647 if((diff & 0xf000f000f000f000llu) &&
4648 (diff & 0x0f000f000f000f00llu) &&
4649 (diff & 0x00f000f000f000f0llu) &&
4650 (diff & 0x000f000f000f000fllu)) continue;
4651 if((diff & 0xc003c003c003c003llu) &&
4652 (diff & 0x3c003c003c003c00llu) &&
4653 (diff & 0x03c003c003c003c0llu) &&
4654 (diff & 0x003c003c003c003cllu)) continue;
4655 uint32_t ri = hi ? riLo : riHi;
4656 uint32_t rir = hi ? rirLo : rirHi;
4657 // Could use pop count
4658 uint8_t *diff8 = reinterpret_cast<uint8_t*>(&diff);
4659 // As a first cut, see if there are too many mismatches in
4660 // the first and last parts of the anchor
4661 uint32_t diffs = u8toMms[(int)diff8[0]] + u8toMms[(int)diff8[7]];
4662 if(diffs > 3) continue;
4663 diffs += u8toMms[(int)diff8[1]] +
4664 u8toMms[(int)diff8[2]] +
4665 u8toMms[(int)diff8[3]] +
4666 u8toMms[(int)diff8[4]] +
4667 u8toMms[(int)diff8[5]] +
4668 u8toMms[(int)diff8[6]];
4669 uint32_t mmpos1 = 0xffffffff;
4671 uint32_t mmpos2 = 0xffffffff;
4673 uint32_t mmpos3 = 0xffffffff;
4675 unsigned int ham = 0;
4677 // Too many differences in the seed; stop
4679 } else if(nPoss > 1 && diffs == nPoss) {
4680 // There was one difference, but there was also one N,
4681 // so we already know where the difference is
4683 refc1 = "ACGT"[(int)ref[rir + nPos1]];
4685 mmpos1 += readSeedOverhang;
4687 char q = quals[mmpos1];
4688 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4689 if(ham > this->qualMax_) {
4690 // Exceeded quality limit
4695 refc2 = "ACGT"[(int)ref[rir + nPos2]];
4697 mmpos2 += readSeedOverhang;
4700 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4701 if(ham > this->qualMax_) {
4702 // Exceeded quality limit
4707 refc3 = "ACGT"[(int)ref[rir + nPos3]];
4709 mmpos3 += readSeedOverhang;
4712 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4713 if(ham > this->qualMax_) {
4714 // Exceeded quality limit
4719 } else if(diffs > 0) {
4720 // Figure out which position mismatched
4721 uint64_t diff2 = diff;
4723 if((diff & 0xffffffffllu) == 0) { diff >>= 32llu; mmpos1 -= 16; }
4724 assert_neq(0, diff);
4725 if((diff & 0xffffllu) == 0) { diff >>= 16llu; mmpos1 -= 8; }
4726 assert_neq(0, diff);
4727 if((diff & 0xffllu) == 0) { diff >>= 8llu; mmpos1 -= 4; }
4728 assert_neq(0, diff);
4729 if((diff & 0xfllu) == 0) { diff >>= 4llu; mmpos1 -= 2; }
4730 assert_neq(0, diff);
4731 if((diff & 0x3llu) == 0) { mmpos1--; }
4732 assert_neq(0, diff);
4733 assert_geq(mmpos1, 0);
4734 assert_lt(mmpos1, 32);
4735 uint32_t savedMmpos1 = mmpos1;
4736 mmpos1 -= anchorCushion;
4737 assert_lt(mmpos1, anchorBitPairs);
4738 refc1 = "ACGT"[(int)ref[rir + mmpos1]];
4740 mmpos1 += readSeedOverhang;
4742 char q = quals[mmpos1];
4743 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4744 if(ham > this->qualMax_) {
4745 // Exceeded quality limit
4749 // Figure out the second mismatched position
4750 ASSERT_ONLY(uint64_t origDiff2 = diff2);
4751 diff2 &= ~(0xc000000000000000llu >> (uint64_t)((savedMmpos1) << 1));
4752 uint64_t diff3 = diff2;
4753 assert_neq(diff2, origDiff2);
4755 if((diff2 & 0xffffffffllu) == 0) { diff2 >>= 32llu; mmpos2 -= 16; }
4756 assert_neq(0, diff2);
4757 if((diff2 & 0xffffllu) == 0) { diff2 >>= 16llu; mmpos2 -= 8; }
4758 assert_neq(0, diff2);
4759 if((diff2 & 0xffllu) == 0) { diff2 >>= 8llu; mmpos2 -= 4; }
4760 assert_neq(0, diff2);
4761 if((diff2 & 0xfllu) == 0) { diff2 >>= 4llu; mmpos2 -= 2; }
4762 assert_neq(0, diff2);
4763 if((diff2 & 0x3llu) == 0) { mmpos2--; }
4764 assert_neq(0, diff2);
4765 assert_geq(mmpos2, 0);
4766 assert_lt(mmpos2, 32);
4767 uint32_t savedMmpos2 = mmpos2;
4768 mmpos2 -= anchorCushion;
4769 assert_neq(mmpos1, mmpos2);
4770 refc2 = "ACGT"[(int)ref[rir + mmpos2]];
4772 mmpos2 += readSeedOverhang;
4775 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4776 if(ham > this->qualMax_) {
4777 // Exceeded quality limit
4780 if(mmpos2 < mmpos1) {
4781 uint32_t mmtmp = mmpos1;
4784 int refctmp = refc1;
4788 assert_lt(mmpos1, mmpos2);
4790 // Figure out the second mismatched position
4791 ASSERT_ONLY(uint32_t origDiff3 = diff3);
4792 diff3 &= ~(0xc000000000000000llu >> (uint64_t)((savedMmpos2) << 1));
4793 assert_neq(diff3, origDiff3);
4795 if((diff3 & 0xffffffffllu) == 0) { diff3 >>= 32llu; mmpos3 -= 16; }
4796 assert_neq(0, diff3);
4797 if((diff3 & 0xffffllu) == 0) { diff3 >>= 16llu; mmpos3 -= 8; }
4798 assert_neq(0, diff3);
4799 if((diff3 & 0xffllu) == 0) { diff3 >>= 8llu; mmpos3 -= 4; }
4800 assert_neq(0, diff3);
4801 if((diff3 & 0xfllu) == 0) { diff3 >>= 4llu; mmpos3 -= 2; }
4802 assert_neq(0, diff3);
4803 if((diff3 & 0x3llu) == 0) { mmpos3--; }
4804 assert_neq(0, diff3);
4805 assert_geq(mmpos3, 0);
4806 assert_lt(mmpos3, 32);
4807 mmpos3 -= anchorCushion;
4808 assert_neq(mmpos2, mmpos3);
4809 assert_neq(mmpos1, mmpos3);
4810 refc3 = "ACGT"[(int)ref[rir + mmpos3]];
4812 mmpos3 += readSeedOverhang;
4815 ham += mmPenalty(this->maqPenalty_, phredCharToPhredQual(q));
4816 if(ham > this->qualMax_) {
4817 // Exceeded quality limit
4820 if(mmpos3 < mmpos1) {
4821 uint32_t mmtmp = mmpos1;
4825 int refctmp = refc1;
4829 } else if(mmpos3 < mmpos2) {
4830 uint32_t mmtmp = mmpos2;
4833 int refctmp = refc2;
4837 assert_lt(mmpos1, mmpos2);
4838 assert_lt(mmpos2, mmpos3);
4842 // If the seed is longer than the anchor, then scan the
4843 // rest of the seed characters
4844 bool foundHit = true;
4845 if(seedAnchorOverhang) {
4846 for(uint32_t j = 0; j < seedAnchorOverhang; j++) {
4847 int rc = (int)ref[rir + anchorBitPairs + j];
4849 // Oops, encountered an N in the reference in
4850 // the overhang portion of the candidate
4852 // (Note that we inverted hi earlier)
4855 // Skip out of the seedAnchorOverhang
4856 assert_eq(0, skipRightToLefts);
4857 skipRightToLefts = seedAnchorOverhang - j - 1;
4859 // ...and skip out of the rest of the read
4860 skipRightToLefts += readSeedOverhang;
4864 // Skip out of the seedAnchorOverhang
4865 assert_eq(0, skipLeftToRights);
4866 skipLeftToRights = anchorBitPairs + j;
4868 // ...and skip out of the rest of the read
4869 skipLeftToRights += readSeedOverhang;
4872 foundHit = false; // Skip this candidate
4875 uint32_t qoff = anchorBitPairs + j;
4877 qoff += readSeedOverhang;
4879 assert_lt(qoff, qlen);
4880 if((int)qry[qoff] != rc) {
4885 } else if(diffs == 3) {
4886 assert_eq(0xffffffff, mmpos3);
4888 assert_eq(-1, refc3);
4889 refc3 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
4890 } else if(diffs == 2) {
4891 assert_eq(0xffffffff, mmpos2);
4893 assert_eq(-1, refc2);
4894 refc2 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
4896 assert_eq(1, diffs);
4897 assert_eq(0xffffffff, mmpos1);
4899 assert_eq(-1, refc1);
4900 refc1 = "ACGT"[(int)ref[rir + anchorBitPairs + j]];
4902 char q = phredCharToPhredQual(quals[qoff]);
4903 ham += mmPenalty(this->maqPenalty_, q);
4904 if(ham > this->qualMax_) {
4905 // Exceeded quality limit
4911 if(!foundHit) continue;
4913 // If the read is longer than the seed, then scan the rest
4914 // of the read characters; mismatches no longer count
4915 // toward the stratum or the 1-mm limit.
4916 // Vectors for holding edit information
4917 std::vector<uint32_t> nonSeedMms;
4918 std::vector<uint8_t> nonSeedRefcs;
4919 int mms = diffs; // start counting total mismatches
4920 if((qlen - slen) > 0) {
4921 // Going left-to-right
4922 for(uint32_t j = 0; j < readSeedOverhang; j++) {
4923 uint32_t roff = rir + slen + j;
4924 uint32_t qoff = slen + j;
4926 assert_geq(roff, qlen);
4930 int rc = (int)ref[roff];
4932 // Oops, encountered an N in the reference in
4933 // the overhang portion of the candidate
4935 // (Note that we inverted hi earlier)
4938 // Skip what's left of the readSeedOverhang
4939 skipRightToLefts = readSeedOverhang - j - 1;
4941 // ...and skip the seed if it's on the right
4942 skipRightToLefts += slen;
4946 // Skip what we've matched of the overhang
4947 skipLeftToRights = j;
4949 // ...and skip the seed if it's on the left
4950 skipLeftToRights += slen;
4953 foundHit = false; // Skip this candidate
4956 if((int)qry[qoff] != rc) {
4957 // Calculate quality of mismatched base
4958 char q = phredCharToPhredQual(quals[qoff]);
4959 ham += mmPenalty(this->maqPenalty_, q);
4960 if(ham > this->qualMax_) {
4961 // Exceeded quality limit
4965 // Legal mismatch outside of the anchor; record it
4967 nonSeedMms.push_back(qoff);
4968 assert_leq(nonSeedMms.size(), (size_t)mms);
4969 nonSeedRefcs.push_back("ACGTN"[rc]);
4972 if(!foundHit) continue;
4975 // Adjust ri if seed is on the right-hand side
4977 ri -= readSeedOverhang;
4978 rir -= readSeedOverhang;
4983 // By convention, the upstream mate's
4984 // coordinates go in the 'first' field
4985 p.first = ((uint64_t)tidx << 32) | (uint64_t)ri;
4986 p.second = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4988 p.second = ((uint64_t)tidx << 32) | (uint64_t)ri;
4989 p.first = ((uint64_t)tidx << 32) | (uint64_t)aoff;
4991 if(pairs->find(p) != pairs->end()) {
4992 // We already found this hit! Continue.
4993 ASSERT_ONLY(duplicates++);
5001 if(this->verbose_) {
5002 cout << "About to report:" << endl;
5004 for(size_t i = 0; i < qlen; i++) {
5005 cout << (char)qry[i];
5009 for(size_t i = 0; i < qlen; i++) {
5010 cout << "ACGT"[ref[rir+i]];
5014 assert_leq(diffs, 3);
5015 assert_geq((size_t)mms, diffs);
5016 assert_lt(r2i, r2.size());
5017 assert_eq(re2[r2i], ri);
5018 ranges.resize(ranges.size()+1);
5019 Range& range = ranges.back();
5020 assert_eq((size_t)mms, r2[r2i].numMms);
5021 range.stratum = diffs;
5023 assert_eq(0, range.mms.size());
5024 assert_eq(0, range.refcs.size());
5026 ASSERT_ONLY(size_t mmcur = 0);
5027 if(seedOnLeft && diffs > 0) {
5028 assert_neq(mmpos1, 0xffffffff);
5029 assert_lt(mmpos1, qlen);
5030 assert_lt(mmcur, (size_t)mms);
5031 assert_eq(mmpos1, r2[r2i].mms[mmcur]);
5032 assert_neq(-1, refc1);
5033 assert_eq(refc1, r2[r2i].refcs[mmcur]);
5034 ASSERT_ONLY(mmcur++);
5035 range.mms.push_back(mmpos1);
5036 range.refcs.push_back(refc1);
5038 assert_neq(mmpos2, 0xffffffff);
5039 assert_lt(mmpos2, qlen);
5040 assert_lt(mmcur, (size_t)mms);
5041 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
5042 assert_neq(-1, refc2);
5043 assert_eq(refc2, r2[r2i].refcs[mmcur]);
5044 ASSERT_ONLY(mmcur++);
5045 range.mms.push_back(mmpos2);
5046 range.refcs.push_back(refc2);
5048 assert_eq(3, diffs);
5049 assert_neq(mmpos3, 0xffffffff);
5050 assert_lt(mmpos3, qlen);
5051 assert_lt(mmcur, (size_t)mms);
5052 assert_eq(mmpos3, r2[r2i].mms[mmcur]);
5053 assert_neq(-1, refc3);
5054 assert_eq(refc3, r2[r2i].refcs[mmcur]);
5055 ASSERT_ONLY(mmcur++);
5056 range.mms.push_back(mmpos3);
5057 range.refcs.push_back(refc3);
5061 const size_t nonSeedMmsSz = nonSeedMms.size();
5062 for(size_t i = 0; i < nonSeedMmsSz; i++) {
5063 assert_neq(0xffffffff, nonSeedMms[i]);
5064 assert_lt(mmcur, (size_t)mms);
5065 assert_eq(nonSeedMms[i], r2[r2i].mms[mmcur]);
5066 range.mms.push_back(nonSeedMms[i]);
5067 assert_eq(nonSeedRefcs[i], r2[r2i].refcs[mmcur]);
5068 ASSERT_ONLY(mmcur++);
5069 range.refcs.push_back(nonSeedRefcs[i]);
5071 if(!seedOnLeft && diffs > 0) {
5072 assert_neq(mmpos1, 0xffffffff);
5073 assert_lt(mmpos1, qlen);
5074 assert_lt(mmcur, (size_t)mms);
5075 assert_eq(mmpos1, r2[r2i].mms[mmcur]);
5076 assert_neq(-1, refc1);
5077 assert_eq(refc1, r2[r2i].refcs[mmcur]);
5078 ASSERT_ONLY(mmcur++);
5079 range.mms.push_back(mmpos1);
5080 range.refcs.push_back(refc1);
5082 assert_neq(mmpos2, 0xffffffff);
5083 assert_lt(mmpos2, qlen);
5084 assert_lt(mmcur, (size_t)mms);
5085 assert_eq(mmpos2, r2[r2i].mms[mmcur]);
5086 assert_neq(-1, refc2);
5087 assert_eq(refc2, r2[r2i].refcs[mmcur]);
5088 ASSERT_ONLY(mmcur++);
5089 range.mms.push_back(mmpos2);
5090 range.refcs.push_back(refc2);
5092 assert_eq(3, diffs);
5093 assert_neq(mmpos3, 0xffffffff);
5094 assert_lt(mmpos3, qlen);
5095 assert_lt(mmcur, (size_t)mms);
5096 assert_eq(mmpos3, r2[r2i].mms[mmcur]);
5097 assert_neq(-1, refc3);
5098 assert_eq(refc3, r2[r2i].refcs[mmcur]);
5099 ASSERT_ONLY(mmcur++);
5100 range.mms.push_back(mmpos3);
5101 range.refcs.push_back(refc3);
5105 assert_eq(mmcur, r2[r2i].mms.size());
5107 assert_eq((size_t)mms, range.mms.size());
5108 assert_eq((size_t)mms, range.refcs.size());
5110 results.push_back(ri);
5111 if(--numToFind == 0) return;
5113 assert_leq(duplicates, r2.size());
5114 assert_geq(r2.size() - duplicates, ranges.size() - rangesInitSz);
5119 #endif /* REF_ALIGNER_H_ */