11 #include <seqan/sequence.h>
13 #include "assert_helpers.h"
15 #include "threading.h"
26 * Classes for dealing with reporting alignments.
30 using namespace seqan;
32 /// Constants for the various output modes
42 /// Names of the various output modes
43 static const std::string output_type_names[] = {
51 typedef pair<uint32_t,uint32_t> U32Pair;
54 * Encapsulates a hit, including a text-id/text-offset pair, a pattern
55 * id, and a boolean indicating whether it matched as its forward or
56 * reverse-complement version.
60 Hit() : stratum(-1) { }
63 * Initialize this Ent to represent the given hit.
65 void toHitSetEnt(HitSetEnt& hse, AnnotationMap *amap) const {
69 hse.stratum = stratum;
71 if(mms.count() == 0) return;
72 for(int i = 0; i < (int)length(); i++) {
75 hse.back().type = EDIT_TYPE_MM;
77 hse.back().chr = refcs[i];
83 * Convert a list of Hit objects to a single HitSet object suitable
86 static void toHitSet(const std::vector<Hit>& hits, HitSet& hs,
89 if(hits.empty()) return;
91 hs.name = hits.front().patName;
92 hs.seq = hits.front().patSeq;
93 hs.qual = hits.front().quals;
94 hs.color = hits.front().color;
95 if(!hits.front().fw) {
97 reverseComplementInPlace(hs.seq, hs.color);
98 reverseInPlace(hs.qual);
100 // Convert hits to entries
101 hs.ents.resize(hits.size());
102 for(size_t i = 0; i < hs.ents.size(); i++) {
103 hits[i].toHitSetEnt(hs.ents[i], amap);
108 * Populate a vector of Hits with hits from the given HitSet.
110 static void fromHitSet(std::vector<Hit>& hits, const HitSet& hs) {
112 hits.resize(hs.size());
113 for(size_t i = 0; i < hs.size(); i++) {
115 hits[i].oms = hs[i].oms;
116 hits[i].fw = hs[i].fw;
117 hits[i].stratum = hs[i].stratum;
118 hits[i].cost = hs[i].cost;
119 hits[i].mate = 0; //hs[i].mate;
120 hits[i].patName = hs.name;
121 hits[i].patSeq = hs.seq;
122 hits[i].quals = hs.qual;
123 hits[i].color = hs.color;
124 size_t len = seqan::length(hs.seq);
126 reverseComplementInPlace(hits[i].patSeq, hs.color);
127 reverseInPlace(hits[i].quals);
129 hits[i].refcs.resize(len);
130 for(size_t j = 0; j < hs[i].size(); j++) {
131 if(hs[i][j].type != EDIT_TYPE_MM) continue;
132 hits[i].mms.set(hs[i][j].pos);
133 hits[i].refcs[hs[i][j].pos] = hs[i][j].chr;
138 U32Pair h; /// reference index & offset
139 U32Pair mh; /// reference index & offset for mate
140 uint32_t patId; /// read index
141 String<char> patName; /// read name
142 String<Dna5> patSeq; /// read sequence
143 String<Dna5> colSeq; /// original color sequence, not decoded
144 String<char> quals; /// read qualities
145 String<char> colQuals;/// original color qualities, not decoded
146 FixedBitset<1024> mms; /// nucleotide mismatch mask
147 FixedBitset<1024> cmms; /// color mismatch mask (if relevant)
148 vector<char> refcs; /// reference characters for mms
149 vector<char> crefcs; /// reference characters for cmms
150 uint32_t oms; /// # of other possible mappings; 0 -> this is unique
151 bool fw; /// orientation of read in alignment
152 bool mfw; /// orientation of mate in alignment
153 uint16_t mlen; /// length of mate
154 int8_t stratum; /// stratum of hit (= mismatches in seed)
155 uint32_t cost; /// total cost, factoring in stratum and quality penalty
156 uint8_t mate; /// matedness; 0 = not a mate
157 /// 1 = upstream mate
158 /// 2 = downstream mate
159 bool color; /// read is in colorspace?
160 uint32_t seed; /// pseudo-random seed for aligned read
163 * Return true if this Hit is internally consistent. Otherwise,
164 * throw an assertion.
167 assert_geq(cost, (uint32_t)(stratum << 14));
171 size_t length() const { return seqan::length(patSeq); }
173 Hit& operator = (const Hit &other) {
176 this->patId = other.patId;
177 this->patName = other.patName;
178 this->patSeq = other.patSeq;
179 this->colSeq = other.colSeq;
180 this->quals = other.quals;
181 this->colQuals= other.colQuals;
182 this->mms = other.mms;
183 this->cmms = other.cmms;
184 this->refcs = other.refcs;
185 this->crefcs = other.crefcs;
186 this->oms = other.oms;
188 this->mfw = other.mfw;
189 this->mlen = other.mlen;
190 this->stratum = other.stratum;
191 this->cost = other.cost;
192 this->mate = other.mate;
193 this->color = other.color;
194 this->cmms = other.cmms;
195 this->seed = other.seed;
201 * Compare hits a and b; a < b if its cost is less than B's. If
202 * there's a tie, break on position and orientation.
204 class HitCostCompare {
206 bool operator() (const Hit& a, const Hit& b) {
207 if(a.cost < b.cost) return true;
208 if(a.cost > b.cost) return false;
209 if(a.h < b.h) return true;
210 if(a.h > b.h) return false;
211 if(a.fw < b.fw) return true;
212 if(a.fw > b.fw) return false;
217 /// Sort by text-id then by text-offset
218 bool operator< (const Hit& a, const Hit& b);
221 * Table for holding recalibration counts, along the lines of the table
222 * presented in the SOAPsnp paper in Genome Res. Each element maps a
223 * read allele (o), quality score (q), cycle (c), and reference allele
224 * (H), to a count of how many times that combination is observed in a
225 * reported alignment.
227 * RecalTable is not synchronized, so it's assumed that threads are
228 * incrementing the counters from within critical sections.
232 RecalTable(int maxCycle,
234 int qualShift) : maxCycle_(maxCycle),
236 qualShift_(qualShift),
237 shift1_(6 - qualShift_),
238 shift2_(shift1_ + 2),
239 shift3_(shift2_ + 2),
243 cerr << "Warning: maximum cycle for recalibration table is 0" << endl;
244 } else if(maxQual >> qualShift == 0) {
245 cerr << "Warning: maximum quality value " << maxQual << ", when shifted, is 0" << endl;
246 } else if(qualShift > 5) {
247 cerr << "Warning: quality shift value " << qualShift << " exceeds ceiling of 5" << endl;
250 len_ = maxCycle_ * 4 /* subj alleles*/ * 4 /* ref alleles */ * 64 /* quals */;
251 ents_ = new uint32_t[len_];
253 throw std::bad_alloc();
255 memset(ents_, 0, len_ << 2);
256 } catch(std::bad_alloc& e) {
257 cerr << "Error allocating recalibration table with " << len_ << " entries" << endl;
264 if(ents_ != NULL) delete[] ents_;
268 * Factor a new alignment into the recalibration table.
270 void commitHit(const Hit& h) {
271 // Iterate through the pattern from 5' to 3', calculate the
272 // shifted quality value, obtain the reference character, and
273 // increment the appropriate counter
275 for(int i = 0; i < (int)h.length(); i++) {
278 ii = h.length() - ii - 1;
280 int qc = (int)h.patSeq[ii];
283 rc = charToDna5[(int)h.refcs[i]];
286 int q = (int)h.quals[ii]-33;
289 ents_[calcIdx(i, qc, rc, q)]++;
294 * Print the contents of the recalibration table.
296 void print (std::ostream& out) const {
297 if(ents_ == NULL) return;
298 const int lim = maxCycle_;
299 for(int i = 0; i < lim; i++) {
300 out << "t" << i << "\t";
301 // Iterate over subject alleles
302 for(int j = 0; j < 4; j++) {
303 // Iterate over reference alleles
304 for(int k = 0; k < 4; k++) {
305 // Iterate over qualities
306 int lim2 = maxQual_ >> qualShift_;
307 for(int l = 0; l < lim2; l++) {
308 out << ents_[calcIdx(i, j, k, l)] << '\t';
319 * Calculate index into the ents_ array given cycle, subject
320 * allele, reference allele, and (shifted) quality.
322 int calcIdx(int cyc, int sa, int ra, int q) const {
323 int ret = q | (ra << shift1_) | (sa << shift2_) | (cyc << shift3_);
324 assert_lt(ret, len_);
330 const int qualShift_;
338 #define DECL_HIT_DUMPS \
339 const std::string& dumpAl, \
340 const std::string& dumpUnal, \
341 const std::string& dumpMax
343 #define INIT_HIT_DUMPS \
344 dumpAlBase_(dumpAl), \
345 dumpUnalBase_(dumpUnal), \
346 dumpMaxBase_(dumpMax)
348 #define DECL_HIT_DUMPS2 \
352 RecalTable *recalTable, \
353 std::vector<std::string>* refnames
355 #define PASS_HIT_DUMPS \
360 #define PASS_HIT_DUMPS2 \
368 * Encapsulates an object that accepts hits, optionally retains them in
369 * a vector, and does something else with them according to
370 * descendent's implementation of pure virtual member reportHitImpl().
374 explicit HitSink(OutFileBuf* out,
379 vector<string>* refnames = NULL) :
387 onePairFile_(onePairFile),
388 sampleMax_(sampleMax),
394 numReportedPaired_(0llu),
396 ssmode_(ios_base::out)
398 _outs.push_back(out);
400 MUTEX_INIT(_locks[0]);
401 MUTEX_INIT(_mainlock);
406 * Open a number of output streams; usually one per reference
407 * sequence. For now, we give then names refXXXXX.map where XXXXX
408 * is the 0-padded reference index. Someday we may want to include
409 * the name of the reference sequence in the filename somehow.
411 explicit HitSink(size_t numOuts,
416 vector<string>* refnames = NULL) :
423 onePairFile_(onePairFile),
424 sampleMax_(sampleMax),
426 ssmode_(ios_base::out)
428 // Open all files for writing and initialize all locks
429 for(size_t i = 0; i < numOuts; i++) {
430 _outs.push_back(NULL); // we open output streams lazily
432 MUTEX_INIT(_locks[i]);
434 MUTEX_INIT(_mainlock);
439 * Destroy HitSinkobject;
444 // Delete all non-NULL output streams
445 for(size_t i = 0; i < _outs.size(); i++) {
446 if(_outs[i] != NULL) {
456 * Call this whenever this HitSink is wrapped by a new
457 * HitSinkPerThread. This helps us keep track of whether the main
458 * lock or any of the per-stream locks will be contended.
465 * Called by concrete subclasses to figure out which elements of
466 * the _outs/_locks array to use when outputting the alignment.
468 size_t refIdxToStreamIdx(size_t refIdx) {
469 if(refIdx >= _outs.size()) return 0;
474 * Append a single hit to the given output stream.
476 virtual void append(ostream& o, const Hit& h) = 0;
479 * Report a batch of hits; all in the given vector.
481 virtual void reportHits(vector<Hit>& hs) {
482 reportHits(hs, 0, hs.size());
486 * Report a batch of hits from a vector, perhaps subsetting it.
488 virtual void reportHits(vector<Hit>& hs, size_t start, size_t end) {
489 assert_geq(end, start);
490 if(end-start == 0) return;
491 bool paired = hs[start].mate > 0;
492 // Sort reads so that those against the same reference sequence
494 if(_outs.size() > 1 && end-start > 2) {
495 sort(hs.begin() + start, hs.begin() + end);
498 for(size_t i = start; i < end; i++) {
499 const Hit& h = hs[i];
503 diff = (refIdxToStreamIdx(h.h.first) != refIdxToStreamIdx(hs[i-1].h.first));
504 if(diff) unlock(hs[i-1].h.first);
506 ostringstream ss(ssmode_);
507 ss.rdbuf()->pubsetbuf(buf, 4096);
509 if(i == start || diff) {
512 out(h.h.first).writeChars(buf, ss.tellp());
514 unlock(hs[end-1].h.first);
519 if(paired) numReportedPaired_ += (end-start);
520 else numReported_ += (end-start);
524 void commitHit(const Hit& hit) {
525 if(recalTable_ != NULL) {
526 recalTable_->commitHit(hit);
530 void commitHits(const std::vector<Hit>& hits) {
531 if(recalTable_ != NULL) {
532 const size_t sz = hits.size();
533 for(size_t i = 0; i < sz; i++) {
540 * Called when all alignments are complete. It is assumed that no
541 * synchronization is necessary.
543 void finish(bool hadoopOut) {
544 // Close output streams
547 // Print information about how many unpaired and/or paired
548 // reads were aligned.
549 uint64_t tot = numAligned_ + numUnaligned_ + numMaxed_;
550 double alPct = 0.0, unalPct = 0.0, maxPct = 0.0;
552 alPct = 100.0 * (double)numAligned_ / (double)tot;
553 unalPct = 100.0 * (double)numUnaligned_ / (double)tot;
554 maxPct = 100.0 * (double)numMaxed_ / (double)tot;
556 cerr << "# reads processed: " << tot << endl;
557 cerr << "# reads with at least one reported alignment: "
558 << numAligned_ << " (" << fixed << setprecision(2)
559 << alPct << "%)" << endl;
560 cerr << "# reads that failed to align: "
561 << numUnaligned_ << " (" << fixed << setprecision(2)
562 << unalPct << "%)" << endl;
565 cerr << "# reads with alignments sampled due to -M: "
566 << numMaxed_ << " (" << fixed << setprecision(2)
567 << maxPct << "%)" << endl;
569 cerr << "# reads with alignments suppressed due to -m: "
570 << numMaxed_ << " (" << fixed << setprecision(2)
571 << maxPct << "%)" << endl;
575 assert_eq(0llu, numReported_);
576 cerr << "No alignments" << endl;
578 else if(numReportedPaired_ > 0 && numReported_ == 0) {
579 cerr << "Reported " << (numReportedPaired_ >> 1)
580 << " paired-end alignments to " << _outs.size()
581 << " output stream(s)" << endl;
583 else if(numReported_ > 0 && numReportedPaired_ == 0) {
584 cerr << "Reported " << numReported_
585 << " alignments to " << _outs.size()
586 << " output stream(s)" << endl;
589 assert_gt(numReported_, 0);
590 assert_gt(numReportedPaired_, 0);
591 cerr << "Reported " << (numReportedPaired_ >> 1)
592 << " paired-end alignments and " << numReported_
593 << " singleton alignments to " << _outs.size()
594 << " output stream(s)" << endl;
597 cerr << "reporter:counter:Bowtie,Reads with reported alignments," << numAligned_ << endl;
598 cerr << "reporter:counter:Bowtie,Reads with no alignments," << numUnaligned_ << endl;
599 cerr << "reporter:counter:Bowtie,Reads exceeding -m limit," << numMaxed_ << endl;
600 cerr << "reporter:counter:Bowtie,Unpaired alignments reported," << numReported_ << endl;
601 cerr << "reporter:counter:Bowtie,Paired alignments reported," << numReportedPaired_ << endl;
604 // Print the recalibration table.
605 if(recalTable_ != NULL) {
606 recalTable_->print(cout);
610 /// Returns the alignment output stream; if the stream needs to be
611 /// created, create it
612 OutFileBuf& out(size_t refIdx) {
613 size_t strIdx = refIdxToStreamIdx(refIdx);
614 if(_outs[strIdx] == NULL) {
618 if (strIdx < 10) oss << "0000";
619 else if(strIdx < 100) oss << "000";
620 else if(strIdx < 1000) oss << "00";
621 else if(strIdx < 10000) oss << "0";
622 oss << strIdx << ".map";
623 _outs[strIdx] = new OutFileBuf(oss.str().c_str(), ssmode_ == ios_base::binary);
625 assert(_outs[strIdx] != NULL);
626 return *(_outs[strIdx]);
630 * Lock the monolithic lock for this HitSink. This is useful when,
631 * for example, outputting a read to an unaligned-read file.
634 MUTEX_LOCK(_mainlock);
638 * Unlock the monolithic lock for this HitSink. This is useful
639 * when, for example, outputting a read to an unaligned-read file.
642 MUTEX_UNLOCK(_mainlock);
646 * Return true iff this HitSink dumps aligned reads to an output
647 * stream (i.e., iff --alfa or --alfq are specified).
649 bool dumpsAlignedReads() {
650 return dumpAlignFlag_;
654 * Return true iff this HitSink dumps unaligned reads to an output
655 * stream (i.e., iff --unfa or --unfq are specified).
657 bool dumpsUnalignedReads() {
658 return dumpUnalignFlag_;
662 * Return true iff this HitSink dumps maxed-out reads to an output
663 * stream (i.e., iff --maxfa or --maxfq are specified).
665 bool dumpsMaxedReads() {
666 return dumpMaxedFlag_ || dumpUnalignFlag_;
670 * Return true iff this HitSink dumps either unaligned or maxed-
671 * out reads to an output stream (i.e., iff --unfa, --maxfa,
672 * --unfq, or --maxfq are specified).
675 return dumpAlignFlag_ || dumpUnalignFlag_ || dumpMaxedFlag_;
679 * Dump an aligned read to all of the appropriate output streams.
680 * Be careful to synchronize correctly - there may be multiple
681 * simultaneous writers.
683 void dumpAlign(PatternSourcePerThread& p) {
684 if(!dumpAlignFlag_) return;
685 if(!p.paired() || onePairFile_) {
686 // Dump unpaired read to an aligned-read file of the same format
687 if(!dumpAlBase_.empty()) {
688 MUTEX_LOCK(dumpAlignLock_);
689 if(dumpAl_ == NULL) {
690 assert(dumpAlQv_ == NULL);
691 dumpAl_ = openOf(dumpAlBase_, 0, "");
692 assert(dumpAl_ != NULL);
693 if(p.bufa().qualOrigBufLen > 0) {
694 dumpAlQv_ = openOf(dumpAlBase_ + ".qual", 0, "");
695 assert(dumpAlQv_ != NULL);
698 dumpAl_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
699 if(dumpAlQv_ != NULL) {
700 dumpAlQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
702 MUTEX_UNLOCK(dumpAlignLock_);
705 // Dump paired-end read to an aligned-read file (or pair of
706 // files) of the same format
707 if(!dumpAlBase_.empty()) {
708 MUTEX_LOCK(dumpAlignLockPE_);
709 if(dumpAl_1_ == NULL) {
710 assert(dumpAlQv_1_ == NULL);
711 assert(dumpAlQv_2_ == NULL);
712 dumpAl_1_ = openOf(dumpAlBase_, 1, "");
713 dumpAl_2_ = openOf(dumpAlBase_, 2, "");
714 assert(dumpAl_1_ != NULL);
715 assert(dumpAl_2_ != NULL);
716 if(p.bufa().qualOrigBufLen > 0) {
717 dumpAlQv_1_ = openOf(dumpAlBase_ + ".qual", 1, "");
718 dumpAlQv_2_ = openOf(dumpAlBase_ + ".qual", 2, "");
719 assert(dumpAlQv_1_ != NULL);
720 assert(dumpAlQv_2_ != NULL);
723 dumpAl_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
724 dumpAl_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen);
725 if(dumpAlQv_1_ != NULL) {
726 dumpAlQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
727 dumpAlQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen);
729 MUTEX_UNLOCK(dumpAlignLockPE_);
735 * Dump an unaligned read to all of the appropriate output streams.
736 * Be careful to synchronize correctly - there may be multiple
737 * simultaneous writers.
739 void dumpUnal(PatternSourcePerThread& p) {
740 if(!dumpUnalignFlag_) return;
741 if(!p.paired() || onePairFile_) {
742 // Dump unpaired read to an unaligned-read file of the same format
743 if(!dumpUnalBase_.empty()) {
744 MUTEX_LOCK(dumpUnalLock_);
745 if(dumpUnal_ == NULL) {
746 assert(dumpUnalQv_ == NULL);
747 dumpUnal_ = openOf(dumpUnalBase_, 0, "");
748 assert(dumpUnal_ != NULL);
749 if(p.bufa().qualOrigBufLen > 0) {
750 dumpUnalQv_ = openOf(dumpUnalBase_ + ".qual", 0, "");
751 assert(dumpUnalQv_ != NULL);
754 dumpUnal_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
755 if(dumpUnalQv_ != NULL) {
756 dumpUnalQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
758 MUTEX_UNLOCK(dumpUnalLock_);
761 // Dump paired-end read to an unaligned-read file (or pair
762 // of files) of the same format
763 if(!dumpUnalBase_.empty()) {
764 MUTEX_LOCK(dumpUnalLockPE_);
765 if(dumpUnal_1_ == NULL) {
766 assert(dumpUnal_1_ == NULL);
767 assert(dumpUnal_2_ == NULL);
768 dumpUnal_1_ = openOf(dumpUnalBase_, 1, "");
769 dumpUnal_2_ = openOf(dumpUnalBase_, 2, "");
770 assert(dumpUnal_1_ != NULL);
771 assert(dumpUnal_2_ != NULL);
772 if(p.bufa().qualOrigBufLen > 0) {
773 dumpUnalQv_1_ = openOf(dumpUnalBase_ + ".qual", 1, "");
774 dumpUnalQv_2_ = openOf(dumpUnalBase_ + ".qual", 2, "");
777 dumpUnal_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
778 dumpUnal_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen);
779 if(dumpUnalQv_1_ != NULL) {
780 dumpUnalQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
781 dumpUnalQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen);
783 MUTEX_UNLOCK(dumpUnalLockPE_);
789 * Dump a maxed-out read to all of the appropriate output streams.
790 * Be careful to synchronize correctly - there may be multiple
791 * simultaneous writers.
793 void dumpMaxed(PatternSourcePerThread& p) {
794 if(!dumpMaxedFlag_) {
795 if(dumpUnalignFlag_) dumpUnal(p);
798 if(!p.paired() || onePairFile_) {
799 // Dump unpaired read to an maxed-out-read file of the same format
800 if(!dumpMaxBase_.empty()) {
801 MUTEX_LOCK(dumpMaxLock_);
802 if(dumpMax_ == NULL) {
803 dumpMax_ = openOf(dumpMaxBase_, 0, "");
804 assert(dumpMax_ != NULL);
805 if(p.bufa().qualOrigBufLen > 0) {
806 dumpMaxQv_ = openOf(dumpMaxBase_ + ".qual", 0, "");
809 dumpMax_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
810 if(dumpMaxQv_ != NULL) {
811 dumpMaxQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
813 MUTEX_UNLOCK(dumpMaxLock_);
816 // Dump paired-end read to a maxed-out-read file (or pair
817 // of files) of the same format
818 if(!dumpMaxBase_.empty()) {
819 MUTEX_LOCK(dumpMaxLockPE_);
820 if(dumpMax_1_ == NULL) {
821 assert(dumpMaxQv_1_ == NULL);
822 assert(dumpMaxQv_2_ == NULL);
823 dumpMax_1_ = openOf(dumpMaxBase_, 1, "");
824 dumpMax_2_ = openOf(dumpMaxBase_, 2, "");
825 assert(dumpMax_1_ != NULL);
826 assert(dumpMax_2_ != NULL);
827 if(p.bufa().qualOrigBufLen > 0) {
828 dumpMaxQv_1_ = openOf(dumpMaxBase_ + ".qual", 1, "");
829 dumpMaxQv_2_ = openOf(dumpMaxBase_ + ".qual", 2, "");
832 dumpMax_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
833 dumpMax_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen);
834 if(dumpMaxQv_1_ != NULL) {
835 dumpMaxQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
836 dumpMaxQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen);
838 MUTEX_UNLOCK(dumpMaxLockPE_);
844 * Report a maxed-out read. Typically we do nothing, but we might
845 * want to print a placeholder when output is chained.
847 virtual void reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p) {
854 * Report an unaligned read. Typically we do nothing, but we might
855 * want to print a placeholder when output is chained.
857 virtual void reportUnaligned(PatternSourcePerThread& p) {
865 /// Implementation of hit-report
866 virtual void reportHit(const Hit& h) {
871 if(h.mate > 0) numReportedPaired_++;
878 * Close (and flush) all OutFileBufs.
881 // Flush and close all non-NULL output streams
882 for(size_t i = 0; i < _outs.size(); i++) {
883 if(_outs[i] != NULL && !_outs[i]->closed()) {
890 * Lock the output buffer for the output stream for reference with
891 * index 'refIdx'. By default, hits for all references are
892 * directed to the same output stream, but if --refout is
893 * specified, each reference has its own reference stream.
895 void lock(size_t refIdx) {
896 size_t strIdx = refIdxToStreamIdx(refIdx);
897 MUTEX_LOCK(_locks[strIdx]);
901 * Lock the output buffer for the output stream for reference with
902 * index 'refIdx'. By default, hits for all references are
903 * directed to the same output stream, but if --refout is
904 * specified, each reference has its own reference stream.
906 void unlock(size_t refIdx) {
907 size_t strIdx = refIdxToStreamIdx(refIdx);
908 MUTEX_UNLOCK(_locks[strIdx]);
911 vector<OutFileBuf*> _outs; /// the alignment output stream(s)
912 bool _deleteOuts; /// Whether to delete elements of _outs upon exit
913 RecalTable *recalTable_; /// recalibration table
914 vector<string>* _refnames; /// map from reference indexes to names
915 int _numWrappers; /// # threads owning a wrapper for this HitSink
916 vector<MUTEX_T> _locks; /// pthreads mutexes for per-file critical sections
917 MUTEX_T _mainlock; /// pthreads mutexes for fields of this object
919 // Output filenames for dumping
920 std::string dumpAlBase_;
921 std::string dumpUnalBase_;
922 std::string dumpMaxBase_;
927 // Output streams for dumping sequences
928 std::ofstream *dumpAl_; // for single-ended reads
929 std::ofstream *dumpAl_1_; // for first mates
930 std::ofstream *dumpAl_2_; // for second mates
931 std::ofstream *dumpUnal_; // for single-ended reads
932 std::ofstream *dumpUnal_1_; // for first mates
933 std::ofstream *dumpUnal_2_; // for second mates
934 std::ofstream *dumpMax_; // for single-ended reads
935 std::ofstream *dumpMax_1_; // for first mates
936 std::ofstream *dumpMax_2_; // for second mates
938 // Output streams for dumping qualities
939 std::ofstream *dumpAlQv_; // for single-ended reads
940 std::ofstream *dumpAlQv_1_; // for first mates
941 std::ofstream *dumpAlQv_2_; // for second mates
942 std::ofstream *dumpUnalQv_; // for single-ended reads
943 std::ofstream *dumpUnalQv_1_; // for first mates
944 std::ofstream *dumpUnalQv_2_; // for second mates
945 std::ofstream *dumpMaxQv_; // for single-ended reads
946 std::ofstream *dumpMaxQv_1_; // for first mates
947 std::ofstream *dumpMaxQv_2_; // for second mates
950 * Open an ofstream with given name; output error message and quit
953 std::ofstream* openOf(const std::string& name,
955 const std::string& suffix)
957 std::string s = name;
958 size_t dotoff = name.find_last_of(".");
960 if(dotoff == string::npos) {
961 s += "_1"; s += suffix;
963 s = name.substr(0, dotoff) + "_1" + s.substr(dotoff);
965 } else if(mateType == 2) {
966 if(dotoff == string::npos) {
967 s += "_2"; s += suffix;
969 s = name.substr(0, dotoff) + "_2" + s.substr(dotoff);
971 } else if(mateType != 0) {
972 cerr << "Bad mate type " << mateType << endl; throw 1;
974 std::ofstream* tmp = new ofstream(s.c_str(), ios::out);
977 cerr << "Could not open single-ended aligned/unaligned-read file for writing: " << name << endl;
979 cerr << "Could not open paired-end aligned/unaligned-read file for writing: " << name << endl;
987 * Initialize all the locks for dumping.
990 dumpAl_ = dumpAl_1_ = dumpAl_2_ = NULL;
991 dumpUnal_ = dumpUnal_1_ = dumpUnal_2_ = NULL;
992 dumpMax_ = dumpMax_1_ = dumpMax_2_ = NULL;
993 dumpAlQv_ = dumpAlQv_1_ = dumpAlQv_2_ = NULL;
994 dumpUnalQv_ = dumpUnalQv_1_ = dumpUnalQv_2_ = NULL;
995 dumpMaxQv_ = dumpMaxQv_1_ = dumpMaxQv_2_ = NULL;
996 dumpAlignFlag_ = !dumpAlBase_.empty();
997 dumpUnalignFlag_ = !dumpUnalBase_.empty();
998 dumpMaxedFlag_ = !dumpMaxBase_.empty();
999 MUTEX_INIT(dumpAlignLock_);
1000 MUTEX_INIT(dumpAlignLockPE_);
1001 MUTEX_INIT(dumpUnalLock_);
1002 MUTEX_INIT(dumpUnalLockPE_);
1003 MUTEX_INIT(dumpMaxLock_);
1004 MUTEX_INIT(dumpMaxLockPE_);
1007 void destroyDumps() {
1008 if(dumpAl_ != NULL) { dumpAl_->close(); delete dumpAl_; }
1009 if(dumpAl_1_ != NULL) { dumpAl_1_->close(); delete dumpAl_1_; }
1010 if(dumpAl_2_ != NULL) { dumpAl_2_->close(); delete dumpAl_2_; }
1011 if(dumpUnal_ != NULL) { dumpUnal_->close(); delete dumpUnal_; }
1012 if(dumpUnal_1_ != NULL) { dumpUnal_1_->close(); delete dumpUnal_1_; }
1013 if(dumpUnal_2_ != NULL) { dumpUnal_2_->close(); delete dumpUnal_2_; }
1014 if(dumpMax_ != NULL) { dumpMax_->close(); delete dumpMax_; }
1015 if(dumpMax_1_ != NULL) { dumpMax_1_->close(); delete dumpMax_1_; }
1016 if(dumpMax_2_ != NULL) { dumpMax_2_->close(); delete dumpMax_2_; }
1017 if(dumpAlQv_ != NULL) { dumpAlQv_->close(); delete dumpAlQv_; }
1018 if(dumpAlQv_1_ != NULL) { dumpAlQv_1_->close(); delete dumpAlQv_1_; }
1019 if(dumpAlQv_2_ != NULL) { dumpAlQv_2_->close(); delete dumpAlQv_2_; }
1020 if(dumpUnalQv_ != NULL) { dumpUnalQv_->close(); delete dumpUnalQv_; }
1021 if(dumpUnalQv_1_ != NULL) { dumpUnalQv_1_->close(); delete dumpUnalQv_1_; }
1022 if(dumpUnalQv_2_ != NULL) { dumpUnalQv_2_->close(); delete dumpUnalQv_2_; }
1023 if(dumpMaxQv_ != NULL) { dumpMaxQv_->close(); delete dumpMaxQv_; }
1024 if(dumpMaxQv_1_ != NULL) { dumpMaxQv_1_->close(); delete dumpMaxQv_1_; }
1025 if(dumpMaxQv_2_ != NULL) { dumpMaxQv_2_->close(); delete dumpMaxQv_2_; }
1028 // Locks for dumping
1029 MUTEX_T dumpAlignLock_;
1030 MUTEX_T dumpAlignLockPE_; // _1 and _2
1031 MUTEX_T dumpUnalLock_;
1032 MUTEX_T dumpUnalLockPE_; // _1 and _2
1033 MUTEX_T dumpMaxLock_;
1034 MUTEX_T dumpMaxLockPE_; // _1 and _2
1036 // false -> no dumping
1037 bool dumpAlignFlag_;
1038 bool dumpUnalignFlag_;
1039 bool dumpMaxedFlag_;
1041 volatile bool first_; /// true -> first hit hasn't yet been reported
1042 volatile uint64_t numAligned_; /// # reads with >= 1 alignment
1043 volatile uint64_t numUnaligned_;/// # reads with no alignments
1044 volatile uint64_t numMaxed_; /// # reads with # alignments exceeding -m ceiling
1045 volatile uint64_t numReported_; /// # single-ended alignments reported
1046 volatile uint64_t numReportedPaired_; /// # paired-end alignments reported
1047 bool quiet_; /// true -> don't print alignment stats at the end
1048 ios_base::openmode ssmode_; /// output mode for stringstreams
1052 * A per-thread wrapper for a HitSink. Incorporates state that a
1053 * single search thread cares about.
1055 class HitSinkPerThread {
1057 HitSinkPerThread(HitSink& sink, uint32_t max, uint32_t n) :
1059 _bestRemainingStratum(0),
1060 _numValidHits(0llu),
1071 virtual ~HitSinkPerThread() { }
1073 /// Return the vector of retained hits
1074 vector<Hit>& retainedHits() { return _hits; }
1076 /// Finalize current read
1077 virtual uint32_t finishRead(PatternSourcePerThread& p, bool report, bool dump) {
1078 uint32_t ret = finishReadImpl();
1079 _bestRemainingStratum = 0;
1081 _bufferedHits.clear();
1084 bool maxed = (ret > _max);
1085 bool unal = (ret == 0);
1086 if(dump && (unal || maxed)) {
1087 // Either no reportable hits were found or the number of
1088 // reportable hits exceeded the -m limit specified by the
1090 assert(ret == 0 || ret > _max);
1091 if(maxed) _sink.dumpMaxed(p);
1092 else _sink.dumpUnal(p);
1096 // Report that the read maxed-out; useful for chaining output
1097 if(dump) _sink.reportMaxed(_bufferedHits, p);
1098 _bufferedHits.clear();
1100 // Report that the read failed to align; useful for chaining output
1101 if(dump) _sink.reportUnaligned(p);
1103 // Flush buffered hits
1104 assert_gt(_bufferedHits.size(), 0);
1105 if(_bufferedHits.size() > _n) {
1106 _bufferedHits.resize(_n);
1108 _sink.reportHits(_bufferedHits);
1110 ret = _bufferedHits.size();
1111 _bufferedHits.clear();
1113 assert_eq(0, _bufferedHits.size());
1117 virtual uint32_t finishReadImpl() = 0;
1120 * Implementation for hit reporting; update per-thread _hits and
1121 * _numReportableHits variables and call the master HitSink to do the actual
1124 virtual void bufferHit(const Hit& h, int stratum) {
1126 // Ensure all buffered hits have the same patid
1127 for(size_t i = 1; i < _bufferedHits.size(); i++) {
1128 assert_eq(_bufferedHits[0].patId, _bufferedHits[i].patId);
1131 _bufferedHits.push_back(h);
1135 * Concrete subclasses override this to (possibly) report a hit and
1136 * return true iff the caller should continue to report more hits.
1138 virtual bool reportHit(const Hit& h, int stratum) {
1144 /// Return the number of valid hits so far (not necessarily
1145 /// reported). It's up to the concrete subclasses
1146 uint64_t numValidHits() { return _numValidHits; }
1149 * Return true if there are no more reportable hits.
1151 bool finishedWithStratum(int stratum) {
1152 bool ret = finishedWithStratumImpl(stratum);
1153 _bestRemainingStratum = stratum+1;
1158 * Use the given set of hits as a starting point. By default, we don't
1160 virtual bool setHits(HitSet& hs) {
1162 cerr << "Error: default setHits() called with non-empty HitSet" << endl;
1169 * Return true if there are no reportable hits with the given cost
1172 virtual bool irrelevantCost(uint16_t cost) {
1177 * Concrete subclasses override this to determine whether the
1178 * search routine should keep searching after having finished
1179 * reporting all alignments at the given stratum.
1181 virtual bool finishedWithStratumImpl(int stratum) = 0;
1183 /// The mhits maximum
1184 uint32_t overThresh() { return _max; }
1186 /// Whether this thread, for this read, knows that we have already
1187 /// exceeded the mhits maximum
1188 bool exceededOverThresh() { return hitsForThisRead_ > _max; }
1190 /// Return whether we span strata
1191 virtual bool spanStrata() = 0;
1193 /// Return whether we report only the best possible hits
1194 virtual bool best() = 0;
1197 * Return true iff the underlying HitSink dumps unaligned or
1201 return _sink.dumpsReads();
1205 * Return true iff there are currently no buffered hits.
1207 bool empty() const {
1208 return _bufferedHits.empty();
1212 * Return the number of currently buffered hits.
1215 return _bufferedHits.size();
1219 * Return max # hits to report (*2 in paired-end mode because mates
1222 virtual uint32_t maxHits() {
1227 HitSink& _sink; /// Ultimate destination of reported hits
1228 /// Least # mismatches in alignments that will be reported in the
1229 /// future. Updated by the search routine.
1230 int _bestRemainingStratum;
1231 /// # hits reported to this HitSink so far (not all of which were
1232 /// necesssary reported to _sink)
1233 uint64_t _numValidHits;
1234 vector<Hit> _hits; /// Repository for retained hits
1235 /// Buffered hits, to be reported and flushed at end of read-phase
1236 vector<Hit> _bufferedHits;
1238 // Following variables are declared in the parent but maintained in
1239 // the concrete subcalsses
1240 uint32_t hitsForThisRead_; /// # hits for this read so far
1241 uint32_t _max; /// don't report any hits if there were > _max
1242 uint32_t _n; /// report at most _n hits
1246 * Abstract parent factory for HitSinkPerThreads.
1248 class HitSinkPerThreadFactory {
1250 virtual ~HitSinkPerThreadFactory() { }
1251 virtual HitSinkPerThread* create() const = 0;
1252 virtual HitSinkPerThread* createMult(uint32_t m) const = 0;
1254 /// Free memory associated with a per-thread hit sink
1255 virtual void destroy(HitSinkPerThread* sink) const {
1256 assert(sink != NULL);
1257 // Free the HitSinkPerThread
1263 * Report first N good alignments encountered; trust search routine
1264 * to try alignments in something approximating a best-first order.
1265 * Best used in combination with a stringent alignment policy.
1267 class NGoodHitSinkPerThread : public HitSinkPerThread {
1270 NGoodHitSinkPerThread(
1274 HitSinkPerThread(sink, max, n)
1277 virtual bool spanStrata() {
1278 return true; // we span strata
1281 virtual bool best() {
1282 return false; // we settle for "good" hits
1285 /// Finalize current read
1286 virtual uint32_t finishReadImpl() {
1287 uint32_t ret = hitsForThisRead_;
1288 hitsForThisRead_ = 0;
1293 * Report and then return true if we've already reported N good
1294 * hits. Ignore the stratum - it's not relevant for finding "good"
1297 virtual bool reportHit(const Hit& h, int stratum) {
1298 HitSinkPerThread::reportHit(h, stratum);
1300 if(hitsForThisRead_ > _max) {
1301 return true; // done - report nothing
1303 //if(hitsForThisRead_ <= _n) {
1304 // Only report hit if we haven't
1305 bufferHit(h, stratum);
1307 if(hitsForThisRead_ == _n &&
1308 (_max == 0xffffffff || _max < _n))
1310 return true; // already reported N good hits and max isn't set; stop!
1312 return false; // not at N or max yet; keep going
1316 * Always return true; search routine should only stop if it's
1317 * already reported N hits.
1319 virtual bool finishedWithStratumImpl(int stratum) { return false; }
1323 * Concrete factory for FirstNGoodHitSinkPerThreads.
1325 class NGoodHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1327 NGoodHitSinkPerThreadFactory(
1337 * Allocate a new NGoodHitSinkPerThread object on the heap,
1338 * using the parameters given in the constructor.
1340 virtual HitSinkPerThread* create() const {
1341 return new NGoodHitSinkPerThread(sink_, n_, max_);
1343 virtual HitSinkPerThread* createMult(uint32_t m) const {
1344 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1345 uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m);
1346 return new NGoodHitSinkPerThread(sink_, n, max);
1356 * Report the first N best alignments encountered in a single
1357 * alignment stratum assuming that we're receiving the alignments in
1360 class NBestFirstStratHitSinkPerThread : public HitSinkPerThread {
1363 NBestFirstStratHitSinkPerThread(
1368 HitSinkPerThread(sink, max, n),
1369 bestStratum_(999), mult_(mult)
1373 * false -> we do not allow strata to be spanned
1375 virtual bool spanStrata() {
1376 return false; // we do not span strata
1380 * true -> we report best hits
1382 virtual bool best() {
1387 * Report and then return false if we've already reported N.
1389 virtual bool reportHit(const Hit& h, int stratum) {
1390 HitSinkPerThread::reportHit(h, stratum);
1391 // This hit is within th best possible remaining stratum,
1392 // so it should definitely count
1394 // It doesn't exceed the limit, so buffer it
1395 if(stratum < bestStratum_) {
1396 bestStratum_ = stratum;
1398 if(hitsForThisRead_ > _max) {
1399 return true; // done - report nothing
1401 //if(hitsForThisRead_ <= _n) {
1402 bufferHit(h, stratum);
1404 if(hitsForThisRead_ == _n &&
1405 (_max == 0xffffffff || _max < _n))
1407 return true; // already reported N good hits; stop!
1409 return false; // not at N yet; keep going
1413 * Finalize current read by reporting any buffered hits from best
1414 * to worst until they're all reported or until we've reported all
1417 virtual uint32_t finishReadImpl() {
1418 uint32_t ret = hitsForThisRead_;
1419 hitsForThisRead_ = 0;
1421 const size_t sz = _bufferedHits.size();
1422 for(size_t i = 0; i < sz; i++) {
1423 // Set 'oms' according to the number of other alignments
1425 _bufferedHits[i].oms = (sz / mult_) - 1;
1431 * If we had any alignments at all and we're now moving on to a new
1432 * stratum, then we're done.
1434 virtual bool finishedWithStratumImpl(int stratum) {
1435 return hitsForThisRead_ > 0;
1439 * If there have been any hits reported so far, classify any
1440 * subsequent alignments with higher strata as irrelevant.
1442 virtual bool irrelevantCost(uint16_t cost) {
1443 if(hitsForThisRead_) {
1444 // irrelevant iff at worse stratum
1445 return ((int)cost >> 14) > bestStratum_;
1452 int bestStratum_; /// best stratum observed so far
1453 uint32_t mult_; /// number of batched-up alignments
1457 * Concrete factory for NBestStratHitSinkPerThread.
1459 class NBestFirstStratHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1461 NBestFirstStratHitSinkPerThreadFactory(
1471 * Allocate a new NGoodHitSinkPerThread object on the heap,
1472 * using the parameters given in the constructor.
1474 virtual HitSinkPerThread* create() const {
1475 return new NBestFirstStratHitSinkPerThread(sink_, n_, max_, 1);
1477 virtual HitSinkPerThread* createMult(uint32_t m) const {
1478 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1479 uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m);
1480 return new NBestFirstStratHitSinkPerThread(sink_, n, max, m);
1492 class ChainingHitSinkPerThread : public HitSinkPerThread {
1495 ChainingHitSinkPerThread(HitSink& sink,
1500 HitSinkPerThread(sink, max, n),
1501 mult_(mult), strata_(strata), cutoff_(0xffff)
1508 * Return true iff we're allowed to span strata.
1510 virtual bool spanStrata() { return !strata_; }
1513 * true -> we report best hits
1515 virtual bool best() { return true; }
1518 * Report and then return false if we've already reported N.
1520 virtual bool reportHit(const Hit& h, int stratum) {
1521 HitSinkPerThread::reportHit(h, stratum);
1522 assert(hs_->sorted());
1523 assert(hs_ != NULL);
1524 assert_eq(h.stratum, stratum);
1525 assert_eq(1, mult_);
1526 assert(consistentStrata());
1527 assert(!irrelevantCost(h.cost));
1529 if(!hs_->empty() && strata_ && stratum < hs_->front().stratum) {
1531 _bufferedHits.clear();
1532 hitsForThisRead_ = 0;
1534 assert(consistentStrata());
1537 if(!hs_->empty() && hs_->tryReplacing(h.h, h.fw, h.cost, replPos)) {
1538 if(replPos != 0xffffffff) {
1539 // Replaced an existing hit
1540 assert_lt(replPos, _bufferedHits.size());
1541 _bufferedHits[replPos] = h;
1544 // Size didn't change, so no need to check against _max and _n
1545 assert(hs_->sorted());
1546 assert(consistentStrata());
1550 hs_->back().h = h.h;
1551 hs_->back().fw = h.fw;
1552 hs_->back().stratum = h.stratum;
1553 hs_->back().cost = h.cost;
1555 if(hs_->size() > _max) {
1556 assert_eq(hs_->size(), _bufferedHits.size());
1557 return true; // done - report nothing
1559 _bufferedHits.push_back(h);
1561 hs_->size() == _n &&
1562 (_max == 0xffffffff || _max < _n))
1564 assert_eq(hs_->size(), _bufferedHits.size());
1565 return true; // already reported N good hits; stop!
1568 assert(consistentStrata());
1570 assert_eq(hs_->size(), _bufferedHits.size());
1572 return false; // not at N yet; keep going
1576 * Finalize current read by reporting any buffered hits from best
1577 * to worst until they're all reported or until we've reported all
1580 virtual uint32_t finishReadImpl() {
1581 assert_eq(1, mult_);
1582 assert(hs_ != NULL);
1583 assert(consistentStrata());
1584 uint32_t ret = hitsForThisRead_;
1585 hitsForThisRead_ = 0;
1586 if(!hs_->empty() && hs_->size() < _n) {
1587 const size_t sz = _bufferedHits.size();
1588 for(size_t i = 0; i < sz; i++) {
1589 // Set 'oms' according to the number of other alignments
1591 _bufferedHits[i].oms = (sz / mult_) - 1;
1594 std::sort(_bufferedHits.begin(), _bufferedHits.end(), HitCostCompare());
1595 if(hs_->size() > _n) {
1596 _bufferedHits.resize(_n);
1598 assert(consistentStrata());
1599 // Make sure that a chained, maxed-out read gets treated as
1600 // maxed-out and not as unaligned
1601 if(hs_->empty() && hs_->maxedStratum != -1) {
1602 assert(_bufferedHits.empty());
1603 // Boy, this is stupid. Need to switch to just using HitSet
1604 // internally all the time.
1605 _bufferedHits.resize(_max+1);
1606 for(size_t i = 0; i < _max+1; i++) {
1607 _bufferedHits[i].stratum = hs_->maxedStratum;
1610 } else if(!hs_->empty() && hs_->maxedStratum != -1) {
1611 assert_lt(hs_->front().stratum, hs_->maxedStratum);
1617 * Set the initial set of Hits.
1619 virtual bool setHits(HitSet& hs) {
1621 assert(hs_ != NULL);
1624 hitsForThisRead_ = hs.size();
1625 assert(_bufferedHits.empty());
1626 assert_geq(hs.maxedStratum, -1);
1627 assert_lt(hs.maxedStratum, 4);
1629 assert_eq(-1, hs.maxedStratum);
1631 Hit::fromHitSet(_bufferedHits, hs);
1632 assert(!_bufferedHits.empty());
1633 assert_leq(_bufferedHits.size(), _max);
1634 assert(consistentStrata());
1635 } else if(hs.maxedStratum != -1) {
1636 if(hs.maxedStratum == 0) {
1641 cutoff_ = (hs.maxedStratum << 14);
1643 assert_eq(hs_->size(), _bufferedHits.size());
1649 * If we had any alignments at all and we're now moving on to a new
1650 * stratum, then we're done.
1652 virtual bool finishedWithStratumImpl(int stratum) {
1658 * If there have been any hits reported so far, classify any
1659 * subsequent alignments with higher strata as irrelevant.
1661 virtual bool irrelevantCost(uint16_t cost) {
1662 if(cutoff_ == 0) return false;
1663 return cost > cutoff_;
1670 * Sanity check that, if we're in strata_ mode, all 'stratum's
1671 * should be the same among hits.
1673 bool consistentStrata() {
1674 if(hs_->empty() || !strata_) return true;
1675 int stratum = hs_->front().stratum;
1676 for(size_t i = 1; i < hs_->size(); i++) {
1677 assert_eq(stratum, (*hs_)[i].stratum);
1684 * Update the lowest relevant cost.
1686 void updateCutoff() {
1687 ASSERT_ONLY(uint16_t origCutoff = cutoff_);
1688 assert(hs_->sorted());
1689 assert(hs_->empty() || hs_->back().cost >= hs_->front().cost);
1690 bool atCapacity = (hs_->size() >= _n);
1691 if(atCapacity && (_max == 0xffffffff || _max < _n)) {
1692 cutoff_ = min(hs_->back().cost, cutoff_);
1694 if(strata_ && !hs_->empty()) {
1695 uint16_t sc = hs_->back().cost;
1696 sc = ((sc >> 14) + 1) << 14;
1697 cutoff_ = min(cutoff_, sc);
1698 assert_leq(cutoff_, origCutoff);
1705 bool strata_; /// true -> reporting is stratified
1706 uint16_t cutoff_; /// the smallest irrelevant cost
1710 * Concrete factory for ChainingHitSinkPerThread.
1712 class ChainingHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1714 ChainingHitSinkPerThreadFactory(
1726 * Allocate a new NGoodHitSinkPerThread object on the heap,
1727 * using the parameters given in the constructor.
1729 virtual HitSinkPerThread* create() const {
1730 return new ChainingHitSinkPerThread(sink_, n_, max_, strata_, 1);
1732 virtual HitSinkPerThread* createMult(uint32_t m) const {
1733 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1734 uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m);
1735 return new ChainingHitSinkPerThread(sink_, n, max, strata_, m);
1746 * Report all valid alignments.
1748 class AllHitSinkPerThread : public HitSinkPerThread {
1751 AllHitSinkPerThread(
1754 HitSinkPerThread(sink, max, 0xffffffff) { }
1756 virtual bool spanStrata() {
1757 return true; // we span strata
1760 virtual bool best() {
1761 return true; // we report "best" hits
1765 * Report and always return true; we're finiding all hits so that
1766 * search routine should always continue.
1768 virtual bool reportHit(const Hit& h, int stratum) {
1769 HitSinkPerThread::reportHit(h, stratum);
1771 if(hitsForThisRead_ > _max) {
1772 return true; // done - report nothing
1774 bufferHit(h, stratum);
1775 return false; // reporting all; always keep going
1779 * Finalize; do nothing because we haven't buffered anything
1781 virtual uint32_t finishReadImpl() {
1782 uint32_t ret = hitsForThisRead_;
1783 hitsForThisRead_ = 0;
1788 * Always return false; search routine should not stop.
1790 virtual bool finishedWithStratumImpl(int stratum) { return false; }
1794 * Concrete factory for AllHitSinkPerThread.
1796 class AllHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1798 AllHitSinkPerThreadFactory(
1806 * Allocate a new NGoodHitSinkPerThread object on the heap,
1807 * using the parameters given in the constructor.
1809 virtual HitSinkPerThread* create() const {
1810 return new AllHitSinkPerThread(sink_, max_);
1812 virtual HitSinkPerThread* createMult(uint32_t m) const {
1813 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1814 return new AllHitSinkPerThread(sink_, max);
1823 * Sink that prints lines like this:
1824 * (pat-id)[-|+]:<hit1-text-id,hit2-text-offset>,<hit2-text-id...
1826 * Activated with --concise
1828 class ConciseHitSink : public HitSink {
1831 * Construct a single-stream ConciseHitSink (default)
1833 ConciseHitSink(OutFileBuf* out,
1836 bool reportOpps = false) :
1837 HitSink(out, PASS_HIT_DUMPS2),
1838 _reportOpps(reportOpps),
1839 offBase_(offBase) { }
1842 * Construct a multi-stream ConciseHitSink with one stream per
1843 * reference string (see --refout)
1845 ConciseHitSink(size_t numOuts,
1848 bool reportOpps = false) :
1849 HitSink(numOuts, PASS_HIT_DUMPS2),
1850 _reportOpps(reportOpps),
1851 offBase_(offBase) { }
1854 * Append a verbose, readable hit to the given output stream.
1856 static void append(ostream& ss, const Hit& h, int offBase, bool reportOpps) {
1859 assert(h.mate == 1 || h.mate == 2);
1860 ss << '/' << (int)h.mate;
1862 ss << (h.fw? "+" : "-") << ":";
1863 // .first is text id, .second is offset
1864 ss << "<" << h.h.first << "," << (h.h.second + offBase) << "," << h.mms.count();
1865 if(reportOpps) ss << "," << h.oms;
1870 * Append a verbose, readable hit to the given output stream.
1872 void append(ostream& ss, const Hit& h) {
1873 ConciseHitSink::append(ss, h, this->offBase_, this->_reportOpps);
1879 * Report a concise alignment to the appropriate output stream.
1881 virtual void reportHit(const Hit& h) {
1882 HitSink::reportHit(h);
1886 out(h.h.first).writeString(ss.str());
1892 int offBase_; /// Add this to reference offsets before outputting.
1893 /// (An easy way to make things 1-based instead of
1898 * Print the given string. If ws = true, print only up to and not
1899 * including the first space or tab. Useful for printing reference
1902 inline void printUptoWs(std::ostream& os, const std::string& str, bool ws) {
1906 size_t pos = str.find_first_of(" \t");
1907 if(pos != string::npos) {
1908 os << str.substr(0, pos);
1916 * Sink that prints lines like this:
1917 * pat-name \t [-|+] \t ref-name \t ref-off \t pat \t qual \t #-alt-hits \t mm-list
1919 class VerboseHitSink : public HitSink {
1922 * Construct a single-stream VerboseHitSink (default)
1924 VerboseHitSink(OutFileBuf* out,
1929 const Bitset& suppressOuts,
1931 AnnotationMap *amap,
1934 int partition = 0) :
1935 HitSink(out, PASS_HIT_DUMPS2),
1936 partition_(partition),
1938 colorSeq_(colorSeq),
1939 colorQual_(colorQual),
1941 suppress_(suppressOuts),
1943 rmap_(rmap), amap_(amap)
1947 * Construct a multi-stream VerboseHitSink with one stream per
1948 * reference string (see --refout)
1950 VerboseHitSink(size_t numOuts,
1955 const Bitset& suppressOuts,
1957 AnnotationMap *amap,
1960 int partition = 0) :
1961 HitSink(numOuts, PASS_HIT_DUMPS2),
1962 partition_(partition),
1964 colorSeq_(colorSeq),
1965 colorQual_(colorQual),
1974 static void append(ostream& ss,
1976 const vector<string>* refnames,
1978 AnnotationMap *amap,
1985 const Bitset& suppress);
1988 * Append a verbose, readable hit to the output stream
1989 * corresponding to the hit.
1991 virtual void append(ostream& ss, const Hit& h) {
1992 VerboseHitSink::append(ss, h, _refnames, rmap_, amap_,
1993 fullRef_, partition_, offBase_,
1994 colorSeq_, colorQual_, cost_,
2001 virtual void reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p);
2006 * Report a verbose, human-readable alignment to the appropriate
2009 virtual void reportHit(const Hit& h) {
2014 * Report a verbose, human-readable alignment to the appropriate
2017 virtual void reportHit(const Hit& h, bool count) {
2018 if(count) HitSink::reportHit(h);
2021 // Make sure to grab lock before writing to output stream
2023 out(h.h.first).writeString(ss.str());
2028 int partition_; /// partition size, or 0 if partitioning is disabled
2029 int offBase_; /// Add this to reference offsets before outputting.
2030 /// (An easy way to make things 1-based instead of
2032 bool colorSeq_; /// true -> print colorspace alignment sequence in colors
2033 bool colorQual_; /// true -> print colorspace quals as originals, not decoded
2034 bool cost_; /// true -> print statum and cost
2035 Bitset suppress_; /// output fields to suppress
2036 bool fullRef_; /// print full reference name
2037 ReferenceMap *rmap_; /// mapping to reference coordinate system.
2038 AnnotationMap *amap_; ///
2042 * Sink for outputting alignments in a binary format.
2044 class ChainingHitSink : public HitSink {
2048 * Construct a single-stream BinaryHitSink (default)
2050 ChainingHitSink(OutFileBuf* out, bool strata, AnnotationMap *amap, DECL_HIT_DUMPS2) :
2051 HitSink(out, PASS_HIT_DUMPS2), amap_(amap), strata_(strata)
2053 ssmode_ |= ios_base::binary;
2057 * Report a batch of hits.
2059 virtual void reportHits(vector<Hit>& hs);
2062 * Append a binary alignment to the output stream corresponding to
2063 * the reference sequence involved.
2065 virtual void append(ostream& o, const Hit& h) {
2066 cerr << "Error: ChainingHitSink::append() not implemented" << endl;
2073 virtual void reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p);
2078 virtual void reportUnaligned(PatternSourcePerThread& p);
2081 AnnotationMap *amap_;
2086 * Sink that does nothing.
2088 class StubHitSink : public HitSink {
2090 StubHitSink() : HitSink(new OutFileBuf(".tmp"), "", "", "", false, false, NULL) { }
2091 virtual void append(ostream& o, const Hit& h) { }