5 #ifndef ALIGNER_METRICS_H_
6 #define ALIGNER_METRICS_H_
10 #include <seqan/sequence.h>
17 * Borrowed from http://www.johndcook.com/standard_deviation.html,
18 * which in turn is borrowed from Knuth.
22 RunningStat() : m_n(0), m_tot(0.0) { }
32 // See Knuth TAOCP vol 2, 3rd edition, page 232
37 m_newM = m_oldM + (x - m_oldM)/m_n;
38 m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
39 // set up for next iteration
54 return (m_n > 0) ? m_newM : 0.0;
57 double variance() const {
58 return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
61 double stddev() const {
62 return sqrt(variance());
68 double m_oldM, m_newM, m_oldS, m_newS;
72 * Encapsulates a set of metrics that we would like an aligner to keep
73 * track of, so that we can possibly use it to diagnose performance
76 class AlignerMetrics {
84 curIsLowEntropy_(false),
85 curIsHomoPoly_(false),
94 threeOrMoreNReads_(0),
95 lessThanThreeNRreads_(0),
99 backtracksPerHomoRead_(),
100 bwtOpsPerLoEntRead_(),
101 backtracksPerLoEntRead_(),
102 bwtOpsPerHiEntRead_(),
103 backtracksPerHiEntRead_(),
104 bwtOpsPerAlignedRead_(),
105 backtracksPerAlignedRead_(),
106 bwtOpsPerUnalignedRead_(),
107 backtracksPerUnalignedRead_(),
109 backtracksPer0nRead_(),
111 backtracksPer1nRead_(),
113 backtracksPer2nRead_(),
114 bwtOpsPer3orMoreNRead_(),
115 backtracksPer3orMoreNRead_(),
116 timer_(cout, "", false)
119 void printSummary() {
123 cout << "AlignerMetrics:" << endl;
124 cout << " # Reads: " << reads_ << endl;
125 float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0);
127 cout << " % homo-polymeric: " << (hopct) << endl;
128 float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0);
130 cout << " % low-entropy: " << (lopct) << endl;
131 float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0);
133 cout << " % unaligned: " << (unpct) << endl;
134 float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0);
136 cout << " % with 3 or more Ns: " << (npct) << endl;
138 cout << " Total BWT ops: avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl;
139 cout << " Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl;
140 time_t elapsed = timer_.elapsed();
141 cout << " BWT ops per second: " << (bwtOpsPerRead_.tot()/elapsed) << endl;
142 cout << " Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl;
144 cout << " Homo-poly:" << endl;
145 cout << " BWT ops: avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl;
146 cout << " Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl;
147 cout << " Low-entropy:" << endl;
148 cout << " BWT ops: avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl;
149 cout << " Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl;
150 cout << " High-entropy:" << endl;
151 cout << " BWT ops: avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl;
152 cout << " Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl;
154 cout << " Unaligned:" << endl;
155 cout << " BWT ops: avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl;
156 cout << " Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl;
157 cout << " Aligned:" << endl;
158 cout << " BWT ops: avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl;
159 cout << " Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl;
161 cout << " 0 Ns:" << endl;
162 cout << " BWT ops: avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl;
163 cout << " Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl;
164 cout << " 1 N:" << endl;
165 cout << " BWT ops: avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl;
166 cout << " Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl;
167 cout << " 2 Ns:" << endl;
168 cout << " BWT ops: avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl;
169 cout << " Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl;
170 cout << " >2 Ns:" << endl;
171 cout << " BWT ops: avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl;
172 cout << " Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl;
179 void nextRead(const seqan::String<seqan::Dna5>& read) {
184 float ent = entropyDna5(read);
185 curIsLowEntropy_ = (ent < 0.75f);
186 curIsHomoPoly_ = (ent < 0.001f);
187 curHadRanges_ = false;
192 const size_t len = seqan::length(read);
193 for(size_t i = 0; i < len; i++) {
194 if((int)read[i] == 4) curNumNs_++;
201 void setReadHasRange() {
202 curHadRanges_ = true;
206 * Commit the running statistics for this read to
210 if(curIsHomoPoly_) homoReads_++;
211 else if(curIsLowEntropy_) lowEntReads_++;
213 if(curHadRanges_) alignedReads_++;
214 else unalignedReads_++;
215 bwtOpsPerRead_.push((float)curBwtOps_);
216 backtracksPerRead_.push((float)curBacktracks_);
217 // Drill down by entropy
219 bwtOpsPerHomoRead_.push((float)curBwtOps_);
220 backtracksPerHomoRead_.push((float)curBacktracks_);
221 } else if(curIsLowEntropy_) {
222 bwtOpsPerLoEntRead_.push((float)curBwtOps_);
223 backtracksPerLoEntRead_.push((float)curBacktracks_);
225 bwtOpsPerHiEntRead_.push((float)curBwtOps_);
226 backtracksPerHiEntRead_.push((float)curBacktracks_);
228 // Drill down by whether it aligned
230 bwtOpsPerAlignedRead_.push((float)curBwtOps_);
231 backtracksPerAlignedRead_.push((float)curBacktracks_);
233 bwtOpsPerUnalignedRead_.push((float)curBwtOps_);
234 backtracksPerUnalignedRead_.push((float)curBacktracks_);
237 lessThanThreeNRreads_++;
238 bwtOpsPer0nRead_.push((float)curBwtOps_);
239 backtracksPer0nRead_.push((float)curBacktracks_);
240 } else if(curNumNs_ == 1) {
241 lessThanThreeNRreads_++;
242 bwtOpsPer1nRead_.push((float)curBwtOps_);
243 backtracksPer1nRead_.push((float)curBacktracks_);
244 } else if(curNumNs_ == 2) {
245 lessThanThreeNRreads_++;
246 bwtOpsPer2nRead_.push((float)curBwtOps_);
247 backtracksPer2nRead_.push((float)curBacktracks_);
249 threeOrMoreNReads_++;
250 bwtOpsPer3orMoreNRead_.push((float)curBwtOps_);
251 backtracksPer3orMoreNRead_.push((float)curBacktracks_);
255 // Running-total of the number of backtracks and BWT ops for the
257 uint32_t curBacktracks_;
264 // true iff the current read is low entropy
265 bool curIsLowEntropy_;
266 // true if current read is all 1 char (or very close)
268 // true iff the current read has had one or more ranges reported
270 // number of Ns in current read
277 // # low-entropy reads
278 uint32_t lowEntReads_;
279 // # high-entropy reads
280 uint32_t hiEntReads_;
281 // # reads with alignments
282 uint32_t alignedReads_;
283 // # reads without alignments
284 uint32_t unalignedReads_;
285 // # reads with 3 or more Ns
286 uint32_t threeOrMoreNReads_;
287 // # reads with < 3 Ns
288 uint32_t lessThanThreeNRreads_;
290 // Distribution of BWT operations per read
291 RunningStat bwtOpsPerRead_;
292 RunningStat backtracksPerRead_;
294 // Distribution of BWT operations per homo-poly read
295 RunningStat bwtOpsPerHomoRead_;
296 RunningStat backtracksPerHomoRead_;
298 // Distribution of BWT operations per low-entropy read
299 RunningStat bwtOpsPerLoEntRead_;
300 RunningStat backtracksPerLoEntRead_;
302 // Distribution of BWT operations per high-entropy read
303 RunningStat bwtOpsPerHiEntRead_;
304 RunningStat backtracksPerHiEntRead_;
306 // Distribution of BWT operations per read that "aligned" (for
307 // which a range was arrived at - range may not have necessarily
308 // lead to an alignment)
309 RunningStat bwtOpsPerAlignedRead_;
310 RunningStat backtracksPerAlignedRead_;
312 // Distribution of BWT operations per read that didn't align
313 RunningStat bwtOpsPerUnalignedRead_;
314 RunningStat backtracksPerUnalignedRead_;
316 // Distribution of BWT operations/backtracks per read with no Ns
317 RunningStat bwtOpsPer0nRead_;
318 RunningStat backtracksPer0nRead_;
320 // Distribution of BWT operations/backtracks per read with one N
321 RunningStat bwtOpsPer1nRead_;
322 RunningStat backtracksPer1nRead_;
324 // Distribution of BWT operations/backtracks per read with two Ns
325 RunningStat bwtOpsPer2nRead_;
326 RunningStat backtracksPer2nRead_;
328 // Distribution of BWT operations/backtracks per read with three or
330 RunningStat bwtOpsPer3orMoreNRead_;
331 RunningStat backtracksPer3orMoreNRead_;
336 #endif /* ALIGNER_METRICS_H_ */