Commit patch to not break on spaces.
[bowtie.git] / aligner_metrics.h
1 /*
2  * aligner_metrics.h
3  */
4
5 #ifndef ALIGNER_METRICS_H_
6 #define ALIGNER_METRICS_H_
7
8 #include <math.h>
9 #include <iostream>
10 #include <seqan/sequence.h>
11 #include "alphabet.h"
12 #include "timer.h"
13
14 using namespace std;
15
16 /**
17  * Borrowed from http://www.johndcook.com/standard_deviation.html,
18  * which in turn is borrowed from Knuth.
19  */
20 class RunningStat {
21 public:
22         RunningStat() : m_n(0), m_tot(0.0) { }
23
24         void clear() {
25                 m_n = 0;
26                 m_tot = 0.0;
27         }
28
29         void push(float x) {
30                 m_n++;
31                 m_tot += x;
32                 // See Knuth TAOCP vol 2, 3rd edition, page 232
33                 if (m_n == 1) {
34                         m_oldM = m_newM = x;
35                         m_oldS = 0.0;
36                 } else {
37                         m_newM = m_oldM + (x - m_oldM)/m_n;
38                         m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
39                         // set up for next iteration
40                         m_oldM = m_newM;
41                         m_oldS = m_newS;
42                 }
43         }
44
45         int num() const {
46                 return m_n;
47         }
48
49         double tot() const {
50                 return m_tot;
51         }
52
53         double mean() const {
54                 return (m_n > 0) ? m_newM : 0.0;
55         }
56
57         double variance() const {
58                 return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
59         }
60
61         double stddev() const {
62                 return sqrt(variance());
63         }
64
65 private:
66         int m_n;
67         double m_tot;
68         double m_oldM, m_newM, m_oldS, m_newS;
69 };
70
71 /**
72  * Encapsulates a set of metrics that we would like an aligner to keep
73  * track of, so that we can possibly use it to diagnose performance
74  * issues.
75  */
76 class AlignerMetrics {
77
78 public:
79
80         AlignerMetrics() :
81                 curBacktracks_(0),
82                 curBwtOps_(0),
83                 first_(true),
84                 curIsLowEntropy_(false),
85                 curIsHomoPoly_(false),
86                 curHadRanges_(false),
87                 curNumNs_(0),
88                 reads_(0),
89                 homoReads_(0),
90                 lowEntReads_(0),
91                 hiEntReads_(0),
92                 alignedReads_(0),
93                 unalignedReads_(0),
94                 threeOrMoreNReads_(0),
95                 lessThanThreeNRreads_(0),
96                 bwtOpsPerRead_(),
97                 backtracksPerRead_(),
98                 bwtOpsPerHomoRead_(),
99                 backtracksPerHomoRead_(),
100                 bwtOpsPerLoEntRead_(),
101                 backtracksPerLoEntRead_(),
102                 bwtOpsPerHiEntRead_(),
103                 backtracksPerHiEntRead_(),
104                 bwtOpsPerAlignedRead_(),
105                 backtracksPerAlignedRead_(),
106                 bwtOpsPerUnalignedRead_(),
107                 backtracksPerUnalignedRead_(),
108                 bwtOpsPer0nRead_(),
109                 backtracksPer0nRead_(),
110                 bwtOpsPer1nRead_(),
111                 backtracksPer1nRead_(),
112                 bwtOpsPer2nRead_(),
113                 backtracksPer2nRead_(),
114                 bwtOpsPer3orMoreNRead_(),
115                 backtracksPer3orMoreNRead_(),
116                 timer_(cout, "", false)
117                 { }
118
119         void printSummary() {
120                 if(!first_) {
121                         finishRead();
122                 }
123                 cout << "AlignerMetrics:" << endl;
124                 cout << "  # Reads:             " << reads_ << endl;
125                 float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0);
126                 hopct *= 100.0;
127                 cout << "  % homo-polymeric:    " << (hopct) << endl;
128                 float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0);
129                 lopct *= 100.0;
130                 cout << "  % low-entropy:       " << (lopct) << endl;
131                 float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0);
132                 unpct *= 100.0;
133                 cout << "  % unaligned:         " << (unpct) << endl;
134                 float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0);
135                 npct *= 100.0;
136                 cout << "  % with 3 or more Ns: " << (npct) << endl;
137                 cout << endl;
138                 cout << "  Total BWT ops:    avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl;
139                 cout << "  Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl;
140                 time_t elapsed = timer_.elapsed();
141                 cout << "  BWT ops per second:    " << (bwtOpsPerRead_.tot()/elapsed) << endl;
142                 cout << "  Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl;
143                 cout << endl;
144                 cout << "  Homo-poly:" << endl;
145                 cout << "    BWT ops:    avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl;
146                 cout << "    Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl;
147                 cout << "  Low-entropy:" << endl;
148                 cout << "    BWT ops:    avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl;
149                 cout << "    Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl;
150                 cout << "  High-entropy:" << endl;
151                 cout << "    BWT ops:    avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl;
152                 cout << "    Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl;
153                 cout << endl;
154                 cout << "  Unaligned:" << endl;
155                 cout << "    BWT ops:    avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl;
156                 cout << "    Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl;
157                 cout << "  Aligned:" << endl;
158                 cout << "    BWT ops:    avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl;
159                 cout << "    Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl;
160                 cout << endl;
161                 cout << "  0 Ns:" << endl;
162                 cout << "    BWT ops:    avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl;
163                 cout << "    Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl;
164                 cout << "  1 N:" << endl;
165                 cout << "    BWT ops:    avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl;
166                 cout << "    Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl;
167                 cout << "  2 Ns:" << endl;
168                 cout << "    BWT ops:    avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl;
169                 cout << "    Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl;
170                 cout << "  >2 Ns:" << endl;
171                 cout << "    BWT ops:    avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl;
172                 cout << "    Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl;
173                 cout << endl;
174         }
175
176         /**
177          *
178          */
179         void nextRead(const seqan::String<seqan::Dna5>& read) {
180                 if(!first_) {
181                         finishRead();
182                 }
183                 first_ = false;
184                 float ent = entropyDna5(read);
185                 curIsLowEntropy_ = (ent < 0.75f);
186                 curIsHomoPoly_ = (ent < 0.001f);
187                 curHadRanges_ = false;
188                 curBwtOps_ = 0;
189                 curBacktracks_ = 0;
190                 // Count Ns
191                 curNumNs_ = 0;
192                 const size_t len = seqan::length(read);
193                 for(size_t i = 0; i < len; i++) {
194                         if((int)read[i] == 4) curNumNs_++;
195                 }
196         }
197
198         /**
199          *
200          */
201         void setReadHasRange() {
202                 curHadRanges_ = true;
203         }
204
205         /**
206          * Commit the running statistics for this read to
207          */
208         void finishRead() {
209                 reads_++;
210                 if(curIsHomoPoly_) homoReads_++;
211                 else if(curIsLowEntropy_) lowEntReads_++;
212                 else hiEntReads_++;
213                 if(curHadRanges_) alignedReads_++;
214                 else unalignedReads_++;
215                 bwtOpsPerRead_.push((float)curBwtOps_);
216                 backtracksPerRead_.push((float)curBacktracks_);
217                 // Drill down by entropy
218                 if(curIsHomoPoly_) {
219                         bwtOpsPerHomoRead_.push((float)curBwtOps_);
220                         backtracksPerHomoRead_.push((float)curBacktracks_);
221                 } else if(curIsLowEntropy_) {
222                         bwtOpsPerLoEntRead_.push((float)curBwtOps_);
223                         backtracksPerLoEntRead_.push((float)curBacktracks_);
224                 } else {
225                         bwtOpsPerHiEntRead_.push((float)curBwtOps_);
226                         backtracksPerHiEntRead_.push((float)curBacktracks_);
227                 }
228                 // Drill down by whether it aligned
229                 if(curHadRanges_) {
230                         bwtOpsPerAlignedRead_.push((float)curBwtOps_);
231                         backtracksPerAlignedRead_.push((float)curBacktracks_);
232                 } else {
233                         bwtOpsPerUnalignedRead_.push((float)curBwtOps_);
234                         backtracksPerUnalignedRead_.push((float)curBacktracks_);
235                 }
236                 if(curNumNs_ == 0) {
237                         lessThanThreeNRreads_++;
238                         bwtOpsPer0nRead_.push((float)curBwtOps_);
239                         backtracksPer0nRead_.push((float)curBacktracks_);
240                 } else if(curNumNs_ == 1) {
241                         lessThanThreeNRreads_++;
242                         bwtOpsPer1nRead_.push((float)curBwtOps_);
243                         backtracksPer1nRead_.push((float)curBacktracks_);
244                 } else if(curNumNs_ == 2) {
245                         lessThanThreeNRreads_++;
246                         bwtOpsPer2nRead_.push((float)curBwtOps_);
247                         backtracksPer2nRead_.push((float)curBacktracks_);
248                 } else {
249                         threeOrMoreNReads_++;
250                         bwtOpsPer3orMoreNRead_.push((float)curBwtOps_);
251                         backtracksPer3orMoreNRead_.push((float)curBacktracks_);
252                 }
253         }
254
255         // Running-total of the number of backtracks and BWT ops for the
256         // current read
257         uint32_t curBacktracks_;
258         uint32_t curBwtOps_;
259
260 protected:
261
262         bool first_;
263
264         // true iff the current read is low entropy
265         bool curIsLowEntropy_;
266         // true if current read is all 1 char (or very close)
267         bool curIsHomoPoly_;
268         // true iff the current read has had one or more ranges reported
269         bool curHadRanges_;
270         // number of Ns in current read
271         int curNumNs_;
272
273         // # reads
274         uint32_t reads_;
275         // # homo-poly reads
276         uint32_t homoReads_;
277         // # low-entropy reads
278         uint32_t lowEntReads_;
279         // # high-entropy reads
280         uint32_t hiEntReads_;
281         // # reads with alignments
282         uint32_t alignedReads_;
283         // # reads without alignments
284         uint32_t unalignedReads_;
285         // # reads with 3 or more Ns
286         uint32_t threeOrMoreNReads_;
287         // # reads with < 3 Ns
288         uint32_t lessThanThreeNRreads_;
289
290         // Distribution of BWT operations per read
291         RunningStat bwtOpsPerRead_;
292         RunningStat backtracksPerRead_;
293
294         // Distribution of BWT operations per homo-poly read
295         RunningStat bwtOpsPerHomoRead_;
296         RunningStat backtracksPerHomoRead_;
297
298         // Distribution of BWT operations per low-entropy read
299         RunningStat bwtOpsPerLoEntRead_;
300         RunningStat backtracksPerLoEntRead_;
301
302         // Distribution of BWT operations per high-entropy read
303         RunningStat bwtOpsPerHiEntRead_;
304         RunningStat backtracksPerHiEntRead_;
305
306         // Distribution of BWT operations per read that "aligned" (for
307         // which a range was arrived at - range may not have necessarily
308         // lead to an alignment)
309         RunningStat bwtOpsPerAlignedRead_;
310         RunningStat backtracksPerAlignedRead_;
311
312         // Distribution of BWT operations per read that didn't align
313         RunningStat bwtOpsPerUnalignedRead_;
314         RunningStat backtracksPerUnalignedRead_;
315
316         // Distribution of BWT operations/backtracks per read with no Ns
317         RunningStat bwtOpsPer0nRead_;
318         RunningStat backtracksPer0nRead_;
319
320         // Distribution of BWT operations/backtracks per read with one N
321         RunningStat bwtOpsPer1nRead_;
322         RunningStat backtracksPer1nRead_;
323
324         // Distribution of BWT operations/backtracks per read with two Ns
325         RunningStat bwtOpsPer2nRead_;
326         RunningStat backtracksPer2nRead_;
327
328         // Distribution of BWT operations/backtracks per read with three or
329         // more Ns
330         RunningStat bwtOpsPer3orMoreNRead_;
331         RunningStat backtracksPer3orMoreNRead_;
332
333         Timer timer_;
334 };
335
336 #endif /* ALIGNER_METRICS_H_ */