Commit patch to not break on spaces.
[bowtie.git] / hit.cpp
1 #include "hit.h"
2 #include "hit_set.h"
3 #include "search_globals.h"
4
5 using namespace std;
6 using namespace seqan;
7
8 /// Sort by text-id then by text-offset
9 bool operator< (const Hit& a, const Hit& b) {
10     return a.h < b.h;
11 }
12
13 /**
14  * Report a batch of hits to a chaining file.
15  */
16 void ChainingHitSink::reportHits(vector<Hit>& hs) {
17         size_t hssz = hs.size();
18         assert_gt(hssz, 0);
19         assert_eq(0, hs[0].mate);
20         // Convert vector<Hit> into HitSet
21         {
22                 // Critical section for output stream 0
23                 HitSet s;
24                 Hit::toHitSet(hs, s, amap_);
25                 lock(0);
26                 s.serialize(out(0));
27                 unlock(0);
28         }
29         {
30                 // Global critical section
31                 mainlock();
32                 commitHits(hs); // Commit to recalibration table
33                 first_ = false;
34                 numReported_ += hssz;
35                 numAligned_++;
36                 mainunlock();
37         }
38 }
39
40 /**
41  * Report a maxed-out read.  Typically we do nothing, but we might
42  * want to print a placeholder when output is chained.
43  */
44 void ChainingHitSink::reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p) {
45         HitSink::reportMaxed(hs, p);
46         assert(!hs.empty());
47         int8_t loStrat = (strata_ ? hs.front().stratum : 0);
48         HitSet s;
49         p.bufa().toHitSet(s);
50         s.maxedStratum = loStrat;
51         lock(0);
52         s.serialize(out(0));
53         unlock(0);
54 }
55
56 /**
57  * Report an unaligned read.  Typically we do nothing, but we might
58  * want to print a placeholder when output is chained.
59  */
60 void ChainingHitSink::reportUnaligned(PatternSourcePerThread& p) {
61         HitSink::reportUnaligned(p);
62         // Read is unaligned; just report a huge starting stratum
63         HitSet s;
64         p.bufa().toHitSet(s);
65         lock(0);
66         s.serialize(out(0));
67         unlock(0);
68 }
69
70 /**
71  * Report a maxed-out read.
72  */
73 void VerboseHitSink::reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p) {
74         HitSink::reportMaxed(hs, p);
75         if(sampleMax_) {
76                 RandomSource rand;
77                 rand.init(p.bufa().seed);
78                 assert_gt(hs.size(), 0);
79                 bool paired = hs.front().mate > 0;
80                 size_t num = 1;
81                 if(paired) {
82                         num = 0;
83                         int bestStratum = 999;
84                         for(size_t i = 0; i < hs.size()-1; i += 2) {
85                                 int strat = min(hs[i].stratum, hs[i+1].stratum);
86                                 if(strat < bestStratum) {
87                                         bestStratum = strat;
88                                         num = 1;
89                                 } else if(strat == bestStratum) {
90                                         num++;
91                                 }
92                         }
93                         assert_leq(num, hs.size());
94                         uint32_t r = rand.nextU32() % num;
95                         num = 0;
96                         for(size_t i = 0; i < hs.size()-1; i += 2) {
97                                 int strat = min(hs[i].stratum, hs[i+1].stratum);
98                                 if(strat == bestStratum) {
99                                         if(num == r) {
100                                                 hs[i].oms = hs[i+1].oms = hs.size()/2;
101                                                 reportHits(hs, i, i+2);
102                                                 break;
103                                         }
104                                         num++;
105                                 }
106                         }
107                         assert_eq(num, r);
108                 } else {
109                         for(size_t i = 1; i < hs.size(); i++) {
110                                 assert_geq(hs[i].stratum, hs[i-1].stratum);
111                                 if(hs[i].stratum == hs[i-1].stratum) num++;
112                                 else break;
113                         }
114                         assert_leq(num, hs.size());
115                         uint32_t r = rand.nextU32() % num;
116                         Hit& h = hs[r];
117                         h.oms = hs.size();
118                         reportHit(h, false);
119                 }
120         }
121 }
122
123 /**
124  * Append a verbose, readable hit to the given output stream.
125  */
126 void VerboseHitSink::append(ostream& ss,
127                    const Hit& h,
128                    const vector<string>* refnames,
129                    ReferenceMap *rmap,
130                    AnnotationMap *amap,
131                    bool fullRef,
132                    int partition,
133                    int offBase,
134                    bool colorSeq,
135                    bool colorQual,
136                    bool cost,
137                    const Bitset& suppress)
138 {
139         bool spill = false;
140         int spillAmt = 0;
141         uint32_t pdiv = 0xffffffff;
142         uint32_t pmod = 0xffffffff;
143         do {
144                 bool dospill = false;
145                 if(spill) {
146                         // The read spilled over a partition boundary and so
147                         // needs to be printed more than once
148                         spill = false;
149                         dospill = true;
150                         spillAmt++;
151                 }
152                 assert(!spill);
153                 size_t field = 0;
154                 bool firstfield = true;
155                 if(partition != 0) {
156                         int pospart = abs(partition);
157                         if(!suppress.test(field++)) {
158                                 if(firstfield) firstfield = false;
159                                 else ss << '\t';
160                                 // Output a partitioning key
161                                 // First component of the key is the reference index
162                                 if(refnames != NULL && rmap != NULL) {
163                                         printUptoWs(ss, rmap->getName(h.h.first), !fullRef);
164                                 } else if(refnames != NULL && h.h.first < refnames->size()) {
165                                         printUptoWs(ss, (*refnames)[h.h.first], !fullRef);
166                                 } else {
167                                         ss << h.h.first;
168                                 }
169                         }
170                         ostringstream ss2, ss3;
171                         // Next component of the key is the partition id
172                         if(!dospill) {
173                                 pdiv = (h.h.second + offBase) / pospart;
174                                 pmod = (h.h.second + offBase) % pospart;
175                         }
176                         assert_neq(0xffffffff, pdiv);
177                         assert_neq(0xffffffff, pmod);
178                         if(dospill) assert_gt(spillAmt, 0);
179                         ss2 << (pdiv + (dospill ? spillAmt : 0));
180                         if(partition > 0 &&
181                            (pmod + h.length()) >= ((uint32_t)pospart * (spillAmt + 1))) {
182                                 // Spills into the next partition so we need to
183                                 // output another alignment for that partition
184                                 spill = true;
185                         }
186                         if(!suppress.test(field++)) {
187                                 if(firstfield) firstfield = false;
188                                 else ss << '\t';
189                                 // Print partition id with leading 0s so that Hadoop
190                                 // can do lexicographical sort (modern Hadoop versions
191                                 // seen to support numeric)
192                                 string s2 = ss2.str();
193                                 size_t partDigits = 1;
194                                 if(pospart >= 10) partDigits++;
195                                 if(pospart >= 100) partDigits++;
196                                 if(pospart >= 1000) partDigits++;
197                                 if(pospart >= 10000) partDigits++;
198                                 if(pospart >= 100000) partDigits++;
199                                 for(size_t i = s2.length(); i < (10-partDigits); i++) {
200                                         ss << "0";
201                                 }
202                                 ss << s2.c_str();
203                         }
204                         if(!suppress.test(field++)) {
205                                 if(firstfield) firstfield = false;
206                                 else ss << '\t';
207                                 // Print offset with leading 0s
208                                 ss3 << (h.h.second + offBase);
209                                 string s3 = ss3.str();
210                                 for(size_t i = s3.length(); i < 9; i++) {
211                                         ss << "0";
212                                 }
213                                 ss << s3;
214                         }
215                         if(!suppress.test(field++)) {
216                                 if(firstfield) firstfield = false;
217                                 else ss << '\t';
218                                 ss << (h.fw? "+":"-");
219                         }
220                         // end if(partition != 0)
221                 } else {
222                         assert(!dospill);
223                         if(!suppress.test(field++)) {
224                                 if(firstfield) firstfield = false;
225                                 else ss << '\t';
226                                 ss << h.patName;
227                         }
228                         if(!suppress.test(field++)) {
229                                 if(firstfield) firstfield = false;
230                                 else ss << '\t';
231                                 ss << (h.fw? '+' : '-');
232                         }
233                         if(!suppress.test(field++)) {
234                                 if(firstfield) firstfield = false;
235                                 else ss << '\t';
236                                 // .first is text id, .second is offset
237                                 if(refnames != NULL && rmap != NULL) {
238                                         printUptoWs(ss, rmap->getName(h.h.first), !fullRef);
239                                 } else if(refnames != NULL && h.h.first < refnames->size()) {
240                                         printUptoWs(ss, (*refnames)[h.h.first], !fullRef);
241                                 } else {
242                                         ss << h.h.first;
243                                 }
244                         }
245                         if(!suppress.test(field++)) {
246                                 if(firstfield) firstfield = false;
247                                 else ss << '\t';
248                                 ss << (h.h.second + offBase);
249                         }
250                         // end else clause of if(partition != 0)
251                 }
252                 if(!suppress.test(field++)) {
253                         if(firstfield) firstfield = false;
254                         else ss << '\t';
255                         const String<Dna5>* pat = &h.patSeq;
256                         if(h.color && colorSeq) pat = &h.colSeq;
257                         ss << *pat;
258                 }
259                 if(!suppress.test(field++)) {
260                         if(firstfield) firstfield = false;
261                         else ss << '\t';
262                         const String<char>* qual = &h.quals;
263                         if(h.color && colorQual) qual = &h.colQuals;
264                         ss << *qual;
265                 }
266                 if(!suppress.test(field++)) {
267                         if(firstfield) firstfield = false;
268                         else ss << '\t';
269                         ss << h.oms;
270                 }
271                 if(!suppress.test(field++)) {
272                         if(firstfield) firstfield = false;
273                         else ss << '\t';
274                         // Look for SNP annotations falling within the alignment
275                         map<int, char> snpAnnots;
276                         const size_t len = length(h.patSeq);
277                         if(amap != NULL) {
278                                 AnnotationMap::Iter ai = amap->lower_bound(h.h);
279                                 for(; ai != amap->end(); ai++) {
280                                         assert_geq(ai->first.first, h.h.first);
281                                         if(ai->first.first != h.h.first) {
282                                                 // Different chromosome
283                                                 break;
284                                         }
285                                         if(ai->first.second >= h.h.second + len) {
286                                                 // Doesn't fall into alignment
287                                                 break;
288                                         }
289                                         if(ai->second.first != 'S') {
290                                                 // Not a SNP annotation
291                                                 continue;
292                                         }
293                                         size_t off = ai->first.second - h.h.second;
294                                         if(!h.fw) off = len - off - 1;
295                                         snpAnnots[off] = ai->second.second;
296                                 }
297                         }
298                         // Output mismatch column
299                         bool firstmm = true;
300                         for (unsigned int i = 0; i < len; ++ i) {
301                                 if(h.mms.test(i)) {
302                                         // There's a mismatch at this position
303                                         if (!firstmm) ss << ",";
304                                         ss << i; // position
305                                         assert_gt(h.refcs.size(), i);
306                                         char refChar = toupper(h.refcs[i]);
307                                         char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1]);
308                                         assert_neq(refChar, qryChar);
309                                         ss << ":" << refChar << ">" << qryChar;
310                                         firstmm = false;
311                                 } else if(snpAnnots.find(i) != snpAnnots.end()) {
312                                         if (!firstmm) ss << ",";
313                                         ss << i; // position
314                                         char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1]);
315                                         ss << "S:" << snpAnnots[i] << ">" << qryChar;
316                                         firstmm = false;
317                                 }
318                         }
319                         if(partition != 0 && firstmm) ss << '-';
320                 }
321                 if(partition != 0) {
322                         // Fields addded as of Crossbow 0.1.4
323                         if(!suppress.test(field++)) {
324                                 if(firstfield) firstfield = false;
325                                 else ss << '\t';
326                                 ss << (int)h.mate;
327                         }
328                         // Print label, or whole read name if label isn't found
329                         if(!suppress.test(field++)) {
330                                 if(firstfield) firstfield = false;
331                                 else ss << '\t';
332                                 int labelOff = -1;
333                                 // If LB: field is present, print its value
334                                 for(int i = 0; i < (int)seqan::length(h.patName)-3; i++) {
335                                         if(h.patName[i]   == 'L' &&
336                                            h.patName[i+1] == 'B' &&
337                                            h.patName[i+2] == ':' &&
338                                            ((i == 0) || h.patName[i-1] == ';'))
339                                         {
340                                                 labelOff = i+3;
341                                                 for(int j = labelOff; j < (int)seqan::length(h.patName); j++) {
342                                                         if(h.patName[j] != ';') {
343                                                                 ss << h.patName[j];
344                                                         } else {
345                                                                 break;
346                                                         }
347                                                 }
348                                         }
349                                 }
350                                 // Otherwise, print the whole read name
351                                 if(labelOff == -1) ss << h.patName;
352                         }
353                 }
354                 if(cost) {
355                         // Stratum
356                         if(!suppress.test(field++)) {
357                                 if(firstfield) firstfield = false;
358                                 else ss << '\t';
359                                 ss << (int)h.stratum;
360                         }
361                         // Cost
362                         if(!suppress.test(field++)) {
363                                 if(firstfield) firstfield = false;
364                                 else ss << '\t';
365                                 ss << (int)h.cost;
366                         }
367                 }
368                 if(showSeed) {
369                         // Seed
370                         if(!suppress.test(field++)) {
371                                 if(firstfield) firstfield = false;
372                                 else ss << '\t';
373                                 ss << h.seed;
374                         }
375                 }
376                 ss << endl;
377         } while(spill);
378 }