3 #include "search_globals.h"
8 /// Sort by text-id then by text-offset
9 bool operator< (const Hit& a, const Hit& b) {
14 * Report a batch of hits to a chaining file.
16 void ChainingHitSink::reportHits(vector<Hit>& hs) {
17 size_t hssz = hs.size();
19 assert_eq(0, hs[0].mate);
20 // Convert vector<Hit> into HitSet
22 // Critical section for output stream 0
24 Hit::toHitSet(hs, s, amap_);
30 // Global critical section
32 commitHits(hs); // Commit to recalibration table
41 * Report a maxed-out read. Typically we do nothing, but we might
42 * want to print a placeholder when output is chained.
44 void ChainingHitSink::reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p) {
45 HitSink::reportMaxed(hs, p);
47 int8_t loStrat = (strata_ ? hs.front().stratum : 0);
50 s.maxedStratum = loStrat;
57 * Report an unaligned read. Typically we do nothing, but we might
58 * want to print a placeholder when output is chained.
60 void ChainingHitSink::reportUnaligned(PatternSourcePerThread& p) {
61 HitSink::reportUnaligned(p);
62 // Read is unaligned; just report a huge starting stratum
71 * Report a maxed-out read.
73 void VerboseHitSink::reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p) {
74 HitSink::reportMaxed(hs, p);
77 rand.init(p.bufa().seed);
78 assert_gt(hs.size(), 0);
79 bool paired = hs.front().mate > 0;
83 int bestStratum = 999;
84 for(size_t i = 0; i < hs.size()-1; i += 2) {
85 int strat = min(hs[i].stratum, hs[i+1].stratum);
86 if(strat < bestStratum) {
89 } else if(strat == bestStratum) {
93 assert_leq(num, hs.size());
94 uint32_t r = rand.nextU32() % num;
96 for(size_t i = 0; i < hs.size()-1; i += 2) {
97 int strat = min(hs[i].stratum, hs[i+1].stratum);
98 if(strat == bestStratum) {
100 hs[i].oms = hs[i+1].oms = hs.size()/2;
101 reportHits(hs, i, i+2);
109 for(size_t i = 1; i < hs.size(); i++) {
110 assert_geq(hs[i].stratum, hs[i-1].stratum);
111 if(hs[i].stratum == hs[i-1].stratum) num++;
114 assert_leq(num, hs.size());
115 uint32_t r = rand.nextU32() % num;
124 * Append a verbose, readable hit to the given output stream.
126 void VerboseHitSink::append(ostream& ss,
128 const vector<string>* refnames,
137 const Bitset& suppress)
141 uint32_t pdiv = 0xffffffff;
142 uint32_t pmod = 0xffffffff;
144 bool dospill = false;
146 // The read spilled over a partition boundary and so
147 // needs to be printed more than once
154 bool firstfield = true;
156 int pospart = abs(partition);
157 if(!suppress.test(field++)) {
158 if(firstfield) firstfield = false;
160 // Output a partitioning key
161 // First component of the key is the reference index
162 if(refnames != NULL && rmap != NULL) {
163 printUptoWs(ss, rmap->getName(h.h.first), !fullRef);
164 } else if(refnames != NULL && h.h.first < refnames->size()) {
165 printUptoWs(ss, (*refnames)[h.h.first], !fullRef);
170 ostringstream ss2, ss3;
171 // Next component of the key is the partition id
173 pdiv = (h.h.second + offBase) / pospart;
174 pmod = (h.h.second + offBase) % pospart;
176 assert_neq(0xffffffff, pdiv);
177 assert_neq(0xffffffff, pmod);
178 if(dospill) assert_gt(spillAmt, 0);
179 ss2 << (pdiv + (dospill ? spillAmt : 0));
181 (pmod + h.length()) >= ((uint32_t)pospart * (spillAmt + 1))) {
182 // Spills into the next partition so we need to
183 // output another alignment for that partition
186 if(!suppress.test(field++)) {
187 if(firstfield) firstfield = false;
189 // Print partition id with leading 0s so that Hadoop
190 // can do lexicographical sort (modern Hadoop versions
191 // seen to support numeric)
192 string s2 = ss2.str();
193 size_t partDigits = 1;
194 if(pospart >= 10) partDigits++;
195 if(pospart >= 100) partDigits++;
196 if(pospart >= 1000) partDigits++;
197 if(pospart >= 10000) partDigits++;
198 if(pospart >= 100000) partDigits++;
199 for(size_t i = s2.length(); i < (10-partDigits); i++) {
204 if(!suppress.test(field++)) {
205 if(firstfield) firstfield = false;
207 // Print offset with leading 0s
208 ss3 << (h.h.second + offBase);
209 string s3 = ss3.str();
210 for(size_t i = s3.length(); i < 9; i++) {
215 if(!suppress.test(field++)) {
216 if(firstfield) firstfield = false;
218 ss << (h.fw? "+":"-");
220 // end if(partition != 0)
223 if(!suppress.test(field++)) {
224 if(firstfield) firstfield = false;
228 if(!suppress.test(field++)) {
229 if(firstfield) firstfield = false;
231 ss << (h.fw? '+' : '-');
233 if(!suppress.test(field++)) {
234 if(firstfield) firstfield = false;
236 // .first is text id, .second is offset
237 if(refnames != NULL && rmap != NULL) {
238 printUptoWs(ss, rmap->getName(h.h.first), !fullRef);
239 } else if(refnames != NULL && h.h.first < refnames->size()) {
240 printUptoWs(ss, (*refnames)[h.h.first], !fullRef);
245 if(!suppress.test(field++)) {
246 if(firstfield) firstfield = false;
248 ss << (h.h.second + offBase);
250 // end else clause of if(partition != 0)
252 if(!suppress.test(field++)) {
253 if(firstfield) firstfield = false;
255 const String<Dna5>* pat = &h.patSeq;
256 if(h.color && colorSeq) pat = &h.colSeq;
259 if(!suppress.test(field++)) {
260 if(firstfield) firstfield = false;
262 const String<char>* qual = &h.quals;
263 if(h.color && colorQual) qual = &h.colQuals;
266 if(!suppress.test(field++)) {
267 if(firstfield) firstfield = false;
271 if(!suppress.test(field++)) {
272 if(firstfield) firstfield = false;
274 // Look for SNP annotations falling within the alignment
275 map<int, char> snpAnnots;
276 const size_t len = length(h.patSeq);
278 AnnotationMap::Iter ai = amap->lower_bound(h.h);
279 for(; ai != amap->end(); ai++) {
280 assert_geq(ai->first.first, h.h.first);
281 if(ai->first.first != h.h.first) {
282 // Different chromosome
285 if(ai->first.second >= h.h.second + len) {
286 // Doesn't fall into alignment
289 if(ai->second.first != 'S') {
290 // Not a SNP annotation
293 size_t off = ai->first.second - h.h.second;
294 if(!h.fw) off = len - off - 1;
295 snpAnnots[off] = ai->second.second;
298 // Output mismatch column
300 for (unsigned int i = 0; i < len; ++ i) {
302 // There's a mismatch at this position
303 if (!firstmm) ss << ",";
305 assert_gt(h.refcs.size(), i);
306 char refChar = toupper(h.refcs[i]);
307 char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1]);
308 assert_neq(refChar, qryChar);
309 ss << ":" << refChar << ">" << qryChar;
311 } else if(snpAnnots.find(i) != snpAnnots.end()) {
312 if (!firstmm) ss << ",";
314 char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1]);
315 ss << "S:" << snpAnnots[i] << ">" << qryChar;
319 if(partition != 0 && firstmm) ss << '-';
322 // Fields addded as of Crossbow 0.1.4
323 if(!suppress.test(field++)) {
324 if(firstfield) firstfield = false;
328 // Print label, or whole read name if label isn't found
329 if(!suppress.test(field++)) {
330 if(firstfield) firstfield = false;
333 // If LB: field is present, print its value
334 for(int i = 0; i < (int)seqan::length(h.patName)-3; i++) {
335 if(h.patName[i] == 'L' &&
336 h.patName[i+1] == 'B' &&
337 h.patName[i+2] == ':' &&
338 ((i == 0) || h.patName[i-1] == ';'))
341 for(int j = labelOff; j < (int)seqan::length(h.patName); j++) {
342 if(h.patName[j] != ';') {
350 // Otherwise, print the whole read name
351 if(labelOff == -1) ss << h.patName;
356 if(!suppress.test(field++)) {
357 if(firstfield) firstfield = false;
359 ss << (int)h.stratum;
362 if(!suppress.test(field++)) {
363 if(firstfield) firstfield = false;
370 if(!suppress.test(field++)) {
371 if(firstfield) firstfield = false;