Imported Upstream version 0.12.7
[bowtie.git] / sam.cpp
1 /*
2  * sam.cpp
3  *
4  *  Created on: Sep 23, 2009
5  *      Author: Ben Langmead
6  */
7
8 #include <vector>
9 #include <string>
10 #include <iostream>
11 #include "pat.h"
12 #include "hit.h"
13 #include "sam.h"
14
15 using namespace std;
16
17 /**
18  * Write the SAM header lines.
19  */
20 void SAMHitSink::appendHeaders(OutFileBuf& os,
21                                size_t numRefs,
22                                const vector<string>& refnames,
23                                bool color,
24                                bool nosq,
25                                ReferenceMap *rmap,
26                                const uint32_t* plen,
27                                bool fullRef,
28                                const char *cmdline,
29                                const char *rgline)
30 {
31         ostringstream ss;
32         ss << "@HD\tVN:1.0\tSO:unsorted" << endl;
33         if(!nosq) {
34                 for(size_t i = 0; i < numRefs; i++) {
35                         // RNAME
36                         ss << "@SQ\tSN:";
37                         if(!refnames.empty() && rmap != NULL) {
38                                 printUptoWs(ss, rmap->getName(i), !fullRef);
39                         } else if(i < refnames.size()) {
40                                 printUptoWs(ss, refnames[i], !fullRef);
41                         } else {
42                                 ss << i;
43                         }
44                         ss << "\tLN:" << (plen[i] + (color ? 1 : 0)) << endl;
45                 }
46         }
47         if(rgline != NULL) {
48                 ss << "@RG\t" << rgline << endl;
49         }
50         ss << "@PG\tID:Bowtie\tVN:" << BOWTIE_VERSION << "\tCL:\"" << cmdline << "\"" << endl;
51         os.writeString(ss.str());
52 }
53
54 /**
55  * Append a SAM output record for an unaligned read.
56  */
57 void SAMHitSink::appendAligned(ostream& ss,
58                                const Hit& h,
59                                int mapq,
60                                int xms, // value for XM:I field
61                                const vector<string>* refnames,
62                                ReferenceMap *rmap,
63                                AnnotationMap *amap,
64                                bool fullRef,
65                                int offBase)
66 {
67         // QNAME
68         if(h.mate > 0) {
69                 // truncate final 2 chars
70                 for(int i = 0; i < (int)seqan::length(h.patName)-2; i++) {
71                         if(isspace(h.patName[i])) break;
72                         ss << h.patName[i];
73                 }
74         } else {
75                 for(int i = 0; i < (int)seqan::length(h.patName); i++) {
76                         if(isspace(h.patName[i])) break;
77                         ss << h.patName[i];
78                 }
79         }
80         ss << '\t';
81         // FLAG
82         int flags = 0;
83         if(h.mate == 1) {
84                 flags |= SAM_FLAG_PAIRED | SAM_FLAG_FIRST_IN_PAIR | SAM_FLAG_MAPPED_PAIRED;
85         } else if(h.mate == 2) {
86                 flags |= SAM_FLAG_PAIRED | SAM_FLAG_SECOND_IN_PAIR | SAM_FLAG_MAPPED_PAIRED;
87         }
88         if(!h.fw) flags |= SAM_FLAG_QUERY_STRAND;
89         if(h.mate > 0 && !h.mfw) flags |= SAM_FLAG_MATE_STRAND;
90         ss << flags << "\t";
91         // RNAME
92         if(refnames != NULL && rmap != NULL) {
93                 printUptoWs(ss, rmap->getName(h.h.first), !fullRef);
94         } else if(refnames != NULL && h.h.first < refnames->size()) {
95                 printUptoWs(ss, (*refnames)[h.h.first], !fullRef);
96         } else {
97                 ss << h.h.first;
98         }
99         // POS
100         ss << '\t' << (h.h.second + 1);
101         // MAPQ
102         ss << "\t" << mapq;
103         // CIGAR
104         ss << '\t' << h.length() << 'M';
105         // MRNM
106         if(h.mate > 0) {
107                 ss << "\t=";
108         } else {
109                 ss << "\t*";
110         }
111         // MPOS
112         if(h.mate > 0) {
113                 ss << '\t' << (h.mh.second + 1);
114         } else {
115                 ss << "\t0";
116         }
117         // ISIZE
118         ss << '\t';
119         if(h.mate > 0) {
120                 assert_eq(h.h.first, h.mh.first);
121                 int64_t inslen = 0;
122                 if(h.h.second > h.mh.second) {
123                         inslen = (int64_t)h.h.second - (int64_t)h.mh.second + (int64_t)h.length();
124                         inslen = -inslen;
125                 } else {
126                         inslen = (int64_t)h.mh.second - (int64_t)h.h.second + (int64_t)h.mlen;
127                 }
128                 ss << inslen;
129         } else {
130                 ss << '0';
131         }
132         // SEQ
133         ss << '\t' << h.patSeq;
134         // QUAL
135         ss << '\t' << h.quals;
136         //
137         // Optional fields
138         //
139         // Always output stratum
140         ss << "\tXA:i:" << (int)h.stratum;
141         // Always output cost
142         //ss << "\tXC:i:" << (int)h.cost;
143         // Look for SNP annotations falling within the alignment
144         // Output MD field
145         size_t len = length(h.patSeq);
146         int nm = 0;
147         int run = 0;
148         ss << "\tMD:Z:";
149         const FixedBitset<1024> *mms = &h.mms;
150         const String<Dna5>* pat = &h.patSeq;
151         const vector<char>* refcs = &h.refcs;
152         if(h.color && false) {
153                 // Disabled: print MD:Z string w/r/t to colors, not letters
154                 mms = &h.cmms;
155                 pat = &h.colSeq;
156                 assert_eq(length(h.colSeq), len+1);
157                 len = length(h.colSeq);
158                 refcs = &h.crefcs;
159         }
160         if(h.fw) {
161                 for (int i = 0; i < (int)len; ++ i) {
162                         if(mms->test(i)) {
163                                 nm++;
164                                 // There's a mismatch at this position
165                                 assert_gt((int)refcs->size(), i);
166                                 char refChar = toupper((*refcs)[i]);
167                                 ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1]));
168                                 assert_neq(refChar, qryChar);
169                                 ss << run << refChar;
170                                 run = 0;
171                         } else {
172                                 run++;
173                         }
174                 }
175         } else {
176                 for (int i = len-1; i >= 0; -- i) {
177                         if(mms->test(i)) {
178                                 nm++;
179                                 // There's a mismatch at this position
180                                 assert_gt((int)refcs->size(), i);
181                                 char refChar = toupper((*refcs)[i]);
182                                 ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1]));
183                                 assert_neq(refChar, qryChar);
184                                 ss << run << refChar;
185                                 run = 0;
186                         } else {
187                                 run++;
188                         }
189                 }
190         }
191         ss << run;
192         // Add optional edit distance field
193         ss << "\tNM:i:" << nm;
194         if(h.color) ss << "\tCM:i:" << h.cmms.count();
195         if(xms > 0)  ss << "\tXM:i:" << xms;
196         ss << endl;
197 }
198
199 /**
200  * Report a verbose, human-readable alignment to the appropriate
201  * output stream.
202  */
203 void SAMHitSink::reportHit(const Hit& h, int mapq, int xms) {
204         if(xms == 0) {
205                 // Otherwise, this is actually a sampled read and belongs in
206                 // the same category as maxed reads
207                 HitSink::reportHit(h);
208         }
209         ostringstream ss;
210         append(ss, h, mapq, xms);
211         // Make sure to grab lock before writing to output stream
212         lock(h.h.first);
213         out(h.h.first).writeString(ss.str());
214         unlock(h.h.first);
215 }
216
217 /**
218  * Report a batch of hits from a vector, perhaps subsetting it.
219  */
220 void SAMHitSink::reportHits(vector<Hit>& hs,
221                             size_t start, size_t end,
222                             int mapq, int xms)
223 {
224         assert_geq(end, start);
225         if(end-start == 0) return;
226         assert_gt(hs[start].mate, 0);
227         char buf[4096];
228         lock(0);
229         for(size_t i = start; i < end; i++) {
230                 ostringstream ss(ssmode_);
231                 ss.rdbuf()->pubsetbuf(buf, 4096);
232                 append(ss, hs[i], mapq, xms);
233                 out(0).writeChars(buf, ss.tellp());
234         }
235         unlock(0);
236         mainlock();
237         commitHits(hs);
238         first_ = false;
239         numAligned_++;
240         numReportedPaired_ += (end-start);
241         mainunlock();
242 }
243
244 /**
245  * Report either an unaligned read or a read that exceeded the -m
246  * ceiling.  We output placeholders for most of the fields in this
247  * case.
248  */
249 void SAMHitSink::reportUnOrMax(PatternSourcePerThread& p,
250                                vector<Hit>* hs,
251                                bool un) // lower bound on number of other hits
252 {
253         if(un) HitSink::reportUnaligned(p);
254         else   HitSink::reportMaxed(*hs, p);
255         ostringstream ss;
256         bool paired = !p.bufb().empty();
257         assert(paired || p.bufa().mate == 0);
258         assert(!paired || p.bufa().mate > 0);
259         assert(un || hs->size() > 0);
260         assert(!un || hs == NULL || hs->size() == 0);
261         size_t hssz = 0;
262         if(hs != NULL) hssz = hs->size();
263         if(paired) {
264                 // truncate final 2 chars
265                 for(int i = 0; i < (int)seqan::length(p.bufa().name)-2; i++) {
266                         ss << p.bufa().name[i];
267                 }
268         } else {
269                 ss << p.bufa().name;
270         }
271         ss << "\t"
272            << (SAM_FLAG_UNMAPPED | (paired ? (SAM_FLAG_PAIRED | SAM_FLAG_FIRST_IN_PAIR | SAM_FLAG_MATE_UNMAPPED) : 0)) << "\t*"
273            << "\t0\t0\t*\t*\t0\t0\t"
274            << p.bufa().patFw << "\t" << p.bufa().qual << "\tXM:i:"
275            << (paired ? (hssz+1)/2 : hssz) << endl;
276         if(paired) {
277                 // truncate final 2 chars
278                 for(int i = 0; i < (int)seqan::length(p.bufb().name)-2; i++) {
279                         ss << p.bufb().name[i];
280                 }
281                 ss << "\t"
282                    << (SAM_FLAG_UNMAPPED | (paired ? (SAM_FLAG_PAIRED | SAM_FLAG_SECOND_IN_PAIR | SAM_FLAG_MATE_UNMAPPED) : 0)) << "\t*"
283                    << "\t0\t0\t*\t*\t0\t0\t"
284                    << p.bufb().patFw << "\t" << p.bufb().qual << "\tXM:i:"
285                    << (hssz+1)/2 << endl;
286         }
287         lock(0);
288         out(0).writeString(ss.str());
289         unlock(0);
290 }
291
292 /**
293  * Append a SAM alignment to the given output stream.
294  */
295 void SAMHitSink::append(ostream& ss,
296                         const Hit& h,
297                         int mapq,
298                         int xms,
299                         const vector<string>* refnames,
300                         ReferenceMap *rmap,
301                         AnnotationMap *amap,
302                         bool fullRef,
303                         int offBase)
304 {
305         appendAligned(ss, h, mapq, xms, refnames, rmap, amap, fullRef, offBase);
306 }
307
308 /**
309  * Report maxed-out read; if sampleMax_ is set, then report 1 alignment
310  * at random.
311  */
312 void SAMHitSink::reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p) {
313         if(sampleMax_) {
314                 HitSink::reportMaxed(hs, p);
315                 RandomSource rand;
316                 rand.init(p.bufa().seed);
317                 assert_gt(hs.size(), 0);
318                 bool paired = hs.front().mate > 0;
319                 size_t num = 1;
320                 if(paired) {
321                         num = 0;
322                         int bestStratum = 999;
323                         for(size_t i = 0; i < hs.size()-1; i += 2) {
324                                 int strat = min(hs[i].stratum, hs[i+1].stratum);
325                                 if(strat < bestStratum) {
326                                         bestStratum = strat;
327                                         num = 1;
328                                 } else if(strat == bestStratum) {
329                                         num++;
330                                 }
331                         }
332                         assert_leq(num, hs.size());
333                         uint32_t r = rand.nextU32() % num;
334                         num = 0;
335                         for(size_t i = 0; i < hs.size()-1; i += 2) {
336                                 int strat = min(hs[i].stratum, hs[i+1].stratum);
337                                 if(strat == bestStratum) {
338                                         if(num == r) {
339                                                 reportHits(hs, i, i+2, 0, hs.size()/2+1);
340                                                 break;
341                                         }
342                                         num++;
343                                 }
344                         }
345                         assert_eq(num, r);
346                 } else {
347                         for(size_t i = 1; i < hs.size(); i++) {
348                                 assert_geq(hs[i].stratum, hs[i-1].stratum);
349                                 if(hs[i].stratum == hs[i-1].stratum) num++;
350                                 else break;
351                         }
352                         assert_leq(num, hs.size());
353                         uint32_t r = rand.nextU32() % num;
354                         reportHit(hs[r], /*MAPQ*/0, /*XM:I*/hs.size()+1);
355                 }
356         } else {
357                 reportUnOrMax(p, &hs, false);
358         }
359 }