Imported Upstream version 0.12.7
[bowtie.git] / ebwt_search_util.h
1 #ifndef EBWT_SEARCH_UTIL_H_
2 #define EBWT_SEARCH_UTIL_H_
3
4 #include <iostream>
5 #include <vector>
6 #include <map>
7 #include <stdint.h>
8 #include <seqan/sequence.h>
9 #include "hit.h"
10 #include "qual.h"
11
12 /// Encapsulates a change made to a query base, i.e. "the 3rd base from
13 /// the 5' end was changed from an A to a T".  Useful when using
14 /// for matching seeded by "seedlings".
15 struct QueryMutation {
16         QueryMutation() : pos(0), oldBase(0), newBase(0) { }
17         QueryMutation(uint16_t _pos, uint8_t _oldBase, uint8_t _newBase) :
18                 pos(_pos), oldBase(_oldBase), newBase(_newBase)
19         {
20                 assert_neq(oldBase, newBase);
21                 assert_leq(oldBase, 4);
22                 assert_lt(newBase, 4);
23         }
24         uint16_t pos;
25         uint8_t oldBase; // original base from the read
26         uint8_t newBase; // mutated to fit the reference in at least one place
27 };
28
29 /**
30  * Encapsulates a partial alignment.  Supports up to 256 positions and
31  * up to 3 substitutions.  The 'type' field of all the alternative
32  * structs tells us whether this entry is a singleton entry, an offset
33  * into the spillover list, a non-tail entry in the spillover list, or
34  * a tail entry in the spillover list.
35  */
36 typedef union {
37         struct {
38                 uint64_t pos0  : 16;   // mismatched pos 1
39                 uint64_t pos1  : 16;   // mismatched pos 2
40                 uint64_t pos2  : 16;   // mismatched pos 3
41                 uint64_t char0 : 2;    // substituted char for pos 1
42                 uint64_t char1 : 2;    // substituted char for pos 2
43                 uint64_t char2 : 2;    // substituted char for pos 3
44                 uint64_t reserved : 8;
45                 uint64_t type  : 2;    // type of entry; 0=singleton_entry,
46                                        // 1=list_offset, 2=list_entry,
47                                        // 3=list_tail
48         } entry;
49         struct {
50                 uint64_t off   : 62;// offset into list
51                 uint64_t type  : 2; // type of entry; 0=singleton,
52                             // 1=list_offset, 2=list_entry,
53                             // 3=list_tail
54         } off; // offset into list
55         struct {
56                 uint64_t unk   : 62;// padding
57                 uint64_t type  : 2; // type of entry; 0=singleton,
58                             // 1=list_offset, 2=list_entry,
59                             // 3=list_tail
60         } unk;   // unknown
61         struct {
62                 uint64_t u64   : 64;
63         } u64;
64
65         /**
66          *
67          */
68         bool repOk(uint32_t qualMax, uint32_t slen, const String<char>& quals, bool maqPenalty) {
69                 uint32_t qual = 0;
70                 assert_leq(slen, seqan::length(quals));
71                 if(entry.pos0 != 0xffff) {
72                         assert_lt(entry.pos0, slen);
73                         qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos0]));
74                 }
75                 if(entry.pos1 != 0xffff) {
76                         assert_lt(entry.pos1, slen);
77                         qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos1]));
78                 }
79                 if(entry.pos2 != 0xffff) {
80                         assert_lt(entry.pos2, slen);
81                         qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos2]));
82                 }
83                 assert_leq(qual, qualMax);
84                 return true;
85         }
86
87 } PartialAlignment;
88
89 #ifndef NDEBUG
90 static bool sameHalfPartialAlignment(PartialAlignment pa1, PartialAlignment pa2) {
91         if(pa1.unk.type == 1 || pa2.unk.type == 1) return false;
92         assert_neq(0xffff, pa1.entry.pos0);
93         assert_neq(0xffff, pa2.entry.pos0);
94
95         // Make sure pa1's pos0 is represented in pa1
96         if(pa1.entry.pos0 == pa2.entry.pos0) {
97                 if(pa1.entry.char0 != pa2.entry.char0) return false;
98         } else if(pa1.entry.pos0 == pa2.entry.pos1) {
99                 if(pa1.entry.char0 != pa2.entry.char1) return false;
100         } else if(pa1.entry.pos0 == pa2.entry.pos2) {
101                 if(pa1.entry.char0 != pa2.entry.char2) return false;
102         } else {
103                 return false;
104         }
105         if(pa1.entry.pos1 != 0xffff) {
106                 if       (pa1.entry.pos1 == pa2.entry.pos0) {
107                         if(pa1.entry.char1 != pa2.entry.char0) return false;
108                 } else if(pa1.entry.pos1 == pa2.entry.pos1) {
109                         if(pa1.entry.char1 != pa2.entry.char1) return false;
110                 } else if(pa1.entry.pos1 == pa2.entry.pos2) {
111                         if(pa1.entry.char1 != pa2.entry.char2) return false;
112                 } else {
113                         return false;
114                 }
115         }
116         if(pa1.entry.pos2 != 0xffff) {
117                 if       (pa1.entry.pos2 == pa2.entry.pos0) {
118                         if(pa1.entry.char2 != pa2.entry.char0) return false;
119                 } else if(pa1.entry.pos2 == pa2.entry.pos1) {
120                         if(pa1.entry.char2 != pa2.entry.char1) return false;
121                 } else if(pa1.entry.pos2 == pa2.entry.pos2) {
122                         if(pa1.entry.char2 != pa2.entry.char2) return false;
123                 } else {
124                         return false;
125                 }
126         }
127         return true;
128 }
129
130 static bool samePartialAlignment(PartialAlignment pa1, PartialAlignment pa2) {
131         return sameHalfPartialAlignment(pa1, pa2) && sameHalfPartialAlignment(pa2, pa1);
132 }
133
134 static bool validPartialAlignment(PartialAlignment pa) {
135         if(pa.entry.pos0 != 0xffff) {
136                 if(pa.entry.pos0 == pa.entry.pos1) return false;
137                 if(pa.entry.pos0 == pa.entry.pos2) return false;
138         } else {
139                 if(pa.entry.pos1 != 0xffff) return false;
140                 if(pa.entry.pos2 != 0xffff) return false;
141         }
142
143         if(pa.entry.pos1 != 0xffff) {
144                 if(pa.entry.pos1 == pa.entry.pos2) return false;
145         } else {
146                 if(pa.entry.pos2 != 0xffff) return false;
147         }
148         return true;
149 }
150 #endif
151
152 extern
153 void printHit(const vector<String<Dna5> >& os,
154                           const Hit& h,
155                           const String<Dna5>& qry,
156                           size_t qlen,
157                           uint32_t unrevOff,
158                           uint32_t oneRevOff,
159                           uint32_t twoRevOff,
160                           uint32_t threeRevOff,
161                           bool ebwtFw);
162
163 /**
164  * A synchronized data structure for storing partial alignments
165  * associated with patids, with particular attention to compactness.
166  */
167 class PartialAlignmentManager {
168 public:
169         PartialAlignmentManager(size_t listSz = 10 * 1024 * 1024) {
170                 MUTEX_INIT(_partialLock);
171                 // Reserve space for 10M partialsList entries = 40 MB
172                 _partialsList.reserve(listSz);
173         }
174
175         ~PartialAlignmentManager() { }
176
177         /**
178          * Add a set of partial alignments for a particular patid into the
179          * partial-alignment database.  This version locks the database,
180          * and so is safe to call if there are potential readers or
181          * writers currently running.
182          */
183         void addPartials(uint32_t patid, const vector<PartialAlignment>& ps) {
184                 if(ps.size() == 0) return;
185                 MUTEX_LOCK(_partialLock);
186                 size_t origPlSz = _partialsList.size();
187                 // Assert that the entry doesn't exist yet
188                 assert(_partialsMap.find(patid) == _partialsMap.end());
189                 if(ps.size() == 1) {
190                         _partialsMap[patid] = ps[0];
191                         _partialsMap[patid].entry.type = 0; // singleton
192                 } else {
193 #ifndef NDEBUG
194                         // Make sure there are not duplicate entries
195                         for(size_t i = 0; i < ps.size()-1; i++)
196                                 for(size_t j = i+1; j < ps.size(); j++)
197                                         assert(!samePartialAlignment(ps[i], ps[j]));
198 #endif
199                         // Insert a "pointer" record into the map that refers to
200                         // the stretch of the _partialsList vector that contains
201                         // the partial alignments.
202                         PartialAlignment al;
203                         al.u64.u64 = 0xffffffffffffffffllu;
204                         al.off.off = origPlSz;
205                         al.off.type = 1; // list offset
206                         _partialsMap[patid] = al; // install pointer
207                         assert_gt(ps.size(), 1);
208                         // Now add all the non-tail partial alignments (all but the
209                         // last) to the _partialsList
210                         for(size_t i = 0; i < ps.size()-1; i++) {
211                                 assert(validPartialAlignment(ps[i]));
212                                 _partialsList.push_back(ps[i]);
213                                 // list entry (non-tail)
214                                 _partialsList.back().entry.type = 2;
215                         }
216                         // Now add the tail (last) partial alignment and mark it as
217                         // such
218                         assert(validPartialAlignment(ps.back()));
219                         _partialsList.push_back(ps.back());
220                         // list tail
221                         _partialsList.back().entry.type = 3;
222 #ifndef NDEBUG
223                         // Make sure there are not duplicate entries
224                         assert_eq(_partialsList.size(), origPlSz + ps.size());
225                         for(size_t i = origPlSz; i < _partialsList.size()-1; i++) {
226                                 for(size_t j = i+1; j < _partialsList.size(); j++) {
227                                         assert(!samePartialAlignment(_partialsList[i], _partialsList[j]));
228                                 }
229                         }
230 #endif
231                 }
232                 // Assert that we added an entry
233                 assert(_partialsMap.find(patid) != _partialsMap.end());
234                 MUTEX_UNLOCK(_partialLock);
235         }
236
237         /**
238          * Get a set of partial alignments for a particular patid out of
239          * the partial-alignment database.
240          */
241         void getPartials(uint32_t patid, vector<PartialAlignment>& ps) {
242                 assert_eq(0, ps.size());
243                 MUTEX_LOCK(_partialLock);
244                 getPartialsUnsync(patid, ps);
245                 MUTEX_UNLOCK(_partialLock);
246         }
247
248         /**
249          * Get a set of partial alignments for a particular patid out of
250          * the partial-alignment database.  This version does not attempt to
251          * lock the database.  This is more efficient than the synchronized
252          * version, but is unsafe if there are other threads that may be
253          * writing to the database.
254          */
255         void getPartialsUnsync(uint32_t patid, vector<PartialAlignment>& ps) {
256                 assert_eq(0, ps.size());
257                 if(_partialsMap.find(patid) == _partialsMap.end()) {
258                         return;
259                 }
260                 PartialAlignment al;
261                 al.u64.u64 = _partialsMap[patid].u64.u64;
262                 uint32_t type = al.unk.type;
263                 if(type == 0) {
264                         // singleton
265                         ps.push_back(al);
266                 } else {
267                         // list
268                         assert_eq(1, type);
269                         uint32_t off = al.off.off;
270                         do {
271                                 assert_lt(off, _partialsList.size());
272                                 ASSERT_ONLY(type = _partialsList[off].entry.type);
273                                 assert(type == 2 || type == 3);
274 #ifndef NDEBUG
275                                 // Make sure this entry isn't equal to any other entry
276                                 for(size_t i = 0; i < ps.size(); i++) {
277                                         assert(validPartialAlignment(ps[i]));
278                                         assert(!samePartialAlignment(ps[i], _partialsList[off]));
279                                 }
280 #endif
281                                 assert(validPartialAlignment(_partialsList[off]));
282                                 ps.push_back(_partialsList[off]);
283                                 ASSERT_ONLY(uint32_t pos0 = ps.back().entry.pos0);
284                                 ASSERT_ONLY(uint32_t pos1 = ps.back().entry.pos1);
285                                 ASSERT_ONLY(uint32_t pos2 = ps.back().entry.pos2);
286                                 assert(pos1 == 0xffff || pos0 != pos1);
287                                 assert(pos2 == 0xffff || pos0 != pos2);
288                                 assert(pos2 == 0xffff || pos1 != pos2);
289                         } while(_partialsList[off++].entry.type == 2);
290                         assert_eq(3, _partialsList[off-1].entry.type);
291                 }
292                 assert_gt(ps.size(), 0);
293         }
294
295         /// Call to clear the database when there is only one element in it
296         void clear(uint32_t patid) {
297                 assert_eq(1, _partialsMap.count(patid));
298                 assert_eq(1, _partialsMap.size());
299                 _partialsMap.erase(patid);
300                 assert_eq(0, _partialsMap.size());
301                 _partialsList.clear();
302                 assert_eq(0, _partialsList.size());
303         }
304
305         size_t size() {
306                 return _partialsMap.size();
307         }
308
309         /**
310          * Convert a partial alignment into a QueryMutation string.
311          */
312         static uint8_t toMutsString(const PartialAlignment& pal,
313                                     const String<Dna5>& seq,
314                                     const String<char>& quals,
315                                     String<QueryMutation>& muts,
316                                     bool maqPenalty = true)
317         {
318                 reserve(muts, 4);
319                 assert_eq(0, length(muts));
320                 uint32_t plen = length(seq);
321                 assert_gt(plen, 0);
322                 assert_neq(1, pal.unk.type);
323                 // Do first mutation
324                 uint8_t oldQuals = 0;
325                 uint32_t pos0 = pal.entry.pos0;
326                 assert_lt(pos0, plen);
327                 uint16_t tpos0 = plen-1-pos0;
328                 uint32_t chr0 = pal.entry.char0;
329                 uint8_t oldChar = (uint8_t)seq[tpos0];
330                 uint8_t oldQual0 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos0]));
331                 assert_leq(oldQual0, 99);
332                 oldQuals += oldQual0; // take quality hit
333                 appendValue(muts, QueryMutation(tpos0, oldChar, chr0)); // apply mutation
334                 if(pal.entry.pos1 != 0xffff) {
335                         // Do second mutation
336                         uint32_t pos1 = pal.entry.pos1;
337                         assert_lt(pos1, plen);
338                         uint16_t tpos1 = plen-1-pos1;
339                         uint32_t chr1 = pal.entry.char1;
340                         oldChar = (uint8_t)seq[tpos1];
341                         uint8_t oldQual1 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos1]));
342                         assert_leq(oldQual1, 99);
343                         oldQuals += oldQual1; // take quality hit
344                         assert_neq(tpos1, tpos0);
345                         appendValue(muts, QueryMutation(tpos1, oldChar, chr1)); // apply mutation
346                         if(pal.entry.pos2 != 0xffff) {
347                                 // Do second mutation
348                                 uint32_t pos2 = pal.entry.pos2;
349                                 assert_lt(pos2, plen);
350                                 uint16_t tpos2 = plen-1-pos2;
351                                 uint32_t chr2 = pal.entry.char2;
352                                 oldChar = (uint8_t)seq[tpos2];
353                                 uint8_t oldQual2 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos2]));
354                                 assert_leq(oldQual2, 99);
355                                 oldQuals += oldQual2; // take quality hit
356                                 assert_neq(tpos2, tpos0);
357                                 assert_neq(tpos2, tpos1);
358                                 append(muts, QueryMutation(tpos2, oldChar, chr2)); // apply mutation
359                         }
360                 }
361                 assert_gt(length(muts), 0);
362                 assert_leq(length(muts), 3);
363                 return oldQuals;
364         }
365 private:
366         /// Maps patids to partial alignments for that patid
367         map<uint32_t, PartialAlignment> _partialsMap;
368         /// Overflow for when a patid has more than 1 partial alignment
369         vector<PartialAlignment> _partialsList;
370         /// Lock for 'partialsMap' and 'partialsList'; necessary because
371         /// search threads will be reading and writing them
372         MUTEX_T _partialLock;
373 };
374
375 #endif /* EBWT_SEARCH_UTIL_H_ */