1 #ifndef EBWT_SEARCH_UTIL_H_
2 #define EBWT_SEARCH_UTIL_H_
8 #include <seqan/sequence.h>
12 /// Encapsulates a change made to a query base, i.e. "the 3rd base from
13 /// the 5' end was changed from an A to a T". Useful when using
14 /// for matching seeded by "seedlings".
15 struct QueryMutation {
16 QueryMutation() : pos(0), oldBase(0), newBase(0) { }
17 QueryMutation(uint16_t _pos, uint8_t _oldBase, uint8_t _newBase) :
18 pos(_pos), oldBase(_oldBase), newBase(_newBase)
20 assert_neq(oldBase, newBase);
21 assert_leq(oldBase, 4);
22 assert_lt(newBase, 4);
25 uint8_t oldBase; // original base from the read
26 uint8_t newBase; // mutated to fit the reference in at least one place
30 * Encapsulates a partial alignment. Supports up to 256 positions and
31 * up to 3 substitutions. The 'type' field of all the alternative
32 * structs tells us whether this entry is a singleton entry, an offset
33 * into the spillover list, a non-tail entry in the spillover list, or
34 * a tail entry in the spillover list.
38 uint64_t pos0 : 16; // mismatched pos 1
39 uint64_t pos1 : 16; // mismatched pos 2
40 uint64_t pos2 : 16; // mismatched pos 3
41 uint64_t char0 : 2; // substituted char for pos 1
42 uint64_t char1 : 2; // substituted char for pos 2
43 uint64_t char2 : 2; // substituted char for pos 3
44 uint64_t reserved : 8;
45 uint64_t type : 2; // type of entry; 0=singleton_entry,
46 // 1=list_offset, 2=list_entry,
50 uint64_t off : 62;// offset into list
51 uint64_t type : 2; // type of entry; 0=singleton,
52 // 1=list_offset, 2=list_entry,
54 } off; // offset into list
56 uint64_t unk : 62;// padding
57 uint64_t type : 2; // type of entry; 0=singleton,
58 // 1=list_offset, 2=list_entry,
68 bool repOk(uint32_t qualMax, uint32_t slen, const String<char>& quals, bool maqPenalty) {
70 assert_leq(slen, seqan::length(quals));
71 if(entry.pos0 != 0xffff) {
72 assert_lt(entry.pos0, slen);
73 qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos0]));
75 if(entry.pos1 != 0xffff) {
76 assert_lt(entry.pos1, slen);
77 qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos1]));
79 if(entry.pos2 != 0xffff) {
80 assert_lt(entry.pos2, slen);
81 qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos2]));
83 assert_leq(qual, qualMax);
90 static bool sameHalfPartialAlignment(PartialAlignment pa1, PartialAlignment pa2) {
91 if(pa1.unk.type == 1 || pa2.unk.type == 1) return false;
92 assert_neq(0xffff, pa1.entry.pos0);
93 assert_neq(0xffff, pa2.entry.pos0);
95 // Make sure pa1's pos0 is represented in pa1
96 if(pa1.entry.pos0 == pa2.entry.pos0) {
97 if(pa1.entry.char0 != pa2.entry.char0) return false;
98 } else if(pa1.entry.pos0 == pa2.entry.pos1) {
99 if(pa1.entry.char0 != pa2.entry.char1) return false;
100 } else if(pa1.entry.pos0 == pa2.entry.pos2) {
101 if(pa1.entry.char0 != pa2.entry.char2) return false;
105 if(pa1.entry.pos1 != 0xffff) {
106 if (pa1.entry.pos1 == pa2.entry.pos0) {
107 if(pa1.entry.char1 != pa2.entry.char0) return false;
108 } else if(pa1.entry.pos1 == pa2.entry.pos1) {
109 if(pa1.entry.char1 != pa2.entry.char1) return false;
110 } else if(pa1.entry.pos1 == pa2.entry.pos2) {
111 if(pa1.entry.char1 != pa2.entry.char2) return false;
116 if(pa1.entry.pos2 != 0xffff) {
117 if (pa1.entry.pos2 == pa2.entry.pos0) {
118 if(pa1.entry.char2 != pa2.entry.char0) return false;
119 } else if(pa1.entry.pos2 == pa2.entry.pos1) {
120 if(pa1.entry.char2 != pa2.entry.char1) return false;
121 } else if(pa1.entry.pos2 == pa2.entry.pos2) {
122 if(pa1.entry.char2 != pa2.entry.char2) return false;
130 static bool samePartialAlignment(PartialAlignment pa1, PartialAlignment pa2) {
131 return sameHalfPartialAlignment(pa1, pa2) && sameHalfPartialAlignment(pa2, pa1);
134 static bool validPartialAlignment(PartialAlignment pa) {
135 if(pa.entry.pos0 != 0xffff) {
136 if(pa.entry.pos0 == pa.entry.pos1) return false;
137 if(pa.entry.pos0 == pa.entry.pos2) return false;
139 if(pa.entry.pos1 != 0xffff) return false;
140 if(pa.entry.pos2 != 0xffff) return false;
143 if(pa.entry.pos1 != 0xffff) {
144 if(pa.entry.pos1 == pa.entry.pos2) return false;
146 if(pa.entry.pos2 != 0xffff) return false;
153 void printHit(const vector<String<Dna5> >& os,
155 const String<Dna5>& qry,
160 uint32_t threeRevOff,
164 * A synchronized data structure for storing partial alignments
165 * associated with patids, with particular attention to compactness.
167 class PartialAlignmentManager {
169 PartialAlignmentManager(size_t listSz = 10 * 1024 * 1024) {
170 MUTEX_INIT(_partialLock);
171 // Reserve space for 10M partialsList entries = 40 MB
172 _partialsList.reserve(listSz);
175 ~PartialAlignmentManager() { }
178 * Add a set of partial alignments for a particular patid into the
179 * partial-alignment database. This version locks the database,
180 * and so is safe to call if there are potential readers or
181 * writers currently running.
183 void addPartials(uint32_t patid, const vector<PartialAlignment>& ps) {
184 if(ps.size() == 0) return;
185 MUTEX_LOCK(_partialLock);
186 size_t origPlSz = _partialsList.size();
187 // Assert that the entry doesn't exist yet
188 assert(_partialsMap.find(patid) == _partialsMap.end());
190 _partialsMap[patid] = ps[0];
191 _partialsMap[patid].entry.type = 0; // singleton
194 // Make sure there are not duplicate entries
195 for(size_t i = 0; i < ps.size()-1; i++)
196 for(size_t j = i+1; j < ps.size(); j++)
197 assert(!samePartialAlignment(ps[i], ps[j]));
199 // Insert a "pointer" record into the map that refers to
200 // the stretch of the _partialsList vector that contains
201 // the partial alignments.
203 al.u64.u64 = 0xffffffffffffffffllu;
204 al.off.off = origPlSz;
205 al.off.type = 1; // list offset
206 _partialsMap[patid] = al; // install pointer
207 assert_gt(ps.size(), 1);
208 // Now add all the non-tail partial alignments (all but the
209 // last) to the _partialsList
210 for(size_t i = 0; i < ps.size()-1; i++) {
211 assert(validPartialAlignment(ps[i]));
212 _partialsList.push_back(ps[i]);
213 // list entry (non-tail)
214 _partialsList.back().entry.type = 2;
216 // Now add the tail (last) partial alignment and mark it as
218 assert(validPartialAlignment(ps.back()));
219 _partialsList.push_back(ps.back());
221 _partialsList.back().entry.type = 3;
223 // Make sure there are not duplicate entries
224 assert_eq(_partialsList.size(), origPlSz + ps.size());
225 for(size_t i = origPlSz; i < _partialsList.size()-1; i++) {
226 for(size_t j = i+1; j < _partialsList.size(); j++) {
227 assert(!samePartialAlignment(_partialsList[i], _partialsList[j]));
232 // Assert that we added an entry
233 assert(_partialsMap.find(patid) != _partialsMap.end());
234 MUTEX_UNLOCK(_partialLock);
238 * Get a set of partial alignments for a particular patid out of
239 * the partial-alignment database.
241 void getPartials(uint32_t patid, vector<PartialAlignment>& ps) {
242 assert_eq(0, ps.size());
243 MUTEX_LOCK(_partialLock);
244 getPartialsUnsync(patid, ps);
245 MUTEX_UNLOCK(_partialLock);
249 * Get a set of partial alignments for a particular patid out of
250 * the partial-alignment database. This version does not attempt to
251 * lock the database. This is more efficient than the synchronized
252 * version, but is unsafe if there are other threads that may be
253 * writing to the database.
255 void getPartialsUnsync(uint32_t patid, vector<PartialAlignment>& ps) {
256 assert_eq(0, ps.size());
257 if(_partialsMap.find(patid) == _partialsMap.end()) {
261 al.u64.u64 = _partialsMap[patid].u64.u64;
262 uint32_t type = al.unk.type;
269 uint32_t off = al.off.off;
271 assert_lt(off, _partialsList.size());
272 ASSERT_ONLY(type = _partialsList[off].entry.type);
273 assert(type == 2 || type == 3);
275 // Make sure this entry isn't equal to any other entry
276 for(size_t i = 0; i < ps.size(); i++) {
277 assert(validPartialAlignment(ps[i]));
278 assert(!samePartialAlignment(ps[i], _partialsList[off]));
281 assert(validPartialAlignment(_partialsList[off]));
282 ps.push_back(_partialsList[off]);
283 ASSERT_ONLY(uint32_t pos0 = ps.back().entry.pos0);
284 ASSERT_ONLY(uint32_t pos1 = ps.back().entry.pos1);
285 ASSERT_ONLY(uint32_t pos2 = ps.back().entry.pos2);
286 assert(pos1 == 0xffff || pos0 != pos1);
287 assert(pos2 == 0xffff || pos0 != pos2);
288 assert(pos2 == 0xffff || pos1 != pos2);
289 } while(_partialsList[off++].entry.type == 2);
290 assert_eq(3, _partialsList[off-1].entry.type);
292 assert_gt(ps.size(), 0);
295 /// Call to clear the database when there is only one element in it
296 void clear(uint32_t patid) {
297 assert_eq(1, _partialsMap.count(patid));
298 assert_eq(1, _partialsMap.size());
299 _partialsMap.erase(patid);
300 assert_eq(0, _partialsMap.size());
301 _partialsList.clear();
302 assert_eq(0, _partialsList.size());
306 return _partialsMap.size();
310 * Convert a partial alignment into a QueryMutation string.
312 static uint8_t toMutsString(const PartialAlignment& pal,
313 const String<Dna5>& seq,
314 const String<char>& quals,
315 String<QueryMutation>& muts,
316 bool maqPenalty = true)
319 assert_eq(0, length(muts));
320 uint32_t plen = length(seq);
322 assert_neq(1, pal.unk.type);
324 uint8_t oldQuals = 0;
325 uint32_t pos0 = pal.entry.pos0;
326 assert_lt(pos0, plen);
327 uint16_t tpos0 = plen-1-pos0;
328 uint32_t chr0 = pal.entry.char0;
329 uint8_t oldChar = (uint8_t)seq[tpos0];
330 uint8_t oldQual0 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos0]));
331 assert_leq(oldQual0, 99);
332 oldQuals += oldQual0; // take quality hit
333 appendValue(muts, QueryMutation(tpos0, oldChar, chr0)); // apply mutation
334 if(pal.entry.pos1 != 0xffff) {
335 // Do second mutation
336 uint32_t pos1 = pal.entry.pos1;
337 assert_lt(pos1, plen);
338 uint16_t tpos1 = plen-1-pos1;
339 uint32_t chr1 = pal.entry.char1;
340 oldChar = (uint8_t)seq[tpos1];
341 uint8_t oldQual1 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos1]));
342 assert_leq(oldQual1, 99);
343 oldQuals += oldQual1; // take quality hit
344 assert_neq(tpos1, tpos0);
345 appendValue(muts, QueryMutation(tpos1, oldChar, chr1)); // apply mutation
346 if(pal.entry.pos2 != 0xffff) {
347 // Do second mutation
348 uint32_t pos2 = pal.entry.pos2;
349 assert_lt(pos2, plen);
350 uint16_t tpos2 = plen-1-pos2;
351 uint32_t chr2 = pal.entry.char2;
352 oldChar = (uint8_t)seq[tpos2];
353 uint8_t oldQual2 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos2]));
354 assert_leq(oldQual2, 99);
355 oldQuals += oldQual2; // take quality hit
356 assert_neq(tpos2, tpos0);
357 assert_neq(tpos2, tpos1);
358 append(muts, QueryMutation(tpos2, oldChar, chr2)); // apply mutation
361 assert_gt(length(muts), 0);
362 assert_leq(length(muts), 3);
366 /// Maps patids to partial alignments for that patid
367 map<uint32_t, PartialAlignment> _partialsMap;
368 /// Overflow for when a patid has more than 1 partial alignment
369 vector<PartialAlignment> _partialsList;
370 /// Lock for 'partialsMap' and 'partialsList'; necessary because
371 /// search threads will be reading and writing them
372 MUTEX_T _partialLock;
375 #endif /* EBWT_SEARCH_UTIL_H_ */