Imported Upstream version 0.12.7
[bowtie.git] / range_cache.h
1 /*
2  * range_cache.h
3  *
4  * Classes that encapsulate the caching of
5  */
6
7 #ifndef RANGE_CACHE_H_
8 #define RANGE_CACHE_H_
9
10 #include <stdint.h>
11 #include <utility>
12 #include <iostream>
13 #include <stdexcept>
14 #include <map>
15 #include "ebwt.h"
16 #include "row_chaser.h"
17
18 #define RANGE_NOT_SET 0xffffffff
19 #define RANGE_CACHE_BAD_ALLOC 0xffffffff
20
21 /**
22  * Manages a pool of memory used exclusively for range cache entries.
23  * This manager is allocate-only; it exists mainly so that we can avoid
24  * lots of new[]s and delete[]s.
25  *
26  * A given stretch of words may be one of two types: a cache entry, or
27  * a cache entry wrapper.  A cache entry has a length and a list of
28  * already-resolved reference positions.  A cache entry wrapper has a
29  * pointer to a cache entry for a different range, along with an
30  * integer indicating how many "jumps" to the left that range is from
31  * the one that owns the wrapper.
32  */
33 class RangeCacheMemPool {
34 public:
35         RangeCacheMemPool(uint32_t lim /* max cache size in bytes */) :
36                 lim_(lim >> 2 /* convert to words */), occ_(0), buf_(NULL),
37                 closed_(false)
38         {
39                 if(lim_ > 0) {
40                         try {
41                                 buf_ = new uint32_t[lim_];
42                                 if(buf_ == NULL) throw std::bad_alloc();
43                         } catch(std::bad_alloc& e) {
44                                 cerr << "Allocation error allocating " << lim
45                                          << " words of range-cache memory" << endl;
46                                 throw 1;
47                         }
48                         assert(buf_ != NULL);
49                         // Fill with 1s to signal that these elements are
50                         // uninitialized
51                         memset(buf_, 0xff, lim_ << 2 /* convert back to bytes */);
52                 }
53         }
54
55         ~RangeCacheMemPool() {
56                 // Release all word memory!
57                 if(lim_ > 0) delete[] buf_;
58         }
59
60         /**
61          * Allocate numElts elements from the word pool.
62          */
63         uint32_t alloc(uint32_t numElts) {
64                 assert_gt(numElts, 0);
65                 assert_leq(occ_, lim_);
66                 if(occ_ + numElts > lim_ || numElts >= 0x80000000) {
67                         return RANGE_CACHE_BAD_ALLOC;
68                 }
69                 assert_gt(lim_, 0);
70                 uint32_t ret = occ_;
71                 assert(allocs_.find(ret) == allocs_.end());
72                 ASSERT_ONLY(allocs_.insert(ret));
73                 // Clear the first elt so that we don't think there's already
74                 // something there
75 #ifndef NDEBUG
76                 for(size_t i = 0; i < numElts; i++) {
77                         assert_eq(0xffffffff, buf_[occ_ + i]);
78                 }
79 #endif
80                 buf_[occ_] = 0;
81                 occ_ += numElts;
82                 assert_leq(occ_, lim_);
83                 if(lim_ - occ_ < 10) {
84                         // No more room - don't try anymore
85                         closed_ = true;
86                 }
87                 return ret;
88         }
89
90         /**
91          * Turn a pool-array index into a pointer; check that it doesn't
92          * fall outside the pool first.
93          */
94         inline uint32_t *get(uint32_t off) {
95                 assert_gt(lim_, 0);
96                 assert_lt(off, lim_);
97                 assert(allocs_.find(off) != allocs_.end());
98                 uint32_t *ret = buf_ + off;
99                 assert_neq(0x80000000, ret[0]);
100                 assert_neq(0xffffffff, ret[0]);
101                 return ret;
102         }
103
104         /**
105          * Return true iff there's no more room in the cache.
106          */
107         inline bool closed() {
108                 return closed_;
109         }
110
111 private:
112         uint32_t lim_;  /// limit on number of 32-bit words to dish out in total
113         uint32_t occ_;  /// number of occupied words
114         uint32_t *buf_; /// buffer of 32-bit words
115         bool closed_;   ///
116 #ifndef NDEBUG
117         std::set<uint32_t> allocs_; // elements allocated
118 #endif
119 };
120
121 /**
122  * A view to a range of cached reference positions.
123  */
124 class RangeCacheEntry {
125
126         typedef Ebwt<String<Dna> > TEbwt;
127         typedef std::pair<uint32_t,uint32_t> U32Pair;
128         typedef RowChaser<String<Dna> > TRowChaser;
129
130 public:
131         /**
132          *
133          */
134         RangeCacheEntry(bool sanity = false) :
135                 top_(0xffffffff), jumps_(0), len_(0), ents_(NULL), ebwt_(NULL),
136                 sanity_(sanity)
137         { }
138
139         /**
140          * Create a new RangeCacheEntry from the data in the pool at 'ents'.
141          */
142         RangeCacheEntry(RangeCacheMemPool& pool, uint32_t top,
143                         uint32_t ent, TEbwt* ebwt, bool sanity = false) :
144             sanity_(sanity)
145         {
146                 init(pool, top, ent, ebwt);
147         }
148
149         /**
150          * Initialize a RangeCacheEntry from the data in the pool at 'ents'.
151          */
152         void init(RangeCacheMemPool& pool, uint32_t top, uint32_t ent, TEbwt* ebwt) {
153                 assert(ebwt != NULL);
154                 top_ = top;
155                 ebwt_ = ebwt;
156                 uint32_t *ents = pool.get(ent);
157                 assert_neq(0x80000000, ents[0]);
158                 // Is hi bit set?
159                 if((ents[0] & 0x80000000) != 0) {
160                         // If so, the target is a wrapper and the non-hi bits
161                         // contain the # jumps
162                         jumps_ = (ents[0] & ~0x80000000);
163                         assert_gt(jumps_, 0);
164                         assert_leq(jumps_, ebwt_->_eh._len);
165                         // Get the target entry
166                         uint32_t *dest = pool.get(ents[1]);
167                         // Get the length from the target entry
168                         len_ = dest[0];
169                         assert_leq(top_ + len_, ebwt_->_eh._len);
170                         assert_gt(len_, 0);
171                         assert_leq(len_, ebwt_->_eh._len);
172                         // Get the pointer to the entries themselves
173                         ents_ = dest + 1;
174                 } else {
175                         // Not a wrapper, so there are no jumps
176                         jumps_ = 0;
177                         // Get the length from the target entry
178                         len_  = ents[0];
179                         assert_leq(top_ + len_, ebwt_->_eh._len);
180                         assert_gt(len_, 0);
181                         assert_leq(len_, ebwt_->_eh._len);
182                         // Get the pointer to the entries themselves
183                         ents_ = ents + 1;
184                 }
185                 assert(sanityCheckEnts());
186         }
187
188         /**
189          * Initialize a wrapper with given number of jumps and given target
190          * entry index.
191          */
192         void init(RangeCacheMemPool& pool, uint32_t top, uint32_t jumps,
193                   uint32_t ent, TEbwt* ebwt)
194         {
195                 assert(ebwt != NULL);
196                 ebwt_ = ebwt;
197                 top_ = top;
198                 jumps_ = jumps;
199                 uint32_t *ents = pool.get(ent);
200                 // Must not be a wrapper
201                 assert_eq(0, ents[0] & 0x80000000);
202                 // Get the length from the target entry
203                 len_ = ents[0];
204                 assert_gt(len_, 0);
205                 assert_leq(len_, ebwt_->_eh._len);
206                 // Get the pointer to the entries themselves
207                 ents_ = ents + 1;
208                 assert_leq(top_ + len_, ebwt_->_eh._len);
209                 assert(sanityCheckEnts());
210         }
211
212         uint32_t len() const   {
213                 assert(ents_ != NULL);
214                 assert(ebwt_ != NULL);
215                 return len_;
216         }
217
218         uint32_t jumps() const {
219                 assert(ents_ != NULL);
220                 assert(ebwt_ != NULL);
221                 return jumps_;
222         }
223
224         /**
225          *
226          */
227         void reset() {
228                 ents_ = NULL;
229         }
230
231         /**
232          * Return true iff this object represents a valid cache entry.
233          */
234         bool valid() const {
235                 return ents_ != NULL;
236         }
237
238         TEbwt *ebwt() {
239                 return ebwt_;
240         }
241
242         /**
243          * Install a result obtained by a client of this cache; be sure to
244          * adjust for how many jumps down the tunnel the cache entry is
245          * situated.
246          */
247         void install(uint32_t elt, uint32_t val) {
248                 if(ents_ == NULL) {
249                         // This is not a valid cache entry; do nothing
250                         return;
251                 }
252                 assert(ents_ != NULL);
253                 assert(ebwt_ != NULL);
254                 assert_leq(jumps_, val);
255                 assert_neq(0xffffffff, val);
256                 assert_leq(top_ + len_, ebwt_->_eh._len);
257                 if(elt < len_) {
258                         val -= jumps_;
259                         if(verbose_) cout << "Installed reference offset: " << (top_ + elt) << endl;
260                         ASSERT_ONLY(uint32_t sanity = TRowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
261                         assert_eq(sanity, val);
262 #ifndef NDEBUG
263                         for(size_t i = 0; i < len_; i++) {
264                                 if(i == elt) continue;
265                                 assert_neq(val, ents_[i]);
266                         }
267 #endif
268                         ents_[elt] = val;
269                 } else {
270                         // ignore install request
271                         if(verbose_) cout << "Fell off end of cache entry for install: " << (top_ + elt) << endl;
272                 }
273         }
274
275         /**
276          * Get an element from the cache, adjusted for tunnel jumps.
277          */
278         inline uint32_t get(uint32_t elt) const {
279                 if(ents_ == NULL) {
280                         // This is not a valid cache entry; do nothing
281                         return RANGE_NOT_SET;
282                 }
283                 assert(ents_ != NULL);
284                 assert(ebwt_ != NULL);
285                 assert_leq(top_ + len_, ebwt_->_eh._len);
286                 if(elt < len_ && ents_[elt] != RANGE_NOT_SET) {
287                         if(verbose_) cout << "Retrieved result from cache: " << (top_ + elt) << endl;
288                         uint32_t ret = ents_[elt] + jumps_;
289                         ASSERT_ONLY(uint32_t sanity = TRowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
290                         assert_eq(sanity, ret);
291                         return ret;
292                 } else {
293                         if(verbose_) cout << "Cache entry not set: " << (top_ + elt) << endl;
294                         return RANGE_NOT_SET;
295                 }
296         }
297
298         /**
299          * Check that len_ and the ents_ array both make sense.
300          */
301         static bool sanityCheckEnts(uint32_t len, uint32_t *ents, TEbwt* ebwt) {
302                 assert_gt(len, 0);
303                 assert_leq(len, ebwt->_eh._len);
304                 if(len < 10) {
305                         for(size_t i = 0; i < len; i++) {
306                                 if(ents[i] == 0xffffffff) continue;
307                                 assert_leq(ents[i], ebwt->_eh._len);
308                                 for(size_t j = i+1; j < len; j++) {
309                                         if(ents[j] == 0xffffffff) continue;
310                                         assert_neq(ents[i], ents[j]);
311                                 }
312                         }
313                 } else {
314                         std::set<uint32_t> seen;
315                         for(size_t i = 0; i < len; i++) {
316                                 if(ents[i] == 0xffffffff) continue;
317                                 assert(seen.find(ents[i]) == seen.end());
318                                 seen.insert(ents[i]);
319                         }
320                 }
321                 return true;
322         }
323
324         /**
325          * Check that len_ and the ents_ array both make sense.
326          */
327         bool sanityCheckEnts() {
328                 return RangeCacheEntry::sanityCheckEnts(len_, ents_, ebwt_);
329         }
330
331 private:
332
333         uint32_t top_;   /// top pointer for this range
334         uint32_t jumps_; /// how many tunnel-jumps it is away from the requester
335         uint32_t len_;   /// # of entries in cache entry
336         uint32_t *ents_; /// ptr to entries, which are flat offs within joined ref
337         //U32Pair *ents_;  /// pointer to entries, which are tidx,toff pairs
338         TEbwt    *ebwt_; /// index that alignments are in
339         bool     verbose_; /// be talkative?
340         bool     sanity_;  /// do consistency checks?
341 };
342
343 /**
344  *
345  */
346 class RangeCache {
347
348         typedef Ebwt<String<Dna> > TEbwt;
349         typedef std::vector<uint32_t> TU32Vec;
350         typedef std::map<uint32_t, uint32_t> TMap;
351         typedef std::map<uint32_t, uint32_t>::iterator TMapItr;
352
353 public:
354         RangeCache(uint32_t lim, TEbwt* ebwt) :
355                 lim_(lim), map_(), pool_(lim), closed_(false), ebwt_(ebwt), sanity_(true) { }
356
357         /**
358          * Given top and bot offsets, retrieve the canonical cache entry
359          * that best covers that range.  The cache entry may not directly
360          * correspond to the top offset provided, rather, it might be an
361          * entry that lies "at the end of the tunnel" when top and bot are
362          * walked backward.
363          */
364         bool lookup(uint32_t top, uint32_t bot, RangeCacheEntry& ent) {
365                 if(ebwt_ == NULL || lim_ == 0) return false;
366                 assert_gt(bot, top);
367                 ent.reset();
368                 TMapItr itr = map_.find(top);
369                 if(itr == map_.end()) {
370                         // No cache entry for the given 'top' offset
371                         if(closed_) {
372                                 return false; // failed to get cache entry
373                         } else {
374                                 if(pool_.closed()) {
375                                         closed_ = true;
376                                         return false; // failed to get cache entry
377                                 }
378                         }
379                         // Use the tunnel
380                         bool ret = tunnel(top, bot, ent);
381                         return ret;
382                 } else {
383                         // There is a cache entry for the given 'top' offset
384                         uint32_t ret = itr->second;
385                         ent.init(pool_, top, ret, ebwt_);
386                         return true; // success
387                 }
388         }
389
390         /**
391          * Exhaustively check all entries linked to from map_ to ensure
392          * they're well-formed.
393          */
394         bool repOk() {
395 #ifndef NDEBUG
396                 for(TMapItr itr = map_.begin(); itr != map_.end(); itr++) {
397                         uint32_t top = itr->first;
398                         uint32_t idx = itr->second;
399                         uint32_t jumps = 0;
400                         assert_leq(top, ebwt_->_eh._len);
401                         uint32_t *ents = pool_.get(idx);
402                         if((ents[0] & 0x80000000) != 0) {
403                                 jumps = ents[0] & ~0x80000000;
404                                 assert_leq(jumps, ebwt_->_eh._len);
405                                 idx = ents[1];
406                                 ents = pool_.get(idx);
407                         }
408                         uint32_t len = ents[0];
409                         assert_leq(top + len, ebwt_->_eh._len);
410                         RangeCacheEntry::sanityCheckEnts(len, ents + 1, ebwt_);
411                 }
412 #endif
413                 return true;
414         }
415
416 protected:
417
418         /**
419          * Tunnel through to the first range that 1) includes all the same
420          * suffixes (though longer) as the given range, and 2) has a cache
421          * entry for it.
422          */
423         bool tunnel(uint32_t top, uint32_t bot, RangeCacheEntry& ent) {
424                 assert_gt(bot, top);
425                 TU32Vec tops;
426                 const uint32_t spread = bot - top;
427                 SideLocus tloc, bloc;
428                 SideLocus::initFromTopBot(top, bot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
429                 uint32_t newtop = top, newbot = bot;
430                 uint32_t jumps = 0;
431                 // Walk left through the tunnel
432                 while(true) {
433                         if(ebwt_->rowL(tloc) != ebwt_->rowL(bloc)) {
434                                 // Different characters at top and bot positions of
435                                 // BWT; this means that the calls to mapLF below are
436                                 // guaranteed to yield rows in two different character-
437                                 // sections of the BWT.
438                                 break;
439                         }
440                         // Advance top and bot
441                         newtop = ebwt_->mapLF(tloc);
442                         newbot = ebwt_->mapLF(bloc);
443                         assert_geq(newbot, newtop);
444                         assert_leq(newbot - newtop, spread);
445                         // If the new spread is the same as the old spread, we can
446                         // be confident that the new range includes all of the same
447                         // suffixes as the last range (though longer by 1 char)
448                         if((newbot - newtop) == spread) {
449                                 // Check if newtop is already cached
450                                 TMapItr itr = map_.find(newtop);
451                                 jumps++;
452                                 if(itr != map_.end()) {
453                                         // This range, which is further to the left in the
454                                         // same tunnel as the query range, has a cache
455                                         // entry already, so use that
456                                         uint32_t idx = itr->second;
457                                         uint32_t *ents = pool_.get(idx);
458                                         if((ents[0] & 0x80000000) != 0) {
459                                                 // The cache entry we found was a wrapper; make
460                                                 // a new wrapper that points to that wrapper's
461                                                 // target, with the appropriate number of jumps
462                                                 jumps += (ents[0] & ~0x80000000);
463                                                 idx = ents[1];
464                                         }
465                                         // Allocate a new wrapper
466                                         uint32_t newentIdx = pool_.alloc(2);
467                                         if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
468                                                 // We successfully made a new wrapper entry;
469                                                 // now populate it and install it in map_
470                                                 uint32_t *newent = pool_.get(newentIdx); // get ptr to it
471                                                 assert_eq(0, newent[0]);
472                                                 newent[0] = 0x80000000 | jumps; // set jumps
473                                                 newent[1] = idx;                // set target
474                                                 assert(map_.find(top) == map_.end());
475                                                 map_[top] = newentIdx;
476                                                 if(sanity_) assert(repOk());
477                                         }
478                                         // Initialize the entry
479                                         ent.init(pool_, top, jumps, idx, ebwt_);
480                                         return true;
481                                 }
482                                 // Save this range
483                                 tops.push_back(newtop);
484                                 SideLocus::initFromTopBot(newtop, newbot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
485                         } else {
486                                 // Not all the suffixes were preserved, so we can't
487                                 // link the source range's cached result to this
488                                 // range's cached results
489                                 break;
490                         }
491                         assert_eq(jumps, tops.size());
492                 }
493                 assert_eq(jumps, tops.size());
494                 // Try to create a new cache entry for the leftmost range in
495                 // the tunnel (which might be the query range)
496                 uint32_t newentIdx = pool_.alloc(spread + 1);
497                 if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
498                         // Successfully allocated new range cache entry; install it
499                         uint32_t *newent = pool_.get(newentIdx);
500                         assert_eq(0, newent[0]);
501                         // Store cache-range length in first word
502                         newent[0] = spread;
503                         assert_lt(newent[0], 0x80000000);
504                         assert_eq(spread, newent[0]);
505                         uint32_t entTop = top;
506                         uint32_t jumps = 0;
507                         if(tops.size() > 0) {
508                                 entTop = tops.back();
509                                 jumps = tops.size();
510                         }
511                         // Cache the entry for the end of the tunnel
512                         assert(map_.find(entTop) == map_.end());
513                         map_[entTop] = newentIdx;
514                         if(sanity_) assert(repOk());
515                         ent.init(pool_, entTop, jumps, newentIdx, ebwt_);
516                         assert_eq(spread, newent[0]);
517                         if(jumps > 0) {
518                                 assert_neq(entTop, top);
519                                 // Cache a wrapper entry for the query range (if possible)
520                                 uint32_t wrapentIdx = pool_.alloc(2);
521                                 if(wrapentIdx != RANGE_CACHE_BAD_ALLOC) {
522                                         uint32_t *wrapent = pool_.get(wrapentIdx);
523                                         assert_eq(0, wrapent[0]);
524                                         wrapent[0] = 0x80000000 | jumps;
525                                         wrapent[1] = newentIdx;
526                                         assert(map_.find(top) == map_.end());
527                                         map_[top] = wrapentIdx;
528                                         if(sanity_) assert(repOk());
529                                 }
530                         }
531                         return true;
532                 } else {
533                         // Could not allocate new range cache entry
534                         return false;
535                 }
536         }
537
538         uint32_t lim_;           /// Total number of key/val bytes to keep in cache
539         TMap map_;               ///
540         RangeCacheMemPool pool_; /// Memory pool
541         bool closed_;            /// Out of space; no new entries
542         TEbwt* ebwt_;            /// Index that alignments are in
543         bool sanity_;
544 };
545
546 #endif /* RANGE_CACHE_H_ */