4 * Classes that encapsulate the caching of
16 #include "row_chaser.h"
18 #define RANGE_NOT_SET 0xffffffff
19 #define RANGE_CACHE_BAD_ALLOC 0xffffffff
22 * Manages a pool of memory used exclusively for range cache entries.
23 * This manager is allocate-only; it exists mainly so that we can avoid
24 * lots of new[]s and delete[]s.
26 * A given stretch of words may be one of two types: a cache entry, or
27 * a cache entry wrapper. A cache entry has a length and a list of
28 * already-resolved reference positions. A cache entry wrapper has a
29 * pointer to a cache entry for a different range, along with an
30 * integer indicating how many "jumps" to the left that range is from
31 * the one that owns the wrapper.
33 class RangeCacheMemPool {
35 RangeCacheMemPool(uint32_t lim /* max cache size in bytes */) :
36 lim_(lim >> 2 /* convert to words */), occ_(0), buf_(NULL),
41 buf_ = new uint32_t[lim_];
42 if(buf_ == NULL) throw std::bad_alloc();
43 } catch(std::bad_alloc& e) {
44 cerr << "Allocation error allocating " << lim
45 << " words of range-cache memory" << endl;
49 // Fill with 1s to signal that these elements are
51 memset(buf_, 0xff, lim_ << 2 /* convert back to bytes */);
55 ~RangeCacheMemPool() {
56 // Release all word memory!
57 if(lim_ > 0) delete[] buf_;
61 * Allocate numElts elements from the word pool.
63 uint32_t alloc(uint32_t numElts) {
64 assert_gt(numElts, 0);
65 assert_leq(occ_, lim_);
66 if(occ_ + numElts > lim_ || numElts >= 0x80000000) {
67 return RANGE_CACHE_BAD_ALLOC;
71 assert(allocs_.find(ret) == allocs_.end());
72 ASSERT_ONLY(allocs_.insert(ret));
73 // Clear the first elt so that we don't think there's already
76 for(size_t i = 0; i < numElts; i++) {
77 assert_eq(0xffffffff, buf_[occ_ + i]);
82 assert_leq(occ_, lim_);
83 if(lim_ - occ_ < 10) {
84 // No more room - don't try anymore
91 * Turn a pool-array index into a pointer; check that it doesn't
92 * fall outside the pool first.
94 inline uint32_t *get(uint32_t off) {
97 assert(allocs_.find(off) != allocs_.end());
98 uint32_t *ret = buf_ + off;
99 assert_neq(0x80000000, ret[0]);
100 assert_neq(0xffffffff, ret[0]);
105 * Return true iff there's no more room in the cache.
107 inline bool closed() {
112 uint32_t lim_; /// limit on number of 32-bit words to dish out in total
113 uint32_t occ_; /// number of occupied words
114 uint32_t *buf_; /// buffer of 32-bit words
117 std::set<uint32_t> allocs_; // elements allocated
122 * A view to a range of cached reference positions.
124 class RangeCacheEntry {
126 typedef Ebwt<String<Dna> > TEbwt;
127 typedef std::pair<uint32_t,uint32_t> U32Pair;
128 typedef RowChaser<String<Dna> > TRowChaser;
134 RangeCacheEntry(bool sanity = false) :
135 top_(0xffffffff), jumps_(0), len_(0), ents_(NULL), ebwt_(NULL),
140 * Create a new RangeCacheEntry from the data in the pool at 'ents'.
142 RangeCacheEntry(RangeCacheMemPool& pool, uint32_t top,
143 uint32_t ent, TEbwt* ebwt, bool sanity = false) :
146 init(pool, top, ent, ebwt);
150 * Initialize a RangeCacheEntry from the data in the pool at 'ents'.
152 void init(RangeCacheMemPool& pool, uint32_t top, uint32_t ent, TEbwt* ebwt) {
153 assert(ebwt != NULL);
156 uint32_t *ents = pool.get(ent);
157 assert_neq(0x80000000, ents[0]);
159 if((ents[0] & 0x80000000) != 0) {
160 // If so, the target is a wrapper and the non-hi bits
161 // contain the # jumps
162 jumps_ = (ents[0] & ~0x80000000);
163 assert_gt(jumps_, 0);
164 assert_leq(jumps_, ebwt_->_eh._len);
165 // Get the target entry
166 uint32_t *dest = pool.get(ents[1]);
167 // Get the length from the target entry
169 assert_leq(top_ + len_, ebwt_->_eh._len);
171 assert_leq(len_, ebwt_->_eh._len);
172 // Get the pointer to the entries themselves
175 // Not a wrapper, so there are no jumps
177 // Get the length from the target entry
179 assert_leq(top_ + len_, ebwt_->_eh._len);
181 assert_leq(len_, ebwt_->_eh._len);
182 // Get the pointer to the entries themselves
185 assert(sanityCheckEnts());
189 * Initialize a wrapper with given number of jumps and given target
192 void init(RangeCacheMemPool& pool, uint32_t top, uint32_t jumps,
193 uint32_t ent, TEbwt* ebwt)
195 assert(ebwt != NULL);
199 uint32_t *ents = pool.get(ent);
200 // Must not be a wrapper
201 assert_eq(0, ents[0] & 0x80000000);
202 // Get the length from the target entry
205 assert_leq(len_, ebwt_->_eh._len);
206 // Get the pointer to the entries themselves
208 assert_leq(top_ + len_, ebwt_->_eh._len);
209 assert(sanityCheckEnts());
212 uint32_t len() const {
213 assert(ents_ != NULL);
214 assert(ebwt_ != NULL);
218 uint32_t jumps() const {
219 assert(ents_ != NULL);
220 assert(ebwt_ != NULL);
232 * Return true iff this object represents a valid cache entry.
235 return ents_ != NULL;
243 * Install a result obtained by a client of this cache; be sure to
244 * adjust for how many jumps down the tunnel the cache entry is
247 void install(uint32_t elt, uint32_t val) {
249 // This is not a valid cache entry; do nothing
252 assert(ents_ != NULL);
253 assert(ebwt_ != NULL);
254 assert_leq(jumps_, val);
255 assert_neq(0xffffffff, val);
256 assert_leq(top_ + len_, ebwt_->_eh._len);
259 if(verbose_) cout << "Installed reference offset: " << (top_ + elt) << endl;
260 ASSERT_ONLY(uint32_t sanity = TRowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
261 assert_eq(sanity, val);
263 for(size_t i = 0; i < len_; i++) {
264 if(i == elt) continue;
265 assert_neq(val, ents_[i]);
270 // ignore install request
271 if(verbose_) cout << "Fell off end of cache entry for install: " << (top_ + elt) << endl;
276 * Get an element from the cache, adjusted for tunnel jumps.
278 inline uint32_t get(uint32_t elt) const {
280 // This is not a valid cache entry; do nothing
281 return RANGE_NOT_SET;
283 assert(ents_ != NULL);
284 assert(ebwt_ != NULL);
285 assert_leq(top_ + len_, ebwt_->_eh._len);
286 if(elt < len_ && ents_[elt] != RANGE_NOT_SET) {
287 if(verbose_) cout << "Retrieved result from cache: " << (top_ + elt) << endl;
288 uint32_t ret = ents_[elt] + jumps_;
289 ASSERT_ONLY(uint32_t sanity = TRowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
290 assert_eq(sanity, ret);
293 if(verbose_) cout << "Cache entry not set: " << (top_ + elt) << endl;
294 return RANGE_NOT_SET;
299 * Check that len_ and the ents_ array both make sense.
301 static bool sanityCheckEnts(uint32_t len, uint32_t *ents, TEbwt* ebwt) {
303 assert_leq(len, ebwt->_eh._len);
305 for(size_t i = 0; i < len; i++) {
306 if(ents[i] == 0xffffffff) continue;
307 assert_leq(ents[i], ebwt->_eh._len);
308 for(size_t j = i+1; j < len; j++) {
309 if(ents[j] == 0xffffffff) continue;
310 assert_neq(ents[i], ents[j]);
314 std::set<uint32_t> seen;
315 for(size_t i = 0; i < len; i++) {
316 if(ents[i] == 0xffffffff) continue;
317 assert(seen.find(ents[i]) == seen.end());
318 seen.insert(ents[i]);
325 * Check that len_ and the ents_ array both make sense.
327 bool sanityCheckEnts() {
328 return RangeCacheEntry::sanityCheckEnts(len_, ents_, ebwt_);
333 uint32_t top_; /// top pointer for this range
334 uint32_t jumps_; /// how many tunnel-jumps it is away from the requester
335 uint32_t len_; /// # of entries in cache entry
336 uint32_t *ents_; /// ptr to entries, which are flat offs within joined ref
337 //U32Pair *ents_; /// pointer to entries, which are tidx,toff pairs
338 TEbwt *ebwt_; /// index that alignments are in
339 bool verbose_; /// be talkative?
340 bool sanity_; /// do consistency checks?
348 typedef Ebwt<String<Dna> > TEbwt;
349 typedef std::vector<uint32_t> TU32Vec;
350 typedef std::map<uint32_t, uint32_t> TMap;
351 typedef std::map<uint32_t, uint32_t>::iterator TMapItr;
354 RangeCache(uint32_t lim, TEbwt* ebwt) :
355 lim_(lim), map_(), pool_(lim), closed_(false), ebwt_(ebwt), sanity_(true) { }
358 * Given top and bot offsets, retrieve the canonical cache entry
359 * that best covers that range. The cache entry may not directly
360 * correspond to the top offset provided, rather, it might be an
361 * entry that lies "at the end of the tunnel" when top and bot are
364 bool lookup(uint32_t top, uint32_t bot, RangeCacheEntry& ent) {
365 if(ebwt_ == NULL || lim_ == 0) return false;
368 TMapItr itr = map_.find(top);
369 if(itr == map_.end()) {
370 // No cache entry for the given 'top' offset
372 return false; // failed to get cache entry
376 return false; // failed to get cache entry
380 bool ret = tunnel(top, bot, ent);
383 // There is a cache entry for the given 'top' offset
384 uint32_t ret = itr->second;
385 ent.init(pool_, top, ret, ebwt_);
386 return true; // success
391 * Exhaustively check all entries linked to from map_ to ensure
392 * they're well-formed.
396 for(TMapItr itr = map_.begin(); itr != map_.end(); itr++) {
397 uint32_t top = itr->first;
398 uint32_t idx = itr->second;
400 assert_leq(top, ebwt_->_eh._len);
401 uint32_t *ents = pool_.get(idx);
402 if((ents[0] & 0x80000000) != 0) {
403 jumps = ents[0] & ~0x80000000;
404 assert_leq(jumps, ebwt_->_eh._len);
406 ents = pool_.get(idx);
408 uint32_t len = ents[0];
409 assert_leq(top + len, ebwt_->_eh._len);
410 RangeCacheEntry::sanityCheckEnts(len, ents + 1, ebwt_);
419 * Tunnel through to the first range that 1) includes all the same
420 * suffixes (though longer) as the given range, and 2) has a cache
423 bool tunnel(uint32_t top, uint32_t bot, RangeCacheEntry& ent) {
426 const uint32_t spread = bot - top;
427 SideLocus tloc, bloc;
428 SideLocus::initFromTopBot(top, bot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
429 uint32_t newtop = top, newbot = bot;
431 // Walk left through the tunnel
433 if(ebwt_->rowL(tloc) != ebwt_->rowL(bloc)) {
434 // Different characters at top and bot positions of
435 // BWT; this means that the calls to mapLF below are
436 // guaranteed to yield rows in two different character-
437 // sections of the BWT.
440 // Advance top and bot
441 newtop = ebwt_->mapLF(tloc);
442 newbot = ebwt_->mapLF(bloc);
443 assert_geq(newbot, newtop);
444 assert_leq(newbot - newtop, spread);
445 // If the new spread is the same as the old spread, we can
446 // be confident that the new range includes all of the same
447 // suffixes as the last range (though longer by 1 char)
448 if((newbot - newtop) == spread) {
449 // Check if newtop is already cached
450 TMapItr itr = map_.find(newtop);
452 if(itr != map_.end()) {
453 // This range, which is further to the left in the
454 // same tunnel as the query range, has a cache
455 // entry already, so use that
456 uint32_t idx = itr->second;
457 uint32_t *ents = pool_.get(idx);
458 if((ents[0] & 0x80000000) != 0) {
459 // The cache entry we found was a wrapper; make
460 // a new wrapper that points to that wrapper's
461 // target, with the appropriate number of jumps
462 jumps += (ents[0] & ~0x80000000);
465 // Allocate a new wrapper
466 uint32_t newentIdx = pool_.alloc(2);
467 if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
468 // We successfully made a new wrapper entry;
469 // now populate it and install it in map_
470 uint32_t *newent = pool_.get(newentIdx); // get ptr to it
471 assert_eq(0, newent[0]);
472 newent[0] = 0x80000000 | jumps; // set jumps
473 newent[1] = idx; // set target
474 assert(map_.find(top) == map_.end());
475 map_[top] = newentIdx;
476 if(sanity_) assert(repOk());
478 // Initialize the entry
479 ent.init(pool_, top, jumps, idx, ebwt_);
483 tops.push_back(newtop);
484 SideLocus::initFromTopBot(newtop, newbot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
486 // Not all the suffixes were preserved, so we can't
487 // link the source range's cached result to this
488 // range's cached results
491 assert_eq(jumps, tops.size());
493 assert_eq(jumps, tops.size());
494 // Try to create a new cache entry for the leftmost range in
495 // the tunnel (which might be the query range)
496 uint32_t newentIdx = pool_.alloc(spread + 1);
497 if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
498 // Successfully allocated new range cache entry; install it
499 uint32_t *newent = pool_.get(newentIdx);
500 assert_eq(0, newent[0]);
501 // Store cache-range length in first word
503 assert_lt(newent[0], 0x80000000);
504 assert_eq(spread, newent[0]);
505 uint32_t entTop = top;
507 if(tops.size() > 0) {
508 entTop = tops.back();
511 // Cache the entry for the end of the tunnel
512 assert(map_.find(entTop) == map_.end());
513 map_[entTop] = newentIdx;
514 if(sanity_) assert(repOk());
515 ent.init(pool_, entTop, jumps, newentIdx, ebwt_);
516 assert_eq(spread, newent[0]);
518 assert_neq(entTop, top);
519 // Cache a wrapper entry for the query range (if possible)
520 uint32_t wrapentIdx = pool_.alloc(2);
521 if(wrapentIdx != RANGE_CACHE_BAD_ALLOC) {
522 uint32_t *wrapent = pool_.get(wrapentIdx);
523 assert_eq(0, wrapent[0]);
524 wrapent[0] = 0x80000000 | jumps;
525 wrapent[1] = newentIdx;
526 assert(map_.find(top) == map_.end());
527 map_[top] = wrapentIdx;
528 if(sanity_) assert(repOk());
533 // Could not allocate new range cache entry
538 uint32_t lim_; /// Total number of key/val bytes to keep in cache
540 RangeCacheMemPool pool_; /// Memory pool
541 bool closed_; /// Out of space; no new entries
542 TEbwt* ebwt_; /// Index that alignments are in
546 #endif /* RANGE_CACHE_H_ */