Commit patch to not break on spaces.
[bowtie.git] / range_chaser.h
1 /*
2  * range_chaser.h
3  */
4
5 #ifndef RANGE_CHASER_H_
6 #define RANGE_CHASER_H_
7
8 #include <stdint.h>
9 #include "ebwt.h"
10 #include "random_source.h"
11 #include "row_chaser.h"
12 #include "range_cache.h"
13 #include "aligner_metrics.h"
14
15 /**
16  * A class that statefully processes a range by picking one row
17  * randomly and then linearly scanning forward through the range,
18  * reporting reference offsets as we go.
19  */
20 template<typename TStr>
21 class RangeChaser {
22
23         typedef Ebwt<TStr> TEbwt;
24         typedef std::pair<uint32_t,uint32_t> U32Pair;
25         typedef std::vector<U32Pair> U32PairVec;
26         typedef RowChaser<TStr> TRowChaser;
27
28 public:
29         RangeChaser(uint32_t cacheThresh,
30                     RangeCache* cacheFw, RangeCache* cacheBw,
31                     AlignerMetrics *metrics = NULL) :
32                 done(false),
33                 ebwt_(NULL),
34                 qlen_(0),
35                 cacheThresh_(cacheThresh),
36                 top_(0xffffffff),
37                 bot_(0xffffffff),
38                 irow_(0xffffffff),
39                 row_(0xffffffff),
40                 off_(make_pair(0xffffffff, 0)),
41                 tlen_(0),
42                 chaser_(metrics),
43                 cached_(false),
44                 cacheFw_(cacheFw), cacheBw_(cacheBw),
45                 metrics_(metrics)
46         { }
47
48         ~RangeChaser() { }
49
50         /**
51          * Convert a range to a vector of reference loci, where a locus is
52          * a u32 pair of <ref-idx, ref-offset>.
53          */
54         static void toOffs(const TEbwt& ebwt,
55                            uint32_t qlen,
56                            RandomSource& rand,
57                            uint32_t top,
58                            uint32_t bot,
59                            U32PairVec& dest)
60         {
61                 RangeChaser rc(ebwt, rand);
62                 rc.setTopBot(top, bot, qlen);
63                 rc.prep();
64                 while(!rc.done) {
65                         rc.advance();
66                         if(rc.foundOff()) {
67                                 dest.push_back(rc.off());
68                                 rc.reset();
69                                 assert(!rc.foundOff());
70                         }
71                         rc.prep();
72                 }
73         }
74
75         /**
76          * Set the row to chase
77          */
78         void setRow(uint32_t row) {
79                 // Must be within bounds of range
80                 assert_lt(row, bot_);
81                 assert_geq(row, top_);
82                 row_ = row;
83                 while(true) {
84                         // First thing to try is the cache
85                         if(cached_) {
86                                 assert(cacheEnt_.valid());
87                                 uint32_t cached = cacheEnt_.get(row_ - top_);
88                                 assert(cacheEnt_.valid());
89                                 if(cached != RANGE_NOT_SET) {
90                                         // Assert that it matches what we would have got...
91                                         ASSERT_ONLY(uint32_t sanity = TRowChaser::toFlatRefOff(ebwt_, 1, row_));
92                                         assert_eq(sanity, cached);
93                                         // We have a cached result.  Cached result is in the
94                                         // form of an offset into the joined reference string,
95                                         // so now we have to convert it to a tidx/toff pair.
96                                         ebwt_->joinedToTextOff(qlen_, cached, off_.first, off_.second, tlen_);
97                                         // Note: tidx may be 0xffffffff, if alignment overlaps a
98                                         // reference boundary
99                                         if(off_.first != 0xffffffff) {
100                                                 // Bingo, we found a valid result using the cache
101                                                 assert(foundOff());
102                                                 return;
103                                         }
104                                 } else {
105                                         // Wasn't in the cache; use the RowChaser
106                                 }
107                         }
108                         // Second thing to try is the chaser
109                         chaser_.setRow(row_, qlen_, ebwt_);
110                         assert(chaser_.prepped_ || chaser_.done);
111                         // It might be done immediately...
112                         if(chaser_.done) {
113                                 // We're done immediately
114                                 off_ = chaser_.off();
115                                 if(off_.first != 0xffffffff) {
116                                         // This is a valid result
117                                         if(cached_) {
118                                                 // Install the result in the cache
119                                                 assert(cacheEnt_.valid());
120                                                 cacheEnt_.install(row_ - top_, chaser_.flatOff());
121                                                 //if(ebwt_->fw()) assert(cacheFw_->repOk());
122                                                 //else            assert(cacheBw_->repOk());
123                                         }
124                                         tlen_ = chaser_.tlen();
125                                         assert(foundOff());
126                                         return; // found result
127                                 }
128                         } else {
129                                 // Pursue this row
130                                 break;
131                         }
132                         // That row didn't have a valid result, move to the next
133                         row_++;
134                         if(row_ == bot_) {
135                                 // Wrap back to top_
136                                 row_ = top_;
137                         }
138                         if(row_ == irow_) {
139                                 // Exhausted all possible rows
140                                 done = true;
141                                 assert_eq(0xffffffff, off_.first);
142                                 return;
143                         }
144                 }
145                 assert(chaser_.prepped_);
146         }
147
148         /**
149          * Set the next range for us to "chase" (i.e. convert row-by-row
150          * to reference loci).
151          */
152         void setTopBot(uint32_t top,
153                        uint32_t bot,
154                        uint32_t qlen,
155                        RandomSource& rand,
156                        const TEbwt* ebwt)
157         {
158                 assert_neq(0xffffffff, top);
159                 assert_neq(0xffffffff, bot);
160                 assert_gt(bot, top);
161                 assert_gt(qlen, 0);
162                 assert(ebwt != NULL);
163                 ebwt_ = ebwt;
164                 qlen_ = qlen;
165                 top_ = top;
166                 bot_ = bot;
167                 uint32_t spread = bot - top;
168                 irow_ = top + (rand.nextU32() % spread); // initial row
169                 done = false;
170                 cached_ = false;
171                 reset();
172                 if(cacheFw_ != NULL || cacheBw_ != NULL) {
173                         if(spread > cacheThresh_) {
174                                 bool ret = false;
175                                 if(ebwt->fw() && cacheFw_ != NULL) {
176                                         ret = cacheFw_->lookup(top, bot, cacheEnt_);
177                                         if(ret) assert(cacheEnt_.ebwt()->fw());
178                                 } else if(!ebwt->fw() && cacheBw_ != NULL) {
179                                         ret = cacheBw_->lookup(top, bot, cacheEnt_);
180                                         if(ret) assert(!cacheEnt_.ebwt()->fw());
181                                 } else {
182                                         cacheEnt_.reset();
183                                 }
184                                 assert_eq(cacheEnt_.valid(), ret);
185                                 cached_ = ret;
186                         } else {
187                                 cacheEnt_.reset();
188                         }
189                 }
190                 setRow(irow_);
191                 assert(chaser_.prepped_ || foundOff() || done);
192         }
193
194         /**
195          * Advance the step-left process by one step.  Check if we're done.
196          */
197         void advance() {
198                 assert(!done);
199                 assert(chaser_.prepped_ || chaser_.done);
200                 reset();
201                 if(chaser_.done) {
202                         // chaser finished with this row
203                         row_++;
204                         if(row_ == bot_) {
205                                 row_ = top_;
206                         }
207                         if(row_ == irow_) {
208                                 // Exhausted all possible rows
209                                 done = true;
210                                 assert_eq(0xffffffff, off_.first);
211                                 return;
212                         }
213                         setRow(row_);
214                         assert(chaser_.prepped_ || foundOff() || done);
215                 } else {
216                         chaser_.advance();
217                         assert(chaser_.prepped_ || chaser_.done);
218                         if(chaser_.done) {
219                                 // We're done immediately
220                                 off_ = chaser_.off();
221                                 if(off_.first != 0xffffffff) {
222                                         if(cached_) {
223                                                 // Install the result in the cache
224                                                 assert(cacheEnt_.valid());
225                                                 cacheEnt_.install(row_ - top_, chaser_.flatOff());
226                                                 //if(ebwt_->fw()) assert(cacheFw_->repOk());
227                                                 //else            assert(cacheBw_->repOk());
228                                         }
229                                         // Found a reference position
230                                         tlen_ = chaser_.tlen();
231                                         assert(foundOff());
232                                 }
233                         }
234                 }
235         }
236
237         /**
238          * Prepare for the next call to advance() by prefetching relevant
239          * data.  In this case, 'chaser_' is doing this for us.
240          */
241         void prep() {
242                 // nothing
243         }
244
245         /**
246          * Return true iff off_ contains a valid reference location for
247          * this range.
248          */
249         bool foundOff() const {
250                 return off_.first != 0xffffffff;
251         }
252
253         /**
254          * Reset the chaser so that 'off_' does not hold a valid offset and
255          * foundOff() returns false.
256          */
257         void reset() {
258                 off_.first = 0xffffffff;
259         }
260
261         /**
262          * Get the calculated offset.
263          */
264         U32Pair off() const {
265                 return off_;
266         }
267
268         /**
269          * Get the length of the hit reference.
270          */
271         uint32_t tlen() const {
272                 return tlen_;
273         }
274
275         bool done;             /// true = chase is done & answer is in off_
276
277 protected:
278
279         const TEbwt* ebwt_;    /// index to resolve row in
280         uint32_t qlen_;        /// length of read; needed to convert to ref. coordinates
281         uint32_t cacheThresh_; /// ranges wider than thresh use cacheing
282         uint32_t top_;         /// range top
283         uint32_t bot_;         /// range bottom
284         uint32_t irow_;        /// initial randomly-chosen row within range
285         uint32_t row_;         /// current row within range
286         U32Pair off_;          /// calculated offset (0xffffffff if not done)
287         uint32_t tlen_;        /// length of text hit
288         TRowChaser chaser_;    /// stateful row chaser
289         RangeCacheEntry cacheEnt_; /// current cache entry
290         bool cached_;          /// cacheEnt is active for current range?
291         RangeCache* cacheFw_; /// cache for the forward index
292         RangeCache* cacheBw_; /// cache for the backward index
293         AlignerMetrics *metrics_;
294 };
295
296 #endif /* RANGE_CHASER_H_ */