Imported Debian patch 0.12.7-3
[bowtie.git] / blockwise_sa.h
1 #ifndef BLOCKWISE_SA_H_
2 #define BLOCKWISE_SA_H_
3
4 #include <stdint.h>
5 #include <stdlib.h>
6 #include <iostream>
7 #include <sstream>
8 #include <stdexcept>
9 #include <vector>
10 #include <seqan/sequence.h>
11 #include <seqan/index.h>
12 #include "assert_helpers.h"
13 #include "diff_sample.h"
14 #include "multikey_qsort.h"
15 #include "random_source.h"
16 #include "binary_sa_search.h"
17 #include "zbox.h"
18 #include "alphabet.h"
19 #include "timer.h"
20 #include "auto_array.h"
21
22 using namespace std;
23 using namespace seqan;
24
25 // Helpers for printing verbose messages
26
27 #ifndef VMSG_NL
28 #define VMSG_NL(args...) \
29 if(this->verbose()) { \
30         stringstream tmp; \
31         tmp << args << endl; \
32         this->verbose(tmp.str()); \
33 }
34 #endif
35
36 #ifndef VMSG
37 #define VMSG(args...) \
38 if(this->verbose()) { \
39         stringstream tmp; \
40         tmp << args; \
41         this->verbose(tmp.str()); \
42 }
43 #endif
44
45 /**
46  * Abstract parent class for blockwise suffix-array building schemes.
47  */
48 template<typename TStr>
49 class BlockwiseSA {
50 public:
51         BlockwiseSA(const TStr& __text,
52                     uint32_t __bucketSz,
53                     bool __sanityCheck = false,
54                     bool __passMemExc = false,
55                     bool __verbose = false,
56                     ostream& __logger = cout) :
57         _text(__text),
58         _bucketSz(max<uint32_t>(__bucketSz, 2u)),
59         _sanityCheck(__sanityCheck),
60         _passMemExc(__passMemExc),
61         _verbose(__verbose),
62         _itrBucket(),
63         _itrBucketPos(0xffffffff),
64         _itrPushedBackSuffix(0xffffffff),
65         _logger(__logger)
66         { }
67
68         virtual ~BlockwiseSA() { }
69
70         /**
71          * Get the next suffix; compute the next bucket if necessary.
72          */
73         uint32_t nextSuffix() {
74                 if(_itrPushedBackSuffix != 0xffffffff) {
75                         uint32_t tmp = _itrPushedBackSuffix;
76                         _itrPushedBackSuffix = 0xffffffff;
77                         return tmp;
78                 }
79                 while(_itrBucketPos >= length(_itrBucket) ||
80                       length(_itrBucket) == 0)
81                 {
82                         if(!hasMoreBlocks()) {
83                                 throw out_of_range("No more suffixes");
84                         }
85                         nextBlock();
86                         _itrBucketPos = 0;
87                 }
88                 return _itrBucket[_itrBucketPos++];
89         }
90
91         /**
92          * Return true iff the next call to nextSuffix will succeed.
93          */
94         bool hasMoreSuffixes() {
95                 if(_itrPushedBackSuffix != 0xffffffff) return true;
96                 try {
97                         _itrPushedBackSuffix = nextSuffix();
98                 } catch(out_of_range& e) {
99                         assert_eq(0xffffffff, _itrPushedBackSuffix);
100                         return false;
101                 }
102                 return true;
103         }
104
105         /**
106          * Reset the suffix iterator so that the next call to nextSuffix()
107          * returns the lexicographically-first suffix.
108          */
109         void resetSuffixItr() {
110                 clear(_itrBucket);
111                 _itrBucketPos = 0xffffffff;
112                 _itrPushedBackSuffix = 0xffffffff;
113                 reset();
114                 assert(suffixItrIsReset());
115         }
116
117         /**
118          * Returns true iff the next call to nextSuffix() returns the
119          * lexicographically-first suffix.
120          */
121         bool suffixItrIsReset() {
122                 return length(_itrBucket)   == 0 &&
123                        _itrBucketPos        == 0xffffffff &&
124                        _itrPushedBackSuffix == 0xffffffff &&
125                        isReset();
126         }
127
128         const TStr& text()  const { return _text; }
129         uint32_t bucketSz() const { return _bucketSz; }
130         bool sanityCheck()  const { return _sanityCheck; }
131         bool verbose()      const { return _verbose; }
132         ostream& log()      const { return _logger; }
133         uint32_t size()     const { return length(_text)+1; }
134
135 protected:
136         /// Reset back to the first block
137         virtual void reset() = 0;
138         /// Return true iff reset to the first block
139         virtual bool isReset() = 0;
140
141         /**
142          * Grab the next block of sorted suffixes.  The block is guaranteed
143          * to have at most _bucketSz elements.
144          */
145         virtual void nextBlock() = 0;
146         /// Return true iff more blocks are available
147         virtual bool hasMoreBlocks() const = 0;
148         /// Optionally output a verbose message
149         void verbose(const string& s) const {
150                 if(this->verbose()) {
151                         this->log() << s;
152                         this->log().flush();
153                 }
154         }
155
156         const TStr&      _text;        /// original string
157         const uint32_t   _bucketSz;    /// target maximum bucket size
158         const bool       _sanityCheck; /// whether to perform sanity checks
159         const bool       _passMemExc;  /// true -> pass on memory exceptions
160         const bool       _verbose;     /// be talkative
161         String<uint32_t> _itrBucket;   /// current bucket
162         uint32_t         _itrBucketPos;/// offset into current bucket
163         uint32_t         _itrPushedBackSuffix; /// temporary slot for lookahead
164         ostream&         _logger;      /// write log messages here
165 };
166
167 /**
168  * Abstract parent class for a blockwise suffix array builder that
169  * always doles out blocks in lexicographical order.
170  */
171 template<typename TStr>
172 class InorderBlockwiseSA : public BlockwiseSA<TStr> {
173 public:
174         InorderBlockwiseSA(const TStr& __text,
175                            uint32_t __bucketSz,
176                            bool __sanityCheck = false,
177                                bool __passMemExc = false,
178                            bool __verbose = false,
179                            ostream& __logger = cout) :
180         BlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger)
181         { }
182 };
183
184 /**
185  * Build the SA a block at a time according to the scheme outlined in
186  * Karkkainen's "Fast BWT" paper.
187  */
188 template<typename TStr>
189 class KarkkainenBlockwiseSA : public InorderBlockwiseSA<TStr> {
190 public:
191         typedef DifferenceCoverSample<TStr> TDC;
192
193         KarkkainenBlockwiseSA(const TStr& __text,
194                               uint32_t __bucketSz,
195                               uint32_t __dcV,
196                               uint32_t __seed = 0,
197                               bool __sanityCheck = false,
198                                   bool __passMemExc = false,
199                               bool __verbose = false,
200                               ostream& __logger = cout) :
201         InorderBlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger),
202         _sampleSuffs(), _cur(0), _dcV(__dcV), _dc(NULL), _built(false)
203         { _randomSrc.init(__seed); reset(); }
204
205         ~KarkkainenBlockwiseSA() {
206                 if(_dc != NULL) delete _dc; _dc = NULL; // difference cover sample
207         }
208
209         /**
210          * Allocate an amount of memory that simulates the peak memory
211          * usage of the DifferenceCoverSample with the given text and v.
212          * Throws bad_alloc if it's not going to fit in memory.  Returns
213          * the approximate number of bytes the Cover takes at all times.
214          */
215         static size_t simulateAllocs(const TStr& text, uint32_t bucketSz) {
216                 size_t len = length(text);
217                 // _sampleSuffs and _itrBucket are in memory at the peak
218                 size_t bsz = bucketSz;
219                 size_t sssz = len / max<uint32_t>(bucketSz-1, 1);
220                 AutoArray<uint32_t> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/));
221                 return bsz;
222         }
223
224         /// Defined in blockwise_sa.cpp
225         virtual void nextBlock();
226
227         /// Defined in blockwise_sa.cpp
228         virtual void qsort(String<uint32_t>& bucket);
229
230         /// Return true iff more blocks are available
231         virtual bool hasMoreBlocks() const {
232                 return _cur <= length(_sampleSuffs);
233         }
234
235         /// Return the difference-cover period
236         uint32_t dcV() const { return _dcV; }
237
238 protected:
239
240         /**
241          * Initialize the state of the blockwise suffix sort.  If the
242          * difference cover sample and the sample set have not yet been
243          * built, build them.  Then reset the block cursor to point to
244          * the first block.
245          */
246         virtual void reset() {
247                 if(!_built) {
248                         build();
249                 }
250                 assert(_built);
251                 _cur = 0;
252         }
253
254         /// Return true iff we're about to dole out the first bucket
255         virtual bool isReset() {
256                 return _cur == 0;
257         }
258
259 private:
260
261         /**
262          * Calculate the difference-cover sample and sample suffixes.
263          */
264         void build() {
265                 // Calculate difference-cover sample
266                 assert(_dc == NULL);
267                 if(_dcV != 0) {
268                         _dc = new TDC(this->text(), _dcV, this->verbose(), this->sanityCheck());
269                         _dc->build();
270                 }
271                 // Calculate sample suffixes
272                 if(this->bucketSz() <= length(this->text())) {
273                         VMSG_NL("Building samples");
274                         buildSamples();
275                 } else {
276                         VMSG_NL("Skipping building samples since text length " <<
277                                 length(this->text()) << " is less than bucket size: " <<
278                                 this->bucketSz());
279                 }
280                 _built = true;
281         }
282
283         /**
284          * Calculate the lcp between two suffixes using the difference
285          * cover as a tie-breaker.  If the tie-breaker is employed, then
286          * the calculated lcp may be an underestimate.
287          *
288          * Defined in blockwise_sa.cpp
289          */
290         inline bool tieBreakingLcp(uint32_t aOff,
291                                    uint32_t bOff,
292                                    uint32_t& lcp,
293                                    bool& lcpIsSoft);
294
295         /**
296          * Compare two suffixes using the difference-cover sample.
297          */
298         inline bool suffixCmp(uint32_t cmp,
299                               uint32_t i,
300                               int64_t& j,
301                               int64_t& k,
302                               bool& kSoft,
303                               const String<uint32_t>& z);
304
305         void buildSamples();
306
307         String<uint32_t> _sampleSuffs; /// sample suffixes
308         uint32_t         _cur;         /// offset to 1st elt of next block
309         const uint32_t   _dcV;         /// difference-cover periodicity
310         TDC*             _dc;          /// queryable difference-cover data
311         bool             _built;       /// whether samples/DC have been built
312         RandomSource     _randomSrc;   /// source of pseudo-randoms
313 };
314
315 /**
316  * Qsort the set of suffixes whose offsets are in 'bucket'.
317  */
318 template<typename TStr>
319 void KarkkainenBlockwiseSA<TStr>::qsort(String<uint32_t>& bucket) {
320         typedef typename Value<TStr>::Type TAlphabet;
321         const TStr& t = this->text();
322         uint32_t *s = begin(bucket);
323         uint32_t slen = seqan::length(bucket);
324         uint32_t len = seqan::length(t);
325         if(_dc != NULL) {
326                 // Use the difference cover as a tie-breaker if we have it
327                 VMSG_NL("  (Using difference cover)");
328                 // Extract the 'host' array because it's faster to work
329                 // with than the String<> container
330                 uint8_t *host = (uint8_t*)t.data_begin;
331                 mkeyQSortSufDcU8(t, host, len, s, slen, *_dc,
332                                  ValueSize<TAlphabet>::VALUE,
333                                  this->verbose(), this->sanityCheck());
334         } else {
335                 VMSG_NL("  (Not using difference cover)");
336                 // We don't have a difference cover - just do a normal
337                 // suffix sort
338                 mkeyQSortSuf(t, s, slen, ValueSize<TAlphabet>::VALUE,
339                              this->verbose(), this->sanityCheck());
340         }
341 }
342
343 /**
344  * Qsort the set of suffixes whose offsets are in 'bucket'.  This
345  * specialization for packed strings does not attempt to extract and
346  * operate directly on the host string; the fact that the string is
347  * packed means that the array cannot be sorted directly.
348  */
349 template<>
350 void KarkkainenBlockwiseSA<String<Dna, Packed<> > >::qsort(String<uint32_t>& bucket) {
351         const String<Dna, Packed<> >& t = this->text();
352         uint32_t *s = begin(bucket);
353         uint32_t slen = seqan::length(bucket);
354         uint32_t len = seqan::length(t);
355         if(_dc != NULL) {
356                 // Use the difference cover as a tie-breaker if we have it
357                 VMSG_NL("  (Using difference cover)");
358                 // Can't use the text's 'host' array because the backing
359                 // store for the packed string is not one-char-per-elt.
360                 mkeyQSortSufDcU8(t, t, len, s, slen, *_dc,
361                                  ValueSize<Dna>::VALUE,
362                                  this->verbose(), this->sanityCheck());
363         } else {
364                 VMSG_NL("  (Not using difference cover)");
365                 // We don't have a difference cover - just do a normal
366                 // suffix sort
367                 mkeyQSortSuf(t, s, slen, ValueSize<Dna>::VALUE,
368                              this->verbose(), this->sanityCheck());
369         }
370 }
371
372 /**
373  * Select a set of bucket-delineating sample suffixes such that no
374  * bucket is greater than the requested upper limit.  Some care is
375  * taken to make each bucket's size close to the limit without
376  * going over.
377  */
378 template<typename TStr>
379 void KarkkainenBlockwiseSA<TStr>::buildSamples() {
380         typedef typename Value<TStr>::Type TAlphabet;
381         const TStr& t = this->text();
382         uint32_t bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
383         uint32_t len = length(this->text());
384         // Prepare _sampleSuffs array
385         clear(_sampleSuffs);
386         uint32_t numSamples = ((len/bsz)+1)<<1; // ~len/bsz x 2
387         assert_gt(numSamples, 0);
388         VMSG_NL("Reserving space for " << numSamples << " sample suffixes");
389         if(this->_passMemExc) {
390                 reserve(_sampleSuffs, numSamples, Exact());
391                 // Randomly generate samples.  Allow duplicates for now.
392                 VMSG_NL("Generating random suffixes");
393                 for(size_t i = 0; i < numSamples; i++) {
394                         appendValue(_sampleSuffs, _randomSrc.nextU32() % len);
395                 }
396         } else {
397                 try {
398                         reserve(_sampleSuffs, numSamples, Exact());
399                         // Randomly generate samples.  Allow duplicates for now.
400                         VMSG_NL("Generating random suffixes");
401                         for(size_t i = 0; i < numSamples; i++) {
402                                 appendValue(_sampleSuffs, _randomSrc.nextU32() % len);
403                         }
404                 } catch(bad_alloc &e) {
405                         if(this->_passMemExc) {
406                                 throw e; // rethrow immediately
407                         } else {
408                                 cerr << "Could not allocate sample suffix container of " << (numSamples * 4) << " bytes." << endl
409                                      << "Please try using a smaller number of blocks by specifying a larger --bmax or" << endl
410                                      << "a smaller --bmaxdivn" << endl;
411                                 throw 1;
412                         }
413                 }
414         }
415         // Remove duplicates; very important to do this before the call to
416         // mkeyQSortSuf so that it doesn't try to calculate lexicographical
417         // relationships between very long, identical strings, which takes
418         // an extremely long time in general, and causes the stack to grow
419         // linearly with the size of the input
420         {
421                 Timer timer(cout, "QSorting sample offsets, eliminating duplicates time: ", this->verbose());
422                 VMSG_NL("QSorting " << length(_sampleSuffs) << " sample offsets, eliminating duplicates");
423                 sort(begin(_sampleSuffs), end(_sampleSuffs));
424                 size_t sslen = length(_sampleSuffs);
425                 for(size_t i = 0; i < sslen-1; i++) {
426                         if(_sampleSuffs[i] == _sampleSuffs[i+1]) {
427                                 erase(_sampleSuffs, i--);
428                                 sslen--;
429                         }
430                 }
431         }
432         // Multikey quicksort the samples
433         {
434                 Timer timer(cout, "  Multikey QSorting samples time: ", this->verbose());
435                 VMSG_NL("Multikey QSorting " << length(_sampleSuffs) << " samples");
436                 this->qsort(_sampleSuffs);
437         }
438         // Calculate bucket sizes
439         VMSG_NL("Calculating bucket sizes");
440         int limit = 5;
441         // Iterate until all buckets are less than
442         while(--limit >= 0) {
443                 // Calculate bucket sizes by doing a binary search for each
444                 // suffix and noting where it lands
445                 uint32_t numBuckets = length(_sampleSuffs)+1;
446                 String<uint32_t> bucketSzs; // holds computed bucket sizes
447                 String<uint32_t> bucketReps; // holds 1 member of each bucket (for splitting)
448                 try {
449                         // Allocate and initialize containers for holding bucket
450                         // sizes and representatives.
451                         fill(bucketSzs, numBuckets, 0, Exact());
452                         fill(bucketReps, numBuckets, 0xffffffff, Exact());
453                 } catch(bad_alloc &e) {
454                         if(this->_passMemExc) {
455                                 throw e; // rethrow immediately
456                         } else {
457                                 cerr << "Could not allocate sizes, representatives (" << ((numBuckets*8)>>10) << " KB) for blocks." << endl
458                                      << "Please try using a smaller number of blocks by specifying a larger --bmax or a" << endl
459                                      << "smaller --bmaxdivn." << endl;
460                                 throw 1;
461                         }
462                 }
463                 // Iterate through every suffix in the text, determine which
464                 // bucket it falls into by doing a binary search across the
465                 // sorted list of samples, and increment a counter associated
466                 // with that bucket.  Also, keep one representative for each
467                 // bucket so that we can split it later.  We loop in ten
468                 // stretches so that we can print out a helpful progress
469                 // message.  (This step can take a long time.)
470                 {
471                         VMSG_NL("  Binary sorting into buckets");
472                         Timer timer(cout, "  Binary sorting into buckets time: ", this->verbose());
473                         uint32_t lenDiv10 = (len + 9) / 10;
474                         for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
475                                 uint32_t itenNext = iten + lenDiv10;
476                                 if(ten > 0) VMSG_NL("  " << (ten * 10) << "%");
477                                 for(uint32_t i = iten; i < itenNext && i < len; i++) {
478                                         uint32_t r = binarySASearch(t, i, _sampleSuffs);
479                                         if(r == 0xffffffff) continue; // r was one of the samples
480                                         assert_lt(r, numBuckets);
481                                         bucketSzs[r]++;
482                                         assert_lt(bucketSzs[r], len);
483                                         if(bucketReps[r] == 0xffffffff ||
484                                            (_randomSrc.nextU32() & 100) == 0)
485                                         {
486                                                 bucketReps[r] = i; // clobbers previous one, but that's OK
487                                         }
488                                 }
489                         }
490                         VMSG_NL("  100%");
491                 }
492                 // Check for large buckets and mergeable pairs of small buckets
493                 // and split/merge as necessary
494                 int added = 0;
495                 int merged = 0;
496                 assert_eq(length(bucketSzs), numBuckets);
497                 assert_eq(length(bucketReps), numBuckets);
498                 {
499                         Timer timer(cout, "  Splitting and merging time: ", this->verbose());
500                         VMSG_NL("Splitting and merging");
501                         for(int64_t i = 0; i < numBuckets; i++) {
502                                 uint32_t mergedSz = bsz + 1;
503                                 assert(bucketSzs[i] == 0 || bucketReps[i] != 0xffffffff);
504                                 if(i < (int64_t)numBuckets-1) {
505                                         mergedSz = bucketSzs[i] + bucketSzs[i+1] + 1;
506                                 }
507                                 // Merge?
508                                 if(mergedSz <= bsz) {
509                                         bucketSzs[i+1] += (bucketSzs[i]+1);
510                                         // The following may look strange, but it's necessary
511                                         // to ensure that the merged bucket has a representative
512                                         bucketReps[i+1] = _sampleSuffs[i+added];
513                                         erase(_sampleSuffs, i+added);
514                                         erase(bucketSzs, i);
515                                         erase(bucketReps, i);
516                                         i--; // might go to -1 but ++ will overflow back to 0
517                                         numBuckets--;
518                                         merged++;
519                                         assert_eq(numBuckets, length(_sampleSuffs)+1-added);
520                                         assert_eq(numBuckets, length(bucketSzs));
521                                 }
522                                 // Split?
523                                 else if(bucketSzs[i] > bsz) {
524                                         // Add an additional sample from the bucketReps[]
525                                         // set accumulated in the binarySASearch loop; this
526                                         // effectively splits the bucket
527                                         insertValue(_sampleSuffs, i + (added++), bucketReps[i]);
528                                 }
529                         }
530                 }
531                 if(added == 0) {
532                         //if(this->verbose()) {
533                         //      cout << "Final bucket sizes:" << endl;
534                         //      cout << "  (begin): " << bucketSzs[0] << " (" << (int)(bsz - bucketSzs[0]) << ")" << endl;
535                         //      for(uint32_t i = 1; i < numBuckets; i++) {
536                         //              cout << "  " << bucketSzs[i] << " (" << (int)(bsz - bucketSzs[i]) << ")" << endl;
537                         //      }
538                         //}
539                         break;
540                 }
541                 // Otherwise, continue until no more buckets need to be
542                 // split
543                 VMSG_NL("Split " << added << ", merged " << merged << "; iterating...");
544         }
545         // Do *not* force a do-over
546 //      if(limit == 0) {
547 //              VMSG_NL("Iterated too many times; trying again...");
548 //              buildSamples();
549 //      }
550         VMSG_NL("Avg bucket size: " << ((float)(len-length(_sampleSuffs)) / (length(_sampleSuffs)+1)) << " (target: " << bsz << ")");
551 }
552
553 /**
554  * Do a simple LCP calculation on two strings.
555  */
556 template<typename T> inline
557 static uint32_t suffixLcp(const T& t, uint32_t aOff, uint32_t bOff) {
558         uint32_t c = 0;
559         size_t len = length(t);
560         assert_leq(aOff, len);
561         assert_leq(bOff, len);
562         while(aOff + c < len && bOff + c < len && t[aOff + c] == t[bOff + c]) c++;
563         return c;
564 }
565
566 /**
567  * Calculate the lcp between two suffixes using the difference
568  * cover as a tie-breaker.  If the tie-breaker is employed, then
569  * the calculated lcp may be an underestimate.  If the tie-breaker is
570  * employed, lcpIsSoft will be set to true (otherwise, false).
571  */
572 template<typename TStr> inline
573 bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(uint32_t aOff,
574                                                  uint32_t bOff,
575                                                  uint32_t& lcp,
576                                                  bool& lcpIsSoft)
577 {
578         const TStr& t = this->text();
579         uint32_t c = 0;
580         uint32_t tlen = length(t);
581         assert_leq(aOff, tlen);
582         assert_leq(bOff, tlen);
583         assert(_dc != NULL);
584         uint32_t dcDist = _dc->tieBreakOff(aOff, bOff);
585         lcpIsSoft = false; // hard until proven soft
586         while(c < dcDist &&    // we haven't hit the tie breaker
587               c < tlen-aOff && // we haven't fallen off of LHS suffix
588               c < tlen-bOff && // we haven't fallen off of RHS suffix
589               t[aOff+c] == t[bOff+c]) // we haven't hit a mismatch
590                 c++;
591         lcp = c;
592         if(c == tlen-aOff) {
593                 // Fell off LHS (a), a is greater
594                 return false;
595         } else if(c == tlen-bOff) {
596                 // Fell off RHS (b), b is greater
597                 return true;
598         } else if(c == dcDist) {
599                 // Hit a tie-breaker element
600                 lcpIsSoft = true;
601                 assert_neq(dcDist, 0xffffffff);
602                 return _dc->breakTie(aOff+c, bOff+c) < 0;
603         } else {
604                 assert_neq(t[aOff+c], t[bOff+c]);
605                 return t[aOff+c] < t[bOff+c];
606         }
607 }
608
609 /**
610  * Lookup a suffix LCP in the given z array; if the element is not
611  * filled in then calculate it from scratch.
612  */
613 template<typename T>
614 static uint32_t lookupSuffixZ(const T& t,
615                               uint32_t zOff,
616                               uint32_t off,
617                               const String<uint32_t>& z)
618 {
619         if(zOff < length(z)) {
620                 uint32_t ret = z[zOff];
621                 assert_eq(ret, suffixLcp(t, off + zOff, off));
622                 return ret;
623         }
624         assert_leq(off + zOff, length(t));
625         return suffixLcp(t, off + zOff, off);
626 }
627
628 /**
629  * true -> i < cmp
630  * false -> i > cmp
631  */
632 template<typename TStr> inline
633 bool KarkkainenBlockwiseSA<TStr>::suffixCmp(uint32_t cmp,
634                                             uint32_t i,
635                                             int64_t& j,
636                                             int64_t& k,
637                                             bool& kSoft,
638                                             const String<uint32_t>& z)
639 {
640         const TStr& t = this->text();
641         uint32_t len = length(t);
642         // i is not covered by any previous match
643         uint32_t l;
644         if(i > k) {
645                 k = i; // so that i + lHi == kHi
646                 l = 0; // erase any previous l
647                 kSoft = false;
648                 // To be extended
649         }
650         // i is covered by a previous match
651         else /* i <= k */ {
652                 assert_gt((int64_t)i, j);
653                 uint32_t zIdx = i-j;
654                 assert_leq(zIdx, len-cmp);
655                 if(zIdx < _dcV || _dc == NULL) {
656                         // Go as far as the Z-box says
657                         l = lookupSuffixZ(t, zIdx, cmp, z);
658                         if(i + l > len) {
659                                 l = len-i;
660                         }
661                         assert_leq(i + l, len);
662                         // Possibly to be extended
663                 } else {
664                         // But we're past the point of no-more-Z-boxes
665                         bool ret = tieBreakingLcp(i, cmp, l, kSoft);
666                         // Sanity-check tie-breaker
667                         if(this->sanityCheck()) {
668                                 if(ret) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
669                                 else    assert(dollarGt(suffix(t, i), suffix(t, cmp)));
670                         }
671                         j = i;
672                         k = i + l;
673                         if(this->sanityCheck()) {
674                                 if(kSoft) { assert_leq(l, suffixLcp(t, i, cmp)); }
675                                 else      { assert_eq (l, suffixLcp(t, i, cmp)); }
676                         }
677                         return ret;
678                 }
679         }
680
681         // Z box extends exactly as far as previous match (or there
682         // is neither a Z box nor a previous match)
683         if(i + l == k) {
684                 // Extend
685                 while(l < len-cmp && k < len && t[cmp+l] == t[k]) {
686                         k++; l++;
687                 }
688                 j = i; // update furthest-extending LHS
689                 kSoft = false;
690                 assert_eq(l, suffixLcp(t, i, cmp));
691         }
692         // Z box extends further than previous match
693         else if(i + l > k) {
694                 l = k - i; // point to just after previous match
695                 j = i; // update furthest-extending LHS
696                 if(kSoft) {
697                         while(l < len-cmp && k < len && t[cmp+l] == t[k]) {
698                                 k++; l++;
699                         }
700                         kSoft = false;
701                         assert_eq(l, suffixLcp(t, i, cmp));
702                 } else assert_eq(l, suffixLcp(t, i, cmp));
703         }
704
705         // Check that calculated lcp matches actual lcp
706         if(this->sanityCheck()) {
707                 if(!kSoft) {
708                         // l should exactly match lcp
709                         assert_eq(l, suffixLcp(t, i, cmp));
710                 } else {
711                         // l is an underestimate of LCP
712                         assert_leq(l, suffixLcp(t, i, cmp));
713                 }
714         }
715         assert_leq(l+i, len);
716         assert_leq(l, len-cmp);
717
718         // i and cmp should not be the same suffix
719         assert(l != len-cmp || i+l != len);
720
721         // Now we're ready to do a comparison on the next char
722         if(l+i != len && (
723            l == len-cmp || // departure from paper algorithm:
724                            // falling off pattern implies
725                            // pattern is *greater* in our case
726            t[i + l] < t[cmp + l]))
727         {
728                 // Case 2: Text suffix is less than upper sample suffix
729                 if(this->sanityCheck()) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
730                 return true; // suffix at i is less than suffix at cmp
731         }
732         else {
733                 // Case 3: Text suffix is greater than upper sample suffix
734                 if(this->sanityCheck()) assert(dollarGt(suffix(t, i), suffix(t, cmp)));
735                 return false; // suffix at i is less than suffix at cmp
736         }
737 }
738
739 /**
740  * Retrieve the next block.  This is the most performance-critical part
741  * of the blockwise suffix sorting process.
742  */
743 template<typename TStr>
744 void KarkkainenBlockwiseSA<TStr>::nextBlock() {
745         typedef typename Value<TStr>::Type TAlphabet;
746         String<uint32_t>& bucket = this->_itrBucket;
747         VMSG_NL("Getting block " << (_cur+1) << " of " << length(_sampleSuffs)+1);
748         assert(_built);
749         assert_gt(_dcV, 3);
750         assert_leq(_cur, length(_sampleSuffs));
751         const TStr& t = this->text();
752         uint32_t len = length(t);
753         // Set up the bucket
754         clear(bucket);
755         uint32_t lo = 0xffffffff, hi = 0xffffffff;
756         if(length(_sampleSuffs) == 0) {
757                 // Special case: if _sampleSuffs is 0, then multikey-quicksort
758                 // everything
759                 VMSG_NL("  No samples; assembling all-inclusive block");
760                 assert_eq(0, _cur);
761                 try {
762                         if(capacity(bucket) < this->bucketSz()) {
763                                 reserve(bucket, len+1, Exact());
764                         }
765                         for(uint32_t i = 0; i < len; i++) append(bucket, i);
766                 } catch(bad_alloc &e) {
767                         if(this->_passMemExc) {
768                                 throw e; // rethrow immediately
769                         } else {
770                                 cerr << "Could not allocate a master suffix-array block of " << ((len+1) * 4) << " bytes" << endl
771                                      << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
772                                      << "a larger --bmaxdivn" << endl;
773                                 throw 1;
774                         }
775                 }
776         } else {
777                 try {
778                         VMSG_NL("  Reserving size (" << this->bucketSz() << ") for bucket");
779                         // BTL: Add a +100 fudge factor; there seem to be instances
780                         // where a bucket ends up having one more elt than bucketSz()
781                         if(capacity(bucket) < this->bucketSz()+100) {
782                                 reserve(bucket, this->bucketSz()+100, Exact());
783                         }
784                 } catch(bad_alloc &e) {
785                         if(this->_passMemExc) {
786                                 throw e; // rethrow immediately
787                         } else {
788                                 cerr << "Could not allocate a suffix-array block of " << ((this->bucketSz()+1) * 4) << " bytes" << endl;
789                                 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
790                                      << "a larger --bmaxdivn" << endl;
791                                 throw 1;
792                         }
793                 }
794                 // Select upper and lower bounds from _sampleSuffs[] and
795                 // calculate the Z array up to the difference-cover periodicity
796                 // for both.  Be careful about first/last buckets.
797                 String<uint32_t> zLo, zHi;
798                 assert_geq(_cur, 0);
799                 assert_leq(_cur, length(_sampleSuffs));
800                 bool first = (_cur == 0);
801                 bool last  = (_cur == length(_sampleSuffs));
802                 try {
803                         Timer timer(cout, "  Calculating Z arrays time: ", this->verbose());
804                         VMSG_NL("  Calculating Z arrays");
805                         if(!last) {
806                                 // Not the last bucket
807                                 assert_lt(_cur, length(_sampleSuffs));
808                                 hi = _sampleSuffs[_cur];
809                                 fill(zHi, _dcV, 0, Exact());
810                                 assert_eq(zHi[0], 0);
811                                 calcZ(t, hi, zHi, this->verbose(), this->sanityCheck());
812                         }
813                         if(!first) {
814                                 // Not the first bucket
815                                 assert_gt(_cur, 0);
816                                 assert_leq(_cur, length(_sampleSuffs));
817                                 lo = _sampleSuffs[_cur-1];
818                                 fill(zLo, _dcV, 0, Exact());
819                                 assert_gt(_dcV, 3);
820                                 assert_eq(zLo[0], 0);
821                                 calcZ(t, lo, zLo, this->verbose(), this->sanityCheck());
822                         }
823                 } catch(bad_alloc &e) {
824                         if(this->_passMemExc) {
825                                 throw e; // rethrow immediately
826                         } else {
827                                 cerr << "Could not allocate a z-array of " << (_dcV * 4) << " bytes" << endl;
828                                 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
829                                      << "a larger --bmaxdivn" << endl;
830                                 throw 1;
831                         }
832                 }
833
834                 // This is the most critical loop in the algorithm; this is where
835                 // we iterate over all suffixes in the text and pick out those that
836                 // fall into the current bucket.
837                 //
838                 // This loop is based on the SMALLERSUFFIXES function outlined on
839                 // p7 of the "Fast BWT" paper
840                 //
841                 int64_t kHi = -1, kLo = -1;
842                 int64_t jHi = -1, jLo = -1;
843                 bool kHiSoft = false, kLoSoft = false;
844                 assert_eq(0, length(bucket));
845                 {
846                         Timer timer(cout, "  Block accumulator loop time: ", this->verbose());
847                         VMSG_NL("  Entering block accumulator loop:");
848                         uint32_t lenDiv10 = (len + 9) / 10;
849                         for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
850                         uint32_t itenNext = iten + lenDiv10;
851                         if(ten > 0) VMSG_NL("  " << (ten * 10) << "%");
852                         for(uint32_t i = iten; i < itenNext && i < len; i++) {
853                                 assert_lt(jLo, i); assert_lt(jHi, i);
854                                 // Advance the upper-bound comparison by one character
855                                 if(i == hi || i == lo) continue; // equal to one of the bookends
856                                 if(hi != 0xffffffff && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
857                                         continue; // not in the bucket
858                                 }
859                                 if(lo != 0xffffffff && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
860                                         continue; // not in the bucket
861                                 }
862                                 // In the bucket! - add it
863                                 assert_lt(i, len);
864                                 try {
865                                         appendValue(bucket, i);
866                                 } catch(bad_alloc &e) {
867                                         if(this->_passMemExc) {
868                                                 throw e; // rethrow immediately
869                                         } else {
870                                                 cerr << "Could not append element to block of " << ((length(bucket)) * 4) << " bytes" << endl;
871                                                 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
872                                                      << "a larger --bmaxdivn" << endl;
873                                                 throw 1;
874                                         }
875                                 }
876                                 // Not necessarily true; we allow overflowing buckets
877                                 // since we can't guarantee that a good set of sample
878                                 // suffixes can be found in a reasonable amount of time
879                                 //assert_lt(length(bucket), this->bucketSz());
880                         }
881                         } // end loop over all suffixes of t
882                         VMSG_NL("  100%");
883                 }
884         } // end else clause of if(length(_sampleSuffs) == 0)
885         // Sort the bucket
886         if(length(bucket) > 0) {
887                 Timer timer(cout, "  Sorting block time: ", this->verbose());
888                 VMSG_NL("  Sorting block of length " << length(bucket));
889                 this->qsort(bucket);
890         }
891         if(hi != 0xffffffff) {
892                 // Not the final bucket; throw in the sample on the RHS
893                 appendValue(bucket, hi);
894         } else {
895                 // Final bucket; throw in $ suffix
896                 appendValue(bucket, len);
897         }
898         VMSG_NL("Returning block of " << length(bucket));
899         _cur++; // advance to next bucket
900 }
901
902 #endif /*BLOCKWISE_SA_H_*/