1 #ifndef BLOCKWISE_SA_H_
2 #define BLOCKWISE_SA_H_
10 #include <seqan/sequence.h>
11 #include <seqan/index.h>
12 #include "assert_helpers.h"
13 #include "diff_sample.h"
14 #include "multikey_qsort.h"
15 #include "random_source.h"
16 #include "binary_sa_search.h"
20 #include "auto_array.h"
23 using namespace seqan;
25 // Helpers for printing verbose messages
28 #define VMSG_NL(args...) \
29 if(this->verbose()) { \
31 tmp << args << endl; \
32 this->verbose(tmp.str()); \
37 #define VMSG(args...) \
38 if(this->verbose()) { \
41 this->verbose(tmp.str()); \
46 * Abstract parent class for blockwise suffix-array building schemes.
48 template<typename TStr>
51 BlockwiseSA(const TStr& __text,
53 bool __sanityCheck = false,
54 bool __passMemExc = false,
55 bool __verbose = false,
56 ostream& __logger = cout) :
58 _bucketSz(max<uint32_t>(__bucketSz, 2u)),
59 _sanityCheck(__sanityCheck),
60 _passMemExc(__passMemExc),
63 _itrBucketPos(0xffffffff),
64 _itrPushedBackSuffix(0xffffffff),
68 virtual ~BlockwiseSA() { }
71 * Get the next suffix; compute the next bucket if necessary.
73 uint32_t nextSuffix() {
74 if(_itrPushedBackSuffix != 0xffffffff) {
75 uint32_t tmp = _itrPushedBackSuffix;
76 _itrPushedBackSuffix = 0xffffffff;
79 while(_itrBucketPos >= length(_itrBucket) ||
80 length(_itrBucket) == 0)
82 if(!hasMoreBlocks()) {
83 throw out_of_range("No more suffixes");
88 return _itrBucket[_itrBucketPos++];
92 * Return true iff the next call to nextSuffix will succeed.
94 bool hasMoreSuffixes() {
95 if(_itrPushedBackSuffix != 0xffffffff) return true;
97 _itrPushedBackSuffix = nextSuffix();
98 } catch(out_of_range& e) {
99 assert_eq(0xffffffff, _itrPushedBackSuffix);
106 * Reset the suffix iterator so that the next call to nextSuffix()
107 * returns the lexicographically-first suffix.
109 void resetSuffixItr() {
111 _itrBucketPos = 0xffffffff;
112 _itrPushedBackSuffix = 0xffffffff;
114 assert(suffixItrIsReset());
118 * Returns true iff the next call to nextSuffix() returns the
119 * lexicographically-first suffix.
121 bool suffixItrIsReset() {
122 return length(_itrBucket) == 0 &&
123 _itrBucketPos == 0xffffffff &&
124 _itrPushedBackSuffix == 0xffffffff &&
128 const TStr& text() const { return _text; }
129 uint32_t bucketSz() const { return _bucketSz; }
130 bool sanityCheck() const { return _sanityCheck; }
131 bool verbose() const { return _verbose; }
132 ostream& log() const { return _logger; }
133 uint32_t size() const { return length(_text)+1; }
136 /// Reset back to the first block
137 virtual void reset() = 0;
138 /// Return true iff reset to the first block
139 virtual bool isReset() = 0;
142 * Grab the next block of sorted suffixes. The block is guaranteed
143 * to have at most _bucketSz elements.
145 virtual void nextBlock() = 0;
146 /// Return true iff more blocks are available
147 virtual bool hasMoreBlocks() const = 0;
148 /// Optionally output a verbose message
149 void verbose(const string& s) const {
150 if(this->verbose()) {
156 const TStr& _text; /// original string
157 const uint32_t _bucketSz; /// target maximum bucket size
158 const bool _sanityCheck; /// whether to perform sanity checks
159 const bool _passMemExc; /// true -> pass on memory exceptions
160 const bool _verbose; /// be talkative
161 String<uint32_t> _itrBucket; /// current bucket
162 uint32_t _itrBucketPos;/// offset into current bucket
163 uint32_t _itrPushedBackSuffix; /// temporary slot for lookahead
164 ostream& _logger; /// write log messages here
168 * Abstract parent class for a blockwise suffix array builder that
169 * always doles out blocks in lexicographical order.
171 template<typename TStr>
172 class InorderBlockwiseSA : public BlockwiseSA<TStr> {
174 InorderBlockwiseSA(const TStr& __text,
176 bool __sanityCheck = false,
177 bool __passMemExc = false,
178 bool __verbose = false,
179 ostream& __logger = cout) :
180 BlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger)
185 * Build the SA a block at a time according to the scheme outlined in
186 * Karkkainen's "Fast BWT" paper.
188 template<typename TStr>
189 class KarkkainenBlockwiseSA : public InorderBlockwiseSA<TStr> {
191 typedef DifferenceCoverSample<TStr> TDC;
193 KarkkainenBlockwiseSA(const TStr& __text,
197 bool __sanityCheck = false,
198 bool __passMemExc = false,
199 bool __verbose = false,
200 ostream& __logger = cout) :
201 InorderBlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger),
202 _sampleSuffs(), _cur(0), _dcV(__dcV), _dc(NULL), _built(false)
203 { _randomSrc.init(__seed); reset(); }
205 ~KarkkainenBlockwiseSA() {
206 if(_dc != NULL) delete _dc; _dc = NULL; // difference cover sample
210 * Allocate an amount of memory that simulates the peak memory
211 * usage of the DifferenceCoverSample with the given text and v.
212 * Throws bad_alloc if it's not going to fit in memory. Returns
213 * the approximate number of bytes the Cover takes at all times.
215 static size_t simulateAllocs(const TStr& text, uint32_t bucketSz) {
216 size_t len = length(text);
217 // _sampleSuffs and _itrBucket are in memory at the peak
218 size_t bsz = bucketSz;
219 size_t sssz = len / max<uint32_t>(bucketSz-1, 1);
220 AutoArray<uint32_t> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/));
224 /// Defined in blockwise_sa.cpp
225 virtual void nextBlock();
227 /// Defined in blockwise_sa.cpp
228 virtual void qsort(String<uint32_t>& bucket);
230 /// Return true iff more blocks are available
231 virtual bool hasMoreBlocks() const {
232 return _cur <= length(_sampleSuffs);
235 /// Return the difference-cover period
236 uint32_t dcV() const { return _dcV; }
241 * Initialize the state of the blockwise suffix sort. If the
242 * difference cover sample and the sample set have not yet been
243 * built, build them. Then reset the block cursor to point to
246 virtual void reset() {
254 /// Return true iff we're about to dole out the first bucket
255 virtual bool isReset() {
262 * Calculate the difference-cover sample and sample suffixes.
265 // Calculate difference-cover sample
268 _dc = new TDC(this->text(), _dcV, this->verbose(), this->sanityCheck());
271 // Calculate sample suffixes
272 if(this->bucketSz() <= length(this->text())) {
273 VMSG_NL("Building samples");
276 VMSG_NL("Skipping building samples since text length " <<
277 length(this->text()) << " is less than bucket size: " <<
284 * Calculate the lcp between two suffixes using the difference
285 * cover as a tie-breaker. If the tie-breaker is employed, then
286 * the calculated lcp may be an underestimate.
288 * Defined in blockwise_sa.cpp
290 inline bool tieBreakingLcp(uint32_t aOff,
296 * Compare two suffixes using the difference-cover sample.
298 inline bool suffixCmp(uint32_t cmp,
303 const String<uint32_t>& z);
307 String<uint32_t> _sampleSuffs; /// sample suffixes
308 uint32_t _cur; /// offset to 1st elt of next block
309 const uint32_t _dcV; /// difference-cover periodicity
310 TDC* _dc; /// queryable difference-cover data
311 bool _built; /// whether samples/DC have been built
312 RandomSource _randomSrc; /// source of pseudo-randoms
316 * Qsort the set of suffixes whose offsets are in 'bucket'.
318 template<typename TStr>
319 void KarkkainenBlockwiseSA<TStr>::qsort(String<uint32_t>& bucket) {
320 typedef typename Value<TStr>::Type TAlphabet;
321 const TStr& t = this->text();
322 uint32_t *s = begin(bucket);
323 uint32_t slen = seqan::length(bucket);
324 uint32_t len = seqan::length(t);
326 // Use the difference cover as a tie-breaker if we have it
327 VMSG_NL(" (Using difference cover)");
328 // Extract the 'host' array because it's faster to work
329 // with than the String<> container
330 uint8_t *host = (uint8_t*)t.data_begin;
331 mkeyQSortSufDcU8(t, host, len, s, slen, *_dc,
332 ValueSize<TAlphabet>::VALUE,
333 this->verbose(), this->sanityCheck());
335 VMSG_NL(" (Not using difference cover)");
336 // We don't have a difference cover - just do a normal
338 mkeyQSortSuf(t, s, slen, ValueSize<TAlphabet>::VALUE,
339 this->verbose(), this->sanityCheck());
344 * Qsort the set of suffixes whose offsets are in 'bucket'. This
345 * specialization for packed strings does not attempt to extract and
346 * operate directly on the host string; the fact that the string is
347 * packed means that the array cannot be sorted directly.
350 void KarkkainenBlockwiseSA<String<Dna, Packed<> > >::qsort(String<uint32_t>& bucket) {
351 const String<Dna, Packed<> >& t = this->text();
352 uint32_t *s = begin(bucket);
353 uint32_t slen = seqan::length(bucket);
354 uint32_t len = seqan::length(t);
356 // Use the difference cover as a tie-breaker if we have it
357 VMSG_NL(" (Using difference cover)");
358 // Can't use the text's 'host' array because the backing
359 // store for the packed string is not one-char-per-elt.
360 mkeyQSortSufDcU8(t, t, len, s, slen, *_dc,
361 ValueSize<Dna>::VALUE,
362 this->verbose(), this->sanityCheck());
364 VMSG_NL(" (Not using difference cover)");
365 // We don't have a difference cover - just do a normal
367 mkeyQSortSuf(t, s, slen, ValueSize<Dna>::VALUE,
368 this->verbose(), this->sanityCheck());
373 * Select a set of bucket-delineating sample suffixes such that no
374 * bucket is greater than the requested upper limit. Some care is
375 * taken to make each bucket's size close to the limit without
378 template<typename TStr>
379 void KarkkainenBlockwiseSA<TStr>::buildSamples() {
380 typedef typename Value<TStr>::Type TAlphabet;
381 const TStr& t = this->text();
382 uint32_t bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
383 uint32_t len = length(this->text());
384 // Prepare _sampleSuffs array
386 uint32_t numSamples = ((len/bsz)+1)<<1; // ~len/bsz x 2
387 assert_gt(numSamples, 0);
388 VMSG_NL("Reserving space for " << numSamples << " sample suffixes");
389 if(this->_passMemExc) {
390 reserve(_sampleSuffs, numSamples, Exact());
391 // Randomly generate samples. Allow duplicates for now.
392 VMSG_NL("Generating random suffixes");
393 for(size_t i = 0; i < numSamples; i++) {
394 appendValue(_sampleSuffs, _randomSrc.nextU32() % len);
398 reserve(_sampleSuffs, numSamples, Exact());
399 // Randomly generate samples. Allow duplicates for now.
400 VMSG_NL("Generating random suffixes");
401 for(size_t i = 0; i < numSamples; i++) {
402 appendValue(_sampleSuffs, _randomSrc.nextU32() % len);
404 } catch(bad_alloc &e) {
405 if(this->_passMemExc) {
406 throw e; // rethrow immediately
408 cerr << "Could not allocate sample suffix container of " << (numSamples * 4) << " bytes." << endl
409 << "Please try using a smaller number of blocks by specifying a larger --bmax or" << endl
410 << "a smaller --bmaxdivn" << endl;
415 // Remove duplicates; very important to do this before the call to
416 // mkeyQSortSuf so that it doesn't try to calculate lexicographical
417 // relationships between very long, identical strings, which takes
418 // an extremely long time in general, and causes the stack to grow
419 // linearly with the size of the input
421 Timer timer(cout, "QSorting sample offsets, eliminating duplicates time: ", this->verbose());
422 VMSG_NL("QSorting " << length(_sampleSuffs) << " sample offsets, eliminating duplicates");
423 sort(begin(_sampleSuffs), end(_sampleSuffs));
424 size_t sslen = length(_sampleSuffs);
425 for(size_t i = 0; i < sslen-1; i++) {
426 if(_sampleSuffs[i] == _sampleSuffs[i+1]) {
427 erase(_sampleSuffs, i--);
432 // Multikey quicksort the samples
434 Timer timer(cout, " Multikey QSorting samples time: ", this->verbose());
435 VMSG_NL("Multikey QSorting " << length(_sampleSuffs) << " samples");
436 this->qsort(_sampleSuffs);
438 // Calculate bucket sizes
439 VMSG_NL("Calculating bucket sizes");
441 // Iterate until all buckets are less than
442 while(--limit >= 0) {
443 // Calculate bucket sizes by doing a binary search for each
444 // suffix and noting where it lands
445 uint32_t numBuckets = length(_sampleSuffs)+1;
446 String<uint32_t> bucketSzs; // holds computed bucket sizes
447 String<uint32_t> bucketReps; // holds 1 member of each bucket (for splitting)
449 // Allocate and initialize containers for holding bucket
450 // sizes and representatives.
451 fill(bucketSzs, numBuckets, 0, Exact());
452 fill(bucketReps, numBuckets, 0xffffffff, Exact());
453 } catch(bad_alloc &e) {
454 if(this->_passMemExc) {
455 throw e; // rethrow immediately
457 cerr << "Could not allocate sizes, representatives (" << ((numBuckets*8)>>10) << " KB) for blocks." << endl
458 << "Please try using a smaller number of blocks by specifying a larger --bmax or a" << endl
459 << "smaller --bmaxdivn." << endl;
463 // Iterate through every suffix in the text, determine which
464 // bucket it falls into by doing a binary search across the
465 // sorted list of samples, and increment a counter associated
466 // with that bucket. Also, keep one representative for each
467 // bucket so that we can split it later. We loop in ten
468 // stretches so that we can print out a helpful progress
469 // message. (This step can take a long time.)
471 VMSG_NL(" Binary sorting into buckets");
472 Timer timer(cout, " Binary sorting into buckets time: ", this->verbose());
473 uint32_t lenDiv10 = (len + 9) / 10;
474 for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
475 uint32_t itenNext = iten + lenDiv10;
476 if(ten > 0) VMSG_NL(" " << (ten * 10) << "%");
477 for(uint32_t i = iten; i < itenNext && i < len; i++) {
478 uint32_t r = binarySASearch(t, i, _sampleSuffs);
479 if(r == 0xffffffff) continue; // r was one of the samples
480 assert_lt(r, numBuckets);
482 assert_lt(bucketSzs[r], len);
483 if(bucketReps[r] == 0xffffffff ||
484 (_randomSrc.nextU32() & 100) == 0)
486 bucketReps[r] = i; // clobbers previous one, but that's OK
492 // Check for large buckets and mergeable pairs of small buckets
493 // and split/merge as necessary
496 assert_eq(length(bucketSzs), numBuckets);
497 assert_eq(length(bucketReps), numBuckets);
499 Timer timer(cout, " Splitting and merging time: ", this->verbose());
500 VMSG_NL("Splitting and merging");
501 for(int64_t i = 0; i < numBuckets; i++) {
502 uint32_t mergedSz = bsz + 1;
503 assert(bucketSzs[i] == 0 || bucketReps[i] != 0xffffffff);
504 if(i < (int64_t)numBuckets-1) {
505 mergedSz = bucketSzs[i] + bucketSzs[i+1] + 1;
508 if(mergedSz <= bsz) {
509 bucketSzs[i+1] += (bucketSzs[i]+1);
510 // The following may look strange, but it's necessary
511 // to ensure that the merged bucket has a representative
512 bucketReps[i+1] = _sampleSuffs[i+added];
513 erase(_sampleSuffs, i+added);
515 erase(bucketReps, i);
516 i--; // might go to -1 but ++ will overflow back to 0
519 assert_eq(numBuckets, length(_sampleSuffs)+1-added);
520 assert_eq(numBuckets, length(bucketSzs));
523 else if(bucketSzs[i] > bsz) {
524 // Add an additional sample from the bucketReps[]
525 // set accumulated in the binarySASearch loop; this
526 // effectively splits the bucket
527 insertValue(_sampleSuffs, i + (added++), bucketReps[i]);
532 //if(this->verbose()) {
533 // cout << "Final bucket sizes:" << endl;
534 // cout << " (begin): " << bucketSzs[0] << " (" << (int)(bsz - bucketSzs[0]) << ")" << endl;
535 // for(uint32_t i = 1; i < numBuckets; i++) {
536 // cout << " " << bucketSzs[i] << " (" << (int)(bsz - bucketSzs[i]) << ")" << endl;
541 // Otherwise, continue until no more buckets need to be
543 VMSG_NL("Split " << added << ", merged " << merged << "; iterating...");
545 // Do *not* force a do-over
547 // VMSG_NL("Iterated too many times; trying again...");
550 VMSG_NL("Avg bucket size: " << ((float)(len-length(_sampleSuffs)) / (length(_sampleSuffs)+1)) << " (target: " << bsz << ")");
554 * Do a simple LCP calculation on two strings.
556 template<typename T> inline
557 static uint32_t suffixLcp(const T& t, uint32_t aOff, uint32_t bOff) {
559 size_t len = length(t);
560 assert_leq(aOff, len);
561 assert_leq(bOff, len);
562 while(aOff + c < len && bOff + c < len && t[aOff + c] == t[bOff + c]) c++;
567 * Calculate the lcp between two suffixes using the difference
568 * cover as a tie-breaker. If the tie-breaker is employed, then
569 * the calculated lcp may be an underestimate. If the tie-breaker is
570 * employed, lcpIsSoft will be set to true (otherwise, false).
572 template<typename TStr> inline
573 bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(uint32_t aOff,
578 const TStr& t = this->text();
580 uint32_t tlen = length(t);
581 assert_leq(aOff, tlen);
582 assert_leq(bOff, tlen);
584 uint32_t dcDist = _dc->tieBreakOff(aOff, bOff);
585 lcpIsSoft = false; // hard until proven soft
586 while(c < dcDist && // we haven't hit the tie breaker
587 c < tlen-aOff && // we haven't fallen off of LHS suffix
588 c < tlen-bOff && // we haven't fallen off of RHS suffix
589 t[aOff+c] == t[bOff+c]) // we haven't hit a mismatch
593 // Fell off LHS (a), a is greater
595 } else if(c == tlen-bOff) {
596 // Fell off RHS (b), b is greater
598 } else if(c == dcDist) {
599 // Hit a tie-breaker element
601 assert_neq(dcDist, 0xffffffff);
602 return _dc->breakTie(aOff+c, bOff+c) < 0;
604 assert_neq(t[aOff+c], t[bOff+c]);
605 return t[aOff+c] < t[bOff+c];
610 * Lookup a suffix LCP in the given z array; if the element is not
611 * filled in then calculate it from scratch.
614 static uint32_t lookupSuffixZ(const T& t,
617 const String<uint32_t>& z)
619 if(zOff < length(z)) {
620 uint32_t ret = z[zOff];
621 assert_eq(ret, suffixLcp(t, off + zOff, off));
624 assert_leq(off + zOff, length(t));
625 return suffixLcp(t, off + zOff, off);
632 template<typename TStr> inline
633 bool KarkkainenBlockwiseSA<TStr>::suffixCmp(uint32_t cmp,
638 const String<uint32_t>& z)
640 const TStr& t = this->text();
641 uint32_t len = length(t);
642 // i is not covered by any previous match
645 k = i; // so that i + lHi == kHi
646 l = 0; // erase any previous l
650 // i is covered by a previous match
652 assert_gt((int64_t)i, j);
654 assert_leq(zIdx, len-cmp);
655 if(zIdx < _dcV || _dc == NULL) {
656 // Go as far as the Z-box says
657 l = lookupSuffixZ(t, zIdx, cmp, z);
661 assert_leq(i + l, len);
662 // Possibly to be extended
664 // But we're past the point of no-more-Z-boxes
665 bool ret = tieBreakingLcp(i, cmp, l, kSoft);
666 // Sanity-check tie-breaker
667 if(this->sanityCheck()) {
668 if(ret) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
669 else assert(dollarGt(suffix(t, i), suffix(t, cmp)));
673 if(this->sanityCheck()) {
674 if(kSoft) { assert_leq(l, suffixLcp(t, i, cmp)); }
675 else { assert_eq (l, suffixLcp(t, i, cmp)); }
681 // Z box extends exactly as far as previous match (or there
682 // is neither a Z box nor a previous match)
685 while(l < len-cmp && k < len && t[cmp+l] == t[k]) {
688 j = i; // update furthest-extending LHS
690 assert_eq(l, suffixLcp(t, i, cmp));
692 // Z box extends further than previous match
694 l = k - i; // point to just after previous match
695 j = i; // update furthest-extending LHS
697 while(l < len-cmp && k < len && t[cmp+l] == t[k]) {
701 assert_eq(l, suffixLcp(t, i, cmp));
702 } else assert_eq(l, suffixLcp(t, i, cmp));
705 // Check that calculated lcp matches actual lcp
706 if(this->sanityCheck()) {
708 // l should exactly match lcp
709 assert_eq(l, suffixLcp(t, i, cmp));
711 // l is an underestimate of LCP
712 assert_leq(l, suffixLcp(t, i, cmp));
715 assert_leq(l+i, len);
716 assert_leq(l, len-cmp);
718 // i and cmp should not be the same suffix
719 assert(l != len-cmp || i+l != len);
721 // Now we're ready to do a comparison on the next char
723 l == len-cmp || // departure from paper algorithm:
724 // falling off pattern implies
725 // pattern is *greater* in our case
726 t[i + l] < t[cmp + l]))
728 // Case 2: Text suffix is less than upper sample suffix
729 if(this->sanityCheck()) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
730 return true; // suffix at i is less than suffix at cmp
733 // Case 3: Text suffix is greater than upper sample suffix
734 if(this->sanityCheck()) assert(dollarGt(suffix(t, i), suffix(t, cmp)));
735 return false; // suffix at i is less than suffix at cmp
740 * Retrieve the next block. This is the most performance-critical part
741 * of the blockwise suffix sorting process.
743 template<typename TStr>
744 void KarkkainenBlockwiseSA<TStr>::nextBlock() {
745 typedef typename Value<TStr>::Type TAlphabet;
746 String<uint32_t>& bucket = this->_itrBucket;
747 VMSG_NL("Getting block " << (_cur+1) << " of " << length(_sampleSuffs)+1);
750 assert_leq(_cur, length(_sampleSuffs));
751 const TStr& t = this->text();
752 uint32_t len = length(t);
755 uint32_t lo = 0xffffffff, hi = 0xffffffff;
756 if(length(_sampleSuffs) == 0) {
757 // Special case: if _sampleSuffs is 0, then multikey-quicksort
759 VMSG_NL(" No samples; assembling all-inclusive block");
762 if(capacity(bucket) < this->bucketSz()) {
763 reserve(bucket, len+1, Exact());
765 for(uint32_t i = 0; i < len; i++) append(bucket, i);
766 } catch(bad_alloc &e) {
767 if(this->_passMemExc) {
768 throw e; // rethrow immediately
770 cerr << "Could not allocate a master suffix-array block of " << ((len+1) * 4) << " bytes" << endl
771 << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
772 << "a larger --bmaxdivn" << endl;
778 VMSG_NL(" Reserving size (" << this->bucketSz() << ") for bucket");
779 // BTL: Add a +100 fudge factor; there seem to be instances
780 // where a bucket ends up having one more elt than bucketSz()
781 if(capacity(bucket) < this->bucketSz()+100) {
782 reserve(bucket, this->bucketSz()+100, Exact());
784 } catch(bad_alloc &e) {
785 if(this->_passMemExc) {
786 throw e; // rethrow immediately
788 cerr << "Could not allocate a suffix-array block of " << ((this->bucketSz()+1) * 4) << " bytes" << endl;
789 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
790 << "a larger --bmaxdivn" << endl;
794 // Select upper and lower bounds from _sampleSuffs[] and
795 // calculate the Z array up to the difference-cover periodicity
796 // for both. Be careful about first/last buckets.
797 String<uint32_t> zLo, zHi;
799 assert_leq(_cur, length(_sampleSuffs));
800 bool first = (_cur == 0);
801 bool last = (_cur == length(_sampleSuffs));
803 Timer timer(cout, " Calculating Z arrays time: ", this->verbose());
804 VMSG_NL(" Calculating Z arrays");
806 // Not the last bucket
807 assert_lt(_cur, length(_sampleSuffs));
808 hi = _sampleSuffs[_cur];
809 fill(zHi, _dcV, 0, Exact());
810 assert_eq(zHi[0], 0);
811 calcZ(t, hi, zHi, this->verbose(), this->sanityCheck());
814 // Not the first bucket
816 assert_leq(_cur, length(_sampleSuffs));
817 lo = _sampleSuffs[_cur-1];
818 fill(zLo, _dcV, 0, Exact());
820 assert_eq(zLo[0], 0);
821 calcZ(t, lo, zLo, this->verbose(), this->sanityCheck());
823 } catch(bad_alloc &e) {
824 if(this->_passMemExc) {
825 throw e; // rethrow immediately
827 cerr << "Could not allocate a z-array of " << (_dcV * 4) << " bytes" << endl;
828 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
829 << "a larger --bmaxdivn" << endl;
834 // This is the most critical loop in the algorithm; this is where
835 // we iterate over all suffixes in the text and pick out those that
836 // fall into the current bucket.
838 // This loop is based on the SMALLERSUFFIXES function outlined on
839 // p7 of the "Fast BWT" paper
841 int64_t kHi = -1, kLo = -1;
842 int64_t jHi = -1, jLo = -1;
843 bool kHiSoft = false, kLoSoft = false;
844 assert_eq(0, length(bucket));
846 Timer timer(cout, " Block accumulator loop time: ", this->verbose());
847 VMSG_NL(" Entering block accumulator loop:");
848 uint32_t lenDiv10 = (len + 9) / 10;
849 for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
850 uint32_t itenNext = iten + lenDiv10;
851 if(ten > 0) VMSG_NL(" " << (ten * 10) << "%");
852 for(uint32_t i = iten; i < itenNext && i < len; i++) {
853 assert_lt(jLo, i); assert_lt(jHi, i);
854 // Advance the upper-bound comparison by one character
855 if(i == hi || i == lo) continue; // equal to one of the bookends
856 if(hi != 0xffffffff && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
857 continue; // not in the bucket
859 if(lo != 0xffffffff && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
860 continue; // not in the bucket
862 // In the bucket! - add it
865 appendValue(bucket, i);
866 } catch(bad_alloc &e) {
867 if(this->_passMemExc) {
868 throw e; // rethrow immediately
870 cerr << "Could not append element to block of " << ((length(bucket)) * 4) << " bytes" << endl;
871 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
872 << "a larger --bmaxdivn" << endl;
876 // Not necessarily true; we allow overflowing buckets
877 // since we can't guarantee that a good set of sample
878 // suffixes can be found in a reasonable amount of time
879 //assert_lt(length(bucket), this->bucketSz());
881 } // end loop over all suffixes of t
884 } // end else clause of if(length(_sampleSuffs) == 0)
886 if(length(bucket) > 0) {
887 Timer timer(cout, " Sorting block time: ", this->verbose());
888 VMSG_NL(" Sorting block of length " << length(bucket));
891 if(hi != 0xffffffff) {
892 // Not the final bucket; throw in the sample on the RHS
893 appendValue(bucket, hi);
895 // Final bucket; throw in $ suffix
896 appendValue(bucket, len);
898 VMSG_NL("Returning block of " << length(bucket));
899 _cur++; // advance to next bucket
902 #endif /*BLOCKWISE_SA_H_*/