blockwise_sa.h

   1 #ifndef BLOCKWISE_SA_H_
   2 #define BLOCKWISE_SA_H_
   3
   4 #include <stdint.h>
   5 #include <stdlib.h>
   6 #include <iostream>
   7 #include <sstream>
   8 #include <stdexcept>
   9 #include <vector>
  10 #include <seqan/sequence.h>
  11 #include <seqan/index.h>
  12 #include "assert_helpers.h"
  13 #include "diff_sample.h"
  14 #include "multikey_qsort.h"
  15 #include "random_source.h"
  16 #include "binary_sa_search.h"
  17 #include "zbox.h"
  18 #include "alphabet.h"
  19 #include "timer.h"
  20 #include "auto_array.h"
  21
  22 using namespace std;
  23 using namespace seqan;
  24
  25 // Helpers for printing verbose messages
  26
  27 #ifndef VMSG_NL
  28 #define VMSG_NL(args...) \
  29 if(this->verbose()) { \
  30         stringstream tmp; \
  31         tmp << args << endl; \
  32         this->verbose(tmp.str()); \
  33 }
  34 #endif
  35
  36 #ifndef VMSG
  37 #define VMSG(args...) \
  38 if(this->verbose()) { \
  39         stringstream tmp; \
  40         tmp << args; \
  41         this->verbose(tmp.str()); \
  42 }
  43 #endif
  44
  45 /**
  46  * Abstract parent class for blockwise suffix-array building schemes.
  47  */
  48 template<typename TStr>
  49 class BlockwiseSA {
  50 public:
  51         BlockwiseSA(const TStr& __text,
  52                     uint32_t __bucketSz,
  53                     bool __sanityCheck = false,
  54                     bool __passMemExc = false,
  55                     bool __verbose = false,
  56                     ostream& __logger = cout) :
  57         _text(__text),
  58         _bucketSz(max<uint32_t>(__bucketSz, 2u)),
  59         _sanityCheck(__sanityCheck),
  60         _passMemExc(__passMemExc),
  61         _verbose(__verbose),
  62         _itrBucket(),
  63         _itrBucketPos(0xffffffff),
  64         _itrPushedBackSuffix(0xffffffff),
  65         _logger(__logger)
  66         { }
  67
  68         virtual ~BlockwiseSA() { }
  69
  70         /**
  71          * Get the next suffix; compute the next bucket if necessary.
  72          */
  73         uint32_t nextSuffix() {
  74                 if(_itrPushedBackSuffix != 0xffffffff) {
  75                         uint32_t tmp = _itrPushedBackSuffix;
  76                         _itrPushedBackSuffix = 0xffffffff;
  77                         return tmp;
  78                 }
  79                 while(_itrBucketPos >= length(_itrBucket) ||
  80                       length(_itrBucket) == 0)
  81                 {
  82                         if(!hasMoreBlocks()) {
  83                                 throw out_of_range("No more suffixes");
  84                         }
  85                         nextBlock();
  86                         _itrBucketPos = 0;
  87                 }
  88                 return _itrBucket[_itrBucketPos++];
  89         }
  90
  91         /**
  92          * Return true iff the next call to nextSuffix will succeed.
  93          */
  94         bool hasMoreSuffixes() {
  95                 if(_itrPushedBackSuffix != 0xffffffff) return true;
  96                 try {
  97                         _itrPushedBackSuffix = nextSuffix();
  98                 } catch(out_of_range& e) {
  99                         assert_eq(0xffffffff, _itrPushedBackSuffix);
 100                         return false;
 101                 }
 102                 return true;
 103         }
 104
 105         /**
 106          * Reset the suffix iterator so that the next call to nextSuffix()
 107          * returns the lexicographically-first suffix.
 108          */
 109         void resetSuffixItr() {
 110                 clear(_itrBucket);
 111                 _itrBucketPos = 0xffffffff;
 112                 _itrPushedBackSuffix = 0xffffffff;
 113                 reset();
 114                 assert(suffixItrIsReset());
 115         }
 116
 117         /**
 118          * Returns true iff the next call to nextSuffix() returns the
 119          * lexicographically-first suffix.
 120          */
 121         bool suffixItrIsReset() {
 122                 return length(_itrBucket)   == 0 &&
 123                        _itrBucketPos        == 0xffffffff &&
 124                        _itrPushedBackSuffix == 0xffffffff &&
 125                        isReset();
 126         }
 127
 128         const TStr& text()  const { return _text; }
 129         uint32_t bucketSz() const { return _bucketSz; }
 130         bool sanityCheck()  const { return _sanityCheck; }
 131         bool verbose()      const { return _verbose; }
 132         ostream& log()      const { return _logger; }
 133         uint32_t size()     const { return length(_text)+1; }
 134
 135 protected:
 136         /// Reset back to the first block
 137         virtual void reset() = 0;
 138         /// Return true iff reset to the first block
 139         virtual bool isReset() = 0;
 140
 141         /**
 142          * Grab the next block of sorted suffixes.  The block is guaranteed
 143          * to have at most _bucketSz elements.
 144          */
 145         virtual void nextBlock() = 0;
 146         /// Return true iff more blocks are available
 147         virtual bool hasMoreBlocks() const = 0;
 148         /// Optionally output a verbose message
 149         void verbose(const string& s) const {
 150                 if(this->verbose()) {
 151                         this->log() << s;
 152                         this->log().flush();
 153                 }
 154         }
 155
 156         const TStr&      _text;        /// original string
 157         const uint32_t   _bucketSz;    /// target maximum bucket size
 158         const bool       _sanityCheck; /// whether to perform sanity checks
 159         const bool       _passMemExc;  /// true -> pass on memory exceptions
 160         const bool       _verbose;     /// be talkative
 161         String<uint32_t> _itrBucket;   /// current bucket
 162         uint32_t         _itrBucketPos;/// offset into current bucket
 163         uint32_t         _itrPushedBackSuffix; /// temporary slot for lookahead
 164         ostream&         _logger;      /// write log messages here
 165 };
 166
 167 /**
 168  * Abstract parent class for a blockwise suffix array builder that
 169  * always doles out blocks in lexicographical order.
 170  */
 171 template<typename TStr>
 172 class InorderBlockwiseSA : public BlockwiseSA<TStr> {
 173 public:
 174         InorderBlockwiseSA(const TStr& __text,
 175                            uint32_t __bucketSz,
 176                            bool __sanityCheck = false,
 177                                bool __passMemExc = false,
 178                            bool __verbose = false,
 179                            ostream& __logger = cout) :
 180         BlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger)
 181         { }
 182 };
 183
 184 /**
 185  * Build the SA a block at a time according to the scheme outlined in
 186  * Karkkainen's "Fast BWT" paper.
 187  */
 188 template<typename TStr>
 189 class KarkkainenBlockwiseSA : public InorderBlockwiseSA<TStr> {
 190 public:
 191         typedef DifferenceCoverSample<TStr> TDC;
 192
 193         KarkkainenBlockwiseSA(const TStr& __text,
 194                               uint32_t __bucketSz,
 195                               uint32_t __dcV,
 196                               uint32_t __seed = 0,
 197                               bool __sanityCheck = false,
 198                                   bool __passMemExc = false,
 199                               bool __verbose = false,
 200                               ostream& __logger = cout) :
 201         InorderBlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger),
 202         _sampleSuffs(), _cur(0), _dcV(__dcV), _dc(NULL), _built(false)
 203         { _randomSrc.init(__seed); reset(); }
 204
 205         ~KarkkainenBlockwiseSA() {
 206                 if(_dc != NULL) delete _dc; _dc = NULL; // difference cover sample
 207         }
 208
 209         /**
 210          * Allocate an amount of memory that simulates the peak memory
 211          * usage of the DifferenceCoverSample with the given text and v.
 212          * Throws bad_alloc if it's not going to fit in memory.  Returns
 213          * the approximate number of bytes the Cover takes at all times.
 214          */
 215         static size_t simulateAllocs(const TStr& text, uint32_t bucketSz) {
 216                 size_t len = length(text);
 217                 // _sampleSuffs and _itrBucket are in memory at the peak
 218                 size_t bsz = bucketSz;
 219                 size_t sssz = len / max<uint32_t>(bucketSz-1, 1);
 220                 AutoArray<uint32_t> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/));
 221                 return bsz;
 222         }
 223
 224         /// Defined in blockwise_sa.cpp
 225         virtual void nextBlock();
 226
 227         /// Defined in blockwise_sa.cpp
 228         virtual void qsort(String<uint32_t>& bucket);
 229
 230         /// Return true iff more blocks are available
 231         virtual bool hasMoreBlocks() const {
 232                 return _cur <= length(_sampleSuffs);
 233         }
 234
 235         /// Return the difference-cover period
 236         uint32_t dcV() const { return _dcV; }
 237
 238 protected:
 239
 240         /**
 241          * Initialize the state of the blockwise suffix sort.  If the
 242          * difference cover sample and the sample set have not yet been
 243          * built, build them.  Then reset the block cursor to point to
 244          * the first block.
 245          */
 246         virtual void reset() {
 247                 if(!_built) {
 248                         build();
 249                 }
 250                 assert(_built);
 251                 _cur = 0;
 252         }
 253
 254         /// Return true iff we're about to dole out the first bucket
 255         virtual bool isReset() {
 256                 return _cur == 0;
 257         }
 258
 259 private:
 260
 261         /**
 262          * Calculate the difference-cover sample and sample suffixes.
 263          */
 264         void build() {
 265                 // Calculate difference-cover sample
 266                 assert(_dc == NULL);
 267                 if(_dcV != 0) {
 268                         _dc = new TDC(this->text(), _dcV, this->verbose(), this->sanityCheck());
 269                         _dc->build();
 270                 }
 271                 // Calculate sample suffixes
 272                 if(this->bucketSz() <= length(this->text())) {
 273                         VMSG_NL("Building samples");
 274                         buildSamples();
 275                 } else {
 276                         VMSG_NL("Skipping building samples since text length " <<
 277                                 length(this->text()) << " is less than bucket size: " <<
 278                                 this->bucketSz());
 279                 }
 280                 _built = true;
 281         }
 282
 283         /**
 284          * Calculate the lcp between two suffixes using the difference
 285          * cover as a tie-breaker.  If the tie-breaker is employed, then
 286          * the calculated lcp may be an underestimate.
 287          *
 288          * Defined in blockwise_sa.cpp
 289          */
 290         inline bool tieBreakingLcp(uint32_t aOff,
 291                                    uint32_t bOff,
 292                                    uint32_t& lcp,
 293                                    bool& lcpIsSoft);
 294
 295         /**
 296          * Compare two suffixes using the difference-cover sample.
 297          */
 298         inline bool suffixCmp(uint32_t cmp,
 299                               uint32_t i,
 300                               int64_t& j,
 301                               int64_t& k,
 302                               bool& kSoft,
 303                               const String<uint32_t>& z);
 304
 305         void buildSamples();
 306
 307         String<uint32_t> _sampleSuffs; /// sample suffixes
 308         uint32_t         _cur;         /// offset to 1st elt of next block
 309         const uint32_t   _dcV;         /// difference-cover periodicity
 310         TDC*             _dc;          /// queryable difference-cover data
 311         bool             _built;       /// whether samples/DC have been built
 312         RandomSource     _randomSrc;   /// source of pseudo-randoms
 313 };
 314
 315 /**
 316  * Qsort the set of suffixes whose offsets are in 'bucket'.
 317  */
 318 template<typename TStr>
 319 void KarkkainenBlockwiseSA<TStr>::qsort(String<uint32_t>& bucket) {
 320         typedef typename Value<TStr>::Type TAlphabet;
 321         const TStr& t = this->text();
 322         uint32_t *s = begin(bucket);
 323         uint32_t slen = seqan::length(bucket);
 324         uint32_t len = seqan::length(t);
 325         if(_dc != NULL) {
 326                 // Use the difference cover as a tie-breaker if we have it
 327                 VMSG_NL("  (Using difference cover)");
 328                 // Extract the 'host' array because it's faster to work
 329                 // with than the String<> container
 330                 uint8_t *host = (uint8_t*)t.data_begin;
 331                 mkeyQSortSufDcU8(t, host, len, s, slen, *_dc,
 332                                  ValueSize<TAlphabet>::VALUE,
 333                                  this->verbose(), this->sanityCheck());
 334         } else {
 335                 VMSG_NL("  (Not using difference cover)");
 336                 // We don't have a difference cover - just do a normal
 337                 // suffix sort
 338                 mkeyQSortSuf(t, s, slen, ValueSize<TAlphabet>::VALUE,
 339                              this->verbose(), this->sanityCheck());
 340         }
 341 }
 342
 343 /**
 344  * Qsort the set of suffixes whose offsets are in 'bucket'.  This
 345  * specialization for packed strings does not attempt to extract and
 346  * operate directly on the host string; the fact that the string is
 347  * packed means that the array cannot be sorted directly.
 348  */
 349 template<>
 350 void KarkkainenBlockwiseSA<String<Dna, Packed<> > >::qsort(String<uint32_t>& bucket) {
 351         const String<Dna, Packed<> >& t = this->text();
 352         uint32_t *s = begin(bucket);
 353         uint32_t slen = seqan::length(bucket);
 354         uint32_t len = seqan::length(t);
 355         if(_dc != NULL) {
 356                 // Use the difference cover as a tie-breaker if we have it
 357                 VMSG_NL("  (Using difference cover)");
 358                 // Can't use the text's 'host' array because the backing
 359                 // store for the packed string is not one-char-per-elt.
 360                 mkeyQSortSufDcU8(t, t, len, s, slen, *_dc,
 361                                  ValueSize<Dna>::VALUE,
 362                                  this->verbose(), this->sanityCheck());
 363         } else {
 364                 VMSG_NL("  (Not using difference cover)");
 365                 // We don't have a difference cover - just do a normal
 366                 // suffix sort
 367                 mkeyQSortSuf(t, s, slen, ValueSize<Dna>::VALUE,
 368                              this->verbose(), this->sanityCheck());
 369         }
 370 }
 371
 372 /**
 373  * Select a set of bucket-delineating sample suffixes such that no
 374  * bucket is greater than the requested upper limit.  Some care is
 375  * taken to make each bucket's size close to the limit without
 376  * going over.
 377  */
 378 template<typename TStr>
 379 void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 380         typedef typename Value<TStr>::Type TAlphabet;
 381         const TStr& t = this->text();
 382         uint32_t bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
 383         uint32_t len = length(this->text());
 384         // Prepare _sampleSuffs array
 385         clear(_sampleSuffs);
 386         uint32_t numSamples = ((len/bsz)+1)<<1; // ~len/bsz x 2
 387         assert_gt(numSamples, 0);
 388         VMSG_NL("Reserving space for " << numSamples << " sample suffixes");
 389         if(this->_passMemExc) {
 390                 reserve(_sampleSuffs, numSamples, Exact());
 391                 // Randomly generate samples.  Allow duplicates for now.
 392                 VMSG_NL("Generating random suffixes");
 393                 for(size_t i = 0; i < numSamples; i++) {
 394                         appendValue(_sampleSuffs, _randomSrc.nextU32() % len);
 395                 }
 396         } else {
 397                 try {
 398                         reserve(_sampleSuffs, numSamples, Exact());
 399                         // Randomly generate samples.  Allow duplicates for now.
 400                         VMSG_NL("Generating random suffixes");
 401                         for(size_t i = 0; i < numSamples; i++) {
 402                                 appendValue(_sampleSuffs, _randomSrc.nextU32() % len);
 403                         }
 404                 } catch(bad_alloc &e) {
 405                         if(this->_passMemExc) {
 406                                 throw e; // rethrow immediately
 407                         } else {
 408                                 cerr << "Could not allocate sample suffix container of " << (numSamples * 4) << " bytes." << endl
 409                                      << "Please try using a smaller number of blocks by specifying a larger --bmax or" << endl
 410                                      << "a smaller --bmaxdivn" << endl;
 411                                 throw 1;
 412                         }
 413                 }
 414         }
 415         // Remove duplicates; very important to do this before the call to
 416         // mkeyQSortSuf so that it doesn't try to calculate lexicographical
 417         // relationships between very long, identical strings, which takes
 418         // an extremely long time in general, and causes the stack to grow
 419         // linearly with the size of the input
 420         {
 421                 Timer timer(cout, "QSorting sample offsets, eliminating duplicates time: ", this->verbose());
 422                 VMSG_NL("QSorting " << length(_sampleSuffs) << " sample offsets, eliminating duplicates");
 423                 sort(begin(_sampleSuffs), end(_sampleSuffs));
 424                 size_t sslen = length(_sampleSuffs);
 425                 for(size_t i = 0; i < sslen-1; i++) {
 426                         if(_sampleSuffs[i] == _sampleSuffs[i+1]) {
 427                                 erase(_sampleSuffs, i--);
 428                                 sslen--;
 429                         }
 430                 }
 431         }
 432         // Multikey quicksort the samples
 433         {
 434                 Timer timer(cout, "  Multikey QSorting samples time: ", this->verbose());
 435                 VMSG_NL("Multikey QSorting " << length(_sampleSuffs) << " samples");
 436                 this->qsort(_sampleSuffs);
 437         }
 438         // Calculate bucket sizes
 439         VMSG_NL("Calculating bucket sizes");
 440         int limit = 5;
 441         // Iterate until all buckets are less than
 442         while(--limit >= 0) {
 443                 // Calculate bucket sizes by doing a binary search for each
 444                 // suffix and noting where it lands
 445                 uint32_t numBuckets = length(_sampleSuffs)+1;
 446                 String<uint32_t> bucketSzs; // holds computed bucket sizes
 447                 String<uint32_t> bucketReps; // holds 1 member of each bucket (for splitting)
 448                 try {
 449                         // Allocate and initialize containers for holding bucket
 450                         // sizes and representatives.
 451                         fill(bucketSzs, numBuckets, 0, Exact());
 452                         fill(bucketReps, numBuckets, 0xffffffff, Exact());
 453                 } catch(bad_alloc &e) {
 454                         if(this->_passMemExc) {
 455                                 throw e; // rethrow immediately
 456                         } else {
 457                                 cerr << "Could not allocate sizes, representatives (" << ((numBuckets*8)>>10) << " KB) for blocks." << endl
 458                                      << "Please try using a smaller number of blocks by specifying a larger --bmax or a" << endl
 459                                      << "smaller --bmaxdivn." << endl;
 460                                 throw 1;
 461                         }
 462                 }
 463                 // Iterate through every suffix in the text, determine which
 464                 // bucket it falls into by doing a binary search across the
 465                 // sorted list of samples, and increment a counter associated
 466                 // with that bucket.  Also, keep one representative for each
 467                 // bucket so that we can split it later.  We loop in ten
 468                 // stretches so that we can print out a helpful progress
 469                 // message.  (This step can take a long time.)
 470                 {
 471                         VMSG_NL("  Binary sorting into buckets");
 472                         Timer timer(cout, "  Binary sorting into buckets time: ", this->verbose());
 473                         uint32_t lenDiv10 = (len + 9) / 10;
 474                         for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
 475                                 uint32_t itenNext = iten + lenDiv10;
 476                                 if(ten > 0) VMSG_NL("  " << (ten * 10) << "%");
 477                                 for(uint32_t i = iten; i < itenNext && i < len; i++) {
 478                                         uint32_t r = binarySASearch(t, i, _sampleSuffs);
 479                                         if(r == 0xffffffff) continue; // r was one of the samples
 480                                         assert_lt(r, numBuckets);
 481                                         bucketSzs[r]++;
 482                                         assert_lt(bucketSzs[r], len);
 483                                         if(bucketReps[r] == 0xffffffff ||
 484                                            (_randomSrc.nextU32() & 100) == 0)
 485                                         {
 486                                                 bucketReps[r] = i; // clobbers previous one, but that's OK
 487                                         }
 488                                 }
 489                         }
 490                         VMSG_NL("  100%");
 491                 }
 492                 // Check for large buckets and mergeable pairs of small buckets
 493                 // and split/merge as necessary
 494                 int added = 0;
 495                 int merged = 0;
 496                 assert_eq(length(bucketSzs), numBuckets);
 497                 assert_eq(length(bucketReps), numBuckets);
 498                 {
 499                         Timer timer(cout, "  Splitting and merging time: ", this->verbose());
 500                         VMSG_NL("Splitting and merging");
 501                         for(int64_t i = 0; i < numBuckets; i++) {
 502                                 uint32_t mergedSz = bsz + 1;
 503                                 assert(bucketSzs[i] == 0 || bucketReps[i] != 0xffffffff);
 504                                 if(i < (int64_t)numBuckets-1) {
 505                                         mergedSz = bucketSzs[i] + bucketSzs[i+1] + 1;
 506                                 }
 507                                 // Merge?
 508                                 if(mergedSz <= bsz) {
 509                                         bucketSzs[i+1] += (bucketSzs[i]+1);
 510                                         // The following may look strange, but it's necessary
 511                                         // to ensure that the merged bucket has a representative
 512                                         bucketReps[i+1] = _sampleSuffs[i+added];
 513                                         erase(_sampleSuffs, i+added);
 514                                         erase(bucketSzs, i);
 515                                         erase(bucketReps, i);
 516                                         i--; // might go to -1 but ++ will overflow back to 0
 517                                         numBuckets--;
 518                                         merged++;
 519                                         assert_eq(numBuckets, length(_sampleSuffs)+1-added);
 520                                         assert_eq(numBuckets, length(bucketSzs));
 521                                 }
 522                                 // Split?
 523                                 else if(bucketSzs[i] > bsz) {
 524                                         // Add an additional sample from the bucketReps[]
 525                                         // set accumulated in the binarySASearch loop; this
 526                                         // effectively splits the bucket
 527                                         insertValue(_sampleSuffs, i + (added++), bucketReps[i]);
 528                                 }
 529                         }
 530                 }
 531                 if(added == 0) {
 532                         //if(this->verbose()) {
 533                         //      cout << "Final bucket sizes:" << endl;
 534                         //      cout << "  (begin): " << bucketSzs[0] << " (" << (int)(bsz - bucketSzs[0]) << ")" << endl;
 535                         //      for(uint32_t i = 1; i < numBuckets; i++) {
 536                         //              cout << "  " << bucketSzs[i] << " (" << (int)(bsz - bucketSzs[i]) << ")" << endl;
 537                         //      }
 538                         //}
 539                         break;
 540                 }
 541                 // Otherwise, continue until no more buckets need to be
 542                 // split
 543                 VMSG_NL("Split " << added << ", merged " << merged << "; iterating...");
 544         }
 545         // Do *not* force a do-over
 546 //      if(limit == 0) {
 547 //              VMSG_NL("Iterated too many times; trying again...");
 548 //              buildSamples();
 549 //      }
 550         VMSG_NL("Avg bucket size: " << ((float)(len-length(_sampleSuffs)) / (length(_sampleSuffs)+1)) << " (target: " << bsz << ")");
 551 }
 552
 553 /**
 554  * Do a simple LCP calculation on two strings.
 555  */
 556 template<typename T> inline
 557 static uint32_t suffixLcp(const T& t, uint32_t aOff, uint32_t bOff) {
 558         uint32_t c = 0;
 559         size_t len = length(t);
 560         assert_leq(aOff, len);
 561         assert_leq(bOff, len);
 562         while(aOff + c < len && bOff + c < len && t[aOff + c] == t[bOff + c]) c++;
 563         return c;
 564 }
 565
 566 /**
 567  * Calculate the lcp between two suffixes using the difference
 568  * cover as a tie-breaker.  If the tie-breaker is employed, then
 569  * the calculated lcp may be an underestimate.  If the tie-breaker is
 570  * employed, lcpIsSoft will be set to true (otherwise, false).
 571  */
 572 template<typename TStr> inline
 573 bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(uint32_t aOff,
 574                                                  uint32_t bOff,
 575                                                  uint32_t& lcp,
 576                                                  bool& lcpIsSoft)
 577 {
 578         const TStr& t = this->text();
 579         uint32_t c = 0;
 580         uint32_t tlen = length(t);
 581         assert_leq(aOff, tlen);
 582         assert_leq(bOff, tlen);
 583         assert(_dc != NULL);
 584         uint32_t dcDist = _dc->tieBreakOff(aOff, bOff);
 585         lcpIsSoft = false; // hard until proven soft
 586         while(c < dcDist &&    // we haven't hit the tie breaker
 587               c < tlen-aOff && // we haven't fallen off of LHS suffix
 588               c < tlen-bOff && // we haven't fallen off of RHS suffix
 589               t[aOff+c] == t[bOff+c]) // we haven't hit a mismatch
 590                 c++;
 591         lcp = c;
 592         if(c == tlen-aOff) {
 593                 // Fell off LHS (a), a is greater
 594                 return false;
 595         } else if(c == tlen-bOff) {
 596                 // Fell off RHS (b), b is greater
 597                 return true;
 598         } else if(c == dcDist) {
 599                 // Hit a tie-breaker element
 600                 lcpIsSoft = true;
 601                 assert_neq(dcDist, 0xffffffff);
 602                 return _dc->breakTie(aOff+c, bOff+c) < 0;
 603         } else {
 604                 assert_neq(t[aOff+c], t[bOff+c]);
 605                 return t[aOff+c] < t[bOff+c];
 606         }
 607 }
 608
 609 /**
 610  * Lookup a suffix LCP in the given z array; if the element is not
 611  * filled in then calculate it from scratch.
 612  */
 613 template<typename T>
 614 static uint32_t lookupSuffixZ(const T& t,
 615                               uint32_t zOff,
 616                               uint32_t off,
 617                               const String<uint32_t>& z)
 618 {
 619         if(zOff < length(z)) {
 620                 uint32_t ret = z[zOff];
 621                 assert_eq(ret, suffixLcp(t, off + zOff, off));
 622                 return ret;
 623         }
 624         assert_leq(off + zOff, length(t));
 625         return suffixLcp(t, off + zOff, off);
 626 }
 627
 628 /**
 629  * true -> i < cmp
 630  * false -> i > cmp
 631  */
 632 template<typename TStr> inline
 633 bool KarkkainenBlockwiseSA<TStr>::suffixCmp(uint32_t cmp,
 634                                             uint32_t i,
 635                                             int64_t& j,
 636                                             int64_t& k,
 637                                             bool& kSoft,
 638                                             const String<uint32_t>& z)
 639 {
 640         const TStr& t = this->text();
 641         uint32_t len = length(t);
 642         // i is not covered by any previous match
 643         uint32_t l;
 644         if(i > k) {
 645                 k = i; // so that i + lHi == kHi
 646                 l = 0; // erase any previous l
 647                 kSoft = false;
 648                 // To be extended
 649         }
 650         // i is covered by a previous match
 651         else /* i <= k */ {
 652                 assert_gt((int64_t)i, j);
 653                 uint32_t zIdx = i-j;
 654                 assert_leq(zIdx, len-cmp);
 655                 if(zIdx < _dcV || _dc == NULL) {
 656                         // Go as far as the Z-box says
 657                         l = lookupSuffixZ(t, zIdx, cmp, z);
 658                         if(i + l > len) {
 659                                 l = len-i;
 660                         }
 661                         assert_leq(i + l, len);
 662                         // Possibly to be extended
 663                 } else {
 664                         // But we're past the point of no-more-Z-boxes
 665                         bool ret = tieBreakingLcp(i, cmp, l, kSoft);
 666                         // Sanity-check tie-breaker
 667                         if(this->sanityCheck()) {
 668                                 if(ret) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
 669                                 else    assert(dollarGt(suffix(t, i), suffix(t, cmp)));
 670                         }
 671                         j = i;
 672                         k = i + l;
 673                         if(this->sanityCheck()) {
 674                                 if(kSoft) { assert_leq(l, suffixLcp(t, i, cmp)); }
 675                                 else      { assert_eq (l, suffixLcp(t, i, cmp)); }
 676                         }
 677                         return ret;
 678                 }
 679         }
 680
 681         // Z box extends exactly as far as previous match (or there
 682         // is neither a Z box nor a previous match)
 683         if(i + l == k) {
 684                 // Extend
 685                 while(l < len-cmp && k < len && t[cmp+l] == t[k]) {
 686                         k++; l++;
 687                 }
 688                 j = i; // update furthest-extending LHS
 689                 kSoft = false;
 690                 assert_eq(l, suffixLcp(t, i, cmp));
 691         }
 692         // Z box extends further than previous match
 693         else if(i + l > k) {
 694                 l = k - i; // point to just after previous match
 695                 j = i; // update furthest-extending LHS
 696                 if(kSoft) {
 697                         while(l < len-cmp && k < len && t[cmp+l] == t[k]) {
 698                                 k++; l++;
 699                         }
 700                         kSoft = false;
 701                         assert_eq(l, suffixLcp(t, i, cmp));
 702                 } else assert_eq(l, suffixLcp(t, i, cmp));
 703         }
 704
 705         // Check that calculated lcp matches actual lcp
 706         if(this->sanityCheck()) {
 707                 if(!kSoft) {
 708                         // l should exactly match lcp
 709                         assert_eq(l, suffixLcp(t, i, cmp));
 710                 } else {
 711                         // l is an underestimate of LCP
 712                         assert_leq(l, suffixLcp(t, i, cmp));
 713                 }
 714         }
 715         assert_leq(l+i, len);
 716         assert_leq(l, len-cmp);
 717
 718         // i and cmp should not be the same suffix
 719         assert(l != len-cmp || i+l != len);
 720
 721         // Now we're ready to do a comparison on the next char
 722         if(l+i != len && (
 723            l == len-cmp || // departure from paper algorithm:
 724                            // falling off pattern implies
 725                            // pattern is *greater* in our case
 726            t[i + l] < t[cmp + l]))
 727         {
 728                 // Case 2: Text suffix is less than upper sample suffix
 729                 if(this->sanityCheck()) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
 730                 return true; // suffix at i is less than suffix at cmp
 731         }
 732         else {
 733                 // Case 3: Text suffix is greater than upper sample suffix
 734                 if(this->sanityCheck()) assert(dollarGt(suffix(t, i), suffix(t, cmp)));
 735                 return false; // suffix at i is less than suffix at cmp
 736         }
 737 }
 738
 739 /**
 740  * Retrieve the next block.  This is the most performance-critical part
 741  * of the blockwise suffix sorting process.
 742  */
 743 template<typename TStr>
 744 void KarkkainenBlockwiseSA<TStr>::nextBlock() {
 745         typedef typename Value<TStr>::Type TAlphabet;
 746         String<uint32_t>& bucket = this->_itrBucket;
 747         VMSG_NL("Getting block " << (_cur+1) << " of " << length(_sampleSuffs)+1);
 748         assert(_built);
 749         assert_gt(_dcV, 3);
 750         assert_leq(_cur, length(_sampleSuffs));
 751         const TStr& t = this->text();
 752         uint32_t len = length(t);
 753         // Set up the bucket
 754         clear(bucket);
 755         uint32_t lo = 0xffffffff, hi = 0xffffffff;
 756         if(length(_sampleSuffs) == 0) {
 757                 // Special case: if _sampleSuffs is 0, then multikey-quicksort
 758                 // everything
 759                 VMSG_NL("  No samples; assembling all-inclusive block");
 760                 assert_eq(0, _cur);
 761                 try {
 762                         if(capacity(bucket) < this->bucketSz()) {
 763                                 reserve(bucket, len+1, Exact());
 764                         }
 765                         for(uint32_t i = 0; i < len; i++) append(bucket, i);
 766                 } catch(bad_alloc &e) {
 767                         if(this->_passMemExc) {
 768                                 throw e; // rethrow immediately
 769                         } else {
 770                                 cerr << "Could not allocate a master suffix-array block of " << ((len+1) * 4) << " bytes" << endl
 771                                      << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
 772                                      << "a larger --bmaxdivn" << endl;
 773                                 throw 1;
 774                         }
 775                 }
 776         } else {
 777                 try {
 778                         VMSG_NL("  Reserving size (" << this->bucketSz() << ") for bucket");
 779                         // BTL: Add a +100 fudge factor; there seem to be instances
 780                         // where a bucket ends up having one more elt than bucketSz()
 781                         if(capacity(bucket) < this->bucketSz()+100) {
 782                                 reserve(bucket, this->bucketSz()+100, Exact());
 783                         }
 784                 } catch(bad_alloc &e) {
 785                         if(this->_passMemExc) {
 786                                 throw e; // rethrow immediately
 787                         } else {
 788                                 cerr << "Could not allocate a suffix-array block of " << ((this->bucketSz()+1) * 4) << " bytes" << endl;
 789                                 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
 790                                      << "a larger --bmaxdivn" << endl;
 791                                 throw 1;
 792                         }
 793                 }
 794                 // Select upper and lower bounds from _sampleSuffs[] and
 795                 // calculate the Z array up to the difference-cover periodicity
 796                 // for both.  Be careful about first/last buckets.
 797                 String<uint32_t> zLo, zHi;
 798                 assert_geq(_cur, 0);
 799                 assert_leq(_cur, length(_sampleSuffs));
 800                 bool first = (_cur == 0);
 801                 bool last  = (_cur == length(_sampleSuffs));
 802                 try {
 803                         Timer timer(cout, "  Calculating Z arrays time: ", this->verbose());
 804                         VMSG_NL("  Calculating Z arrays");
 805                         if(!last) {
 806                                 // Not the last bucket
 807                                 assert_lt(_cur, length(_sampleSuffs));
 808                                 hi = _sampleSuffs[_cur];
 809                                 fill(zHi, _dcV, 0, Exact());
 810                                 assert_eq(zHi[0], 0);
 811                                 calcZ(t, hi, zHi, this->verbose(), this->sanityCheck());
 812                         }
 813                         if(!first) {
 814                                 // Not the first bucket
 815                                 assert_gt(_cur, 0);
 816                                 assert_leq(_cur, length(_sampleSuffs));
 817                                 lo = _sampleSuffs[_cur-1];
 818                                 fill(zLo, _dcV, 0, Exact());
 819                                 assert_gt(_dcV, 3);
 820                                 assert_eq(zLo[0], 0);
 821                                 calcZ(t, lo, zLo, this->verbose(), this->sanityCheck());
 822                         }
 823                 } catch(bad_alloc &e) {
 824                         if(this->_passMemExc) {
 825                                 throw e; // rethrow immediately
 826                         } else {
 827                                 cerr << "Could not allocate a z-array of " << (_dcV * 4) << " bytes" << endl;
 828                                 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
 829                                      << "a larger --bmaxdivn" << endl;
 830                                 throw 1;
 831                         }
 832                 }
 833
 834                 // This is the most critical loop in the algorithm; this is where
 835                 // we iterate over all suffixes in the text and pick out those that
 836                 // fall into the current bucket.
 837                 //
 838                 // This loop is based on the SMALLERSUFFIXES function outlined on
 839                 // p7 of the "Fast BWT" paper
 840                 //
 841                 int64_t kHi = -1, kLo = -1;
 842                 int64_t jHi = -1, jLo = -1;
 843                 bool kHiSoft = false, kLoSoft = false;
 844                 assert_eq(0, length(bucket));
 845                 {
 846                         Timer timer(cout, "  Block accumulator loop time: ", this->verbose());
 847                         VMSG_NL("  Entering block accumulator loop:");
 848                         uint32_t lenDiv10 = (len + 9) / 10;
 849                         for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
 850                         uint32_t itenNext = iten + lenDiv10;
 851                         if(ten > 0) VMSG_NL("  " << (ten * 10) << "%");
 852                         for(uint32_t i = iten; i < itenNext && i < len; i++) {
 853                                 assert_lt(jLo, i); assert_lt(jHi, i);
 854                                 // Advance the upper-bound comparison by one character
 855                                 if(i == hi || i == lo) continue; // equal to one of the bookends
 856                                 if(hi != 0xffffffff && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
 857                                         continue; // not in the bucket
 858                                 }
 859                                 if(lo != 0xffffffff && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
 860                                         continue; // not in the bucket
 861                                 }
 862                                 // In the bucket! - add it
 863                                 assert_lt(i, len);
 864                                 try {
 865                                         appendValue(bucket, i);
 866                                 } catch(bad_alloc &e) {
 867                                         if(this->_passMemExc) {
 868                                                 throw e; // rethrow immediately
 869                                         } else {
 870                                                 cerr << "Could not append element to block of " << ((length(bucket)) * 4) << " bytes" << endl;
 871                                                 cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
 872                                                      << "a larger --bmaxdivn" << endl;
 873                                                 throw 1;
 874                                         }
 875                                 }
 876                                 // Not necessarily true; we allow overflowing buckets
 877                                 // since we can't guarantee that a good set of sample
 878                                 // suffixes can be found in a reasonable amount of time
 879                                 //assert_lt(length(bucket), this->bucketSz());
 880                         }
 881                         } // end loop over all suffixes of t
 882                         VMSG_NL("  100%");
 883                 }
 884         } // end else clause of if(length(_sampleSuffs) == 0)
 885         // Sort the bucket
 886         if(length(bucket) > 0) {
 887                 Timer timer(cout, "  Sorting block time: ", this->verbose());
 888                 VMSG_NL("  Sorting block of length " << length(bucket));
 889                 this->qsort(bucket);
 890         }
 891         if(hi != 0xffffffff) {
 892                 // Not the final bucket; throw in the sample on the RHS
 893                 appendValue(bucket, hi);
 894         } else {
 895                 // Final bucket; throw in $ suffix
 896                 appendValue(bucket, len);
 897         }
 898         VMSG_NL("Returning block of " << length(bucket));
 899         _cur++; // advance to next bucket
 900 }
 901
 902 #endif /*BLOCKWISE_SA_H_*/