5 #include <seqan/sequence.h>
6 #include <seqan/index.h> // for LarssonSadakane
7 #include "assert_helpers.h"
8 #include "multikey_qsort.h"
10 #include "auto_array.h"
13 using namespace seqan;
16 #define VMSG_NL(args...) \
17 if(this->verbose()) { \
19 tmp << args << endl; \
20 this->verbose(tmp.str()); \
25 #define VMSG(args...) \
26 if(this->verbose()) { \
29 this->verbose(tmp.str()); \
34 * Routines for calculating, sanity-checking, and dispensing difference
35 * cover samples to clients.
44 uint32_t samples[128];
47 /// Array of Colbourn and Ling calculated difference covers up to
48 /// r = 16 (maxV = 5953)
49 static struct sampleEntry clDCs[16];
50 static bool clDCs_calced = false; /// have clDCs been calculated?
53 * Check that the given difference cover 'ds' actually covers all
54 * differences for a periodicity of v.
57 static bool dcRepOk(T v, String<T>& ds) {
58 // diffs[] records all the differences observed
59 bool *covered = new bool[v];
60 for(T i = 1; i < v; i++) {
63 for(T di = T(); di < length(ds); di++) {
64 for(T dj = di+1; dj < length(ds); dj++) {
65 assert_lt(ds[di], ds[dj]);
66 T d1 = (ds[dj] - ds[di]);
67 T d2 = (ds[di] + v - ds[dj]);
75 for(T i = 1; i < v; i++) {
76 if(covered[i] == false) {
86 * Return true iff each element of ts (with length 'limit') is greater
90 static bool increasing(T* ts, size_t limit) {
91 for(size_t i = 0; i < limit-1; i++) {
92 if(ts[i+1] <= ts[i]) return false;
98 * Return true iff the given difference cover covers difference 'diff'
102 static inline bool hasDifference(T *ds, T d, T v, T diff) {
103 // diffs[] records all the differences observed
104 for(T di = T(); di < d; di++) {
105 for(T dj = di+1; dj < d; dj++) {
106 assert_lt(ds[di], ds[dj]);
107 T d1 = (ds[dj] - ds[di]);
108 T d2 = (ds[di] + v - ds[dj]);
111 if(d1 == diff || d2 == diff) return true;
118 * Exhaustively calculate optimal difference cover samples for v = 4,
119 * 8, 16, 32, 64, 128, 256 and store results in p2DCs[]
122 void calcExhaustiveDC(T i, bool verbose = false, bool sanityCheck = false) {
124 bool *diffs = new bool[v];
125 // v is the target period
126 T ld = (T)ceil(sqrt(v));
127 // ud is the upper bound on |D|
129 // for all possible |D|s
133 for(d = ld; d <= ud+1; d++) {
134 // for all possible |D| samples
136 for(T j = 0; j < d; j++) {
139 assert(increasing(ds, d));
142 for(T t = 1; t < v; t++) {
146 // diffs[] records all the differences observed
147 for(T di = 0; di < d; di++) {
148 for(T dj = di+1; dj < d; dj++) {
149 assert_lt(ds[di], ds[dj]);
150 T d1 = (ds[dj] - ds[di]);
151 T d2 = (ds[di] + v - ds[dj]);
156 if(!diffs[d1]) diffCnt++; diffs[d1] = true;
157 if(!diffs[d2]) diffCnt++; diffs[d2] = true;
160 // Do we observe all possible differences (except 0)
163 // Yes, all differences are covered
167 // (Following is commented out because it turns out
169 // Find a missing difference
170 //uint32_t missing = 0xffffffff;
171 //for(uint32_t t = 1; t < v; t++) {
172 // if(diffs[t] == false) {
173 // missing = diffs[t];
177 //assert_neq(missing, 0xffffffff);
178 assert(increasing(ds, d));
179 bool advanced = false;
180 bool keepGoing = false;
183 for(T bd = d-1; bd > 1; bd--) {
185 if(ds[bd] < v-1-dif) {
187 assert_neq(0, ds[bd]);
188 // Reset subsequent ones
189 for(T bdi = bd+1; bdi < d; bdi++) {
190 assert_eq(0, ds[bdi]);
191 ds[bdi] = ds[bdi-1]+1;
192 assert_gt(ds[bdi], ds[bdi-1]);
194 assert(increasing(ds, d));
195 // (Following is commented out because
196 // it turns out it's slow)
197 // See if the new DC has the missing value
198 //if(!hasDifference(ds, d, v, missing)) {
210 // No solution for this |D|
212 assert(increasing(ds, d));
214 } // next sample assignment
222 cout << "Did exhaustive v=" << v << " |D|=" << d << endl;
224 for(T i = 0; i < d; i++) {
226 if(i < d-1) cout << ",";
233 * Routune for calculating the elements of clDCs up to r = 16 using the
234 * technique of Colbourn and Ling.
236 * See http://citeseer.ist.psu.edu/211575.html
238 template <typename T>
239 void calcColbournAndLingDCs(bool verbose = false, bool sanityCheck = false) {
240 for(T r = 0; r < 16; r++) {
241 T maxv = 24*r*r + 36*r + 13; // Corollary 2.3
243 clDCs[r].maxV = maxv;
244 clDCs[r].numSamples = numsamp;
245 memset(clDCs[r].samples, 0, 4 * 128);
247 // clDCs[r].samples[0] = 0;
248 // Fill in the 1^r part of the B series
249 for(i = 1; i < r+1; i++) {
250 clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1;
252 // Fill in the (r + 1)^1 part
253 clDCs[r].samples[r+1] = clDCs[r].samples[r] + r + 1;
254 // Fill in the (2r + 1)^r part
255 for(i = r+2; i < r+2+r; i++) {
256 clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 1;
258 // Fill in the (4r + 3)^(2r + 1) part
259 for(i = r+2+r; i < r+2+r+2*r+1; i++) {
260 clDCs[r].samples[i] = clDCs[r].samples[i-1] + 4*r + 3;
262 // Fill in the (2r + 2)^(r + 1) part
263 for(i = r+2+r+2*r+1; i < r+2+r+2*r+1+r+1; i++) {
264 clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 2;
266 // Fill in the last 1^r part
267 for(i = r+2+r+2*r+1+r+1; i < r+2+r+2*r+1+r+1+r; i++) {
268 clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1;
270 assert_eq(i, numsamp);
273 // diffs[] records all the differences observed
274 bool *diffs = new bool[maxv];
275 for(T i = 0; i < numsamp; i++) {
276 for(T j = i+1; j < numsamp; j++) {
277 T d1 = (clDCs[r].samples[j] - clDCs[r].samples[i]);
278 T d2 = (clDCs[r].samples[i] + maxv - clDCs[r].samples[j]);
285 // Should have observed all possible differences (except 0)
286 for(T i = 1; i < maxv; i++) {
287 if(diffs[i] == false) cout << r << ", " << i << endl;
288 assert(diffs[i] == true);
297 * Entries 4-57 are transcribed from page 6 of Luk and Wong's paper
298 * "Two New Quorum Based Algorithms for Distributed Mutual Exclusion",
299 * which is also used and cited in the Burkhardt and Karkkainen's
300 * papers on difference covers for sorting. These samples are optimal
301 * according to Luk and Wong.
303 * All other entries are generated via the exhaustive algorithm in
304 * calcExhaustiveDC().
306 * The 0 is stored at the end of the sample as an end-of-list marker,
307 * but 0 is also an element of each.
309 * Note that every difference cover has a 0 and a 1. Intuitively,
310 * any optimal difference cover sample can be oriented (i.e. rotated)
311 * such that it includes 0 and 1 as elements.
313 * All samples in this list have been verified to be complete covers.
315 * A value of 0xffffffff in the first column indicates that there is no
316 * sample for that value of v. We do not keep samples for values of v
317 * less than 3, since they are trivial (and the caller probably didn't
318 * mean to ask for it).
320 static uint32_t dc0to64[65][10] = {
335 {1, 2, 3, 7, 0}, // 14
336 {1, 2, 3, 7, 0}, // 15
337 {1, 2, 5, 8, 0}, // 16
338 {1, 2, 4, 12, 0}, // 17
339 {1, 2, 5, 11, 0}, // 18
340 {1, 2, 6, 9, 0}, // 19
341 {1, 2, 3, 6, 10, 0}, // 20
342 {1, 4, 14, 16, 0}, // 21
343 {1, 2, 3, 7, 11, 0}, // 22
344 {1, 2, 3, 7, 11, 0}, // 23
345 {1, 2, 3, 7, 15, 0}, // 24
346 {1, 2, 3, 8, 12, 0}, // 25
347 {1, 2, 5, 9, 15, 0}, // 26
348 {1, 2, 5, 13, 22, 0}, // 27
349 {1, 4, 15, 20, 22, 0}, // 28
350 {1, 2, 3, 4, 9, 14, 0}, // 29
351 {1, 2, 3, 4, 9, 19, 0}, // 30
352 {1, 3, 8, 12, 18, 0}, // 31
353 {1, 2, 3, 7, 11, 19, 0}, // 32
354 {1, 2, 3, 6, 16, 27, 0}, // 33
355 {1, 2, 3, 7, 12, 20, 0}, // 34
356 {1, 2, 3, 8, 12, 21, 0}, // 35
357 {1, 2, 5, 12, 14, 20, 0}, // 36
358 {1, 2, 4, 10, 15, 22, 0}, // 37
359 {1, 2, 3, 4, 8, 14, 23, 0}, // 38
360 {1, 2, 4, 13, 18, 33, 0}, // 39
361 {1, 2, 3, 4, 9, 14, 24, 0}, // 40
362 {1, 2, 3, 4, 9, 15, 25, 0}, // 41
363 {1, 2, 3, 4, 9, 15, 25, 0}, // 42
364 {1, 2, 3, 4, 10, 15, 26, 0}, // 43
365 {1, 2, 3, 6, 16, 27, 38, 0}, // 44
366 {1, 2, 3, 5, 12, 18, 26, 0}, // 45
367 {1, 2, 3, 6, 18, 25, 38, 0}, // 46
368 {1, 2, 3, 5, 16, 22, 40, 0}, // 47
369 {1, 2, 5, 9, 20, 26, 36, 0}, // 48
370 {1, 2, 5, 24, 33, 36, 44, 0}, // 49
371 {1, 3, 8, 17, 28, 32, 38, 0}, // 50
372 {1, 2, 5, 11, 18, 30, 38, 0}, // 51
373 {1, 2, 3, 4, 6, 14, 21, 30, 0}, // 52
374 {1, 2, 3, 4, 7, 21, 29, 44, 0}, // 53
375 {1, 2, 3, 4, 9, 15, 21, 31, 0}, // 54
376 {1, 2, 3, 4, 6, 19, 26, 47, 0}, // 55
377 {1, 2, 3, 4, 11, 16, 33, 39, 0}, // 56
378 {1, 3, 13, 32, 36, 43, 52, 0}, // 57
380 // Generated by calcExhaustiveDC()
381 {1, 2, 3, 7, 21, 33, 37, 50, 0}, // 58
382 {1, 2, 3, 6, 13, 21, 35, 44, 0}, // 59
383 {1, 2, 4, 9, 15, 25, 30, 42, 0}, // 60
384 {1, 2, 3, 7, 15, 25, 36, 45, 0}, // 61
385 {1, 2, 4, 10, 32, 39, 46, 51, 0}, // 62
386 {1, 2, 6, 8, 20, 38, 41, 54, 0}, // 63
387 {1, 2, 5, 14, 16, 34, 42, 59, 0} // 64
391 * Get a difference cover for the requested periodicity v.
393 template <typename T>
394 static String<T> getDiffCover(T v,
395 bool verbose = false,
396 bool sanityCheck = false)
401 // Can we look it up in our hardcoded array?
402 if(v <= 64 && dc0to64[v][0] == 0xffffffff) {
403 if(verbose) cout << "v in hardcoded area, but hardcoded entry was all-fs" << endl;
407 for(size_t i = 0; i < 10; i++) {
408 if(dc0to64[v][i] == 0) break;
409 append(ret, dc0to64[v][i]);
411 if(sanityCheck) assert(dcRepOk(v, ret));
415 // Can we look it up in our calcColbournAndLingDCs array?
417 calcColbournAndLingDCs<uint32_t>(verbose, sanityCheck);
418 assert(clDCs_calced);
420 for(size_t i = 0; i < 16; i++) {
421 if(v <= clDCs[i].maxV) {
422 for(size_t j = 0; j < clDCs[i].numSamples; j++) {
423 T s = clDCs[i].samples[j];
426 for(size_t k = 0; k < length(ret); k++) {
427 if(s == ret[k]) break;
429 insertValue(ret, k, s);
437 if(sanityCheck) assert(dcRepOk(v, ret));
441 cerr << "Error: Could not find a difference cover sample for v=" << v << endl;
446 * Calculate and return a delta map based on the given difference cover
449 template <typename T>
450 static String<T> getDeltaMap(T v, const String<T>& dc) {
451 // Declare anchor-map-related items
454 fill(amap, v, 0xffffffff, Exact());
456 // Print out difference cover (and optionally calculate
458 for(size_t i = 0; i < length(dc); i++) {
459 for(size_t j = i+1; j < length(dc); j++) {
460 assert_gt(dc[j], dc[i]);
461 T diffLeft = dc[j] - dc[i];
462 T diffRight = dc[i] + v - dc[j];
463 assert_lt(diffLeft, v);
464 assert_lt(diffRight, v);
465 if(amap[diffLeft] == 0xffffffff) {
466 amap[diffLeft] = dc[i];
469 if(amap[diffRight] == 0xffffffff) {
470 amap[diffRight] = dc[j];
479 * Return population count (count of all bits set to 1) of i.
482 static unsigned int popCount(T i) {
483 unsigned int cnt = 0;
484 for(size_t j = 0; j < sizeof(T)*8; j++) {
492 * Calculate log-base-2 of i
495 static unsigned int myLog2(T i) {
496 assert_eq(1, popCount(i)); // must be power of 2
497 for(size_t j = 0; j < sizeof(T)*8; j++) {
508 template<typename TStr>
509 class DifferenceCoverSample {
512 DifferenceCoverSample(const TStr& __text,
514 bool __verbose = false,
515 bool __sanity = false,
516 ostream& __logger = cout) :
521 _ds(getDiffCover(_v, _verbose, _sanity)),
522 _dmap(getDeltaMap(_v, _ds)),
528 _vmask(0xffffffff << _log2v),
532 assert_eq(1, popCount(_v)); // must be power of 2
533 // Build map from d's to idx's
534 fill(_dInv, _v, 0xffffffff, Exact());
535 for(size_t i = 0; i < length(_ds); i++) _dInv[_ds[i]] = i;
539 * Allocate an amount of memory that simulates the peak memory
540 * usage of the DifferenceCoverSample with the given text and v.
541 * Throws bad_alloc if it's not going to fit in memory. Returns
542 * the approximate number of bytes the Cover takes at all times.
544 static size_t simulateAllocs(const TStr& text, uint32_t v) {
545 String<uint32_t> ds = getDiffCover(v, false /*verbose*/, false /*sanity*/);
546 size_t len = length(text);
547 size_t sPrimeSz = (len / v) * length(ds);
548 // sPrime, sPrimeOrder, _isaPrime all exist in memory at
549 // once and that's the peak
550 AutoArray<uint32_t> aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/));
551 return sPrimeSz * 4; // sPrime array
554 uint32_t v() const { return _v; }
555 uint32_t log2v() const { return _log2v; }
556 uint32_t vmask() const { return _vmask; }
557 uint32_t modv(uint32_t i) const { return i & ~_vmask; }
558 uint32_t divv(uint32_t i) const { return i >> _log2v; }
559 uint32_t d() const { return _d; }
560 bool verbose() const { return _verbose; }
561 bool sanityCheck() const { return _sanity; }
562 const TStr& text() const { return _text; }
563 const String<uint32_t>& ds() const { return _ds; }
564 const String<uint32_t>& dmap() const { return _dmap; }
565 ostream& log() const { return _logger; }
568 uint32_t tieBreakOff(uint32_t i, uint32_t j) const;
569 int64_t breakTie(uint32_t i, uint32_t j) const;
570 bool isCovered(uint32_t i) const;
571 uint32_t rank(uint32_t i) const;
574 * Print out the suffix array such that every sample offset has its
575 * rank filled in and every non-sample offset is shown as '-'.
577 void print(ostream& out) {
578 for(size_t i = 0; i < length(_text); i++) {
584 if(i < length(_text)-1) {
593 void doBuiltSanityCheck() const;
594 void buildSPrime(String<uint32_t>& sPrime);
597 return length(_isaPrime) > 0;
600 void verbose(const string& s) const {
601 if(this->verbose()) {
607 const TStr& _text; // text to sample
608 uint32_t _v; // periodicity of sample
611 String<uint32_t> _ds; // samples: idx -> d
612 String<uint32_t> _dmap; // delta map
613 uint32_t _d; // |D| - size of sample
614 String<uint32_t> _doffs; // offsets into sPrime/isaPrime for each d idx
615 String<uint32_t> _isaPrime; // ISA' array
616 String<uint32_t> _dInv; // Map from d -> idx
623 * Return true iff suffixes with offsets suf1 and suf2 out of host
624 * string 'host' are identical up to depth 'v'.
626 template <typename TStr>
627 static inline bool suffixLt(const TStr& host, uint32_t suf1, uint32_t suf2) {
628 uint32_t hlen = length(host);
629 assert_neq(suf1, suf2);
631 while(suf1 + i < hlen && suf2 + i < hlen) {
632 if(host[suf1+i] < host[suf2+i]) return true;
633 if(host[suf1+i] > host[suf2+i]) return false;
636 if(suf1 + i == hlen) {
637 assert_lt(suf2 + i, hlen);
640 assert_eq(suf2 + i, hlen);
645 * Sanity-check the difference cover by first inverting _isaPrime then
646 * checking that each successive suffix really is less than the next.
648 template <typename TStr>
649 void DifferenceCoverSample<TStr>::doBuiltSanityCheck() const {
650 uint32_t v = this->v();
652 VMSG_NL(" Doing sanity check");
654 String<uint32_t> sorted;
655 fill(sorted, length(_isaPrime), 0xffffffff, Exact());
656 for(size_t di = 0; di < this->d(); di++) {
657 uint32_t d = _ds[di];
659 for(size_t doi = _doffs[di]; doi < _doffs[di+1]; doi++, i++) {
660 assert_eq(0xffffffff, sorted[_isaPrime[doi]]);
661 // Maps the offset of the suffix to its rank
662 sorted[_isaPrime[doi]] = v*i + d;
666 assert_eq(added, length(_isaPrime));
667 for(size_t i = 0; i < length(sorted)-1; i++) {
668 assert(suffixLt(this->text(), sorted[i], sorted[i+1]));
673 * Build the s' array by sampling suffixes (suffix offsets, actually)
674 * from t according to the difference-cover sample and pack them into
675 * an array of machine words in the order dictated by the "mu" mapping
676 * described in Burkhardt.
678 * Also builds _doffs map.
680 template <typename TStr>
681 void DifferenceCoverSample<TStr>::buildSPrime(String<uint32_t>& sPrime) {
682 const TStr& t = this->text();
683 const String<uint32_t>& ds = this->ds();
684 uint32_t tlen = length(t);
685 uint32_t v = this->v();
686 uint32_t d = this->d();
689 // Record where each d section should begin in sPrime
690 uint32_t tlenDivV = this->divv(tlen);
691 uint32_t tlenModV = this->modv(tlen);
692 uint32_t sPrimeSz = 0;
693 assert(empty(_doffs));
694 reserve(_doffs, d+1, Exact());
695 assert_eq(capacity(_doffs), d+1);
696 for(uint32_t di = 0; di < d; di++) {
698 uint32_t sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0);
700 appendValue(_doffs, sPrimeSz);
703 appendValue(_doffs, sPrimeSz);
706 for(size_t i = 0; i < d; i++) {
707 assert_gt(_doffs[i+1], _doffs[i]);
708 uint32_t diff = _doffs[i+1] - _doffs[i];
709 assert(diff == tlenDivV || diff == tlenDivV+1);
713 assert_eq(length(_doffs), d+1);
714 // Size sPrime appropriately
715 reserve(sPrime, sPrimeSz+1, Exact()); // reserve extra slot for LS
716 fill(sPrime, sPrimeSz, 0xffffffff, Exact());
717 // Slot suffixes from text into sPrime according to the mu
718 // mapping; where the mapping would leave a blank, insert a 0
721 for(uint32_t ti = 0; ti <= tlen; ti += v) {
722 for(uint32_t di = 0; di < d; di++) {
723 uint32_t tti = ti + ds[di];
724 if(tti > tlen) break;
725 uint32_t spi = _doffs[di] + i;
726 assert_lt(spi, _doffs[di+1]);
727 assert_leq(tti, tlen);
728 assert_lt(spi, sPrimeSz);
729 assert_eq(0xffffffff, sPrime[spi]);
730 sPrime[spi] = tti; added++;
734 assert_eq(added, sPrimeSz);
738 * Return true iff suffixes with offsets suf1 and suf2 out of host
739 * string 'host' are identical up to depth 'v'.
741 template <typename TStr>
742 static inline bool suffixSameUpTo(const TStr& host,
747 for(uint32_t i = 0; i < v; i++) {
748 bool endSuf1 = suf1+i >= length(host);
749 bool endSuf2 = suf2+i >= length(host);
750 if((endSuf1 && !endSuf2) || (!endSuf1 && endSuf2)) return false;
751 if(endSuf1 && endSuf2) return true;
752 if(host[suf1+i] != host[suf2+i]) return false;
758 * Calculates a ranking of all suffixes in the sample and stores them,
759 * packed according to the mu mapping, in _isaPrime.
761 template <typename TStr>
762 void DifferenceCoverSample<TStr>::build() {
763 // Local names for relevant types
764 typedef typename Value<TStr>::Type TAlphabet;
765 VMSG_NL("Building DifferenceCoverSample");
766 // Local names for relevant data
767 const TStr& t = this->text();
768 uint32_t v = this->v();
771 String<uint32_t> sPrime;
772 VMSG_NL(" Building sPrime");
774 assert_gt(length(sPrime), 0);
775 assert_leq(length(sPrime), length(t)+1); // +1 is because of the end-cap
776 uint32_t nextRank = 0;
778 VMSG_NL(" Building sPrimeOrder");
779 String<uint32_t> sPrimeOrder;
780 reserve(sPrimeOrder, length(sPrime)+1, Exact()); // reserve extra slot for LS
781 resize(sPrimeOrder, length(sPrime), Exact());
782 for(size_t i = 0; i < length(sPrimeOrder); i++) {
785 // sPrime now holds suffix-offsets for DC samples.
787 Timer timer(cout, " V-Sorting samples time: ", this->verbose());
788 VMSG_NL(" V-Sorting samples");
789 // Extract backing-store array from sPrime and sPrimeOrder;
790 // the mkeyQSortSuf2 routine works on the array for maximum
792 uint32_t *sPrimeArr = (uint32_t*)begin(sPrime);
793 size_t slen = length(sPrime);
794 assert_eq(sPrimeArr[0], sPrime[0]);
795 assert_eq(sPrimeArr[slen-1], sPrime[slen-1]);
796 uint32_t *sPrimeOrderArr = (uint32_t*)begin(sPrimeOrder);
797 assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]);
798 assert_eq(sPrimeOrderArr[slen-1], sPrimeOrder[slen-1]);
799 // Sort sample suffixes up to the vth character using a
800 // multikey quicksort. Sort time is proportional to the
801 // number of samples times v. It isn't quadratic.
802 // sPrimeOrder is passed in as a swapping partner for
803 // sPrimeArr, i.e., every time the multikey qsort swaps
804 // elements in sPrime, it swaps the same elements in
805 // sPrimeOrder too. This allows us to easily reconstruct
806 // what the sort did.
807 mkeyQSortSuf2(t, sPrimeArr, slen, sPrimeOrderArr,
808 ValueSize<TAlphabet>::VALUE,
809 this->verbose(), this->sanityCheck(), v);
810 // Make sure sPrime and sPrimeOrder are consistent with
811 // their respective backing-store arrays
812 assert_eq(sPrimeArr[0], sPrime[0]);
813 assert_eq(sPrimeArr[slen-1], sPrime[slen-1]);
814 assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]);
815 assert_eq(sPrimeOrderArr[slen-1], sPrimeOrder[slen-1]);
817 // Now assign the ranking implied by the sorted sPrime/sPrimeOrder
818 // arrays back into sPrime.
819 VMSG_NL(" Allocating rank array");
820 reserve(_isaPrime, length(sPrime)+1, Exact());
821 fill(_isaPrime, length(sPrime), 0xffffffff, Exact());
822 assert_gt(length(_isaPrime), 0);
824 Timer timer(cout, " Ranking v-sort output time: ", this->verbose());
825 VMSG_NL(" Ranking v-sort output");
826 for(size_t i = 0; i < length(sPrime)-1; i++) {
827 // Place the appropriate ranking
828 _isaPrime[sPrimeOrder[i]] = nextRank;
829 // If sPrime[i] and sPrime[i+1] are identical up to v, then we
830 // should give the next suffix the same rank
831 if(!suffixSameUpTo(t, sPrime[i], sPrime[i+1], v)) nextRank++;
833 _isaPrime[sPrimeOrder[length(sPrime)-1]] = nextRank; // finish off
835 // sPrimeOrder is destroyed
836 // All the information we need is now in _isaPrime
839 // Check that all ranks are sane
840 for(size_t i = 0; i < length(_isaPrime); i++) {
841 assert_neq(_isaPrime[i], 0xffffffff);
842 assert_lt(_isaPrime[i], length(_isaPrime));
845 // Now pass the sPrimeRanks[] array to LarssonSadakane (in SeqAn).
846 append(_isaPrime, length(_isaPrime));
847 append(sPrime, length(sPrime));
849 Timer timer(cout, " Invoking Larsson-Sadakane on ranks time: ", this->verbose());
850 VMSG_NL(" Invoking Larsson-Sadakane on ranks");
851 createSuffixArray(sPrime, _isaPrime, LarssonSadakane(), length(_isaPrime));
853 // sPrime now contains the suffix array (which we ignore)
854 assert_eq(length(_isaPrime), length(sPrime));
855 assert_gt(length(_isaPrime), 0);
856 // chop off final character of _isaPrime
857 resize(_isaPrime, length(_isaPrime)-1);
858 // Subtract 1 from each isaPrime (to adjust for LarssonSadakane
859 // always ranking the final suffix as lexicographically first)
860 for(size_t i = 0; i < length(_isaPrime); i++) {
863 VMSG_NL(" Sanity-checking and returning");
866 //if(this->verbose()) print(cout);
867 if(this->sanityCheck()) doBuiltSanityCheck();
871 * Return true iff index i within the text is covered by the difference
872 * cover sample. Allow i to be off the end of the text; simplifies
875 template <typename TStr>
876 bool DifferenceCoverSample<TStr>::isCovered(uint32_t i) const {
878 uint32_t modi = this->modv(i);
879 assert_lt(modi, length(_dInv));
880 return _dInv[modi] != 0xffffffff;
884 * Given a text offset that's covered, return its lexicographical rank
885 * among the sample suffixes.
887 template <typename TStr>
888 uint32_t DifferenceCoverSample<TStr>::rank(uint32_t i) const {
890 assert_lt(i, length(this->text()));
891 uint32_t imodv = this->modv(i);
892 assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample
893 uint32_t ioff = this->divv(i);
894 assert_lt(ioff, _doffs[_dInv[imodv]+1] - _doffs[_dInv[imodv]]);
895 uint32_t isaIIdx = _doffs[_dInv[imodv]] + ioff;
896 assert_lt(isaIIdx, length(_isaPrime));
897 uint32_t isaPrimeI = _isaPrime[isaIIdx];
898 assert_leq(isaPrimeI, length(_isaPrime));
903 * Return: < 0 if suffix i is lexicographically less than suffix j; > 0
904 * if suffix j is lexicographically greater.
906 template <typename TStr>
907 int64_t DifferenceCoverSample<TStr>::breakTie(uint32_t i, uint32_t j) const {
910 assert_lt(i, length(this->text()));
911 assert_lt(j, length(this->text()));
912 uint32_t imodv = this->modv(i);
913 uint32_t jmodv = this->modv(j);
914 assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample
915 assert_neq(0xffffffff, _dInv[jmodv]); // must be in the sample
916 uint32_t dimodv = _dInv[imodv];
917 uint32_t djmodv = _dInv[jmodv];
918 uint32_t ioff = this->divv(i);
919 uint32_t joff = this->divv(j);
920 assert_lt(dimodv+1, length(_doffs));
921 assert_lt(djmodv+1, length(_doffs));
922 // assert_lt: expected (32024) < (0)
923 assert_lt(ioff, _doffs[dimodv+1] - _doffs[dimodv]);
924 assert_lt(joff, _doffs[djmodv+1] - _doffs[djmodv]);
925 uint32_t isaIIdx = _doffs[dimodv] + ioff;
926 uint32_t isaJIdx = _doffs[djmodv] + joff;
927 assert_lt(isaIIdx, length(_isaPrime));
928 assert_lt(isaJIdx, length(_isaPrime));
929 assert_neq(isaIIdx, isaJIdx); // ranks must be unique
930 uint32_t isaPrimeI = _isaPrime[isaIIdx];
931 uint32_t isaPrimeJ = _isaPrime[isaJIdx];
932 assert_neq(isaPrimeI, isaPrimeJ); // ranks must be unique
933 assert_leq(isaPrimeI, length(_isaPrime));
934 assert_leq(isaPrimeJ, length(_isaPrime));
935 return (int64_t)isaPrimeI - (int64_t)isaPrimeJ;
939 * Given i, j, return the number of additional characters that need to
940 * be compared before the difference cover can break the tie.
942 template <typename TStr>
943 uint32_t DifferenceCoverSample<TStr>::tieBreakOff(uint32_t i, uint32_t j) const {
944 const TStr& t = this->text();
945 const String<uint32_t>& dmap = this->dmap();
947 // It's actually convenient to allow this, but we're permitted to
948 // return nonsense in that case
949 if(t[i] != t[j]) return 0xffffffff;
950 //assert_eq(t[i], t[j]); // if they're unequal, there's no tie to break
951 uint32_t v = this->v();
953 assert_lt(i, length(t));
954 assert_lt(j, length(t));
955 uint32_t imod = this->modv(i);
956 uint32_t jmod = this->modv(j);
957 uint32_t diffLeft = (jmod >= imod)? (jmod - imod) : (jmod + v - imod);
958 uint32_t diffRight = (imod >= jmod)? (imod - jmod) : (imod + v - jmod);
959 assert_lt(diffLeft, length(dmap));
960 assert_lt(diffRight, length(dmap));
961 uint32_t destLeft = dmap[diffLeft]; // offset where i needs to be
962 uint32_t destRight = dmap[diffRight]; // offset where i needs to be
963 assert(isCovered(destLeft));
964 assert(isCovered(destLeft+diffLeft));
965 assert(isCovered(destRight));
966 assert(isCovered(destRight+diffRight));
967 assert_lt(destLeft, v);
968 assert_lt(destRight, v);
969 uint32_t deltaLeft = (destLeft >= imod)? (destLeft - imod) : (destLeft + v - imod);
970 if(deltaLeft == v) deltaLeft = 0;
971 uint32_t deltaRight = (destRight >= jmod)? (destRight - jmod) : (destRight + v - jmod);
972 if(deltaRight == v) deltaRight = 0;
973 assert_lt(deltaLeft, v);
974 assert_lt(deltaRight, v);
975 assert(isCovered(i+deltaLeft));
976 assert(isCovered(j+deltaLeft));
977 assert(isCovered(i+deltaRight));
978 assert(isCovered(j+deltaRight));
979 return min(deltaLeft, deltaRight);
982 #endif /*DIFF_SAMPLE_H_*/