1 #ifndef MULTIKEY_QSORT_H_
2 #define MULTIKEY_QSORT_H_
5 #include <seqan/basic.h>
6 #include <seqan/file.h>
7 #include <seqan/sequence.h>
8 #include "sequence_io.h"
10 #include "assert_helpers.h"
11 #include "diff_sample.h"
14 using namespace seqan;
17 * Swap elements a and b in seqan::String s
19 template <typename TStr, typename TPos>
20 static inline void swap(TStr& s, size_t slen, TPos a, TPos b) {
21 typedef typename Value<TStr>::Type TAlphabet;
30 * Swap elements a and b in array s
32 template <typename TVal, typename TPos>
33 static inline void swap(TVal* s, size_t slen, TPos a, TPos b) {
42 * Helper macro for swapping elements a and b in seqan::String s. Does
43 * some additional sainty checking w/r/t begin and end (which are
44 * parameters to the sorting routines below).
46 #define SWAP(s, a, b) { \
47 assert_geq(a, begin); \
48 assert_geq(b, begin); \
51 swap(s, slen, a, b); \
55 * Helper macro for swapping the same pair of elements a and b in
56 * two different seqan::Strings s and s2. This is a helpful variant
57 * if, for example, the caller would like to see how their input was
58 * permuted by the sort routine (in that case, the caller would let s2
59 * be an array s2[] where s2 is the same length as s and s2[i] = i).
61 #define SWAP2(s, s2, a, b) { \
63 swap(s2, slen, a, b); \
66 #define SWAP1(s, s2, a, b) { \
71 * Helper macro that swaps a range of elements [i, i+n) with another
72 * range [j, j+n) in seqan::String s.
74 #define VECSWAP(s, i, j, n) { \
75 if(n > 0) { vecswap(s, slen, i, j, n, begin, end); } \
79 * Helper macro that swaps a range of elements [i, i+n) with another
80 * range [j, j+n) both in seqan::String s and seqan::String s2.
82 #define VECSWAP2(s, s2, i, j, n) { \
83 if(n > 0) { vecswap2(s, slen, s2, i, j, n, begin, end); } \
87 * Helper function that swaps a range of elements [i, i+n) with another
88 * range [j, j+n) in seqan::String s. begin and end represent the
89 * current range under consideration by the caller (one of the
90 * recursive multikey_quicksort routines below).
92 template <typename TStr, typename TPos>
93 static inline void vecswap(TStr& s, size_t slen, TPos i, TPos j, TPos n, TPos begin, TPos end) {
102 assert_geq(a, begin);
103 assert_geq(b, begin);
110 template <typename TVal, typename TPos>
111 static inline void vecswap(TVal *s, size_t slen, TPos i, TPos j, TPos n, TPos begin, TPos end) {
112 assert_geq(i, begin);
113 assert_geq(j, begin);
120 assert_geq(a, begin);
121 assert_geq(b, begin);
129 * Helper function that swaps a range of elements [i, i+n) with another
130 * range [j, j+n) both in seqan::String s and seqan::String s2. begin
131 * and end represent the current range under consideration by the
132 * caller (one of the recursive multikey_quicksort routines below).
134 template <typename TStr, typename TPos>
135 static inline void vecswap2(TStr& s, size_t slen, TStr& s2, TPos i, TPos j, TPos n, TPos begin, TPos end) {
136 assert_geq(i, begin);
137 assert_geq(j, begin);
144 assert_geq(a, begin);
145 assert_geq(b, begin);
149 swap(s2, slen, a, b);
153 template <typename TVal, typename TPos>
154 static inline void vecswap2(TVal* s, size_t slen, TVal* s2, TPos i, TPos j, TPos n, TPos begin, TPos end) {
155 assert_geq(i, begin);
156 assert_geq(j, begin);
163 assert_geq(a, begin);
164 assert_geq(b, begin);
168 swap(s2, slen, a, b);
172 /// Retrieve an int-ized version of the ath character of string s, or,
173 /// if a goes off the end of s, return a (user-specified) int greater
174 /// than any TAlphabet character - 'hi'.
175 #define CHAR_AT(ss, aa) ((length(s[ss]) > aa) ? (int)(Dna)(s[ss][aa]) : hi)
177 /// Retrieve an int-ized version of the ath character of string s, or,
178 /// if a goes off the end of s, return a (user-specified) int greater
179 /// than any TAlphabet character - 'hi'.
180 #define CHAR_AT_SUF(si, off) (((off+s[si]) < hlen) ? ((int)(Dna)(host[off+s[si]])) : (hi))
182 /// Retrieve an int-ized version of the ath character of string s, or,
183 /// if a goes off the end of s, return a (user-specified) int greater
184 /// than any TAlphabet character - 'hi'.
186 #define CHAR_AT_SUF_U8(si, off) char_at_suf_u8(host, hlen, s, si, off, hi)
188 // Note that CHOOSE_AND_SWAP_RANDOM_PIVOT is unused
189 #define CHOOSE_AND_SWAP_RANDOM_PIVOT(sw, ch) { \
190 /* Note: rand() didn't really cut it here; it seemed to run out of */ \
191 /* randomness and, after a time, returned the same thing over and */ \
193 a = (rand() % n) + begin; /* choose pivot between begin and end */ \
194 assert_lt(a, end); assert_geq(a, begin); \
195 sw(s, s2, begin, a); /* move pivot to beginning */ \
199 * Ad-hoc DNA-centric way of choose a pretty good pivot without using
200 * the pseudo-random number generator. We try to get a 1 or 2 if
201 * possible, since they'll split things more evenly than a 0 or 4. We
202 * also avoid swapping in the event that we choose the first element.
204 #define CHOOSE_AND_SWAP_SMART_PIVOT(sw, ch) { \
205 a = begin; /* choose first elt */ \
206 /* now try to find a better elt */ \
207 if(n >= 5) { /* n is the difference between begin and end */ \
208 if (ch(begin+1, depth) == 1 || ch(begin+1, depth) == 2) a = begin+1; \
209 else if(ch(begin+2, depth) == 1 || ch(begin+2, depth) == 2) a = begin+2; \
210 else if(ch(begin+3, depth) == 1 || ch(begin+3, depth) == 2) a = begin+3; \
211 else if(ch(begin+4, depth) == 1 || ch(begin+4, depth) == 2) a = begin+4; \
212 if(a != begin) sw(s, s2, begin, a); /* move pivot to beginning */ \
214 /* the element at [begin] now holds the pivot value */ \
217 #define CHOOSE_AND_SWAP_PIVOT CHOOSE_AND_SWAP_SMART_PIVOT
222 * Assert that the range of chars at depth 'depth' in strings 'begin'
223 * to 'end' in string-of-suffix-offsets s is parititioned properly
224 * according to the ternary paritioning strategy of Bentley and McIlroy
225 * (*prior to* swapping the = regions to the center)
227 template<typename THost>
228 bool assertPartitionedSuf(const THost& host,
237 size_t hlen = length(host);
238 int state = 0; // 0 -> 1st = section, 1 -> < section, 2 -> > section, 3 -> 2nd = section
239 for(size_t i = begin; i < end; i++) {
242 if (CHAR_AT_SUF(i, depth) < pivot) { state = 1; break; }
243 else if (CHAR_AT_SUF(i, depth) > pivot) { state = 2; break; }
244 assert_eq(CHAR_AT_SUF(i, depth), pivot); break;
246 if (CHAR_AT_SUF(i, depth) > pivot) { state = 2; break; }
247 else if (CHAR_AT_SUF(i, depth) == pivot) { state = 3; break; }
248 assert_lt(CHAR_AT_SUF(i, depth), pivot); break;
250 if (CHAR_AT_SUF(i, depth) == pivot) { state = 3; break; }
251 assert_gt(CHAR_AT_SUF(i, depth), pivot); break;
253 assert_eq(CHAR_AT_SUF(i, depth), pivot); break;
260 * Assert that the range of chars at depth 'depth' in strings 'begin'
261 * to 'end' in string-of-suffix-offsets s is parititioned properly
262 * according to the ternary paritioning strategy of Bentley and McIlroy
263 * (*after* swapping the = regions to the center)
265 template<typename THost>
266 bool assertPartitionedSuf2(const THost& host,
275 size_t hlen = length(host);
276 int state = 0; // 0 -> < section, 1 -> = section, 2 -> > section
277 for(size_t i = begin; i < end; i++) {
280 if (CHAR_AT_SUF(i, depth) == pivot) { state = 1; break; }
281 else if (CHAR_AT_SUF(i, depth) > pivot) { state = 2; break; }
282 assert_lt(CHAR_AT_SUF(i, depth), pivot); break;
284 if (CHAR_AT_SUF(i, depth) > pivot) { state = 2; break; }
285 assert_eq(CHAR_AT_SUF(i, depth), pivot); break;
287 assert_gt(CHAR_AT_SUF(i, depth), pivot); break;
295 * Assert that the seqan::String s of suffix offsets into seqan::String
296 * 'host' is a seemingly legitimate suffix-offset list (at this time,
297 * we just check that it doesn't list any suffix twice).
299 static void sanityCheckInputSufs(uint32_t *s, size_t slen) {
301 for(size_t i = 0; i < slen; i++) {
302 // Actually, it's convenient to allow the caller to provide
303 // suffix offsets thare are off the end of the host string.
304 // See, e.g., build() in diff_sample.cpp.
305 //assert_lt(s[i], length(host));
306 for(size_t j = i+1; j < slen; j++) {
307 assert_neq(s[i], s[j]);
313 * Assert that the seqan::String s of suffix offsets into seqan::String
314 * 'host' really are in lexicographical order up to depth 'upto'.
316 template <typename T>
317 void sanityCheckOrderedSufs(const T& host,
323 uint32_t upper = 0xffffffff)
325 assert_lt(s[0], hlen);
326 upper = min<uint32_t>(upper, slen-1);
327 for(size_t i = lower; i < upper; i++) {
328 // Allow s[i+t] to point off the end of the string; this is
329 // convenient for some callers
330 if(s[i+1] >= hlen) continue;
331 if(upto == 0xffffffff) {
332 assert(dollarLt(suffix(host, s[i]), suffix(host, s[i+1])));
335 if(prefix(suffix(host, s[i]), upto) > prefix(suffix(host, s[i+1]), upto)) {
336 // operator > treats shorter strings as
337 // lexicographically smaller, but we want to opposite
338 assert(isPrefix(suffix(host, s[i+1]), suffix(host, s[i])));
346 * Main multikey quicksort function for suffixes. Based on Bentley &
347 * Sedgewick's algorithm on p.5 of their paper "Fast Algorithms for
348 * Sorting and Searching Strings". That algorithm has been extended in
351 * 1. Deal with keys of different lengths by checking bounds and
352 * considering off-the-end values to be 'hi' (b/c our goal is the
353 * BWT transform, we're biased toward considring prefixes as
354 * lexicographically *greater* than their extensions).
355 * 2. The multikey_qsort_suffixes version takes a single host string
356 * and a list of suffix offsets as input. This reduces memory
357 * footprint compared to an approach that treats its input
358 * generically as a set of strings (not necessarily suffixes), thus
359 * requiring that we store at least two integers worth of
360 * information for each string.
361 * 3. Sorting functions take an extra "upto" parameter that upper-
362 * bounds the depth to which the function sorts.
364 * TODO: Consult a tie-breaker (like a difference cover sample) if two
365 * keys share a long prefix.
368 void mkeyQSortSuf(const T& host,
376 size_t upto = 0xffffffff)
378 // Helper for making the recursive call; sanity-checks arguments to
379 // make sure that the problem actually got smaller.
380 #define MQS_RECURSE_SUF(nbegin, nend, ndepth) { \
381 assert(nbegin > begin || nend < end || ndepth > depth); \
382 if(ndepth < upto) { /* don't exceed depth of 'upto' */ \
383 mkeyQSortSuf(host, hlen, s, slen, hi, nbegin, nend, ndepth, upto); \
386 assert_leq(begin, slen);
387 assert_leq(end, slen);
388 size_t a, b, c, d, /*e,*/ r;
389 size_t n = end - begin;
390 if(n <= 1) return; // 1-element list already sorted
391 CHOOSE_AND_SWAP_PIVOT(SWAP1, CHAR_AT_SUF); // pick pivot, swap it into [begin]
392 int v = CHAR_AT_SUF(begin, depth); // v <- randomly-selected pivot value
395 bool stillInBounds = false;
396 for(size_t i = begin; i < end; i++) {
397 if(depth < (hlen-s[i])) {
398 stillInBounds = true;
400 } else { /* already fell off this suffix */ }
402 assert(stillInBounds); // >=1 suffix must still be in bounds
408 // Invariant: everything before a is = pivot, everything
409 // between a and b is <
410 int bc = 0; // shouldn't have to init but gcc on Mac complains
411 while(b <= c && v >= (bc = CHAR_AT_SUF(b, depth))) {
417 // Invariant: everything after d is = pivot, everything
418 // between c and d is >
419 int cc = 0; // shouldn't have to init but gcc on Mac complains
420 while(b <= c && v <= (cc = CHAR_AT_SUF(c, depth))) {
431 assert(a > begin || c < end-1); // there was at least one =s
432 assert_lt(d-c, n); // they can't all have been > pivot
433 assert_lt(b-a, n); // they can't all have been < pivot
434 assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth)); // check pre-=-swap invariant
435 r = min(a-begin, b-a); VECSWAP(s, begin, b-r, r); // swap left = to center
436 r = min(d-c, end-d-1); VECSWAP(s, b, end-r, r); // swap right = to center
437 assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
438 r = b-a; // r <- # of <'s
440 MQS_RECURSE_SUF(begin, begin + r, depth); // recurse on <'s
442 // Do not recurse on ='s if the pivot was the off-the-end value;
443 // they're already fully sorted
445 MQS_RECURSE_SUF(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
447 r = d-c; // r <- # of >'s excluding those exhausted
448 if(r > 0 && v < hi-1) {
449 MQS_RECURSE_SUF(end-r, end, depth); // recurse on >'s
454 * Toplevel function for multikey quicksort over suffixes.
457 void mkeyQSortSuf(const T& host,
461 bool verbose = false,
462 bool sanityCheck = false,
463 size_t upto = 0xffffffff)
465 size_t hlen = length(host);
467 if(sanityCheck) sanityCheckInputSufs(s, slen);
468 mkeyQSortSuf(host, hlen, s, slen, hi, (size_t)0, slen, (size_t)0, upto);
469 if(sanityCheck) sanityCheckOrderedSufs(host, hlen, s, slen, upto);
473 * Just like mkeyQSortSuf but all swaps are applied to s2 as well as s.
474 * This is a helpful variant if, for example, the caller would like to
475 * see how their input was permuted by the sort routine (in that case,
476 * the caller would let s2 be an array s2[] where s2 is the same length
477 * as s and s2[i] = i).
480 void mkeyQSortSuf2(const T& host,
489 size_t upto = 0xffffffff)
491 // Helper for making the recursive call; sanity-checks arguments to
492 // make sure that the problem actually got smaller.
493 #define MQS_RECURSE_SUF_DS(nbegin, nend, ndepth) { \
494 assert(nbegin > begin || nend < end || ndepth > depth); \
495 if(ndepth < upto) { /* don't exceed depth of 'upto' */ \
496 mkeyQSortSuf2(host, hlen, s, slen, s2, hi, nbegin, nend, ndepth, upto); \
499 assert_leq(begin, slen);
500 assert_leq(end, slen);
501 size_t a, b, c, d, /*e,*/ r;
502 size_t n = end - begin;
503 if(n <= 1) return; // 1-element list already sorted
504 CHOOSE_AND_SWAP_PIVOT(SWAP2, CHAR_AT_SUF); // pick pivot, swap it into [begin]
505 int v = CHAR_AT_SUF(begin, depth); // v <- randomly-selected pivot value
508 bool stillInBounds = false;
509 for(size_t i = begin; i < end; i++) {
510 if(depth < (hlen-s[i])) {
511 stillInBounds = true;
513 } else { /* already fell off this suffix */ }
515 assert(stillInBounds); // >=1 suffix must still be in bounds
519 c = d = /*e =*/ end-1;
521 // Invariant: everything before a is = pivot, everything
522 // between a and b is <
523 int bc = 0; // shouldn't have to init but gcc on Mac complains
524 while(b <= c && v >= (bc = CHAR_AT_SUF(b, depth))) {
526 SWAP2(s, s2, a, b); a++;
530 // Invariant: everything after d is = pivot, everything
531 // between c and d is >
532 int cc = 0; // shouldn't have to init but gcc on Mac complains
533 while(b <= c && v <= (cc = CHAR_AT_SUF(c, depth))) {
535 SWAP2(s, s2, c, d); d--; /*e--;*/
537 //else if(c == e && v == hi) e--;
545 assert(a > begin || c < end-1); // there was at least one =s
546 assert_lt(/*e*/d-c, n); // they can't all have been > pivot
547 assert_lt(b-a, n); // they can't all have been < pivot
548 assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth)); // check pre-=-swap invariant
549 r = min(a-begin, b-a); VECSWAP2(s, s2, begin, b-r, r); // swap left = to center
550 r = min(d-c, end-d-1); VECSWAP2(s, s2, b, end-r, r); // swap right = to center
551 assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
552 r = b-a; // r <- # of <'s
554 MQS_RECURSE_SUF_DS(begin, begin + r, depth); // recurse on <'s
556 // Do not recurse on ='s if the pivot was the off-the-end value;
557 // they're already fully sorted
559 MQS_RECURSE_SUF_DS(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
561 r = d-c; // r <- # of >'s excluding those exhausted
562 if(r > 0 && v < hi-1) {
563 MQS_RECURSE_SUF_DS(end-r, end, depth); // recurse on >'s
568 * Toplevel function for multikey quicksort over suffixes with double
572 void mkeyQSortSuf2(const T& host,
577 bool verbose = false,
578 bool sanityCheck = false,
579 size_t upto = 0xffffffff)
581 size_t hlen = length(host);
582 if(sanityCheck) sanityCheckInputSufs(s, slen);
583 uint32_t *sOrig = NULL;
585 sOrig = new uint32_t[slen];
586 memcpy(sOrig, s, 4 * slen);
588 mkeyQSortSuf2(host, hlen, s, slen, s2, hi, (size_t)0, slen, (size_t)0, upto);
590 sanityCheckOrderedSufs(host, hlen, s, slen, upto);
591 for(size_t i = 0; i < slen; i++) {
592 assert_eq(s[i], sOrig[s2[i]]);
597 // Ugly but necessary; otherwise the compiler chokes dramatically on
598 // the DifferenceCoverSample<> template args to the next few functions
599 template <typename T>
600 class DifferenceCoverSample;
605 template<typename T1, typename T2> inline
606 bool sufDcLt(const T1& host,
609 const DifferenceCoverSample<T1>& dc,
610 bool sanityCheck = false)
612 uint32_t diff = dc.tieBreakOff(s1, s2);
613 assert_lt(diff, dc.v());
614 assert_lt(diff, length(host)-s1);
615 assert_lt(diff, length(host)-s2);
617 for(uint32_t i = 0; i < diff; i++) {
618 assert_eq(host[s1+i], host[s2+i]);
621 bool ret = dc.breakTie(s1+diff, s2+diff) < 0;
623 if(sanityCheck && ret != dollarLt(suffix(host, s1), suffix(host, s2))) {
633 template<typename T> inline
634 void qsortSufDc(const T& host,
638 const DifferenceCoverSample<T>& dc,
641 bool sanityCheck = false)
643 assert_leq(end, slen);
644 assert_lt(begin, slen);
645 assert_gt(end, begin);
646 size_t n = end - begin;
647 if(n <= 1) return; // 1-element list already sorted
648 // Note: rand() didn't really cut it here; it seemed to run out of
649 // randomness and, after a time, returned the same thing over and
651 size_t a = (rand() % n) + begin; // choose pivot between begin and end
653 assert_geq(a, begin);
654 SWAP(s, end-1, a); // move pivot to end
656 for(size_t i = begin; i < end-1; i++) {
657 if(sufDcLt(host, s[i], s[end-1], dc, sanityCheck)) {
659 assert(dollarLt(suffix(host, s[i]), suffix(host, s[end-1])));
660 assert_lt(begin + cur, end-1);
661 SWAP(s, i, begin + cur);
665 // Put pivot into place
666 assert_lt(cur, end-begin);
667 SWAP(s, end-1, begin+cur);
668 if(begin+cur > begin) qsortSufDc(host, hlen, s, slen, dc, begin, begin+cur);
669 if(end > begin+cur+1) qsortSufDc(host, hlen, s, slen, dc, begin+cur+1, end);
673 * Toplevel function for multikey quicksort over suffixes.
675 template<typename T1, typename T2>
676 void mkeyQSortSufDcU8(const T1& seqanHost,
681 const DifferenceCoverSample<T1>& dc,
683 bool verbose = false,
684 bool sanityCheck = false)
686 if(sanityCheck) sanityCheckInputSufs(s, slen);
687 mkeyQSortSufDcU8(seqanHost, host, hlen, s, slen, dc, hi, 0, slen, 0, sanityCheck);
688 if(sanityCheck) sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff);
692 * Return a boolean indicating whether s1 < s2 using the difference
693 * cover to break the tie.
695 template<typename T1, typename T2> inline
696 bool sufDcLtU8(const T1& seqanHost,
701 const DifferenceCoverSample<T1>& dc,
702 bool sanityCheck = false)
705 uint32_t diff = dc.tieBreakOff(s1, s2);
706 assert_lt(diff, dc.v());
707 assert_lt(diff, hlen-s1);
708 assert_lt(diff, hlen-s2);
710 for(uint32_t i = 0; i < diff; i++) {
711 assert_eq(host[s1+i], host[s2+i]);
714 bool ret = dc.breakTie(s1+diff, s2+diff) < 0;
715 // Sanity-check return value using dollarLt
718 ret != dollarLt(suffix(seqanHost, s1), suffix(seqanHost, s2)))
729 template<typename T1, typename T2> inline
730 void qsortSufDcU8(const T1& seqanHost,
735 const DifferenceCoverSample<T1>& dc,
738 bool sanityCheck = false)
740 assert_leq(end, slen);
741 assert_lt(begin, slen);
742 assert_gt(end, begin);
743 size_t n = end - begin;
744 if(n <= 1) return; // 1-element list already sorted
745 // Note: rand() didn't really cut it here; it seemed to run out of
746 // randomness and, after a time, returned the same thing over and
748 size_t a = (rand() % n) + begin; // choose pivot between begin and end
750 assert_geq(a, begin);
751 SWAP(s, end-1, a); // move pivot to end
753 for(size_t i = begin; i < end-1; i++) {
754 if(sufDcLtU8(seqanHost, host, hlen, s[i], s[end-1], dc, sanityCheck)) {
757 assert(dollarLt(suffix(seqanHost, s[i]), suffix(seqanHost, s[end-1])));
758 assert_lt(begin + cur, end-1);
760 SWAP(s, i, begin + cur);
764 // Put pivot into place
765 assert_lt(cur, end-begin);
766 SWAP(s, end-1, begin+cur);
767 if(begin+cur > begin) qsortSufDcU8(seqanHost, host, hlen, s, slen, dc, begin, begin+cur);
768 if(end > begin+cur+1) qsortSufDcU8(seqanHost, host, hlen, s, slen, dc, begin+cur+1, end);
771 #define BUCKET_SORT_CUTOFF (4 * 1024 * 1024)
772 #define SELECTION_SORT_CUTOFF 6
774 // 5 64-element buckets for bucket-sorting A, C, G, T, $
775 static uint32_t bkts[4][4 * 1024 * 1024];
778 * Straightforwardly obtain a uint8_t-ized version of t[off]. This
779 * works fine as long as TStr is not packed.
781 template<typename TStr>
782 inline uint8_t get_uint8(const TStr& t, uint32_t off) {
787 * For incomprehensible generic-programming reasons, getting a uint8_t
788 * version of a character in a packed String<> requires casting first
789 * to Dna then to uint8_t.
792 inline uint8_t get_uint8(const String<Dna, Packed<> >& t, uint32_t off) {
793 return (uint8_t)(Dna)t[off];
797 * Return character at offset 'off' from the 'si'th suffix in the array
798 * 's' of suffixes. If the character is out-of-bounds, return hi.
800 template<typename TStr>
801 static inline int char_at_suf_u8(const TStr& host,
808 return ((off+s[si]) < hlen) ? get_uint8(host, off+s[si]) : (hi);
811 template<typename T1, typename T2>
812 static void selectionSortSufDcU8(
818 const DifferenceCoverSample<T1>& dc,
823 bool sanityCheck = false)
825 #define ASSERT_SUF_LT(l, r) \
827 !dollarLt(suffix(seqanHost, s[l]), \
828 suffix(seqanHost, s[r]))) { \
829 cout << "l: " << suffixStr(seqanHost, s[l]) << endl; \
830 cout << "r: " << suffixStr(seqanHost, s[r]) << endl; \
834 assert_gt(end, begin+1);
835 assert_leq(end-begin, SELECTION_SORT_CUTOFF);
839 uint32_t off = dc.tieBreakOff(s[begin], s[begin+1]);
840 if(off + s[begin] >= hlen ||
841 off + s[begin+1] >= hlen)
845 if(off != 0xffffffff) {
847 qsortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc,
848 begin, end, sanityCheck);
849 // It's helpful for debugging if we call this here
851 sanityCheckOrderedSufs(seqanHost, hlen, s, slen,
852 0xffffffff, begin, end);
859 assert_leq(v, dc.v());
862 for(size_t i = begin; i < end-1; i++) {
864 uint32_t targoff = depth + s[i];
865 for(size_t j = i+1; j < end; j++) {
867 uint32_t joff = depth + s[j];
869 for(k = 0; k <= lim; k++) {
871 uint8_t jc = (k + joff < hlen) ? get_uint8(host, k + joff) : hi;
872 uint8_t tc = (k + targoff < hlen) ? get_uint8(host, k + targoff) : hi;
873 assert(jc != hi || tc != hi);
875 // the jth suffix is greater than the current
877 ASSERT_SUF_LT(targ, j);
880 // the jth suffix is less than the current smallest
881 // suffix, so update smallest to be j
882 ASSERT_SUF_LT(j, targ);
886 } else if(k == lim) {
887 // Check whether either string ends immediately
888 // after this character
889 assert_leq(k + joff + 1, hlen);
890 assert_leq(k + targoff + 1, hlen);
891 if(k + joff + 1 == hlen) {
893 assert_neq(k + targoff + 1, hlen);
894 ASSERT_SUF_LT(targ, j);
896 } else if(k + targoff + 1 == hlen) {
898 ASSERT_SUF_LT(j, targ);
904 // They're equal so far, keep going
907 // The jth suffix was equal to the current smallest suffix
908 // up to the difference-cover period, so disambiguate with
912 if(sufDcLtU8(seqanHost, host, hlen, s[j], s[targ], dc, sanityCheck)) {
914 assert(!sufDcLtU8(seqanHost, host, hlen, s[targ], s[j], dc, sanityCheck));
915 ASSERT_SUF_LT(j, targ);
919 assert(sufDcLtU8(seqanHost, host, hlen, s[targ], s[j], dc, sanityCheck));
920 ASSERT_SUF_LT(targ, j); // !
925 ASSERT_SUF_LT(targ, i);
931 for(size_t j = i+1; j < end; j++) {
936 sanityCheckOrderedSufs(seqanHost, hlen, s, slen,
937 0xffffffff, begin, end);
941 template<typename T1, typename T2>
942 static void bucketSortSufDcU8(
948 const DifferenceCoverSample<T1>& dc,
953 bool sanityCheck = false)
955 uint32_t cnts[] = { 0, 0, 0, 0, 0 };
956 #define BKT_RECURSE_SUF_DC_U8(nbegin, nend) { \
957 bucketSortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc, hi, \
958 (nbegin), (nend), depth+1, sanityCheck); \
960 assert_gt(end, begin);
961 assert_leq(end-begin, BUCKET_SORT_CUTOFF);
963 if(end == begin+1) return; // 1-element list already sorted
965 // Quicksort the remaining suffixes using difference cover
966 // for constant-time comparisons; this is O(k*log(k)) where
968 qsortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc, begin, end, sanityCheck);
971 if(end-begin <= SELECTION_SORT_CUTOFF) {
972 // Bucket sort remaining items
973 selectionSortSufDcU8(seqanHost, host, hlen, s, slen, dc, hi,
974 begin, end, depth, sanityCheck);
976 sanityCheckOrderedSufs(seqanHost, hlen, s, slen,
977 0xffffffff, begin, end);
981 for(size_t i = begin; i < end; i++) {
982 uint32_t off = depth+s[i];
983 uint8_t c = (off < hlen) ? get_uint8(host, off) : hi;
986 s[begin + cnts[0]++] = s[i];
988 bkts[c-1][cnts[c]++] = s[i];
991 assert_eq(cnts[0] + cnts[1] + cnts[2] + cnts[3] + cnts[4], end - begin);
992 uint32_t cur = begin + cnts[0];
993 if(cnts[1] > 0) { memcpy(&s[cur], bkts[0], cnts[1] << 2); cur += cnts[1]; }
994 if(cnts[2] > 0) { memcpy(&s[cur], bkts[1], cnts[2] << 2); cur += cnts[2]; }
995 if(cnts[3] > 0) { memcpy(&s[cur], bkts[2], cnts[3] << 2); cur += cnts[3]; }
996 if(cnts[4] > 0) { memcpy(&s[cur], bkts[3], cnts[4] << 2); }
997 // This frame is now totally finished with bkts[][], so recursive
998 // callees can safely clobber it; we're not done with cnts[], but
999 // that's local to the stack frame.
1002 BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[0]); cur += cnts[0];
1005 BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[1]); cur += cnts[1];
1008 BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[2]); cur += cnts[2];
1011 BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[3]);
1017 * Main multikey quicksort function for suffixes. Based on Bentley &
1018 * Sedgewick's algorithm on p.5 of their paper "Fast Algorithms for
1019 * Sorting and Searching Strings". That algorithm has been extended in
1022 * 1. Deal with keys of different lengths by checking bounds and
1023 * considering off-the-end values to be 'hi' (b/c our goal is the
1024 * BWT transform, we're biased toward considring prefixes as
1025 * lexicographically *greater* than their extensions).
1026 * 2. The multikey_qsort_suffixes version takes a single host string
1027 * and a list of suffix offsets as input. This reduces memory
1028 * footprint compared to an approach that treats its input
1029 * generically as a set of strings (not necessarily suffixes), thus
1030 * requiring that we store at least two integers worth of
1031 * information for each string.
1032 * 3. Sorting functions take an extra "upto" parameter that upper-
1033 * bounds the depth to which the function sorts.
1035 template<typename T1, typename T2>
1036 void mkeyQSortSufDcU8(const T1& seqanHost,
1041 const DifferenceCoverSample<T1>& dc,
1046 bool sanityCheck = false)
1048 // Helper for making the recursive call; sanity-checks arguments to
1049 // make sure that the problem actually got smaller.
1050 #define MQS_RECURSE_SUF_DC_U8(nbegin, nend, ndepth) { \
1051 assert(nbegin > begin || nend < end || ndepth > depth); \
1052 mkeyQSortSufDcU8(seqanHost, host, hlen, s, slen, dc, hi, nbegin, nend, ndepth, sanityCheck); \
1054 assert_leq(begin, slen);
1055 assert_leq(end, slen);
1056 size_t n = end - begin;
1057 if(n <= 1) return; // 1-element list already sorted
1058 if(depth > dc.v()) {
1059 // Quicksort the remaining suffixes using difference cover
1060 // for constant-time comparisons; this is O(k*log(k)) where
1062 qsortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc, begin, end, sanityCheck);
1064 sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff, begin, end);
1068 if(n <= BUCKET_SORT_CUTOFF) {
1069 // Bucket sort remaining items
1070 bucketSortSufDcU8(seqanHost, host, hlen, s, slen, dc,
1071 (uint8_t)hi, begin, end, depth, sanityCheck);
1073 sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff, begin, end);
1077 size_t a, b, c, d, r;
1078 CHOOSE_AND_SWAP_PIVOT(SWAP1, CHAR_AT_SUF_U8); // choose pivot, swap to begin
1079 int v = CHAR_AT_SUF_U8(begin, depth); // v <- pivot value
1082 bool stillInBounds = false;
1083 for(size_t i = begin; i < end; i++) {
1084 if(depth < (hlen-s[i])) {
1085 stillInBounds = true;
1087 } else { /* already fell off this suffix */ }
1089 assert(stillInBounds); // >=1 suffix must still be in bounds
1095 // Invariant: everything before a is = pivot, everything
1096 // between a and b is <
1097 int bc = 0; // shouldn't have to init but gcc on Mac complains
1098 while(b <= c && v >= (bc = CHAR_AT_SUF_U8(b, depth))) {
1104 // Invariant: everything after d is = pivot, everything
1105 // between c and d is >
1106 int cc = 0; // shouldn't have to init but gcc on Mac complains
1107 bool hiLatch = true;
1108 while(b <= c && v <= (cc = CHAR_AT_SUF_U8(c, depth))) {
1112 else if(hiLatch && cc == hi) {
1122 assert(a > begin || c < end-1); // there was at least one =s
1123 assert_lt(d-c, n); // they can't all have been > pivot
1124 assert_lt(b-a, n); // they can't all have been < pivot
1125 //assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth)); // check pre-=-swap invariant
1126 r = min(a-begin, b-a); VECSWAP(s, begin, b-r, r); // swap left = to center
1127 r = min(d-c, end-d-1); VECSWAP(s, b, end-r, r); // swap right = to center
1128 //assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
1129 r = b-a; // r <- # of <'s
1131 MQS_RECURSE_SUF_DC_U8(begin, begin + r, depth); // recurse on <'s
1133 // Do not recurse on ='s if the pivot was the off-the-end value;
1134 // they're already fully sorted
1136 MQS_RECURSE_SUF_DC_U8(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
1138 r = d-c; // r <- # of >'s excluding those exhausted
1139 if(r > 0 && v < hi-1) {
1140 MQS_RECURSE_SUF_DC_U8(end-r, end, depth); // recurse on >'s
1145 #endif /*MULTIKEY_QSORT_H_*/