5 * Created by David Weese on 18.04.07.
7 * This file contains the suffix array algorithm implementation
8 * by Larsson and Sadakane with the following modifications.
11 * - a context is used for reentrance
12 * - a generic TValue is used instead of int
13 * - functions are surrounded by SeqAn's namespace to omit namespace pollution
20 Copyright 1999, N. Jesper Larsson, all rights reserved.
22 This file contains an implementation of the algorithm presented in "Faster
23 Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
24 Sadakane (sada@is.s.u-tokyo.ac.jp).
26 This software may be used freely for any purpose. However, when distributed,
27 the original source must be clearly stated, and, when the source code is
28 distributed, the copyright notice must be retained and any alterations in
29 the code must be clearly marked. No warranty is given regarding the quality
32 #ifndef SEQAN_HEADER_INDEX_SA_LSS_H
33 #define SEQAN_HEADER_INDEX_SA_LSS_H
35 namespace SEQAN_NAMESPACE_MAIN
38 template <typename TValue>
41 TValue *I, /* group array, ultimately suffix array.*/
42 *V, /* inverse array, ultimately inverse of I.*/
43 r, /* number of symbols aggregated by transform.*/
44 h; /* length of already-sorted prefixes.*/
46 // MODIFIED: renamed defines according to SeqAn's naming conventions
47 #define SEQAN_LSSKEY(p) (V[*(p)+(h)])
48 #define SEQAN_LSSSWAP(p, q) (tmp=*(p), *(p)=*(q), *(q)=tmp)
49 #define SEQAN_LSSMED3(a, b, c) (SEQAN_LSSKEY(a)<SEQAN_LSSKEY(b) ? \
50 (SEQAN_LSSKEY(b)<SEQAN_LSSKEY(c) ? (b) : SEQAN_LSSKEY(a)<SEQAN_LSSKEY(c) ? (c) : (a)) \
51 : (SEQAN_LSSKEY(b)>SEQAN_LSSKEY(c) ? (b) : SEQAN_LSSKEY(a)>SEQAN_LSSKEY(c) ? (c) : (a)))
53 /* Subroutine for select_sort_split and sort_split. Sets group numbers for a
54 group whose lowest position in I is pl and highest position is pm.*/
56 inline void update_group(TValue *pl, TValue *pm)
60 g=pm-I; /* group number.*/
61 V[*pl]=g; /* update group number of first position.*/
63 *pl=-1; /* one element, sorted group.*/
65 do /* more than one element, unsorted group.*/
66 V[*++pl]=g; /* update group numbers.*/
70 /* Quadratic sorting method to use for small subarrays. To be able to update
71 group numbers consistently, a variant of selection sorting is used.*/
73 inline void select_sort_split(TValue *p, TValue n) {
74 TValue *pa, *pb, *pi, *pn;
77 pa=p; /* pa is start of group being picked out.*/
78 pn=p+n-1; /* pn is last position of subarray.*/
80 for (pi=pb=pa+1, f=SEQAN_LSSKEY(pa); pi<=pn; ++pi)
81 if ((v=SEQAN_LSSKEY(pi))<f) {
82 f=v; /* f is smallest key found.*/
83 SEQAN_LSSSWAP(pi, pa); /* place smallest element at beginning.*/
84 pb=pa+1; /* pb is position for elements equal to f.*/
85 } else if (v==f) { /* if equal to smallest key.*/
86 SEQAN_LSSSWAP(pi, pb); /* place next to other smallest elements.*/
89 update_group(pa, pb-1); /* update group values for new group.*/
90 pa=pb; /* continue sorting rest of the subarray.*/
92 if (pa==pn) { /* check if last part is single element.*/
94 *pa=-1; /* sorted group.*/
98 /* Subroutine for sort_split, algorithm by Bentley & McIlroy.*/
100 inline TValue choose_pivot(TValue *p, TValue n) {
101 TValue *pl, *pm, *pn;
104 pm=p+(n>>1); /* small arrays, middle element.*/
108 if (n>40) { /* big arrays, pseudomedian of 9.*/
110 pl=SEQAN_LSSMED3(pl, pl+s, pl+s+s);
111 pm=SEQAN_LSSMED3(pm-s, pm, pm+s);
112 pn=SEQAN_LSSMED3(pn-s-s, pn-s, pn);
114 pm=SEQAN_LSSMED3(pl, pm, pn); /* midsize arrays, median of 3.*/
116 return SEQAN_LSSKEY(pm);
119 /* Sorting routine called for each unsorted group. Sorts the array of integers
120 (suffix numbers) of length n starting at p. The algorithm is a ternary-split
121 quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
122 Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
123 function is based on Program 7.*/
125 inline void sort_split(TValue *p, TValue n)
127 TValue *pa, *pb, *pc, *pd, *pl, *pm, *pn;
128 TValue f, v, s, t, tmp;
130 if (n<7) { /* multi-selection sort smallest arrays.*/
131 select_sort_split(p, n);
135 v=choose_pivot(p, n);
138 while (1) { /* split-end partition.*/
139 while (pb<=pc && (f=SEQAN_LSSKEY(pb))<=v) {
141 SEQAN_LSSSWAP(pa, pb);
146 while (pc>=pb && (f=SEQAN_LSSKEY(pc))>=v) {
148 SEQAN_LSSSWAP(pc, pd);
155 SEQAN_LSSSWAP(pb, pc);
160 if ((s=pa-p)>(t=pb-pa))
162 for (pl=p, pm=pb-s; s; --s, ++pl, ++pm)
163 SEQAN_LSSSWAP(pl, pm);
164 if ((s=pd-pc)>(t=pn-pd-1))
166 for (pl=pb, pm=pn-s; s; --s, ++pl, ++pm)
167 SEQAN_LSSSWAP(pl, pm);
173 update_group(p+s, p+n-t-1);
175 sort_split(p+n-t, t);
178 /* Bucketsort for first iteration.
180 Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
181 at least once. x[n] is 0. (This is the corresponding output of transform.) k
182 must be at most n+1. p is array of size n+1 whose contents are disregarded.
184 Output: x is V and p is I after the initial sorting stage of the refined
185 suffix sorting algorithm.*/
187 inline void bucketsort(TValue *x, TValue *p, TValue n, TValue k)
189 TValue *pi, i, c, d, g;
191 for (pi=p; pi<p+k; ++pi)
192 *pi=-1; /* mark linked lists empty.*/
193 for (i=0; i<=n; ++i) {
194 x[i]=p[c=x[i]]; /* insert in linked list.*/
197 for (pi=p+k-1, i=n; pi>=p; --pi) {
198 d=x[c=*pi]; /* c is position, d is next in list.*/
199 x[c]=g=i; /* last position equals group number.*/
200 if (d>=0) { /* if more than one element in group.*/
201 p[i--]=c; /* p is permutation for the sorted x.*/
203 d=x[c=d]; /* next in linked list.*/
204 x[c]=g; /* group number in x.*/
205 p[i--]=c; /* permutation in p.*/
208 p[i--]=-1; /* one element, sorted group.*/
212 /* Transforms the alphabet of x by attempting to aggregate several symbols into
213 one, while preserving the suffix order of x. The alphabet may also be
214 compacted, so that x on output comprises all integers of the new alphabet
215 with no skipped numbers.
217 Input: x is an array of size n+1 whose first n elements are positive
218 integers in the range l...k-1. p is array of size n+1, used for temporary
219 storage. q controls aggregation and compaction by defining the maximum value
220 for any symbol during transformation: q must be at least k-l; if q<=n,
221 compaction is guaranteed; if k-l>n, compaction is never done; if q is
222 INT_MAX, the maximum number of symbols are aggregated into one.
224 Output: Returns an integer j in the range 1...q representing the size of the
225 new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
226 set to the number of old symbols grouped into one. Only x[n] is 0.*/
228 inline TValue transform(TValue *x, TValue *p, TValue n, TValue k, TValue l, TValue q)
230 TValue b, c, d, e, i, j, m, s;
233 for (s=0, i=k-l; i; i>>=1)
234 ++s; /* s is number of bits in old symbol.*/
235 e=SupremumValue<TValue>::VALUE>>s; /* e is for overflow checking.*/
236 for (b=d=r=0; r<n && d<=e && (c=d<<s|(k-l))<=q; ++r) {
237 b=b<<s|(x[r]-l+1); /* b is start of x in chunk alphabet.*/
238 d=c; /* d is max symbol in chunk alphabet.*/
240 m=(1<<(r-1)*s)-1; /* m masks off top old symbol from chunk.*/
241 x[n]=l-1; /* emulate zero terminator.*/
242 if (d<=n) { /* if bucketing possible, compact alphabet.*/
243 for (pi=p; pi<=p+d; ++pi)
244 *pi=0; /* zero transformation table.*/
245 for (pi=x+r, c=b; pi<=x+n; ++pi) {
246 p[c]=1; /* mark used chunk symbol.*/
247 c=(c&m)<<s|(*pi-l+1); /* shift in next old symbol in chunk.*/
249 for (i=1; i<r; ++i) { /* handle last r-1 positions.*/
250 p[c]=1; /* mark used chunk symbol.*/
251 c=(c&m)<<s; /* shift in next old symbol in chunk.*/
253 for (pi=p, j=1; pi<=p+d; ++pi)
255 *pi=j++; /* j is new alphabet size.*/
256 for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
257 *pi=p[c]; /* transform to new alphabet.*/
258 c=(c&m)<<s|(*pj-l+1); /* shift in next old symbol in chunk.*/
260 while (pi<x+n) { /* handle last r-1 positions.*/
261 *pi++=p[c]; /* transform to new alphabet.*/
262 c=(c&m)<<s; /* shift right-end zero in chunk.*/
264 } else { /* bucketing not possible, don't compact.*/
265 for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
266 *pi=c; /* transform to new alphabet.*/
267 c=(c&m)<<s|(*pj-l+1); /* shift in next old symbol in chunk.*/
269 while (pi<x+n) { /* handle last r-1 positions.*/
270 *pi++=c; /* transform to new alphabet.*/
271 c=(c&m)<<s; /* shift right-end zero in chunk.*/
273 j=d+1; /* new alphabet size.*/
275 x[n]=0; /* end-of-string symbol is zero.*/
276 return j; /* return new alphabet size.*/
279 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
280 n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
281 contents of x[n] is disregarded, the n-th symbol being regarded as
282 end-of-string smaller than all other symbols.*/
284 void suffixsort(TValue *x, TValue *p, TValue n, TValue k, TValue l)
289 V=x; /* set global values.*/
292 if (n>=k-l) { /* if bucketing possible,*/
293 j=transform(V, I, n, k, l, n);
294 bucketsort(V, I, n, j); /* bucketsort on first r positions.*/
296 transform(V, I, n, k, l, SupremumValue<TValue>::VALUE);
298 I[i]=i; /* initialize I with suffix numbers.*/
300 sort_split(I, n+1); /* quicksort on first r positions.*/
302 h=r; /* number of symbols aggregated by transform.*/
305 pi=I; /* pi is first position of group.*/
306 sl=0; /* sl is negated length of sorted groups.*/
309 pi-=s; /* skip over sorted group.*/
310 sl+=s; /* add negated length to sl.*/
313 *(pi+sl)=sl; /* combine sorted groups before pi.*/
316 pk=I+V[s]+1; /* pk-1 is last position of unsorted group.*/
317 sort_split(pi, pk-pi);
318 pi=pk; /* next group.*/
321 if (sl) /* if the array ends with a sorted group.*/
322 *(pi+sl)=sl; /* combine sorted groups at end of I.*/
323 h=2*h; /* double sorted-depth.*/
326 for (i=0; i<=n; ++i) /* reconstruct suffix array from inverse.*/
333 //////////////////////////////////////////////////////////////////////////////
336 struct LarssonSadakane {};
339 // 1. the content of s will be destroyed
340 // 2. s and SA have to have size n+1
341 // 3. s[n] must be unique and less than any other character in s
343 // better use LarssonSadakane as a pipe (look down)
345 template < typename TSA,
347 void createSuffixArray(
350 LarssonSadakane const &,
353 typedef typename Value<TSA>::Type TValue;
354 typedef typename _MakeSigned<TValue>::Type TSValue; // LarssonSadakane expects signed values
355 _Context_LSS<TSValue> c;
357 (TSValue*)begin(s, Standard()), // text
358 (TSValue*)begin(SA, Standard()), // SA
364 //////////////////////////////////////////////////////////////////////////////
366 // template < typename TInput >
367 // struct Pipe< TInput, LarssonSadakane >
369 // typedef typename SAValue<TInput>::Type TValue;
370 // typedef String<TValue, Alloc<> > TText;
371 // typedef String<TValue, Alloc<> > TSA;
372 // typedef Pipe<TSA, Source<> > TSource;
377 // Pipe(TInput &_textIn):
380 // typedef typename Iterator<TText, Standard>::Type TIter;
382 // TValue len = length(_textIn);
384 // resize(text, len + 1, Exact());
386 // TIter it = begin(text);
387 // TIter itEnd = begin(text) + len;
388 // TValue maxChar = 0;
390 // beginRead(_textIn);
391 // for(; it != itEnd; ++it, ++_textIn) {
392 // TValue val = (*it = 1 + (TValue)*_textIn);
393 // if (val > maxChar)
398 // resize(sa, len + 1, Exact());
399 // createSuffixArray(sa, text, LarssonSadakane(), maxChar + 1);
402 // inline typename Value<TSource>::Type const & operator*() {
406 // inline Pipe& operator++() {
412 // template < typename TInput >
413 // inline bool control(Pipe< TInput, LarssonSadakane > &me, ControlBeginRead const &command) {
414 // control(me.in, command);
419 // template < typename TInput >
420 // inline typename Size< Pipe< TInput, LarssonSadakane > >::Type
421 // length(Pipe< TInput, LarssonSadakane > const &me) {
422 // return length(me.in) - 1;