-
Notifications
You must be signed in to change notification settings - Fork 0
/
suffix.h
234 lines (200 loc) · 9.78 KB
/
suffix.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// Copyright (c) 2016 Christoph Diegelmann
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// Linear Time Suffix Sorting By Depth Awareness
// Notes from the author:
// This is a reference implementation currently NOT running in O(n)
// It uses multi pivot introsort rather than a linear time sorting stage
// I would be happy if you file a pull request on github when you change
// something so we can improve the software for everyone. Of course it's
// your right not to do so.
// If you have questions on this feel free to report an issue.
// http://github.com/akamiru
// I'd really like to thank all authors who make their papers available online
// many related papers are cited in the description most of which can be
// found on the internet.
#ifndef SORT_SUFFIX_H
#define SORT_SUFFIX_H
#ifdef __INTEL_COMPILER
#pragma warning disable 3373
#endif
#include <algorithm>
#include <climits>
#include <cmath>
#include <cstdint>
#include <tuple>
#include <type_traits>
#include <vector>
#include "detail/suffix.h"
#include "inplace.h"
#include "copy.h"
// define if additional space may be used
#ifndef NO_USE_COPY
#define USE_COPY
#endif
namespace sort {
namespace suffix {
// Sort the suffix array depth aware in (theoretical) linear time
// expects the SA and ISA to be grouped by a single char
// that implies ISA[n-1] == 0 && SA[0] == n-1
// moreover the name of each group should equal the position of
// the beginning in SA (e.g. generated by an EXclusive scan)
// [Sf Sl) is additional space available
#ifdef USE_COPY
template <class T, class U, class V> void daware(T SAf, T SAl, U ISAf, V Af, V Al) {
using X = std::remove_reference_t<decltype(*ISAf)>;
using Y = std::remove_reference_t<decltype(*SAf)>;
auto* Sf = reinterpret_cast<detail::misc::pair<X, Y>*>(&*Af);
auto* Sl = Sf + (Al - Af) / sizeof(decltype(*Sf)) * sizeof(decltype(*Af));
#else
template <class T, class U, class V> void daware(T SAf, T SAl, U ISAf) {
#endif
// This is a "pulling" or "lazy" rather than a "pushing" version of GSACA
// while GSACA sorts previous elements using info of the current group
// this implementation sorts groups using info of subsequent elements
// in a way related to prefix doubling. For GSACA see
// "Linear-time Suffix Sorting - A New Approach for Suffix Array Construction" - Baier
// More interesting papers on the topic include
// - "Faster suffix sorting" - Larsson, Sadakane (Prefix Doubling)
// - "A Taxonomy of Suffix Array Construction Algorithms" - PUGLISI, SMYTH, TURPIN (General Overview)
// - probably also see "An Efficient Algorithm for Suffix Sorting" - Peng, Wang, Xue, Wei
// because it incorporates depth info into prefix doubling but sadly I
// don't have access to it
// A "pulling" version allows us to drop all but the additional data
// for ISA and sorting depth (correspond to Previous Pointers in GSACA).
// see the comment at the naming stage in sort::detail::misc::name() (line 185).
// This is more cache friendly and results in less book keeping work.
// The depth array can be further emplaced into the ISA to achieve 8 * n
// memory usage
// This is enough to get a O(n) time, O(1) working space SACA using the
// Improved Two Stage (ITS) approach
// see "Short description of improved two-stage suffix sorting algorithm" - Mori
// http://homepage3.nifty.com/wpage/software/itssort.txt
// basically it's a single stage of SA-IS
// see "Linear Suffix Array Construction by Almost Pure Induced-Sorting" - Nong, Zhang, Chan
// [currently the actual worst case working space is O(log(n)) due to
// introsort]
// The current implementation has a worst and average case running time of
// O(n * log(n)) due to the fact that it uses multi pivot introsort rather
// than a linear time sort (which probably won't change)
// Additional ideas:
// "BlockQuicksort: How Branch Mispredictions don't affect Quicksort" - Edelkamp, Weiss
// for multi pivot quicksort. But without it the reduced branch mispredictions
// probably won't make up for the additional cache misses when not using three
// pivot quicksort.
// We use this when values are fast to access.
// "The Spreadsort High-performance General-case Sorting Algorithm" - Ross
// Spreadsort might improve performance over pure quicksort
// at least runtime garantee improves somewhat.
// "Improving multikey Quicksort for sorting strings with many equal elements" - Kim, Park
// Split-end partitioning for the three way quicksort might not be optimal
// because it is only used when we have a big amount of elements equal to
// the pivot which results in a lot of copying. "collect-center" might be
// faster. Tests showed that collect-center is slower.
// We might be able to further reduce the cache misses if we could
// classify the ISA values to S/L type while building the SA in the
// previous stage and only sort the L type in the first stage.
// Using sorting networks instead of insertion sort for small lists
// might increase performance by reducing the number of comparisons.
// Maybe compile time construction of Batcher's Odd-Even Mergesort
// to keep the code clean.
// Hard to add branchless therefore pretty slow.
// Prefetching ISA when sorting
// This should be tested on real data with valgrind - first tests are pretty
// mixed
// If we have additional memory available (with ITS we have an average of n left)
// we could first copy SA[i], ISA[SA[i]] together, then sort this list and
// then copy back the results. That should reduce the number of cache misses
// to near linear (quicksort itself has O(n * log(n)) cache misses with a very
// small const factor)
// This greatly improved the scaling of daware.
// Actual implementation:
// Start by iterating over all groups right to left
// sorting all suffixes until their next smaller suffix
// Basically this equals sorting the lyndon words starting at each SA[i]
// So this has something to do with Lyndon Trees
// which themselves have to do with maximal reptitions
// maybe this leads to an even faster approach in those areas
// "A new characterization of maximal repetitions by Lyndon trees" - Bannai, I, Inenaga
for (auto gl = SAl; gl > SAf + 1;) {
// Name of the group equals the start of the group
auto gf = SAf + ISAf[gl[-1]];
if (1 < std::distance(gf, gl)) {
// after this call everything left is type F everything right is type S
T gc = detail::suffix::partition(SAf, ISAf, gf, gl, 1);
// handle only the type S subgroups
for (T sgf = gc, sgl = gl; gc < sgl;) {
if (sgl[-1] < 0) { // is type F?
sgl = SAf + ISAf[~sgl[-1]]; // skip over
} else { // type S
sgf = SAf + ISAf[+sgl[-1]]; // get the start of the group
if (std::distance(sgf, sgl) < 2) {
*--sgl = ~*sgf; // we can directly skip over it
continue;
}
// calculate the sorting depth
auto n = ISAf[sgl[-1] + 1]; // Get the element following the current
auto depth = 1 + ((n >> (sizeof(decltype(n)) * CHAR_BIT - 1)) & ~n);
// get the index function
auto index = detail::misc::index(ISAf, depth);
// sort all type S
constexpr auto RL = detail::misc::RL;
#ifdef USE_COPY
sort::copy::quick<RL>(sgf, sgl, Sf, Sl, index, detail::suffix::name(SAf, ISAf, depth));
#else
sort::inplace::quick<RL>(sgf, sgl, index, detail::suffix::name(SAf, ISAf, depth));
#endif
}
}
} else
*gf = ~*gf; // Group is unique - just flag it as type F
gl = gf;
}
// Induce the order of all suffixes from left to right
for (auto gf = SAf + 1; gf < SAl;) {
auto gl = gf;
while (0 <= *gl++); // End of the group is flagged
gl[-1] = ~gl[-1]; // Flip the flag
auto n = ISAf[*gf + 1]; // Get the element following the current
auto depth = 1 + ((n >> (sizeof(decltype(n)) * CHAR_BIT - 1)) & ~n);
auto index = detail::misc::index(ISAf, depth);
// All elements of the left are already unique so we simply need to sort
constexpr auto NOCB = detail::misc::NOCB;
#ifdef USE_COPY
sort::copy::quick<NOCB>(gf, gl, Sf, Sl, index);
#else
sort::inplace::quick<NOCB>(gf, gl, index);
#endif
// And give each of them a unique name
auto castToIndex = detail::misc::castTo<decltype(*ISAf)>();
std::for_each(gf, gl, [ISAf, SAf, castToIndex](auto &a) { ISAf[a] = castToIndex(&a - &*SAf); });
gf = gl;
// Scan over all unique groups
for (; gf < SAl && *gf < 0; ++gf) {
*gf = ~*gf;
// Maybe we could somehow overwrite the embedded depth info in the
// sorting stage to make this unnecessary
ISAf[*gf] = castToIndex(gf - SAf);
}
}
// Now the SA is completly sorted and ISA is completly reconstructed
}
} // suffix
} // sort
#endif // SORT_SUFFIX_H