-
Notifications
You must be signed in to change notification settings - Fork 0
/
common.go
215 lines (187 loc) · 5.81 KB
/
common.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
package gosaca
func setAllToEmpty(SA []int) {
for i := range SA {
SA[i] = empty
}
}
// compute the length of the LMS substring at the front of the LMS suffix s[:]
// pre-condition: s[:] is an LMS suffix
// WARNING: if s[:] ends on the sentinel, the returned value will be len(s)+1!
func lmsSubstringLength0(s []byte) int {
n := len(s)
for i := 2; i < n; i++ {
if s[i] < s[i-1] {
// s[i-1] is L-type; move on to step 2
for j := i; j < n; j++ {
if s[j] > s[j-1] {
return i
} else if s[j] < s[j-1] {
i = j + 1
}
}
}
}
return n + 1 // add one to indicate substring ended with the sentinel
}
// for level 0, rename the LMS substrings sitting in SA1, and return the new alphabet size (k1)
func rename0(S []byte, SA1, work, S1 []int) int {
n := len(S)
n1 := len(SA1)
if n1 == 0 {
return 0
}
// currently work only holds positive values; we save the time of clearing
// it out by inserting bitwise-negated (negative) values so we know which
// ones we put in vs which ones just contain old data
// walk SA1 from left to right, creating Z1 (spread throughout work)
// first, record the first LMS suffix
k1 := 1
bktHead := 0 // renamed value == head of bucket in SA1 (part of property 4.1)
prev := SA1[0]
work[prev/2] = ^bktHead
SA1[0] = 1 // after we read SA[i], reuse it as a bucket size (needed for post-Z1 step)
// at each step, we need to see if the LMS substring starting at S[SA1[i]] (S[pos])
// is the same as the one we just saw starting at S[SA1[i-1]] (S[prev])
for i := 1; i < n1; i++ {
pos := SA1[i]
SA1[i] = 0 // reused as bucket size
diff := false
// quick first test - if initial character is different we're done
if S[prev] != S[pos] {
diff = true
} else {
// TODO - this walks both LMS substrings to calculate their lengths; can we combine this to short-circuit earlier if possible? tricky to do correctly!
prevLen := lmsSubstringLength0(S[prev:])
posLen := lmsSubstringLength0(S[pos:])
if prev+prevLen == n+1 || // S[prev:] ends with sentinel
pos+posLen == n+1 || // S[pos:] ends with sentinel
prevLen != posLen { // different lengths
diff = true
} else {
// if we get here:
// (a) first character is the same
// (b) both end before the sentinel
// (c) both have the same length
// so we need to check the rest of the characters one-by-one
for j := 1; j < prevLen; j++ {
if S[prev+j] != S[pos+j] {
diff = true
break
}
}
}
}
if diff {
bktHead = i
k1++
}
work[pos/2] = ^bktHead
SA1[bktHead]++ // increment bucket size
prev = pos
}
// Z1 is now sitting (sparsely) in work[]
buildS1FromZ1(S1, SA1, work)
return k1
}
// this function is almost the same as rename0; differences:
// (a) S is an []int
// (b) our LMS substring comparision is different (and easier)
func rename1(S, SA1, work, S1 []int) int {
n := len(S)
n1 := len(SA1)
if n1 == 0 {
return 0
}
// currently work only holds positive values; we save the time of clearing
// it out by inserting bitwise-negated (negative) values so we know which
// ones we put in vs which ones just contain old data
// walk SA1 from left to right, creating Z1 (spread throughout work)
// first, record the first LMS suffix
k1 := 1
bktHead := 0 // renamed value == head of bucket in SA1 (part of property 4.1)
prev := SA1[0]
work[prev/2] = ^bktHead
SA1[0] = 1 // after we read SA[i], reuse it as a bucket size (needed for post-Z1 step)
// at each step, we need to see if the LMS substring starting at S[SA1[i]] (S[pos])
// is the same as the one we just saw starting at S[SA1[i-1]] (S[prev])
for i := 1; i < n1; i++ {
pos := SA1[i]
SA1[i] = 0 // reused as bucket size
diff := false
// walk both strings character-by-character until (a) we get a
// difference or (b) we begin the L+ sequence
j := 0
for j = 0; j < n; j++ {
if S[prev+j] != S[pos+j] {
diff = true
break
} else if S[prev+j] >= 0 {
break
}
}
if !diff {
// both strings started L+ at the same place; now walk until:
// (a) either hits the end (=> different)
// (b) we get a different character (=> different)
// (c) both hit the same S-type value (=> same)
for j++; j < n; j++ {
if prev+j == n || pos+j == n || S[prev+j] != S[pos+j] {
diff = true
break
} else if S[prev+j] < 0 {
break
}
}
}
if diff {
bktHead = i
k1++
}
work[pos/2] = ^bktHead
SA1[bktHead]++ // increment bucket size
prev = pos
}
// Z1 is now sitting (sparsely) in work[]
buildS1FromZ1(S1, SA1, work)
return k1
}
// Build S1 from Z1 (which is sitting sparsely in work[] - all the negative
// values for work are the bitwise inversions of Z1).
func buildS1FromZ1(S1, SA1, work []int) {
n1 := len(S1)
// walk work[] from right-to-left and adjust any S-type characters to point to the end of their bucket instead of the beginning
Z1pos := len(work) - 1
for i := 0; i < n1; i++ {
// find next element of Z1
for work[Z1pos] >= 0 {
Z1pos--
}
// record character (head of bucket, only correct for L-type)
c := ^work[Z1pos]
S1[n1-1-i] = c
Z1pos--
// check and see if c is S-type
if i > 0 && // S1[n-1] is L-type by definition due to sentinel
((S1[n1-i] < 0 && c <= ^S1[n1-i]) || // S1[n1-i] was S-type and we are <= it
(S1[n1-i] >= 0 && c < S1[n1-i])) { // S1[n1-i] was L-type and we are < it
// Adjust c so it points to the end of its bucket instead of the
// head. Note that in the Z1 construction loop above, we stored
// the width of each bucket in SA1[c]. Also, bitwise negate it so
// the recursive computeSuffixArray1 doesn't have to.
S1[n1-1-i] = ^(S1[n1-1-i] + SA1[c] - 1)
}
}
}
func sortRecursively(S1, SA1 []int, k1 int) {
if k1 == len(S1) {
for i, s := range S1 {
if s < 0 {
SA1[^s] = i
} else {
SA1[s] = i
}
}
} else {
computeSuffixArray1(S1, SA1, k1)
}
}