-
Notifications
You must be signed in to change notification settings - Fork 8
/
sa.py
164 lines (139 loc) · 5.51 KB
/
sa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
########################################################################
# FastGeneralizedSuffixArrays v0.1 #
# (c) 2011 Mark Mazumder #
# markmaz.com #
########################################################################
class Triple(object):
"""Represent each sortable character in R with three integers"""
#todo: input validation, errors
def __init__(self, T, idx, length):
t_i = lambda i: T[i] if i < length else 0
self._triple = (t_i(idx), t_i(idx + 1), t_i(idx + 2))
self._index = idx
self._rank = None
self._rpos = None
@property
def triple(self):
"""Character for R_k strings"""
return self._triple
@property
def index(self):
"""Position of R_k character in source string"""
return self._index
@property
def rpos(self):
"""Sorted order of R_k charcter"""
return self._rpos
@rpos.setter
def rpos(self, pos):
self._rpos = pos
@property
def rank(self):
"""Sorted order of R_k charcter"""
return self._rank
@rank.setter
def rank(self, pos):
self._rank = pos
def __repr__(self):
return "Triple({0}, {1}, {2})".format(self.triple, self.index, self.rank)
class NonsamplePair(object):
#todo: property decorators for validation
def __init__(self, T, idx, S_i_ranks):
self.index = idx
self.pair = None
max_index = len(T)
if idx < max_index:
self.pair = (T[self.index], S_i_ranks[self.index + 1])
else:
self.pair = (0, S_i_ranks[self.index + 1]) #defined to be 0 by KS algorithm
# Recursive Karkkainen-Sanders implementation
# Input: list of integers (representing characters)
# Returns suffix array for list
def ksa(T):
length = len(T) # n
# B_k = { i \in [0,n] | i mod 3 = k }
B_0, B_1, B_2 = xrange(0, length+1, 3), xrange(1, length+1, 3), xrange(2, length+1, 3)
#karkkainen-sanders step 1: sort sample suffixes
R_0 = [ Triple(T, idx, length) for idx in B_0 ]
R_1 = [ Triple(T, idx, length) for idx in B_1 ]
R_2 = [ Triple(T, idx, length) for idx in B_2 ]
R = R_1 + R_2
#enable reverse-lookup of characters in R from a list of sorted characters from R
for i, r_char in enumerate(R):
r_char.rpos = i
sorted_suffixes_R = sorted(R, key=lambda suffix_char: suffix_char.triple)
#Enables 0 as unique terminating character by starting ranks at 1
def rank_suffixes(suffixes, rank=1):
for i, suffix in enumerate(suffixes):
if i > 0 and suffix.triple != suffixes[i-1].triple:
rank += 1
suffix.rank = rank
return rank
rank = rank_suffixes(sorted_suffixes_R)
R_prime = [suffix.rank for suffix in R]
#recursive call
if (rank < len(R)): #we had repeats of characters of R, make a recursive call to sort
R_prime_suffix_array = ksa(R_prime)
else:
#directly form suffix array
R_prime_suffix_array = [len(R)] + [suffix.rpos for suffix in sorted_suffixes_R]
rank_Si = [None] * (length + 3) #why plus 3? -> additiionally define rank(S_(n+1) = rank(S_(n+2)) = 0
rank_Si[-2] = rank_Si[-1] = 0
#build rank(S_i) lookup array
for i, SAi in enumerate(R_prime_suffix_array):
if SAi < len(R): #ignore the index pointing to the terminating character of R_prime
rank_Si[R[SAi].index] = i
sorted_suffixes_R = [R[i] for i in R_prime_suffix_array[1:]]
#karkkainen-sanders step 2: sort nonsample suffixes
nonsample_suffix_pairs = [NonsamplePair(T, idx, rank_Si) for idx in B_0]
sorted_nonsample_suffix_pairs = sorted(nonsample_suffix_pairs, key=lambda p: p.pair)
#karkkainen-sanders step 3: merge
cur_Sc, cur_Sb0 = 0, 0
objs_SA = []
def getT(idx):
if idx < len(T):
return T[idx]
return 0
while cur_Sc < len(sorted_suffixes_R) and cur_Sb0 < len(sorted_nonsample_suffix_pairs):
i = sorted_suffixes_R[cur_Sc].index
j = sorted_nonsample_suffix_pairs[cur_Sb0].index
if i % 3 == 1: #i in B_1
#S_i =< S_j iff (T[i], rank(S_t+1) =< (t_j, rank(s_j+1))
if (getT(i), rank_Si[i+1]) < (getT(j), rank_Si[j+1]):
objs_SA.append(sorted_suffixes_R[cur_Sc])
cur_Sc += 1
else:
objs_SA.append(sorted_nonsample_suffix_pairs[cur_Sb0])
cur_Sb0 += 1
else: #i in B_2
if (getT(i), getT(i+1), rank_Si[i+2]) < (getT(j), getT(j+1), rank_Si[j+2]):
objs_SA.append(sorted_suffixes_R[cur_Sc])
cur_Sc += 1
else:
objs_SA.append(sorted_nonsample_suffix_pairs[cur_Sb0])
cur_Sb0 += 1
objs_SA += sorted_suffixes_R[cur_Sc:]
objs_SA += sorted_nonsample_suffix_pairs[cur_Sb0:]
SA = [suffix_object.index for suffix_object in objs_SA]
return SA
import sys
def main():
def inputerr():
print 'usage: python sa.py input_alphanumeric_string'
exit()
if len(sys.argv) != 2:
inputerr()
T = sys.argv[1].strip()
#if not T.isalnum():
# inputerr()
myT = []
for chr in T:
myT.append(ord(chr))
sa = ksa(myT)
print T
print ["{0:^2}".format(chr) for chr in T]
print ["{0:^2}".format(i) for i, chr in enumerate(T)]
print sa
if __name__ == '__main__':
main()