forked from bojone/bytepiece
-
Notifications
You must be signed in to change notification settings - Fork 0
/
faster.pyx
44 lines (35 loc) · 1.15 KB
/
faster.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# cython: language_level=3
from libc.time cimport time
from libc.stdlib cimport RAND_MAX, rand, srand
from libc.math cimport INFINITY, exp, log
srand(time(NULL))
cpdef set_seed(unsigned int seed):
srand(seed)
cdef inline double logsumexp(double x, double y):
if x < y:
x, y = y, x
return x + log(1 + exp(y - x))
cdef inline bint choice(double x, double y):
return rand() < exp(x - y) * RAND_MAX
def _tokenize(self, bytes text, double alpha=-1):
cdef int e, k, s
cdef double v, score
cdef list scores = [0] + [-INFINITY] * len(text)
cdef list routes = list(range(len(text) + 1))
cdef list tokens = []
for e, (k, v) in self._automaton.iter(text):
s, e = e - k + 1, e + 1
if alpha < 0:
score = scores[s] + v
if score > scores[e]:
scores[e], routes[e] = score, s
else:
score = scores[s] + alpha * v
scores[e] = logsumexp(scores[e], score)
if choice(score, scores[e]):
routes[e] = s
while text:
s = routes[e]
tokens.append(text[s:e])
text, e = text[:s], s
return tokens[::-1]