-
Notifications
You must be signed in to change notification settings - Fork 4
/
document.py
103 lines (73 loc) · 2.05 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
class LineParser:
def __init__(self, parsers):
self.parsers = sorted(parsers.items())
def __call__(self, line):
return [p(line[i]) for i, p in self.parsers]
class DocumentParser:
def __init__(self, *func):
self.func = func
def __call__(self, doc):
for f in self.func:
doc = f(doc)
return doc
def parse_document(sep1='#N#', sep2=' '):
def f(doc):
return [list(map(int, d.split(sep2))) for d in doc.split(sep1)]
return f
def filter_document(vocab):
def f(doc):
return [[x for x in d if x <= vocab] for d in doc]
return f
def pad_document(size, length):
def f(doc):
result = np.zeros((size, length))
i = 0
for d in doc:
if d:
for j, x in enumerate(d):
if j == length:
break
result[i, j] = x
i += 1
if i == size:
break
return result
return f
def bow_document(vocab):
def f(doc):
result = np.zeros((vocab + 1,))
for d in doc:
for word in d:
if 0 < word <= vocab:
result[word] += 1
return result
return f
def flat_document(size):
def f(doc):
result = np.zeros((size,))
i = 0
for d in doc:
for word in d:
if word > 0:
result[i] = word
i += 1
if i == size:
return result
return result
return f
def kv_parse_document(deliminator=':', sep=' '):
def f(doc):
k, v = zip(*[(int(k), int(v)) for k, v in [d.split(deliminator) for d in doc.split(sep)]])
return k, v
return f
def kv_pad_document(size):
def f(doc):
k, v = doc
rk = np.zeros((size,))
rv = np.zeros((size,))
for ik, iv, i in zip(k, v, range(size)):
rk[i] = ik
rv[i] = ik
return rk, rv
return f