-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSet.py
228 lines (175 loc) · 7.8 KB
/
Set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import os
from parsexml.text import Text
from Persistence import Persistence
import sys
import multiprocessing
from helper.pickle_methods import activate
from helper.parallel_features import Parallel_features
from feature.exception import FailedProcessingFeature
from Feature import Feature
from helper.sparse import build_sparse_matrix
import pickle
from scipy.sparse import coo_matrix
# Needs to be done in order to use multiprocessing
activate()
class Set(object):
def __init__(self, inverse=False, closure=False, best_settings=False, *corpora):
self.corpora = corpora
self.inverse = inverse
self.closure = closure
self.best_settings = best_settings
# There are no features yet
self._processed = False
global inverse_g
inverse_g = self.inverse
global closure_g
closure_g = self.closure
global best_settings_g
best_settings_g = self.best_settings
# Hols all textfile objects
self.text_objects = []
self._parse()
self._event_event_rels = []
self._event_timex_rels = []
self._extract_relations()
self.relations = self._event_event_rels + self._event_timex_rels
# The following needs to be passed with self.pass_objects()
self.features = None
self.strings_cache = None
self.nlp_persistence_obj = None
self.duration_cache = None
self.discourse_cache = None
# Masks for feature selection
self.feature_selection_mask_event_event = pickle.load(open("selector_acc_object_ee.p", "rb")).support_
self.feature_selection_mask_event_timex = pickle.load(open("selector_acc_object_et.p", "rb")).support_
def pass_objects(self, features, strings_cache, nlp_persistence_obj, duration_cache, discourse_cache):
"""Needs to be called before self.get_event_{event,timex}_feature_vectors_and_target()."""
self.features = features
self.strings_cache = strings_cache
self.nlp_persistence_obj = nlp_persistence_obj
self.duration_cache = duration_cache
self.discourse_cache = discourse_cache
def get_event_event_feature_vectors_and_targets(self):
if not self._processed:
self._get_feature_data()
self._processed = True
X = []
y = []
# self.text_objects include feature data now
for text_obj in self.text_objects:
for relation in text_obj.relations:
if relation.is_event_event():
if "all" in self.features and "feature_selection" in self.features:
feature = self._apply_feature_selection(relation.feature, relation.is_event_event())
X.append(feature)
relation.feature = feature
else:
X.append(relation.feature)
y.append(relation.relation_type)
sparse_X_matrix = build_sparse_matrix(X)
return (sparse_X_matrix, y)
def get_event_timex_feature_vectors_and_targets(self):
if not self._processed:
self._get_feature_data()
self._processed = True
X = []
y = []
# self.text_objects include feature data now
for text_obj in self.text_objects:
for relation in text_obj.relations:
if relation.is_event_timex():
if "all" in self.features and "feature_selection" in self.features:
feature = self._apply_feature_selection(relation.feature, relation.is_event_event())
X.append(feature)
relation.feature = feature
else:
X.append(relation.feature)
y.append(relation.relation_type)
sparse_X_matrix = build_sparse_matrix(X)
return (sparse_X_matrix, y)
def _apply_feature_selection(self, feature_vector, is_event_event):
feature_vector = feature_vector.toarray()
if is_event_event:
feature_vector_masked = feature_vector[:, self.feature_selection_mask_event_event]
else:
feature_vector_masked = feature_vector[:, self.feature_selection_mask_event_timex]
return coo_matrix(feature_vector_masked, dtype="int32")
def _extract_relations(self):
for text_obj in self.text_objects:
for relation in text_obj.relations:
if relation.is_event_event():
self._event_event_rels.append(relation)
elif relation.is_event_timex():
self._event_timex_rels.append(relation)
def _print_progress(self, position, length):
sys.stdout.write("\r%d%%" % int(position*100/(length - 1)))
sys.stdout.flush()
def _parse(self):
# Holds all corpora files
files = []
# Get all files
for corpus in self.corpora:
files = files + self._fetch_files(corpus)
# Parse all files
tmls = []
for file in files:
# Only parse *.tml files
if not file.endswith('tml'):
continue
tmls.append(file)
# Parse from files on all cores
pool = multiprocessing.Pool()
pool.map_async(self._parse_from_file, tmls, callback=self._append_text_objs)
pool.close()
pool.join()
def _append_text_objs(self, text_objs):
self.text_objects += text_objs
def _parse_from_file(self, file):
try:
# Mapping xml data to python objects
text = Text(file, inverse=inverse_g, closure=closure_g, best_settings=best_settings_g)
return text
except Exception as e:
print e
def _fetch_files(self, directory_or_file):
files = []
if os.path.isfile(directory_or_file):
# It's a file
return [directory_or_file]
else:
# It's a directory
# Append '/' if there is no at the end of directory string
if not directory_or_file.endswith('/'):
directory_or_file = directory_or_file + '/'
for file in os.listdir(directory_or_file):
files.append(directory_or_file + file)
return files
def _get_feature_data(self):
features_event_event = self._remove_only_event_timex_features(self.features)
features_event_timex = self._remove_only_event_event_features(self.features)
parallel_processing = Parallel_features(self.text_objects, self.nlp_persistence_obj, self.strings_cache, self.duration_cache, self.discourse_cache, features_event_event, features_event_timex)
text_objs_with_feature_data = parallel_processing.processed_text_objs
self.text_objects = text_objs_with_feature_data
def _remove_only_event_event_features(self, features):
features_event_timex = list(features)
self._try_to_remove(features_event_timex, "same_tense")
self._try_to_remove(features_event_timex, "same_aspect")
self._try_to_remove(features_event_timex, "same_class")
self._try_to_remove(features_event_timex, "same_pos")
self._try_to_remove(features_event_timex, "same_polarity")
self._try_to_remove(features_event_timex, "temporal_discourse")
return features_event_timex
def _remove_only_event_timex_features(self, features):
features_event_event = list(features)
self._try_to_remove(features_event_event, "dct")
self._try_to_remove(features_event_event, "type")
self._try_to_remove(features_event_event, "value")
return features_event_event
def _try_to_remove(self, l, value):
try:
l.remove(value)
except ValueError:
pass
class WrongArguments(Exception):
def __str__(self):
return repr("Using wrong arguments.")