-
Notifications
You must be signed in to change notification settings - Fork 108
/
features.py
482 lines (384 loc) · 12.9 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
# pass in a sentence, pass out it's features
import nltk
import pandas as pd
import sys
import hashlib
import re
import string
import itertools
from nltk import word_tokenize
from nltk.corpus import stopwords
import logging
import logger_config
log = logging.getLogger(__name__)
log.info("Entered module: %s" % __name__)
lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer("english")
line = [
"xxx",
"Oracle 12.2 will be released for on-premises users on 15 March 2017",
0,
"S",
]
pos = [] # list of PartsOfSpeech
output = "" # comma separated string
header = "" # string for describing features header
VerbCombos = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "MD"]
questionTriples = [
"CD-VB-VBN",
"MD-PRP-VB",
"MD-VB-CD",
"NN-IN-DT",
"PRP-VB-PRP",
"PRP-WP-NNP",
"VB-CD-VB",
"VB-PRP-WP",
"VBZ-DT-NN",
"WP-VBZ-DT",
"WP-VBZ-NNP",
"WRB-MD-VB",
]
statementTriples = [
"DT-JJ-NN",
"DT-NN-VBZ",
"DT-NNP-NNP",
"IN-DT-NN",
"IN-NN-NNS",
"MD-VB-VBN",
"NNP-IN-NNP",
"NNP-NNP-NNP",
"NNP-VBZ-DT",
"NNP-VBZ-NNP",
"NNS-IN-DT",
"VB-VBN-IN",
"VBZ-DT-JJ",
]
startTuples = ["NNS-DT", "WP-VBZ", "WRB-MD"]
endTuples = ["IN-NN", "VB-VBN", "VBZ-NNP"]
"""Because python dict's return key-vals in random order, provide ordered
list to pass to ML models"""
feature_keys = [
"id",
"wordCount",
"stemmedCount",
"stemmedEndNN",
"CD",
"NN",
"NNP",
"NNPS",
"NNS",
"PRP",
"VBG",
"VBZ",
"startTuple0",
"endTuple0",
"endTuple1",
"endTuple2",
"verbBeforeNoun",
"qMark",
"qVerbCombo",
"qTripleScore",
"sTripleScore",
"class",
]
@logger_config.logger
def strip_sentence(sentence):
sentence = sentence.strip(",")
sentence = "".join(filter(lambda x: x in string.printable, sentence))
# strip out non-alpha-numerix
sentence = sentence.translate(str.maketrans("", "", string.punctuation))
# strip punctuation
return sentence
@logger_config.logger
def exists_pair_combos(comboCheckList, sentence):
pos = get_pos(sentence)
tag_string = "-".join([i[1] for i in pos])
combo_list = []
for pair in itertools.permutations(comboCheckList, 2):
if pair[0] == "MD": # Kludge - strip off leading MD
pair = ["", ""]
combo_list.append("-".join(pair))
if any(code in tag_string for code in combo_list):
return 1
else:
return 0
@logger_config.logger
# Parts Of Speech
def get_pos(sentence):
sentenceParsed = word_tokenize(sentence)
return nltk.pos_tag(sentenceParsed)
@logger_config.logger
# Count Q-Marks
def count_qmark(sentence):
return sentence.count("?")
@logger_config.logger
# Count a specific POS-Type
# VBG = count_POSType(pos,'VBG')
def count_POSType(pos, ptype):
tags = [i[1] for i in pos]
return tags.count(ptype)
# if ptype in tags:
# VBG = 1
# return(VBG)
@logger_config.logger
# Does Verb occur before first Noun
def exists_vb_before_nn(pos):
pos_tags = [i[1] for i in pos]
# Strip the Verbs to all just "V"
pos_tags = [re.sub(r"V.*", "V", str) for str in pos_tags]
# Strip the Nouns to all just "NN"
pos_tags = [re.sub(r"NN.*", "NN", str) for str in pos_tags]
vi = 99
ni = 99
mi = 99
# Get first NN index
if "NN" in pos_tags:
ni = pos_tags.index("NN")
# Get first V index
if "V" in pos_tags:
vi = pos_tags.index("V")
# get Modal Index
if "MD" in pos_tags:
mi = pos_tags.index("MD")
if vi < ni or mi < ni:
return 1
else:
return 0
@logger_config.logger
# Stemmed sentence ends in "NN-NN"?
def exists_stemmed_end_NN(stemmed):
stemmedEndNN = 0
stemmed_end = get_first_last_tuples(" ".join(stemmed))[1]
if stemmed_end == "NN-NN":
stemmedEndNN = 1
return stemmedEndNN
@logger_config.logger
# Go through the predefined list of start-tuples, 1 / 0 if given startTuple occurs in the list
def exists_startTuple(startTuple):
exists_startTuples = []
for tstring in startTuples: # startTuples defined as global var
if startTuple in tstring:
exists_startTuples.append(1)
else:
exists_startTuples.append(0)
return exists_startTuples
@logger_config.logger
# Go through the predefined list of end-tuples, 1 / 0 if given Tuple occurs in the list
def exists_endTuple(endTuple):
exists_endTuples = []
for tstring in endTuples: # endTuples defined as global var
if endTuple in tstring:
exists_endTuples.append(1)
else:
exists_endTuples.append(0)
return exists_endTuples
@logger_config.logger
# loop round list of triples and construct a list of binary 1/0 vals if triples occur in list
def exists_triples(triples, tripleSet):
exists = []
for tstring in tripleSet:
if tstring in triples:
exists.append(1)
else:
exists.append(0)
return exists
@logger_config.logger
# Get a sentence and spit out the POS triples
def get_triples(pos):
list_of_triple_strings = []
pos = [i[1] for i in pos] # extract the 2nd element of the POS tuples in list
n = len(pos)
if n > 2: # need to have three items
for i in range(0, n - 2):
t = "-".join(
pos[i : i + 3] # noqa: E203
) # pull out 3 list item from counter, convert to string
list_of_triple_strings.append(t)
return list_of_triple_strings
@logger_config.logger
def get_first_last_tuples(sentence):
first_last_tuples = []
sentenceParsed = word_tokenize(sentence)
pos = nltk.pos_tag(sentenceParsed) # Parts Of Speech
pos = [i[1] for i in pos] # extract the 2nd element of the POS tuples in list
n = len(pos)
first = ""
last = ""
if n > 1: # need to have three items
first = "-".join(pos[0:2]) # pull out first 2 list items
last = "-".join(pos[-2:]) # pull out last 2 list items
first_last_tuples = [first, last]
return first_last_tuples
@logger_config.logger
def lemmatize(sentence):
"""
pass in a sentence as a string, return just core text that has
been "lematised" stop words are removed - could effect ability to detect if
this is a question or answer - depends on import
lemma = nltk.wordnet.WordNetLemmatizer()
and from nltk.corpus import stopwords
"""
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(sentence)
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w.lower()) # also set lowercase
lem = []
for w in filtered_sentence:
lem.append(lemma.lemmatize(w))
return lem
@logger_config.logger
def stematize(sentence):
"""
pass in a sentence as a string, return just core text stemmed
stop words are removed - could effect ability to detect if this is a
question or answer - depends on import
sno = nltk.stem.SnowballStemmer('english')
and from nltk.corpus import stopwords
"""
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(sentence)
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
stemmed = []
for w in filtered_sentence:
stemmed.append(sno.stem(w))
return stemmed
#########################################################################
# A wrapper function to put it all together - build a csv line to return
# A header string is also returned for optional use
def get_string(id, sentence, c="X"):
header, output = "", ""
pos = get_pos(sentence)
qMark = count_qmark(sentence) # count Qmarks before stripping punctuation
sentence = strip_sentence(sentence)
# lemmed = lemmatize(sentence)
stemmed = stematize(sentence)
wordCount = len(sentence.split())
stemmedCount = len(stemmed)
qVerbCombo = exists_pair_combos(VerbCombos, sentence)
verbBeforeNoun = exists_vb_before_nn(pos)
output = (
id
+ ","
+ str(wordCount)
+ ","
+ str(stemmedCount)
+ ","
+ str(qVerbCombo)
+ ","
+ str(qMark)
+ ","
+ str(verbBeforeNoun)
)
header = header + "id,wordCount,stemmedCount,qVerbCombo,qMark,verbBeforeNoun"
# list of POS-TYPES to count , generate a list of counts in the CSV line
for ptype in ["VBG", "VBZ", "NNP", "NN", "NNS", "NNPS", "PRP", "CD"]:
output = output + "," + str(count_POSType(pos, ptype))
header = header + "," + ptype
output = output + "," + str(exists_stemmed_end_NN(stemmed))
header = header + ",StemmedEndNN,"
# get Start Tuples and End Tuples Features ##
startTuple, endTuple = get_first_last_tuples(sentence)
list1 = exists_startTuple(startTuple) # list [1/0] for exists / not exists
output = output + "," + ",".join(str(i) for i in list1)
for i in range(0, len(list1)):
header = header + "startTuple" + str(i + 1) + ","
list1 = exists_endTuple(endTuple) # list [1/0] for exists / not exists
output = output + "," + ",".join(str(i) for i in list1)
for i in range(0, len(list1)):
header = header + "endTuple" + str(i + 1) + ","
# look for special Triple Combinations ##
triples = get_triples(pos) # all the triple sequences in the sentence POS list
list1 = exists_triples(triples, questionTriples)
total = sum(list1)
output = output + "," + str(total)
header = header + "qTripleScore" + ","
list1 = exists_triples(triples, statementTriples)
total = sum(list1)
output = output + "," + str(total)
header = header + "sTripleScore" + ","
output = output + "," + c # Class Type on end
header = header + "class"
return output, header
# End of Get String wrapper
@logger_config.logger
# Build a dictionary of features
def features_dict(id, sentence, c="X"):
features = {}
pos = get_pos(sentence)
features["id"] = id
features["qMark"] = count_qmark(
sentence
) # count Qmarks before stripping punctuation
sentence = strip_sentence(sentence)
stemmed = stematize(sentence)
startTuple, endTuple = get_first_last_tuples(sentence)
features["wordCount"] = len(sentence.split())
features["stemmedCount"] = len(stemmed)
features["qVerbCombo"] = exists_pair_combos(VerbCombos, sentence)
features["verbBeforeNoun"] = exists_vb_before_nn(pos)
for ptype in ["VBG", "VBZ", "NNP", "NN", "NNS", "NNPS", "PRP", "CD"]:
features[ptype] = count_POSType(pos, ptype)
features["stemmedEndNN"] = exists_stemmed_end_NN(stemmed)
list1 = exists_startTuple(startTuple) # list [1/0] for exists / not exists
for i in range(0, len(list1)):
features["startTuple" + str(i)] = list1[i]
list1 = exists_endTuple(endTuple) # list [1/0] for exists / not exists
for i in range(0, len(list1)):
features["endTuple" + str(i)] = list1[i]
# look for special Triple Combinations ##
triples = get_triples(pos) # all the triple sequences in the sentence POS list
list1 = exists_triples(
triples, questionTriples
) # a list of 1/0 for hits on this triple-set
features["qTripleScore"] = sum(
list1
) # add all the triple matches up to get a score
list1 = exists_triples(
triples, statementTriples
) # Do same check for the Statement t-set
features["sTripleScore"] = sum(
list1
) # add all the triple matches up to get a score
features["class"] = c # Class Type on end
return features
@logger_config.logger
# pass in dict, get back series
def features_series(features_dict):
values = []
for key in feature_keys:
values.append(features_dict[key])
features_series = pd.Series(values)
return features_series
# MAIN ##
if __name__ == "__main__":
"""ID, WordCount, StemmedCount, Qmark, VBG, StemmedEnd, StartTuples,
EndTuples, QuestionTriples, StatementTriples, Class
[1/0] [NN-NN?] [3 x binary] [3 x binary] [10 x binary] [10 x binary]"""
logging.debug("Starting...")
c = "X" # Dummy class
header = ""
output = ""
if len(sys.argv) > 1:
sentence = sys.argv[1]
else:
sentence = line[1]
id = hashlib.md5(str(sentence).encode("utf-8")).hexdigest()[:16]
features = features_dict(id, sentence, c)
pos = get_pos(sentence) # NLTK Parts Of Speech, duplicated just for the printout
logging.debug(pos)
logging.debug(features)
for key, value in features.items():
logging.debug(key, value)
# header string
for key, value in features.items():
header = header + ", " + key # keys come out in a random order
output = output + ", " + str(value)
header = header[1:] # strip the first ","" off
output = output[1:] # strip the first ","" off
logging.debug("HEADER:", header)
logging.debug("VALUES:", output)