-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLPutils.py
238 lines (199 loc) · 8.17 KB
/
NLPutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
'''
Basic cleaning and pre-processing of EMS narratives
Copyright (C) 2017 University of Virginia, Homa Alemzadeh
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import bigrams
from nltk import trigrams
from nltk.util import ngrams
#from nltk.tag.stanford import StanfordPOSTagger
import csv, re, math, operator, sys, os, glob
from time import time
import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.doc2vec import LabeledSentence
from gensim import utils
import numpy as np
class NLPutils(object):
def __init__(self):
self.bigram_measures = nltk.collocations.BigramAssocMeasures()
self.trigram_measures = nltk.collocations.TrigramAssocMeasures()
#pos = StanfordPOSTagger('./stanford-postagger-2013-06-20/models/english-left3words-distsim.tagger',
# './stanford-postagger-2013-06-20/stanford-postagger.jar')
# N-gram patterns for technical terms
self.patterns = [['NN'],['JJ'], ['JJ','JJ'],['JJ', 'NN'], ['NN','NN'], ['JJ', 'JJ', 'NN'], ['JJ', 'NN', 'NN'],
['NN', 'JJ', 'NN'], ['NN', 'NN', 'NN'],['NN', 'IN', 'NN'],['JJ','JJ','JJ'],
['JJ', 'JJ', 'JJ','NN'], ['JJ', 'JJ', 'NN','NN'], ['JJ', 'NN', 'NN','NN'],['JJ', 'NN', 'JJ','NN'],['NN', 'JJ', 'JJ','NN'],
['NN', 'NN', 'NN','NN'],['NN','JJ', 'NN','NN'], ['NN', 'NN','JJ', 'NN']]
# For filtering junk
self.non_tech_words = []
self.googlenews_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
self.googlenews_model.syn0norm = self.googlenews_model.syn0
# Tokenization
def get_tokens(self, text):
stops = set(stopwords.words('english'))
regex = re.compile('[%s]' % re.escape('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'))
# Get the sentences
sentences = regex.sub(' ', text)
# Get the words
raw_tokens = list(set(word_tokenize(sentences.encode("utf-8", 'ignore'))))
# Filter numbers and characters
#tokens= [str(t) for t in raw_tokens if str(t) not in stops and not str(t).isdigit() and len(str(t))>1]
tokens= [str(t) for t in raw_tokens if not str(t).isdigit() and len(str(t))>1]
tokens = [t for t in tokens if t not in self.non_tech_words and t.isalpha()]
return tokens
# Part of Speech Tagging
def get_pos_tags(self, tokens):
regex = re.compile('[%s]' % re.escape('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'))
tags = [];
starti = 0
endi = 0
no_chunks = len(tokens)/5000+1;
print 'Process '+str(len(tokens))+' tokens in '+str(no_chunks)+ ' chunks..'
for l in range(0, no_chunks):
endi = min((starti + (len(tokens)/no_chunks) ), len(tokens))
print "Tagging #" + str(l) + ": from " + str(starti)+ " to "+str(endi-1)
tags = tags + nltk.pos_tag(tokens[starti:endi]);
#tags = tags + pos.tag(tokens[starti:endi])[0];
starti = endi;
print str(len(tags))+" words tagged.."
# Save all the Noun and Adjective unigrams in a hash table
tag_set = {'Word':'Tag'}
for tag in tags:
if (self.cleanseNN([str(tag[1])]) in self.patterns[0:2]):
tag_set[str(tag[0])] = str(tag[1])
#print cleanseNN([str(tag[1])])
#print tags
#print '\n'
return tag_set
# Clean part of speech tags
def cleanseNN(self, l):
for i in range(0, len(l)):
for k in range(0, len(l[i])):
if("NN" in l[i][k]):
l[i][k] = "NN"
return l
# Look for longest n-gram appearing in each sentence with the patterns of technical terms
def get_tech_ngrams(self, text, tag_set):
# Normalization and Punctuation filtering=> keep sentence separators
text = text.lower()
sentences = re.split('\.(?!\d)', text)
#print sentences
results = {'ngram':'tags'};
n_gram = []
tags = []
n_gram_str = ''
tag_str = ''
for s in sentences:
regex = re.compile('[%s]' % re.escape('!"#$%&\'()*+/:<=>?@[\\]^_`{|}~-'))
Text = regex.sub(' ',s)
# Get the words
words = word_tokenize(unicode(s, errors='ignore'))
w_i = -1;
# Filter numbers
#words= [str(t) for t in raw_tokens if not str(t).isdigit() and len(str(t))>1]
#print '--->' + Text + '\n'
s_result = []
for w in words:
w_i = w_i + 1;
if (tag_set.has_key(w)):
n_gram.append(w)
tags.append(tag_set[w])
#print "n-gram = "+n_gram[-1]
#print w+'-'+str(words.index(w))
# If this is the last word in the list or a non-NJ word, we finalize the n-gram
if not(tag_set.has_key(w)) or (w_i == len(words)-1):
# Only if we found something
if (len(n_gram) > 1):
#print "long n-gram = "+n_gram[0]
# If the pattern of tags is of interest, save it
if (self.cleanseNN(tags) in self.patterns):
n_gram_str = ' '.join(n_gram)
tag_str = ', '.join(tags)
if not(n_gram_str in s_result) :
s_result.append(n_gram_str)
n_gram_str = n_gram_str.decode('utf-8')
if (n_gram_str in Text):
if not(results.has_key(n_gram_str)):
results[n_gram_str] = tag_str
else:
print('ngram not found in text: '+n_gram_str)
# Restart searching for next n-gram
n_gram = []
tags = []
n_gram_str = ''
tag_str = ''
print(str(len(results.keys()))+" n-grams found..")
results.pop("ngram")
return results
def lexicon_expansion(self, seed_terms):
print("\n Total seed terms = " + str(len(seed_terms))+"\n")
terms_scores = []
for c in seed_terms:
terms_scores.append((str(c),str(1.0)))
for c in seed_terms:
if not(c.isdigit()) and (len(c) > 3):
print("\nSeed Term:" + c)
# Get the word2vec representation of the ngram by adding word vectors
c_array = [s for s in c.split(' ') if s in self.googlenews_model.vocab]
if len(c_array):
vector = np.sum(self.googlenews_model[c_array], axis=0)
# Get the most similar vectors
most_similar = self.googlenews_model.similar_by_vector(vector, topn=20, restrict_vocab=None)
# Filter highly similar vectors
for (term,score) in most_similar:
term = term.lower().replace("_"," ")
if (score > 0.4):
if not(term in seed_terms) and not(term.isdigit()) and (len(term.split(' ')) < 3):
seed_terms.append(str(term))
terms_scores.append((str(term),round(score,2)))
print(term)
print(len(seed_terms))
return seed_terms
'''
def main():
os.chdir("./dataset")
# Set default encoding of python to utf8
reload(sys)
sys.setdefaultencoding('utf8')
ngrams_set = set()
with open('output.csv', 'w') as output:
csvwriter = csv.writer(output)
csvwriter.writerow(["Data File", "Narrative", "N-grams"])
for file in glob.glob("*.txt"):
with open(file, 'r') as reader:
text = ''
for line in reader:
text = text + line.rstrip('\n\r').lower()
print("\nProcessing "+file)
# Tokenization
tokens = get_tokens(text)
# Part of speech Tagging
tag_set = get_pos_tags(tokens)
# Technical N-gram extraction
ngrams = get_tech_ngrams(text, tag_set)
# Write to output
csvwriter.writerow([file, text, ngrams.keys()])
ngrams_set = ngrams_set.union(set(ngrams.keys()))
ngrams_list = lexicon_expansion(list(ngrams_set))
with open('ngrams.txt', 'w') as writer:
for s in ngrams_list:
writer.write(str(s)+'\n')
if __name__ == '__main__':
sys.exit(main())
'''