-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk_utils.py
237 lines (173 loc) · 8.55 KB
/
nltk_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
################################################################################
# Imports
################################################################################
from nltk.corpus import *
from nltk.probability import *
from nltk.model import NgramModel
from nltk.tag import *
from nltk.grammar import *
from nltk import *
################################################################################
# Returns the corpus from its string identifier
################################################################################
def get_corpus(corpus_):
if corpus_ == "treebank":
corpus = treebank
elif corpus_ == "brown":
corpus = brown
elif corpus_ == "shakespeare":
corpus = shakespeare # TODO Review this, not compatible
elif corpus_ == "grimmstories":
corpus = PlaintextCorpusReader("./custom_corpora/GrimmCorpus", '.*\.txt')
elif corpus_ == "spinoza":
corpus = PlaintextCorpusReader("./custom_corpora/SpinozaCorpus", '.*\.txt')
elif corpus_ == "kant":
corpus = PlaintextCorpusReader("./custom_corpora/KantCorpus", '.*\.txt')
else:
print "--Falling back to treebank as training set"
corpus = treebank
return corpus
################################################################################
# Returns the words list of a corpus given its string identifier
################################################################################
def get_corpus_words(corpus_, corpus_category_):
if corpus_ == "brown":
return get_corpus(corpus_).words(categories = corpus_category_)
else:
return get_corpus(corpus_).words()
################################################################################
# Sets the training set and extracts its contents as a list of tagged words
# NOT USED. Only here for research purposes.
################################################################################
def get_tagged_words_list_from_corpus(corpus_):
sentences = ""
# Set the training set
# Treebank produces unorthodox results in the context of a usual conversation
# because of its economic content.
if corpus_ == "treebank":
sentences = treebank.tagged_sents(simplify_tags = "universal")
elif corpus_ == "brown":
sentences = brown.tagged_sents(simplify_tags = "universal")
elif corpus_ == "shakespeare":
sentences = shakespeare.tagged_sents(simplify_tags = "universal") # TODO Review this, not compatible
else:
print "--Falling back to treebank as training set"
sentences = treebank.tagged_sents(simplify_tags = "universal")
return sentences
################################################################################
# Sets the training set and extracts its contents as a list of words
################################################################################
def get_words_list_from_corpus(corpus_, corpus_category_):
sentences = get_corpus_words(corpus_, corpus_category_)
# The tokenized training set as a list
words = []
for sentence in sentences:
sent = word_tokenize(sentence)
for s in sent:
words.append(s)
return words
################################################################################
# Induce a context free grammar from the corpus used
# NOT USED. Only here for research purposes.
################################################################################
def induce_grammar_from_parsed_corpus(corpus_):
corpus = get_corpus(corpus_)
productions = []
for item in corpus.items:
for tree in corpus.parsed_sents(item):
# perform optional tree transformations, e.g.:
tree.collapse_unary(collapsePOS = False)
tree.chomsky_normal_form(horzMarkov = 2)
productions += tree.productions()
S = Nonterminal('S')
grammar = induce_pcfg(S, productions)
return grammar
################################################################################
# Consider this the sub-main method of the program
################################################################################
def init(corpus_, corpus_category_, N_, est_):
# Initialize the corpus (sentences), size N (for ngrams)
# and the estimator (estimator) if smoothing is selected
words, N, estimator = init_base(corpus_, corpus_category_, N_, est_)
# Builds the language model based on the selected base
language_model = init_language_model(words, N, estimator)
#tag_model = init_tagger_model(corpus_)
#cfg_grammar = induce_grammar_from_parsed_corpus(corpus_)
return language_model #, tag_model, cfg_grammar
################################################################################
# Initializes corpus, n for ngrams and the smoothing technique
################################################################################
def init_base(corpus_, corpus_category_, N_, est_):
# Sets the training set and extracts its contents as a list of words
words = get_words_list_from_corpus(corpus_, corpus_category_)
# Set the N-gram N factor
N = N_
# Set the smoothing estimator
estimator = set_estimator(est_, words)
return words, N, estimator
################################################################################
# Constructs the language model
################################################################################
def init_language_model(words, N, estimator):
# Ngram language model based on the training set
if estimator:
langModel = NgramModel(N, words, estimator=estimator)
else:
langModel = NgramModel(N, words)
return langModel
################################################################################
# Constructs the tagging model
# NOT USED. Only here for research purposes.
################################################################################
def init_tagger_model(corpus_):
words = get_tagged_words_list_from_corpus(corpus_)
# Last-resort tagger
regexp_backoff = tag.RegexpTagger([
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
# Taggers in backoff
tagger_1 = tag.NgramTagger(1, words, backoff=regexp_backoff)
tagger_2 = tag.NgramTagger(2, words, backoff=tagger_1)
tagger_3 = tag.NgramTagger(3, words, backoff=tagger_2)
tagger_4 = tag.NgramTagger(4, words, backoff=tagger_3)
#templates
templates = [
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateTagsRule, (1,1)),
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateTagsRule, (2,2)),
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateTagsRule, (1,2)),
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateTagsRule, (1,3)),
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateWordsRule, (1,1)),
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateWordsRule, (2,2)),
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateWordsRule, (1,2)),
tag.brill.SymmetricProximateTokensTemplate(tag.brill.ProximateWordsRule, (1,3)),
tag.brill.ProximateTokensTemplate(tag.brill.ProximateTagsRule, (-1, -1), (1,1)),
tag.brill.ProximateTokensTemplate(tag.brill.ProximateWordsRule, (-1, -1), (1,1))
]
trainer = tag.brill.FastBrillTaggerTrainer(tagger_4, templates)
brill_tagger = trainer.train(words, max_rules=100, min_score=3)
return brill_tagger
################################################################################
# Sets the estimator for smoothing
################################################################################
def set_estimator(est_, words):
# Smoother selection
if est_ == 0:
estimator = None
elif est_ == 1:
print "--Using LidstoneProbDist as smoother"
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
elif est_ == 2:
print "--Using ELEProbDist as smoother"
estimator = lambda fdist, bins: ELEProbDist(fdist)
else:
print "--Falling back to Lidstone as smoother"
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.1)
return estimator