-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
134 lines (102 loc) · 3.64 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Utilities for the DIConf Notebooks
import sys
import nltk
import string
from nltk.corpus import wordnet as wn
##########################################################################
## Data Loading Utility
##########################################################################
def documents(corpus, fileids=None, categories=None, norm=False):
"""
Generator yields tokenized documents in the corpus.
"""
for doc in corpus.docs(fileids=fileids, categories=categories):
doc = [
token
for paragraph in doc
for sentence in paragraph
for token in sentence
]
if norm:
yield normalize(doc)
else:
yield doc
def utterances(corpus, fileids=None, categories=None, norm=False):
"""
Generator yeilds tokenized sentences in the corpus.
"""
for sent in corpus.sents(fileids=fileids, categories=categories):
if norm:
yield normalize(sent)
else:
yield doc
def labels(corpus, fileids=None, categories=None):
"""
Generator yields labels for the given subset of docs.
"""
for fileid in corpus._resolve(fileids, categories):
yield corpus.categories(fileids=[fileid])[0]
##########################################################################
## Helper Functions
##########################################################################
def identity(arg):
"""
Simple identity function works as a passthrough.
"""
return arg
##########################################################################
## Preprocessing
##########################################################################
class Lemmatizer(object):
"""
Wraps the nltk.WordNetLemmatizer to provide added functionality like the
discovery of the part of speech of the word to lemmatize.
"""
def __init__(self):
self._wordnet = nltk.WordNetLemmatizer()
self._cache = {}
def tagwn(self, tag):
"""
Returns the WordNet tag from the Penn Treebank tag.
"""
return {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
def poswn(self, word):
"""
Computes the part of speech for the given word.
"""
return self.tagwn(nltk.pos_tag([word])[0][1])
def lemmatize(self, word, tag=None):
"""
Lemmatizes the word; if no tag is given, then computes the tag.
"""
if (word, tag) in self._cache:
return self._cache[(word, tag)]
tag = self.tagwn(tag) if tag else self.poswn(word)
lemma = self._wordnet.lemmatize(word, tag)
self._cache[(word, tag)] = lemma
return lemma
PUNCTUATION = string.punctuation + "“”’—"
class Normalizer(object):
def __init__(self, stopwords=None, punct=None):
self.stopwords = set(stopwords or nltk.corpus.stopwords.words('english'))
self.punct = str.maketrans('', '', PUNCTUATION)
self.lemmatizer = Lemmatizer()
def strip_punctuation(self, token):
return token.translate(self.punct)
def strip_stopwords(self, token):
if token in self.stopwords:
return ""
return token
def __call__(self, tokens):
tokens = map(lambda s: self.lemmatizer.lemmatize(*s), tokens)
tokens = map(lambda s: s.strip(), tokens)
tokens = map(lambda s: s.lower(), tokens)
tokens = map(lambda s: self.strip_stopwords(s), tokens)
tokens = map(lambda s: self.strip_punctuation(s), tokens)
return list(filter(None, tokens))
normalize = Normalizer()