-
Notifications
You must be signed in to change notification settings - Fork 0
/
kneserney.py
369 lines (313 loc) · 13.6 KB
/
kneserney.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#!/usr/bin/env python
from __future__ import division
"""
Build a Kneser-Ney smoothed language model from data with (possibly
fractional) sentence weights.
author: David Chiang
version: 2013-05-07
For usage, run: python fractional.py -h
Please note that this code is primarily for instructional purposes
and is not the same code as that used for the experiments in the
paper.
Copyright (c) 2013, University of Southern California
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
"""
import sys
from collections import defaultdict
import math
start = "<s>"
stop = "</s>"
unk = "<unk>"
logzero = -99
verbose = 0
class CountDistribution(object):
"""A (partial) representation of a distribution over nonnegative
integers."""
def __init__(self, s=0):
"""We only store the first s values of the probability mass
function. In other words, s-1 is the maximum value of r
that we will want to know P(count=r) for."""
self.mean = 0.
self.p = [0.]*s
if s > 0:
self.p[0] = 1.
def add(self, p=1., c=1):
"""Add c events occurring together with probability p."""
self.mean += p * c
for r in xrange(len(self.p)-1, -1, -1):
self.p[r] = self.p[r] * (1-p)
if r-c >= 0:
self.p[r] += self.p[r-c] * p
def p_ge(self, r):
"""Probability that the the count is greater than or equal to r."""
# The max is a sanity check against rounding errors. More
# sophisticated corrections don't have any effect in practice.
return max(1 - sum(self.p[r1] for r1 in xrange(r)), self.p[r])
class LanguageModel(object):
def __init__(self, order):
self.order = order
self.base = None
self.p = defaultdict(dict)
self.b = {}
def prob(self, context, word, interpolate=False):
p = 0.
found = context in self.p and word in self.p[context]
if found:
p += self.p[context][word]
if self.base and (not found or interpolate):
p += self.b.get(context, 1.) * self.base.prob(context[1:], word)
return p
def add(self, context, word, p):
if len(context)+1 == self.order:
self.p[context][word] = p
else:
self.base.add(context, word, p)
def interpolate(self):
"""Interpolate this model with the lower-order model."""
if not self.base:
return
self.base.interpolate()
for context in self.p:
for word in self.p[context]:
self.p[context][word] += self.b[context] * self.base.prob(context[1:], word)
def trim_start(self):
"""If an n-gram starts with two <s> symbols, trim the first one
and transfer the (n-1)-gram to the lower-order model."""
if not self.base:
return
to_del = []
for context in self.p:
if len(context) >= 2 and context[1] == start:
self.base.p[context[1:]] = self.p[context]
self.base.b[context[1:]] *= self.b[context]
to_del.append(context)
for context in to_del:
del self.p[context]
del self.b[context]
self.base.trim_start()
def _write_length(self, outfile):
if self.base:
self.base._write_length(outfile)
if self.order >= 1:
size = sum(len(self.p[context]) for context in self.p)
outfile.write("ngram %d=%d\n" % (self.order, size))
def _write_ngrams(self, outfile, parent=None):
if self.base:
self.base._write_ngrams(outfile, self)
if self.order >= 1:
outfile.write("\\%d-grams:\n" % self.order)
for context in self.p:
for word, prob in self.p[context].iteritems():
ngram = context+(word,)
prob = math.log10(prob) if prob > 0. else logzero
if parent and ngram in parent.b:
bow = math.log10(parent.b[ngram])
outfile.write("%f\t%s\t%f\n" % (prob, " ".join(ngram), bow))
else:
outfile.write("%f\t%s\n" % (prob, " ".join(ngram)))
outfile.write("\n")
def write_arpa(self, outfile):
outfile.write("\\data\\\n")
self._write_length(outfile)
outfile.write("\n")
self._write_ngrams(outfile)
outfile.write("\\end\\\n")
class MaximumLikelihood(object):
def __init__(self, order):
self.order = order
self.c = defaultdict(lambda: defaultdict(float))
def add(self, context, word, p=1., c=1):
self.c[context][word] += p * c
def estimate(self):
m = LanguageModel(self.order)
m.p = defaultdict(dict)
m.b = {}
for context in self.c:
n = sum(self.c[context][word] for word in self.c[context])
for word in self.c[context]:
m.p[context][word] = self.c[context][word] / n
m.b[context] = 0.
return m
class Uniform(object):
def __init__(self, order=0):
self.order = order
self.c = defaultdict(set)
def add(self, context, word, p=1., c=1):
self.c[context].add(word)
def estimate(self):
m = LanguageModel(self.order)
for context in self.c:
for word in self.c[context]:
m.p[context][word] = 1./len(self.c[context])
m.b[context] = 0.
return m
class KneserNey(object):
def __init__(self, order, s=1, zerogram=None):
"""For types seen r<s times, discount by D_r (similar to Good-Turing);
for types seen r>=s times, discount by D (absolute discounting).
For Modified Kneser-Ney, use s=3."""
self.order = order
self.c = defaultdict(lambda: defaultdict(lambda: CountDistribution(s+2)))
if order > 1:
self.base = KneserNey(order-1, s, zerogram)
else:
self.base = zerogram or Uniform(order=0)
self.s = s
def add(self, context, word, p=1., c=1):
if len(context)+1 == self.order:
self.c[context][word].add(p, c)
else:
# delegate to base
self.base.add(context, word, p, c)
def estimate(self, recursion='uniform'):
# Collect count-counts over all contexts
n = []
for r in xrange(self.s+2):
n_r = 0.
for context, c_context in self.c.iteritems():
for word, c_word in c_context.iteritems():
n_r += c_word.p[r]
n.append(n_r)
if verbose:
for r in xrange(1, self.s+2):
sys.stderr.write("n_%s = %s\n" % (r, n[r]))
# Compute discounts
y = n[1] / (n[1] + 2*n[2])
d = [0.] + [r-y*(r+1)*n[r+1]/n[r] for r in xrange(1, self.s+1)]
if verbose:
for r in xrange(1, self.s+1):
sys.stderr.write("D_%s = %s\n" % (r, d[r]))
if d[r]-d[r-1] > 1.:
sys.stderr.write("warning: D_%d - D_%d > 1\n" % (r, r-1))
# Estimate model
m = LanguageModel(order=self.order)
for context, c_context in self.c.iteritems():
b = 0.
n = sum(c_context[word].mean for word in c_context)
p_context = m.p[context]
for word, c_word in c_context.iteritems():
# Calculate discount for this word
discount = 0.
for r in xrange(1, self.s):
discount += c_word.p[r] * d[r]
discount += c_word.p_ge(self.s) * d[self.s]
# Smoothed probability estimate and backoff weight
p_context[word] = (c_word.mean - discount) / n
b += discount / n
# Send events to lower-order model
if recursion == 'uniform':
# The lower-order model sees one of each type.
self.base.add(context[1:], word, p=c_word.p_ge(1))
elif recursion == 'exact':
# The lower-order model sees the discounted part
# of each type. This way seems more correct but
# doesn't reduce to unweighted KN.
for r in xrange(1, self.s+1):
p = min(1., c_word.p_ge(r) * (d[r]-d[r-1]))
self.base.add(context[1:], word, p=p)
m.b[context] = b
if isinstance(self.base, KneserNey): # maybe recursion should be a field instead?
m.base = self.base.estimate(recursion=recursion)
else:
m.base = self.base.estimate()
return m
if __name__ == "__main__":
kneserney.verbose = 1
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', nargs='?',
type=argparse.FileType('r'), default=sys.stdin,
metavar='file', help='input file, tokenized')
parser.add_argument('weightfile', nargs='?',
type=argparse.FileType('r'), default=None,
metavar='file', help='weights file, one weight per line')
parser.add_argument('-o', dest='outfile',
type=argparse.FileType('w'), default=sys.stdout,
metavar='file', help='output file in ARPA format (default: stdout)')
parser.add_argument('-n', dest='order',
type=int, default=3,
metavar='n', help='order of language model (default: 3)')
parser.add_argument('-s', dest='discount_min',
type=int, default=1,
metavar='count', help='minimum count for absolute discounting: *1 = original Kneser-Ney, 3 = modified Kneser-Ney')
parser.add_argument('--start-ngrams', dest='start',
choices=['short', 'long'], default='short',
metavar='*short|long', help='how to handle start-of-sentence n-grams')
parser.add_argument('--recursion', dest='recursion',
choices=['uniform', 'exact'], default='uniform',
metavar='*uniform|exact', help='how to count n-grams in lower-order models')
parser.add_argument('--zerogram', dest='zerogram',
choices=['uniform', 'fractional'], default='uniform',
metavar='*uniform|fractional', help='use uniform or fractional counts to estimate 0-gram model')
parser.add_argument('--interpolate', dest='interpolate',
choices=['yes', 'no'], default='yes',
metavar='*yes|no', help='create an interpolated model')
args = parser.parse_args()
infile = args.infile
weightfile = args.weightfile
outfile = args.outfile
order = args.order
if args.zerogram == 'uniform':
# The 0-gram model is the uniform distribution over types.
zerogram = Uniform(order=0)
elif args.zerogram == 'fractional':
# The 0-gram probability is proportional to the probability of
# a type being seen.
zerogram = MaximumLikelihood(order=0)
estimator = KneserNey(order, s=args.discount_min, zerogram=zerogram)
sys.stderr.write("Counting\n")
for line in infile:
if weightfile:
p = float(weightfile.readline())
else:
p = 1.
if args.start == 'long':
# Prepend n-1 start symbols so that first word is an
# n-gram.
words = [start]*(order-1) + line.split() + [stop]
i0 = order-1
elif args.start == 'short':
# Prepend 1 start symbol so that first word is a 2-gram.
# This means that modified and unmodified counts will get
# mixed together.
words = [start] + line.split() + [stop]
i0 = 1
for i in xrange(i0, len(words)):
context = tuple(words[max(0,i-order+1):i])
word = words[i]
estimator.add(context, word, p=p)
# Add unknown word
# Bug: not clear what p should be if 0-gram model is not uniform
zerogram.add((), unk, p=1.)
estimator.add((), unk, p=0.)
# Estimate models
sys.stderr.write("Estimating\n")
model = estimator.estimate(recursion=args.recursion)
if args.interpolate == 'yes':
sys.stderr.write("Interpolating\n")
model.interpolate()
if args.start == 'long':
sys.stderr.write("Trimming start-of-sentence events\n")
model.trim_start()
# The start symbol is the only symbol that has a bow but no prob.
# Insert it with zero prob so it will get output
model.add((), start, 0.)
sys.stderr.write("Writing\n")
model.write_arpa(outfile)