-
Notifications
You must be signed in to change notification settings - Fork 25
/
pronunciation.py
75 lines (59 loc) · 2.32 KB
/
pronunciation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
Get the arpabet pronunciation of a set of words, courtesy
of the CMU Sphinx pronunciation dictionary (and their
tools to determine the pronunciation of unknown words).
Usage: create a pronounce object, add words to pronounce object
run .p()
Command line: python pronunciation.py list of words to pronounce
Copyright 2013 - Steven Rubin - [email protected]
MIT License
"""
import requests
import sys
import re
import string
class Pronounce(object):
url = "http://www.speech.cs.cmu.edu/cgi-bin/tools/logios/lextool.pl"
dict_re = re.compile(r"http://.*\d+\.dict")
other_pr = re.compile(r"(.*)\(\d+\)$")
vowel_re = re.compile(r"AA|AE|AH|AO|AW|AY|EH|ER|EY|IH|IY|OW|OY|UH|UW")
def __init__(self, words=None):
if words:
self.words = words
else:
self.words = []
def add(self, word):
self.words.append(word)
def p(self, add_fake_stress=False):
w_upper = [unicode(w).upper() for w in self.words]
punc_map = dict((ord(c), None) for c in string.punctuation)
w_nopunc = [s.translate(punc_map) for s in w_upper]
wordfile = {'wordfile': ('words.txt', " ".join(w_nopunc))}
res = requests.post(Pronounce.url,
data={"formtype": "simple"},
files=wordfile, allow_redirects=True)
base_url = res.url
text = res.text
dict_path = Pronounce.dict_re.search(text).group(0)
res = requests.get(dict_path)
# generate output dict
pronunciations = {}
for line in res.text.split('\n'):
if len(line) > 0:
pr = line.split('\t')
match = Pronounce.other_pr.match(pr[0])
if match:
pr[0] = match.group(1)
idx = w_nopunc.index(pr[0])
orig = self.words[idx]
upword = w_upper[idx]
if add_fake_stress:
pr[1] = re.sub(Pronounce.vowel_re, r"\g<0>0", pr[1])
if orig in pronunciations:
pronunciations[orig].append(pr[1])
else:
pronunciations[orig] = [upword, pr[1]]
return pronunciations
if __name__ == '__main__':
pr = Pronounce(sys.argv[1:])
print pr.p()