-
Notifications
You must be signed in to change notification settings - Fork 3
/
chunker.py
48 lines (39 loc) · 1.3 KB
/
chunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# -*- coding: utf-8 -*-
import plac
import regex as re
import ftfy
from nltk import RegexpParser
from nltk.tag.util import str2tuple
from string import punctuation as punct
grammar = r"""
NP:
{<PNOUN|NOUN|NUM|\.>+<ADJ>?}
VP:
{<VERB>+}
"""
chunker = RegexpParser(grammar)
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def main(corpus_file, output):
with open(output, 'w') as out:
for line in open(corpus_file):
try:
line = ftfy.fix_text(line.decode('utf-8'))
except Exception, e:
print e
tokens = [str2tuple(tok) for tok in re.sub('\s+', ' ', line).split()]
try:
tree = chunker.parse(tokens)
except Exception, e:
print e
for subtree in tree.subtrees(filter = lambda t: t.label() in ['NP', 'VP']):
try:
text = [w.strip(punct) for (w, t) in subtree.leaves() if t != '.']
text = ' '.join(text).strip().lower()
if len(text) > 2 and not text.isdigit() and is_ascii(text):
out.write(text)
out.write('\n')
except Exception, e:
print e
if __name__ == '__main__':
plac.call(main)