forked from brendano/tweetmotif
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwokenize.py
211 lines (178 loc) · 5.92 KB
/
twokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# -*- coding: utf-8 -*-
""" tokenizer for tweets! might be appropriate for other social media dialects too.
general philosophy is to throw as little out as possible.
development philosophy: every time you change a rule, do a diff of this
program's output on ~100k tweets. if you iterate through many possible rules
and only accept the ones that seeem to result in good diffs, it's a sort of
statistical learning with in-the-loop human evaluation :)
"""
__author__="brendan o'connor (anyall.org)"
import re,sys
import emoticons
mycompile = lambda pat: re.compile(pat, re.UNICODE)
def regex_or(*items):
r = '|'.join(items)
r = '(' + r + ')'
return r
def pos_lookahead(r):
return '(?=' + r + ')'
def neg_lookahead(r):
return '(?!' + r + ')'
def optional(r):
return '(%s)?' % r
PunctChars = r'''['“".?!,:;]'''
Punct = '%s+' % PunctChars
Entity = '&(amp|lt|gt|quot);'
# one-liner URL recognition:
#Url = r'''https?://\S+'''
# more complex version:
UrlStart1 = regex_or('https?://', r'www\.')
CommonTLDs = regex_or('com','co\\.uk','org','net','info','ca')
UrlStart2 = r'[a-z0-9\.-]+?' + r'\.' + CommonTLDs + pos_lookahead(r'[/ \W\b]')
UrlBody = r'[^ \t\r\n<>]*?' # * not + for case of: "go to bla.com." -- don't want period
UrlExtraCrapBeforeEnd = '%s+?' % regex_or(PunctChars, Entity)
UrlEnd = regex_or( r'\.\.+', r'[<>]', r'\s', '$')
Url = (r'\b' +
regex_or(UrlStart1, UrlStart2) +
UrlBody +
pos_lookahead( optional(UrlExtraCrapBeforeEnd) + UrlEnd))
Url_RE = re.compile("(%s)" % Url, re.U|re.I)
Timelike = r'\d+:\d+'
NumNum = r'\d+\.\d+'
NumberWithCommas = r'(\d+,)+?\d{3}' + pos_lookahead(regex_or('[^,]','$'))
Abbrevs1 = ['am','pm','us','usa','ie','eg']
def regexify_abbrev(a):
chars = list(a)
icase = ["[%s%s]" % (c,c.upper()) for c in chars]
dotted = [r'%s\.' % x for x in icase]
return "".join(dotted)
Abbrevs = [regexify_abbrev(a) for a in Abbrevs1]
BoundaryNotDot = regex_or(r'\s', '[“"?!,:;]', Entity)
aa1 = r'''([A-Za-z]\.){2,}''' + pos_lookahead(BoundaryNotDot)
aa2 = r'''([A-Za-z]\.){1,}[A-Za-z]''' + pos_lookahead(BoundaryNotDot)
ArbitraryAbbrev = regex_or(aa1,aa2)
assert '-' != '―'
Separators = regex_or('--+', '―')
Decorations = r' [ ♫ ]+ '.replace(' ','')
EmbeddedApostrophe = r"\S+'\S+"
ProtectThese = [
emoticons.Emoticon,
Url,
Entity,
Timelike,
NumNum,
NumberWithCommas,
Punct,
ArbitraryAbbrev,
Separators,
Decorations,
EmbeddedApostrophe,
]
Protect_RE = mycompile(regex_or(*ProtectThese))
class Tokenization(list):
" list of tokens, plus extra info "
def __init__(self):
self.alignments = []
self.text = ""
def subset(self, tok_inds):
new = Tokenization()
new += [self[i] for i in tok_inds]
new.alignments = [self.alignments[i] for i in tok_inds]
new.text = self.text
return new
def assert_consistent(t):
assert len(t) == len(t.alignments)
assert [t.text[t.alignments[i] : (t.alignments[i]+len(t[i]))] for i in range(len(t))] == list(t)
def align(toks, orig):
s_i = 0
alignments = [None]*len(toks)
for tok_i in range(len(toks)):
while True:
L = len(toks[tok_i])
if orig[s_i:(s_i+L)] == toks[tok_i]:
alignments[tok_i] = s_i
s_i += L
break
s_i += 1
if s_i >= len(orig): raise AlignmentFailed((orig,toks,alignments))
#if orig[s_i] != ' ': raise AlignmentFailed("nonspace advance: %s" % ((s_i,orig),))
if any(a is None for a in alignments): raise AlignmentFailed((orig,toks,alignments))
return alignments
class AlignmentFailed(Exception): pass
def unicodify(s, encoding='utf8', *args):
if isinstance(s,unicode): return s
if isinstance(s,str): return s.decode(encoding, *args)
return unicode(s)
def tokenize(tweet):
text = unicodify(tweet)
text = squeeze_whitespace(text)
t = Tokenization()
t += simple_tokenize(text)
t.text = text
t.alignments = align(t, text)
return t
def simple_tokenize(text):
s = text
s = edge_punct_munge(s)
# strict alternating ordering through the string. first and last are goods.
# good bad good bad good bad good
goods = []
bads = []
i = 0
if Protect_RE.search(s):
for m in Protect_RE.finditer(s):
goods.append( (i,m.start()) )
bads.append(m.span())
i = m.end()
goods.append( (m.end(), len(s)) )
else:
goods = [ (0, len(s)) ]
assert len(bads)+1 == len(goods)
goods = [s[i:j] for i,j in goods]
bads = [s[i:j] for i,j in bads]
#print goods
#print bads
goods = [unprotected_tokenize(x) for x in goods]
res = []
for i in range(len(bads)):
res += goods[i]
res.append(bads[i])
res += goods[-1]
res = post_process(res)
return res
AposS = mycompile(r"(\S+)('s)$")
def post_process(pre_toks):
# hacky: further splitting of certain tokens
post_toks = []
for tok in pre_toks:
m = AposS.search(tok)
if m:
post_toks += m.groups()
else:
post_toks.append( tok )
return post_toks
WS_RE = mycompile(r'\s+')
def squeeze_whitespace(s):
new_string = WS_RE.sub(" ",s)
return new_string.strip()
# fun: copy and paste outta http://en.wikipedia.org/wiki/Smart_quotes
EdgePunct = r"""[ ' " “ ” ‘ ’ < > « » { } ( \) [ \] ]""".replace(' ','')
#NotEdgePunct = r"""[^'"([\)\]]""" # alignment failures?
NotEdgePunct = r"""[a-zA-Z0-9]"""
EdgePunctLeft = r"""(\s|^)(%s+)(%s)""" % (EdgePunct, NotEdgePunct)
EdgePunctRight = r"""(%s)(%s+)(\s|$)""" % (NotEdgePunct, EdgePunct)
EdgePunctLeft_RE = mycompile(EdgePunctLeft)
EdgePunctRight_RE= mycompile(EdgePunctRight)
def edge_punct_munge(s):
s = EdgePunctLeft_RE.sub( r"\1\2 \3", s)
s = EdgePunctRight_RE.sub(r"\1 \2\3", s)
return s
def unprotected_tokenize(s):
return s.split()
if __name__=='__main__':
for line in sys.stdin:
print u" ".join(tokenize(line[:-1])).encode('utf-8')
#print "CUR\t" + " ".join(tokenize(line[:-1]))
#print "WS\t" + " ".join(line[:-1].split())
#print ansi.color(line.strip(),'red')
#print ansi.color(" ".join(tokenize(line.strip())),'blue','bold')