-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtknizer.py
178 lines (160 loc) · 7.44 KB
/
tknizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#tokenizer borrows from nltk, emojipy, jie_ba_tokenizer, and TinySegmenter
#much borrowed from https://github.com/erikavaris/tokenizer/blob/master/tokenizer/tokenizer.py
import nltk
from nltk.tokenize.casual import remove_handles, reduce_lengthening, _str_to_unicode, _replace_html_entities # EMOTICONS, EMOTICON_RE
import emojipy
from emojipy.ruleset import unicode_replace
import re
from .reg import Regularizer
import unicodedata
import pkg_resources
import regex
import itertools
import collections
import tinysegmenter
import html
import re
import emojipy
from emojipy.ruleset import unicode_replace
ascii = False
unicode_alt = True
sprites = False
image_png_path = 'https://cdn.jsdelivr.net/emojione/assets/3.1/png/64/'
ignored_regexp = '<object[^>]*>.*?<\/object>|<span[^>]*>.*?<\/span>|<(?:object|embed|svg|img|div|span|p|a)[^>]*>'
unicode_regexp = "(" + '|'.join([re.escape(x.decode("utf-8")) for x in sorted(unicode_replace.keys(), key=len, reverse=True)]) + ")"
shortcode_regexp = ':([-+\\w]+):'
ascii_regexp = '(\\<3|<3|\\<\\/3|<\\/3|\\:\'\\)|\\:\'\\-\\)|\\:D|\\:\\-D|\\=D|\\:\\)|\\:\\-\\)|\\=\\]|\\=\\)|\\:\\]|\'\\:\\)|\'\\:\\-\\)|\'\\=\\)|\'\\:D|\'\\:\\-D|\'\\=D|\\>\\:\\)|>\\:\\)|\\>;\\)|>;\\)|\\>\\:\\-\\)|>\\:\\-\\)|\\>\\=\\)|>\\=\\)|;\\)|;\\-\\)|\\*\\-\\)|\\*\\)|;\\-\\]|;\\]|;D|;\\^\\)|\'\\:\\(|\'\\:\\-\\(|\'\\=\\(|\\:\\*|\\:\\-\\*|\\=\\*|\\:\\^\\*|\\>\\:P|>\\:P|X\\-P|x\\-p|\\>\\:\\[|>\\:\\[|\\:\\-\\(|\\:\\(|\\:\\-\\[|\\:\\[|\\=\\(|\\>\\:\\(|>\\:\\(|\\>\\:\\-\\(|>\\:\\-\\(|\\:@|\\:\'\\(|\\:\'\\-\\(|;\\(|;\\-\\(|\\>\\.\\<|>\\.<|\\:\\$|\\=\\$|#\\-\\)|#\\)|%\\-\\)|%\\)|X\\)|X\\-\\)|\\*\\\\0\\/\\*|\\\\0\\/|\\*\\\\O\\/\\*|\\\\O\\/|O\\:\\-\\)|0\\:\\-3|0\\:3|0\\:\\-\\)|0\\:\\)|0;\\^\\)|O\\:\\-\\)|O\\:\\)|O;\\-\\)|O\\=\\)|0;\\-\\)|O\\:\\-3|O\\:3|B\\-\\)|B\\)|8\\)|8\\-\\)|B\\-D|8\\-D|\\-_\\-|\\-__\\-|\\-___\\-|\\>\\:\\\\|>\\:\\\\|\\>\\:\\/|>\\:\\/|\\:\\-\\/|\\:\\-\\.|\\:\\/|\\:\\\\|\\=\\/|\\=\\\\|\\:L|\\=L|\\:P|\\:\\-P|\\=P|\\:\\-p|\\:p|\\=p|\\:\\-Þ|\\:\\-Þ|\\:Þ|\\:Þ|\\:þ|\\:þ|\\:\\-þ|\\:\\-þ|\\:\\-b|\\:b|d\\:|\\:\\-O|\\:O|\\:\\-o|\\:o|O_O|\\>\\:O|>\\:O|\\:\\-X|\\:X|\\:\\-#|\\:#|\\=X|\\=x|\\:x|\\:\\-x|\\=#)'
shortcode_compiled = re.compile(ignored_regexp+"|("+shortcode_regexp+")",
re.IGNORECASE)
unicode_compiled = re.compile(ignored_regexp+"|("+unicode_regexp+")",
re.UNICODE)
ascii_compiled = re.compile(ignored_regexp+"|("+ascii_regexp+")",
re.IGNORECASE)
EMOTICONS_FILE = ('emoticons.txt') #put your emoticons file here
#urls - nltk version
URLS = r""" # Capture 1: entire matched URL
(?:
https?: # URL protocol and colon
(?:
/{1,3} # 1-3 slashes
| # or
[a-z0-9%] # Single letter or digit or '%'
# (Trying not to match e.g. "URI::Escape")
)
| # or
# looks like domain name followed by a slash:
[a-z0-9.\-]+[.]
(?:[a-z]{2,13})
/
)
(?: # One or more:
[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
| # or
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
\([^\s]+?\) # balanced parens, non-recursive: (...)
)+
(?: # End with:
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
\([^\s]+?\) # balanced parens, non-recursive: (...)
| # or
[^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
)
| # OR, the following to match naked domains:
(?:
(?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
[a-z0-9]+
(?:[.\-][a-z0-9]+)*
[.]
(?:[a-z]{2,13})
\b
/?
(?!@) # not succeeded by a @,
# avoid matching "foo.na" in "[email protected]"
)
"""
#my emoticons, borrowed & expanded from https://github.com/g-c-k/idiml/blob/master/predict/src/main/resources/data/emoticons.txt
EMOTICONS = []
with open(EMOTICONS_FILE, 'r') as f:
for line in f:
item = line.rstrip('\n')
item = re.escape(item)
EMOTICONS.append(item)
# Twitter specific:
HASHTAG = r"""(?:\#\w+)"""
TWITTER_USER = r"""(?:@\w+)"""
REDDIT_USER = r"(?:\/?u\/\w+)"
#separately compiled regexps
TWITTER_USER_RE = re.compile(TWITTER_USER, re.UNICODE)
REDDIT_USER_RE = re.compile(REDDIT_USER, flags=re.UNICODE)
HASHTAG_RE = re.compile(HASHTAG, re.UNICODE)
HASH_RE = re.compile(r'#(?=\w+)', re.UNICODE)
#my url version, nltk's doesn't work for separate regexp
URL_RE = re.compile(r"""((https?:\/\/|www)|\w+\.(\w{2-3}))([\w\!#$&-;=\?\-\[\]~]|%[0-9a-fA-F]{2})+""", re.UNICODE)
EMOTICON_RE = re.compile(r"""(%s)""" % "|".join(EMOTICONS), re.UNICODE)
# more regular expressions for word compilation, borrowed from nltk
#phone numbers
PHONE = r"""(?:(?:\+?[01][\-\s.]*)?(?:[\(]?\d{3}[\-\s.\)]*)?\d{3}[\-\s.]*\d{4})"""
# email addresses
EMAILS = r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]"""
# HTML tags:
HTML_TAGS = r"""<[^>\s]+>"""
# ASCII Arrows
ASCII_ARROWS = r"""[\-]+>|<[\-]+"""
#long non-word, non-numeric repeats
#HANGS = r"""([^a-zA-Z0-9])\1{3,}"""
# Remaining word types:
WORDS = r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
TWITTER_REGEXPS = [URLS, PHONE] + EMOJIS + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, TWITTER_USER, HASHTAG, EMAILS, WORDS]
REDDIT_REGEXPS = [URLS, PHONE] + EMOJIS + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, REDDIT_USER, HASHTAG, EMAILS, WORDS]
WORD_RE = re.compile(r"""(%s)""" % "|".join(TWITTER_REGEXPS), re.VERBOSE | re.I | re.UNICODE) # add REDDIT_REGEXPS as necessary
def tokenize2(text):
words = list(map((lambda x : x if EMOTICON_RE.search(x) or unicode_compiled.findall(x) else x.lower()), words))
return words
segmenter = tinysegmenter.TinySegmenter()
from chinese_tokenizer.tokenizer import Tokenizer
jie_ba_tokenizer = Tokenizer().jie_ba_tokenizer
def new_tokenize(row):
line = row["text"]
tokens = tokenize2(line)
return(tokens)
def flatten(l):
for el in l:
if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
yield from flatten(el)
else:
yield el
def tknz1(row):
line = row["text"]
line = html.unescape(line)
line = line.replace('\n','')
line = line.replace('\t','')
if row["lang"]=="ja":
tokens = tokenize2(line)
for i,x in enumerate(tokens):
if regex.findall("\p{Hiragana}|\p{Katakana}|\p{Han}",x):
tokens[i] = segmenter.tokenize(x)
return [x for x in flatten(tokens)]
#tokens = segmenter.tokenize(line)
#return tokens
elif row["lang"]=="zh":
tokens = tokenize2(line)
for i,x in enumerate(tokens):
if regex.findall("\p{Han}",x):
tokens[i] = segmenter.tokenize(x)
return [x for x in flatten(tokens)]
else:
tokens = line.lower()
tokens = tokenize2(line)
return tokens