-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_utils.py
40 lines (34 loc) · 1.08 KB
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import string
from nltk import word_tokenize as lib_tokenizer
import nltk
nltk.download('punkt')
dict_map = dict({})
def word_tokenize(text):
words = text.split()
words_norm = []
for w in words:
if dict_map.get(w, None) is None:
dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"')
words_norm.append(dict_map[w])
return words_norm
def strip_context(text):
text = text.replace('\n', ' ')
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def post_process(x):
x = " ".join(word_tokenize(strip_context(x))).strip()
x = x.replace("\n"," ")
x = "".join([i for i in x if i not in string.punctuation])
return x
def preprocess(x, max_length=-1, remove_puncts=False):
x = nltk_tokenize(x)
x = x.replace("\n", " ")
if remove_puncts:
x = "".join([i for i in x if i not in string.punctuation])
if max_length > 0:
x = " ".join(x.split()[:max_length])
return x
def nltk_tokenize(x):
return " ".join(word_tokenize(strip_context(x))).strip()