forked from AadityaNair/LDA-with-Deep-Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
57 lines (48 loc) · 1.66 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import re
from string import punctuation, whitespace
import html
import nltk
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import reuters
from nltk.corpus import stopwords
from stop_words import get_stop_words
LOT_OF_STOPWORDS = frozenset(list(STOPWORDS) + get_stop_words('en') + stopwords.words('english'))
TRAINING_SET = list(filter(lambda x: x.startswith('train'), reuters.fileids()))
TESTING_SET = list(filter(lambda x: x.startswith('test'), reuters.fileids()))
INPUTS_DIR = os.environ.get('INPUTS_DIR', 'inputs')
OUTPUTS_DIR = os.environ.get('OUTPUTS_DIR', 'outputs')
WHITE_PUNC_REGEX = re.compile(r"[%s]+" % re.escape(whitespace + punctuation), re.UNICODE)
lemma = nltk.wordnet.WordNetLemmatizer()
def preprocess_document(document_text):
"""
1.) Lowercase it all
2.) Remove HTML Entities
3.) Split by punctuations to remove them.
4.) Stem / Lemmaize
5.) Remove stop words
6.) Remove unit length words
7.) Remove numbers
"""
def is_num(x):
return not (x.isdigit() or (x[0] == '-' and x[1:].isdigit()))
return list(
filter(
is_num,
filter(
lambda x: len(x) > 1,
filter(
lambda x: x not in LOT_OF_STOPWORDS,
map(
lambda x: lemma.lemmatize(x),
re.split(
WHITE_PUNC_REGEX,
html.unescape(
document_text.lower()
)
)
)
)
)
)
)