-
Notifications
You must be signed in to change notification settings - Fork 32
/
preprocess.py
48 lines (36 loc) · 1.12 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re
import pandas as pd
import numpy as np
import constants
appos = constants.appos
def construct_spacy_obj(df, nlp):
# constructing spacy object for each review
with nlp.disable_pipes(['parser', 'ner']):
docs = list(nlp.pipe(df['reviewText']))
df['spacyObj'] = pd.Series(docs, index=df['reviewText'].index)
return df
def reduce_lengthening(text):
pattern = re.compile(r"([a-zA-Z])\1{2,}")
return pattern.sub(r"\1\1", text)
def preprocess_text(txt, nlp):
txt = txt.lower()
txt = reduce_lengthening(txt)
with nlp.disable_pipes('tagger', 'parser', 'ner', 'sentencizer'):
doc = nlp(txt)
tokens = [token.text for token in doc]
if len(tokens) <3:
return np.NaN
for i, token in enumerate(tokens):
if token in appos:
tokens[i] = appos[token]
txt = ' '.join(tokens)
txt = re.sub(r"[^a-zA-Z0-9.,:;\-'?!/\n]", " ", txt)
txt = re.sub(r"\n", ".", txt)
txt = re.sub(r" ([.,:?;])", r"\1", txt)
txt = re.sub(r"([. ])\1{1,}", r"\1", txt)
return txt
def preprocess(df, nlp):
df.dropna(inplace=True)
df['reviewText'] = df['reviewText'].apply(lambda x: preprocess_text(x, nlp))
df.dropna(inplace=True)
return df