-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtext_processing_utils.py
182 lines (142 loc) · 4.56 KB
/
text_processing_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
Text processing functions to:
- remove URLs from text;
- remove username patterns;
- remove specific tokens/emojis from text;
- define english stopwords using NLTK and spaCy;
- lemmatise text;
- tokenise text;
"""
import pandas as pd
import logging
from typing import Union
import re
from nltk.corpus import stopwords
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_sm")
logger = logging.getLogger(__name__)
def remove_urls(text: str) -> str:
"""
Removes URLs from text.
Args:
text (str): a string, tipically one or multiple sentences long
Returns:
str: text without URLs
"""
# Define a regular expression pattern for matching URLs
url_pattern = re.compile(r"https?://\S+|www\.\S+")
# Replace URLs with a space
cleaned_text = url_pattern.sub(" ", text)
return cleaned_text
def remove_text_after_patterns(text: str) -> str:
"""
Removes patterns of the form "xxx wrote: »" and "xxx said:".
Args:
text (str): text to be cleaned
Returns:
str: cleaned text
"""
# We use re.sub() to replace the pattern with an empty string
result = re.sub(r"\w+ wrote: »", " ", text)
result = re.sub(r"\w+ said:", " ", result)
return result
def remove_tokens_in_list(
data: Union[pd.Series, list], list_of_tokens_to_remove: list
) -> Union[pd.Series, list]:
"""
Removes specific tokens from a pandas Series or list.
Args:
data (Union[pd.Series, list]): original data
list_of_tokens_to_remove (list): list of tokens to be removed
Returns:
Union[pd.Series, list]: data without tokens in list
"""
return [token for token in data if token not in list_of_tokens_to_remove]
def remove_emojis(text: str) -> str:
"""
Removes emojis from text.
Args:
text (str): original text
Returns:
str: text with emojis removed
"""
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+",
flags=re.UNICODE,
)
return emoji_pattern.sub(r"", text)
def preprocess_text(text: str) -> str:
"""
Preprocesses text by:
- replacing &/& with "and"
- replacing ">" and "=" with space
- removing URLs
- transforming text to lower case
- removing username patterns
- removing emojis
Args:
text (str): text to be processed
Returns:
str: Preprocessed text.
"""
# Replacing "and" symbols with keyword "and"
text = re.sub("&", " and ", text)
text = re.sub("&", " and ", text)
# Replacing certain symbols with space as multiple of them appear together
# and don't disappear by deleting punctuation
text = re.sub(">", " ", text)
text = re.sub("=", " ", text)
text = remove_urls(text)
text = text.lower()
text = remove_text_after_patterns(text)
text = remove_emojis(text)
return text
def english_stopwords_definition() -> list:
"""
Defines English stopwords by putting together NLTK and SpaCy stopwords.
Returns:
list: a list of English stopwords.
"""
spacy_stopwords = list(spacy.lang.en.STOP_WORDS)
nltk_stopwrods = stopwords.words("english")
return list(set(spacy_stopwords + nltk_stopwrods))
def lemmatise(text: str) -> list:
"""
Applies lemmatisation sentence by sentence.
It then removes punctuation and tokens tagged as space (e.g. "\n")
Args:
text (str): text to be lemmatise, might include multiple sentences
Returns:
list: list of lemmatised token without punctuation tokens
"""
doc = nlp(text)
# apply sentence by sentence lemmatisation
processed_sentence_tokens = []
for sentence in doc.sents:
lemmatised_tokens = [
token.lemma_.lower()
for token in sentence
if not (token.is_punct or token.pos_ == "SPACE")
]
processed_sentence_tokens.extend(lemmatised_tokens)
return processed_sentence_tokens
def tokenise(text: str) -> list:
"""
Applies tokenisation and removes punctuation and tokens tagged as space (e.g. "\n")
Args:
text (str): text to be tokenised, might include multiple sentences
Returns:
list: list of tokens without punctuation tokens
"""
doc = nlp(text)
return [
token.text.lower()
for token in doc
if not (token.is_punct or token.pos_ == "SPACE")
]