-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessor.py
93 lines (85 loc) · 3.57 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import html
import re
import unicodedata
import lxml
chars = {
'\xc2\x82': ',', # High code comma
'\xc2\x84': ',,', # High code double comma
'\xc2\x85': '...', # Tripple dot
'\xc2\x88': '^', # High carat
'\xc2\x91': '\x27', # Forward single quote
'\xc2\x92': '\x27', # Reverse single quote
'\xc2\x93': '\x22', # Forward double quote
'\xc2\x94': '\x22', # Reverse double quote
'\xc2\x95': ' ',
'\xc2\x96': '-', # High hyphen
'\xc2\x97': '--', # Double hyphen
'\xc2\x99': ' ',
'\xc2\xa0': ' ',
'\xc2\xa6': '|', # Split vertical bar
'\xc2\xab': '<<', # Double less than
'\xc2\xbb': '>>', # Double greater than
'\xc2\xbc': '1/4', # one quarter
'\xc2\xbd': '1/2', # one half
'\xc2\xbe': '3/4', # three quarters
'\xca\xbf': '\x27', # c-single quote
'\xcc\xa8': '', # modifier - under curve
'\xcc\xb1': '' # modifier - under line
}
def replace_chars(match):
char = match.group(0)
return chars[char]
def clean_anomalies(text_str):
"""
Clean various anomalies resulting from formatting and unicode
Args:
text_str (string):
string to clean
Returns:
text_str (string):
cleaned string
"""
text_str = re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text_str)
text_str = re.sub('(-[\n\r\t\v])', '-',
text_str) # replace dashes followed by a newline or tab with a dash
text_str = re.sub('(&[\S]{2,6};)', '', text_str) # remove all remaining html chars starting with &
text_str = re.sub('(#[a-f0-9]{6})', '', text_str) # remove all remaining html chars starting with #
text_str = re.sub('(\s-\s)', ' ', text_str)
text_str = re.sub('and/or', 'and or', text_str, flags=re.IGNORECASE)
text_str = re.sub('(([-.=]){2,}\s*)', ' ', text_str)
text_str = re.sub('_', '', text_str)
text_str = re.sub('(\s){3,}', ' ', text_str)
text_str = re.sub('(\n\s*?){3,}', '\n\n', text_str)
text_str = re.sub('(?<![\n])(\n)(?![\n\s])', ' ', text_str)
return text_str
def preprocess_html(text_str, plain_text=True, remove_newlines=True, remove_tabs=True, remove_tables=False):
"""
Pre-process html string by cleaning anomalies and optionally removing newlines, tabs, and html tables
Args:
text_str (string):
html string to preprocess
plain_text (bool, optional, default True):
if True, remove all html markup and return only the text
remove_newlines (bool, optional, default True):
if True, remove all newlines
remove_tabs (bool, optional, default True):
if True, remove all tabs
remove_tables (bool, optional, default False):
if True, remove all html tagged tables
Returns:
text_str (string):
pre-processed html string
"""
text_str = html.unescape(unicodedata.normalize('NFKD', text_str))
text_str = re.sub('[\r]', '\n', text_str)
if remove_tables:
text_str = re.sub('(<table>.+?</table>)', '\n', text_str, flags=re.I)
if plain_text:
text_str = clean_anomalies(lxml.html.fromstring(text_str).text_content())
else:
text_str = text_str = clean_anomalies(text_str)
if remove_newlines:
text_str = re.sub('[\n]+', ' ', text_str) # remove newlines
if remove_tabs:
text_str = re.sub('[\t]+', ' ', text_str) # remove tabs
return text_str