forked from lspitzley/edgar-10k-sa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_text.py
95 lines (70 loc) · 2.51 KB
/
preprocess_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import codecs
from glob import glob
import os
import re
from pathos.pools import ProcessPool
from pathos.helpers import cpu_count
from tqdm import tqdm
import unicodedata
src_dir = './txt'
tar_dir = './ptxt'
def process_text(text):
# Normalize due to unicode
text = unicodedata.normalize("NFKD", text) # Normalize
text = '\n'.join(text.splitlines()) # Let python take care of unicode break lines
# Convert to upper
text = text.upper() # Convert to upper
# Take care of breaklines & whitespaces combinations due to beautifulsoup parsing
text = re.sub(r'[ ]+\n', '\n', text)
text = re.sub(r'\n[ ]+', '\n', text)
text = re.sub(r'\n+', '\n', text)
# To find MDA section, reformat item headers
text = text.replace('\n.\n','.\n') # Move Period to beginning
text = text.replace('\nI\nTEM','\nITEM')
text = text.replace('\nITEM\n','\nITEM ')
text = text.replace('\nITEM ','\nITEM ')
text = text.replace(':\n','.\n')
# Math symbols for clearer looks
text = text.replace('$\n','$')
text = text.replace('\n%','%')
# Reformat
text = text.replace('\n','\n\n') # Reformat by additional breakline
return text
def preprocess_job(txt_path):
txt_name = os.path.basename(txt_path)
new_path = os.path.join(tar_dir,txt_name)
if os.path.exists(new_path):
print("{} already exists, skipping".format(new_path))
return
print("Preprocessing {}".format(txt_path))
with codecs.open(txt_path,'r',encoding='utf-8') as fin:
text = fin.read()
# Preprocess text here
text = process_text(text)
# Write new files
with codecs.open(new_path,'w',encoding='utf-8') as fout:
fout.write(text)
def mlp():
if not os.path.exists(tar_dir):
os.makedirs(tar_dir)
iterator = glob(os.path.join(src_dir,'*.txt'))
ncpus = cpu_count() if cpu_count() <= 8 else 8;
pool = ProcessPool( ncpus )
pool.map( preprocess_job, iterator )
def main():
if not os.path.exists(tar_dir):
os.makedirs(tar_dir)
for txt_path in tqdm(glob(os.path.join(src_dir,'*.txt'))):
# Read old file
with codecs.open(txt_path,'r',encoding='utf-8') as fin:
text = fin.read()
# Preprocess text here
text = process_text(text)
# Write new file
txt_name = os.path.basename(txt_path)
new_path = os.path.join(tar_dir,txt_name)
with codecs.open(new_path,'w',encoding='utf-8') as fout:
fout.write(text)
if __name__ == "__main__":
#main()
mlp()