-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcript_preprocessing.py
44 lines (40 loc) · 1.77 KB
/
transcript_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
ALLOWED_CHARS = {
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'ä', 'ö', 'ü',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
' ',
',', ';', ':', '.', '?', '!',
}
WHITESPACE_REGEX = re.compile(r'[ \t]+')
def preprocess_transcript_for_corpus(transcript):
transcript = transcript.lower()
transcript = transcript.replace('á', 'a')
transcript = transcript.replace('à', 'a')
transcript = transcript.replace('â', 'a')
transcript = transcript.replace('ç', 'c')
transcript = transcript.replace('é', 'e')
transcript = transcript.replace('è', 'e')
transcript = transcript.replace('ê', 'e')
transcript = transcript.replace('í', 'i')
transcript = transcript.replace('ì', 'i')
transcript = transcript.replace('î', 'i')
transcript = transcript.replace('ñ', 'n')
transcript = transcript.replace('ó', 'o')
transcript = transcript.replace('ò', 'o')
transcript = transcript.replace('ô', 'o')
transcript = transcript.replace('ú', 'u')
transcript = transcript.replace('ù', 'u')
transcript = transcript.replace('û', 'u')
transcript = transcript.replace('ș', 's')
transcript = transcript.replace('ş', 's')
transcript = transcript.replace('ß', 'ss')
transcript = transcript.replace('-', ' ')
# Not used consistently, better to replace with space as well
transcript = transcript.replace('–', ' ')
transcript = transcript.replace('/', ' ')
transcript = WHITESPACE_REGEX.sub(' ', transcript)
transcript = ''.join([char for char in transcript if char in ALLOWED_CHARS])
transcript = WHITESPACE_REGEX.sub(' ', transcript)
transcript = transcript.strip()
return transcript