-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocessing.py
106 lines (80 loc) · 3.05 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import re
import random
from tqdm import tqdm
import glob
import pandas as pd
import pdb
import argparse
def load_label(filepath):
char2id = dict()
id2char = dict()
ch_labels = pd.read_csv(filepath, encoding="utf-8")
id_list = ch_labels["id"]
char_list = ch_labels["char"]
freq_list = ch_labels["freq"]
for (id_, char, freq) in zip(id_list, char_list, freq_list):
char2id[char] = id_
id2char[id_] = char
return char2id, id2char
def sentence_to_target(sentence, char2id):
target = str()
for ch in sentence:
try:
target += (str(char2id[ch]) + ' ')
except KeyError:
continue
return target[:-1]
def target_to_sentence(target, id2char):
sentence = ""
targets = target.split()
for n in targets:
sentence += id2char[int(n)]
return sentence
def generate_character_script(videos_paths, audios_paths, transcripts,mode):
print('create_script started..')
if mode == 'Train':
char2id, id2char = load_label("./dataset/labels.csv")
with open(os.path.join('./dataset/'+mode+".txt"), "w") as f:
tmp = list(zip(videos_paths, audios_paths, transcripts))
random.shuffle(tmp)
videos_paths,audios_paths, transcripts = zip(*tmp)
for video_path, audio_path,transcript in zip(videos_paths,audios_paths, transcripts):
char_id_transcript = sentence_to_target(transcript, char2id)
f.write(f'{video_path}\t{audio_path}\t{transcript}\t{char_id_transcript}\n')
else:
with open(os.path.join('./dataset/'+mode+'.txt'),'w') as f:
tmp = list(zip(videos_paths,audios_paths,transcripts))
for a,b,c in tmp:
f.write(f'{a}\t{b}\t{c}\t{str(0)}\n')
def preprocess(dataset_path,mode):
print('preprocess started..')
transcripts=[]
dataset_path_video = data_folder + '/Video/*'
videos_paths = glob.glob(dataset_path_video)
videos_paths = sorted(videos_paths)
dataset_path_audio = data_folder + '/Audio/*.wav'
audios_paths = glob.glob(dataset_path_audio)
audios_paths = sorted(audios_paths)
if mode == 'Train':
for file_ in tqdm(audios_paths):
txt_file_ = file_.replace('.wav','.txt')
txt_file_ = txt_file_.replace('Audio','Text')
with open(txt_file_, "r",encoding='utf-8') as f:
raw_sentence = f.read()
transcripts.append(raw_sentence)
else:
transcripts.extend(['-']*len(audios_paths))
return videos_paths, audios_paths, transcripts
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--data_folder', type=str, required=True)
parser.add_argument('--mode', type=str, required=True)
args = parser.parse_args()
return args
if __name__ == '__main__':
args = get_args()
data_folder = args.data_folder
mode = args.mode
videos_paths, audios_paths, transcripts = preprocess(data_folder,mode)
generate_character_script(videos_paths,audios_paths, transcripts, mode)