-
Notifications
You must be signed in to change notification settings - Fork 5
/
preprocess.py
141 lines (110 loc) · 5.69 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import argparse
import os
from multiprocessing import cpu_count
from datasets import preprocessor
from hparams import hparams
from tqdm import tqdm
def clean_metadata(metadata):
speaker_ids = os.listdir(hparams.speaker_embedding_dir)
speaker_ids = list(set([x[:-4] for x in speaker_ids]))
metadata_ = []
for meta in metadata:
if meta is None:
continue
speaker_id = meta[1].split('_')[0].split('-')[1]
if speaker_id in speaker_ids:
metadata_.append(meta)
return metadata_
def preprocess(args, input_folders, out_dir, hparams):
mel_dir = os.path.join(out_dir, 'mels')
wav_dir = os.path.join(out_dir, 'audio')
linear_dir = os.path.join(out_dir, 'linear')
os.makedirs(mel_dir, exist_ok=True)
os.makedirs(wav_dir, exist_ok=True)
os.makedirs(linear_dir, exist_ok=True)
if args.dataset.startswith('thchs30'):
metadata = preprocessor.build_from_path_thchs30(hparams, input_folders, mel_dir, linear_dir, wav_dir,
args.n_jobs, tqdm=tqdm)
# metadata = clean_metadata(metadata)
else:
metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs,
tqdm=tqdm)
write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sr = hparams.sample_rate
hours = timesteps / sr / 3600
print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
len(metadata), mel_frames, timesteps, hours))
print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
def norm_data(args):
merge_books = (args.merge_books == 'True')
print('Selecting data folders..')
supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS', 'biaobei', 'thchs30','Female-Golden','decoder_pretrain']
if args.dataset not in supported_datasets:
raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format(
args.dataset, supported_datasets))
if args.dataset.startswith('LJSpeech'):
return [os.path.join(args.base_dir, args.dataset)]
if args.dataset.startswith('biaobei'):
return [os.path.join(args.base_dir, 'biaobei_female')]
if args.dataset.startswith('thchs30'):
return [os.path.join(args.base_dir, 'data_thchs30')]
if args.dataset.startswith('Female-Golden'):
return [os.path.join(args.base_dir, 'female_golden_v2')]
if args.dataset.startswith('decoder_pretrain'):
return [os.path.join(args.base_dir, 'decoder_pretrain_wavs')]
if args.dataset == 'M-AILABS':
supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU',
'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
if args.language not in supported_languages:
raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format(
supported_languages))
supported_voices = ['female', 'male', 'mix']
if args.voice not in supported_voices:
raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format(
supported_voices))
path = os.path.join(args.base_dir, args.language, 'by_book', args.voice)
supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path, e))]
if args.reader not in supported_readers:
raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format(
supported_readers))
path = os.path.join(path, args.reader)
supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path, e))]
if merge_books:
return [os.path.join(path, book) for book in supported_books]
else:
if args.book not in supported_books:
raise ValueError('Please enter a valid book for your reader settings! \n{}'.format(
supported_books))
return [os.path.join(path, args.book)]
def run_preprocess(args, hparams):
input_folders = norm_data(args)
output_folder = os.path.join(args.base_dir, args.output)
preprocess(args, input_folders, output_folder, hparams)
def main():
print('initializing preprocessing..')
parser = argparse.ArgumentParser()
parser.add_argument('--base_dir', default='')
parser.add_argument('--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--dataset', default='thchs30')
parser.add_argument('--language', default='en_US')
parser.add_argument('--voice', default='female')
parser.add_argument('--reader', default='mary_ann')
parser.add_argument('--merge_books', default='False')
parser.add_argument('--book', default='northandsouth')
parser.add_argument('--output', default='training_data')
parser.add_argument('--n_jobs', type=int, default=cpu_count())
args = parser.parse_args()
modified_hp = hparams.parse(args.hparams)
assert args.merge_books in ('False', 'True')
run_preprocess(args, modified_hp)
if __name__ == '__main__':
main()