This repository has been archived by the owner on Dec 7, 2023. It is now read-only.
forked from pytorch/audio
-
Notifications
You must be signed in to change notification settings - Fork 1
/
librilightlimited_test.py
107 lines (86 loc) · 4.35 KB
/
librilightlimited_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
from torchaudio.datasets import librilight_limited
from torchaudio_unittest.common_utils import get_whitenoise, save_wav, TempDirMixin, TorchaudioTestCase
# Used to generate a unique transcript for each dummy audio file
_NUMBERS = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"]
def _save_sample(file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed):
filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac"
path = os.path.join(file_path, filename)
data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed)
transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]])
save_wav(path, data, sample_rate)
sample = (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
return sample
def get_mock_dataset(dataset_dir: str):
"""Create mocked dataset for a sub directory.
Args:
dataset_dir (str): the path of the sub directory.
The structure is: audio_type/speaker_id/chapter_id/filename.flac
"""
mocked_data = []
sample_rate = 16000 # 16kHz
seed = 0
for audio_type in ["clean", "other"]:
for speaker_id in range(5):
for chapter_id in range(3):
file_path = os.path.join(dataset_dir, audio_type, str(speaker_id), str(chapter_id))
os.makedirs(file_path, exist_ok=True)
trans_content = []
for utterance_id in range(3):
sample = _save_sample(file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed)
trans_content.append(f"{sample[3]}-{sample[4]}-{sample[5]:04d} {sample[2]}")
mocked_data.append(sample)
seed += 1
trans_filename = f"{speaker_id}-{chapter_id}.trans.txt"
trans_path = os.path.join(file_path, trans_filename)
with open(trans_path, "w") as f:
f.write("\n".join(trans_content))
return mocked_data
def get_mock_datasets(root_dir):
"""
root_dir: directory to the mocked dataset
"""
mocked_data_10min, mocked_data_1h, mocked_data_10h = [], [], []
dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", "0")
os.makedirs(dataset_dir, exist_ok=True)
mocked_data_10min = get_mock_dataset(dataset_dir)
mocked_data_1h += mocked_data_10min
for i in range(1, 6):
dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", str(i))
os.makedirs(dataset_dir, exist_ok=True)
mocked_data_1h += get_mock_dataset(dataset_dir)
mocked_data_10h += mocked_data_1h
dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "9h")
os.makedirs(dataset_dir, exist_ok=True)
mocked_data_10h += get_mock_dataset(dataset_dir)
return mocked_data_10min, mocked_data_1h, mocked_data_10h
class TestLibriLightLimited(TempDirMixin, TorchaudioTestCase):
backend = "default"
root_dir = None
samples_10min = []
samples_1h = []
samples_10h = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
(cls.samples_10min, cls.samples_1h, cls.samples_10h) = get_mock_datasets(cls.root_dir)
def _test_librilightlimited(self, dataset, samples):
num_samples = 0
for i, (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) in enumerate(dataset):
self.assertEqual(data, samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == samples[i][1]
assert transcript == samples[i][2]
assert speaker_id == samples[i][3]
assert chapter_id == samples[i][4]
assert utterance_id == samples[i][5]
num_samples += 1
assert num_samples == len(samples)
def test_librilightlimited_10min(self):
dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="10min")
self._test_librilightlimited(dataset, self.samples_10min)
def test_librilightlimited_1h(self):
dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="1h")
self._test_librilightlimited(dataset, self.samples_1h)
def test_librilightlimited_10h(self):
dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="10h")
self._test_librilightlimited(dataset, self.samples_10h)