-
Notifications
You must be signed in to change notification settings - Fork 79
/
Copy pathextract_simuleval_data.py
48 lines (40 loc) · 1.38 KB
/
extract_simuleval_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import tqdm
import argparse
import pandas as pd
import os
import sys
dir_path = os.path.dirname(os.path.realpath(__file__))
parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
sys.path.insert(0, parent_dir_path)
from pathlib import Path
from examples.speech_to_text.data_utils import (
load_df_from_tsv,
save_df_to_tsv,
gen_config_yaml,
)
from fairseq.data.audio.data_cfg import S2SDataConfig
MANIFEST_COLUMNS = ["audio", "tgt_text"]
def process(args):
cvss_dir = Path(args.cvss_dir)
covost2_dir = Path(args.covost2_dir)
out_dir = Path(args.out_dir)
out_dir.mkdir(exist_ok=True)
for split in ["train", "dev", "test"]:
Path(f"{out_dir}/{split}").mkdir(exist_ok=True)
with open(cvss_dir / f"{split}.tsv", "r") as f:
data = f.read().splitlines()
with open(f"{out_dir}/{split}/wav_list.txt", "w") as f_wav:
with open(f"{out_dir}/{split}/target.txt", "w") as f_tgt:
for x in data:
wav, tgt = x.split("\t")
f_wav.write(f"{covost2_dir}/clips/{wav}" + "\n")
f_tgt.write(tgt + "\n")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--cvss-dir")
parser.add_argument("--covost2-dir")
parser.add_argument("--out-dir")
args = parser.parse_args()
process(args)
if __name__ == "__main__":
main()