From 0b19aa09c1d1921affae37688115ebfdb090b188 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 19 Jul 2021 23:35:32 +0800 Subject: [PATCH] Compute features of librispeech and musan. --- .pre-commit-config.yaml | 3 + .../ASR/local/compute_fbank_librispeech.py | 98 +++++++++++++++++++ .../ASR/local/compute_fbank_musan.py | 97 ++++++++++++++++++ egs/librispeech/ASR/local/download_data.py | 21 ++++ egs/librispeech/ASR/local/download_lm.py | 7 +- .../ASR/local/prepare_librispeech_manifest.py | 29 ++++++ .../ASR/local/prepare_musan_manifest.py | 22 +++++ egs/librispeech/ASR/prepare.sh | 52 +++++++++- 8 files changed, 322 insertions(+), 7 deletions(-) create mode 100755 egs/librispeech/ASR/local/compute_fbank_librispeech.py create mode 100755 egs/librispeech/ASR/local/compute_fbank_musan.py create mode 100755 egs/librispeech/ASR/local/download_data.py create mode 100755 egs/librispeech/ASR/local/prepare_librispeech_manifest.py create mode 100755 egs/librispeech/ASR/local/prepare_musan_manifest.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 792e01f026..ac08ff6d7a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,16 +3,19 @@ repos: rev: 21.6b0 hooks: - id: black + args: [--line-length=80] - repo: https://github.com/PyCQA/flake8 rev: 3.9.2 hooks: - id: flake8 + args: [--max-line-length=80] - repo: https://github.com/pycqa/isort rev: 5.9.2 hooks: - id: isort + args: [--profile=black] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py new file mode 100755 index 0000000000..0c55f72415 --- /dev/null +++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +""" +This file computes fbank features of the librispeech dataset. +Its looks for manifests in the directory data/manifests +and generated fbank features are saved in data/fbank. +""" + +import os +import subprocess +from contextlib import contextmanager +from pathlib import Path + +from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine +from lhotse.recipes.utils import read_manifests_if_cached + + +@contextmanager +def get_executor(): + # We'll either return a process pool or a distributed worker pool. + # Note that this has to be a context manager because we might use multiple + # context manager ("with" clauses) inside, and this way everything will + # free up the resources at the right time. + try: + # If this is executed on the CLSP grid, we will try to use the + # Grid Engine to distribute the tasks. + # Other clusters can also benefit from that, provided a cluster-specific wrapper. + # (see https://github.com/pzelasko/plz for reference) + # + # The following must be installed: + # $ pip install dask distributed + # $ pip install git+https://github.com/pzelasko/plz + name = subprocess.check_output("hostname -f", shell=True, text=True) + if name.strip().endswith(".clsp.jhu.edu"): + import plz + from distributed import Client + + with plz.setup_cluster() as cluster: + cluster.scale(80) + yield Client(cluster) + return + except: + pass + # No need to return anything - compute_and_store_features + # will just instantiate the pool itself. + yield None + + +def compute_fbank_librispeech(): + src_dir = Path("data/manifests") + output_dir = Path("data/fbank") + num_jobs = min(15, os.cpu_count()) + num_mel_bins = 80 + + dataset_parts = ( + "dev-clean", + "dev-other", + "test-clean", + "test-other", + "train-clean-100", + "train-clean-360", + "train-other-500", + ) + manifests = read_manifests_if_cached( + dataset_parts=dataset_parts, output_dir=src_dir + ) + assert manifests is not None + + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + + with get_executor() as ex: # Initialize the executor only once. + for partition, m in manifests.items(): + if (output_dir / f"cuts_{partition}.json.gz").is_file(): + print(f"{partition} already exists - skipping.") + continue + print("Processing", partition) + cut_set = CutSet.from_manifests( + recordings=m["recordings"], supervisions=m["supervisions"], + ) + if "train" in partition: + cut_set = ( + cut_set + + cut_set.perturb_speed(0.9) + + cut_set.perturb_speed(1.1) + ) + cut_set = cut_set.compute_and_store_features( + extractor=extractor, + storage_path=f"{output_dir}/feats_{partition}", + # when an executor is specified, make more partitions + num_jobs=num_jobs if ex is None else 80, + executor=ex, + storage_type=LilcomHdf5Writer, + ) + cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + + +if __name__ == "__main__": + compute_fbank_librispeech() diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py new file mode 100755 index 0000000000..41b19c6561 --- /dev/null +++ b/egs/librispeech/ASR/local/compute_fbank_musan.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +""" +This file computes fbank features of the musan dataset. +Its looks for manifests in the directory data/manifests +and generated fbank features are saved in data/fbank. +""" + +import os +import subprocess +from contextlib import contextmanager +from pathlib import Path + +from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine +from lhotse.recipes.utils import read_manifests_if_cached + + +@contextmanager +def get_executor(): + # We'll either return a process pool or a distributed worker pool. + # Note that this has to be a context manager because we might use multiple + # context manager ("with" clauses) inside, and this way everything will + # free up the resources at the right time. + try: + # If this is executed on the CLSP grid, we will try to use the + # Grid Engine to distribute the tasks. + # Other clusters can also benefit from that, provided a cluster-specific wrapper. + # (see https://github.com/pzelasko/plz for reference) + # + # The following must be installed: + # $ pip install dask distributed + # $ pip install git+https://github.com/pzelasko/plz + name = subprocess.check_output("hostname -f", shell=True, text=True) + if name.strip().endswith(".clsp.jhu.edu"): + import plz + from distributed import Client + + with plz.setup_cluster() as cluster: + cluster.scale(80) + yield Client(cluster) + return + except: + pass + # No need to return anything - compute_and_store_features + # will just instantiate the pool itself. + yield None + + +def compute_fbank_musan(): + src_dir = Path("data/manifests") + output_dir = Path("data/fbank") + num_jobs = min(15, os.cpu_count()) + num_mel_bins = 80 + + dataset_parts = ( + "music", + "speech", + "noise", + ) + manifests = read_manifests_if_cached( + dataset_parts=dataset_parts, output_dir=src_dir + ) + assert manifests is not None + + musan_cuts_path = output_dir / "cuts_musan.json.gz" + + if musan_cuts_path.is_file(): + print(f"{musan_cuts_path} already exists - skipping") + return + + print("Extracting features for Musan") + + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + + with get_executor() as ex: # Initialize the executor only once. + # create chunks of Musan with duration 5 - 10 seconds + musan_cuts = ( + CutSet.from_manifests( + recordings=combine( + part["recordings"] for part in manifests.values() + ) + ) + .cut_into_windows(10.0) + .filter(lambda c: c.duration > 5) + .compute_and_store_features( + extractor=extractor, + storage_path=f"{output_dir}/feats_musan", + num_jobs=num_jobs if ex is None else 80, + executor=ex, + storage_type=LilcomHdf5Writer, + ) + ) + musan_cuts.to_json(musan_cuts_path) + + +if __name__ == "__main__": + compute_fbank_musan() diff --git a/egs/librispeech/ASR/local/download_data.py b/egs/librispeech/ASR/local/download_data.py new file mode 100755 index 0000000000..b9e6232fe3 --- /dev/null +++ b/egs/librispeech/ASR/local/download_data.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +""" +This file downloads the librispeech dataset +to the directory data/LibriSpeech. + +It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh . +""" + + +from lhotse.recipes import download_librispeech + + +def download_data(): + target_dir = "data" + + download_librispeech(target_dir=target_dir, dataset_parts="librispeech") + + +if __name__ == "__main__": + download_data() diff --git a/egs/librispeech/ASR/local/download_lm.py b/egs/librispeech/ASR/local/download_lm.py index fd6713ce89..7df8646803 100755 --- a/egs/librispeech/ASR/local/download_lm.py +++ b/egs/librispeech/ASR/local/download_lm.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 # Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) +""" +This file downloads librispeech LM files to data/lm +""" import gzip import os @@ -26,9 +29,7 @@ def download_lm(): filename = target_dir / f if filename.is_file() is False: urlretrieve_progress( - f"{url}/{f}", - filename=filename, - desc=f"Downloading {filename}", + f"{url}/{f}", filename=filename, desc=f"Downloading {filename}", ) if ".gz" in str(filename): diff --git a/egs/librispeech/ASR/local/prepare_librispeech_manifest.py b/egs/librispeech/ASR/local/prepare_librispeech_manifest.py new file mode 100755 index 0000000000..357f6e6ea2 --- /dev/null +++ b/egs/librispeech/ASR/local/prepare_librispeech_manifest.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +""" +This file generates manifests for the librispeech dataset. +It expects the dataset is saved in data/LibriSpeech +and the generated manifests are saved in data/manifests. +""" + +import os +from pathlib import Path + +from lhotse.recipes import prepare_librispeech + + +def prepare_librispeech_mainfest(): + corpus_dir = Path("data/LibriSpeech") + output_dir = Path("data/manifests") + num_jobs = min(15, os.cpu_count()) + + librispeech_manifests = prepare_librispeech( + corpus_dir=corpus_dir, + dataset_parts="auto", + output_dir=output_dir, + num_jobs=num_jobs, + ) + + +if __name__ == "__main__": + prepare_librispeech_mainfest() diff --git a/egs/librispeech/ASR/local/prepare_musan_manifest.py b/egs/librispeech/ASR/local/prepare_musan_manifest.py new file mode 100755 index 0000000000..43b9839790 --- /dev/null +++ b/egs/librispeech/ASR/local/prepare_musan_manifest.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +""" +This file generates manifests for the musan dataset. +It expects the dataset is saved in data/musan +and the generated manifests are saved in data/manifests. +""" + +from pathlib import Path + +from lhotse.recipes import prepare_musan + + +def prepare_musan_mainfest(): + corpus_dir = Path("data/musan") + output_dir = Path("data/manifests") + + prepare_musan(corpus_dir=corpus_dir, output_dir=output_dir) + + +if __name__ == "__main__": + prepare_musan_mainfest() diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 861ded0ac2..89ec476733 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash - set -eou pipefail stage=-1 @@ -19,8 +18,53 @@ fi if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then echo "stage 0: Download data" - # If you have pre-downloaded it in /path/to/LibriSpeech - # Just run: ln -sfv /path/to/LibriSpeech data/ + # If you have pre-downloaded it to /path/to/LibriSpeech, + # you can create a symlink to avoid downloading it again: + # + # ln -sfv /path/to/LibriSpeech data/ + # + mkdir -p data/LibriSpeech - # TODO + + if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then + # It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh + ./local/download_data.py + fi + + # If you have pre-downloaded it to /path/to/musan, + # you can create a symlink to avoid downloading it again: + # + # ln -s /path/to/musan data/ + # + if [ ! -e data/musan ]; then + wget https://www.openslr.org/resources/17/musan.tar.gz + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + echo "Stage 1: Prepare librispeech manifest" + # We assume that you have downloaded the librispeech corpus + # to data/LibriSpeech + mkdir -p data/manifests + ./local/prepare_librispeech_manifest.py +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + echo "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to data/musan + mkdir -p data/manifests + ./local/prepare_musan_manifest.py +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + echo "Stage 3: Compute fbank for librispeech" + mkdir -p data/fbank + ./local/compute_fbank_librispeech.py +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + echo "Stage 4: Compute fbank for librispeech" + mkdir -p data/fbank + ./local/compute_fbank_musan.py fi