Skip to content

Commit

Permalink
Compute features of librispeech and musan.
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Jul 19, 2021
1 parent 40eed74 commit 0b19aa0
Show file tree
Hide file tree
Showing 8 changed files with 322 additions and 7 deletions.
3 changes: 3 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@ repos:
rev: 21.6b0
hooks:
- id: black
args: [--line-length=80]

- repo: https://github.com/PyCQA/flake8
rev: 3.9.2
hooks:
- id: flake8
args: [--max-line-length=80]

- repo: https://github.com/pycqa/isort
rev: 5.9.2
hooks:
- id: isort
args: [--profile=black]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
Expand Down
98 changes: 98 additions & 0 deletions egs/librispeech/ASR/local/compute_fbank_librispeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3

"""
This file computes fbank features of the librispeech dataset.
Its looks for manifests in the directory data/manifests
and generated fbank features are saved in data/fbank.
"""

import os
import subprocess
from contextlib import contextmanager
from pathlib import Path

from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
from lhotse.recipes.utils import read_manifests_if_cached


@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client

with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None


def compute_fbank_librispeech():
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())
num_mel_bins = 80

dataset_parts = (
"dev-clean",
"dev-other",
"test-clean",
"test-other",
"train-clean-100",
"train-clean-360",
"train-other-500",
)
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=src_dir
)
assert manifests is not None

extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
print(f"{partition} already exists - skipping.")
continue
print("Processing", partition)
cut_set = CutSet.from_manifests(
recordings=m["recordings"], supervisions=m["supervisions"],
)
if "train" in partition:
cut_set = (
cut_set
+ cut_set.perturb_speed(0.9)
+ cut_set.perturb_speed(1.1)
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomHdf5Writer,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")


if __name__ == "__main__":
compute_fbank_librispeech()
97 changes: 97 additions & 0 deletions egs/librispeech/ASR/local/compute_fbank_musan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3

"""
This file computes fbank features of the musan dataset.
Its looks for manifests in the directory data/manifests
and generated fbank features are saved in data/fbank.
"""

import os
import subprocess
from contextlib import contextmanager
from pathlib import Path

from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
from lhotse.recipes.utils import read_manifests_if_cached


@contextmanager
def get_executor():
# We'll either return a process pool or a distributed worker pool.
# Note that this has to be a context manager because we might use multiple
# context manager ("with" clauses) inside, and this way everything will
# free up the resources at the right time.
try:
# If this is executed on the CLSP grid, we will try to use the
# Grid Engine to distribute the tasks.
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
# (see https://github.com/pzelasko/plz for reference)
#
# The following must be installed:
# $ pip install dask distributed
# $ pip install git+https://github.com/pzelasko/plz
name = subprocess.check_output("hostname -f", shell=True, text=True)
if name.strip().endswith(".clsp.jhu.edu"):
import plz
from distributed import Client

with plz.setup_cluster() as cluster:
cluster.scale(80)
yield Client(cluster)
return
except:
pass
# No need to return anything - compute_and_store_features
# will just instantiate the pool itself.
yield None


def compute_fbank_musan():
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())
num_mel_bins = 80

dataset_parts = (
"music",
"speech",
"noise",
)
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=src_dir
)
assert manifests is not None

musan_cuts_path = output_dir / "cuts_musan.json.gz"

if musan_cuts_path.is_file():
print(f"{musan_cuts_path} already exists - skipping")
return

print("Extracting features for Musan")

extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

with get_executor() as ex: # Initialize the executor only once.
# create chunks of Musan with duration 5 - 10 seconds
musan_cuts = (
CutSet.from_manifests(
recordings=combine(
part["recordings"] for part in manifests.values()
)
)
.cut_into_windows(10.0)
.filter(lambda c: c.duration > 5)
.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_musan",
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomHdf5Writer,
)
)
musan_cuts.to_json(musan_cuts_path)


if __name__ == "__main__":
compute_fbank_musan()
21 changes: 21 additions & 0 deletions egs/librispeech/ASR/local/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env python3

"""
This file downloads the librispeech dataset
to the directory data/LibriSpeech.
It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh .
"""


from lhotse.recipes import download_librispeech


def download_data():
target_dir = "data"

download_librispeech(target_dir=target_dir, dataset_parts="librispeech")


if __name__ == "__main__":
download_data()
7 changes: 4 additions & 3 deletions egs/librispeech/ASR/local/download_lm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/usr/bin/env python3

# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
"""
This file downloads librispeech LM files to data/lm
"""

import gzip
import os
Expand All @@ -26,9 +29,7 @@ def download_lm():
filename = target_dir / f
if filename.is_file() is False:
urlretrieve_progress(
f"{url}/{f}",
filename=filename,
desc=f"Downloading {filename}",
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
)

if ".gz" in str(filename):
Expand Down
29 changes: 29 additions & 0 deletions egs/librispeech/ASR/local/prepare_librispeech_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3

"""
This file generates manifests for the librispeech dataset.
It expects the dataset is saved in data/LibriSpeech
and the generated manifests are saved in data/manifests.
"""

import os
from pathlib import Path

from lhotse.recipes import prepare_librispeech


def prepare_librispeech_mainfest():
corpus_dir = Path("data/LibriSpeech")
output_dir = Path("data/manifests")
num_jobs = min(15, os.cpu_count())

librispeech_manifests = prepare_librispeech(
corpus_dir=corpus_dir,
dataset_parts="auto",
output_dir=output_dir,
num_jobs=num_jobs,
)


if __name__ == "__main__":
prepare_librispeech_mainfest()
22 changes: 22 additions & 0 deletions egs/librispeech/ASR/local/prepare_musan_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env python3

"""
This file generates manifests for the musan dataset.
It expects the dataset is saved in data/musan
and the generated manifests are saved in data/manifests.
"""

from pathlib import Path

from lhotse.recipes import prepare_musan


def prepare_musan_mainfest():
corpus_dir = Path("data/musan")
output_dir = Path("data/manifests")

prepare_musan(corpus_dir=corpus_dir, output_dir=output_dir)


if __name__ == "__main__":
prepare_musan_mainfest()
52 changes: 48 additions & 4 deletions egs/librispeech/ASR/prepare.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env bash


set -eou pipefail

stage=-1
Expand All @@ -19,8 +18,53 @@ fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
echo "stage 0: Download data"

# If you have pre-downloaded it in /path/to/LibriSpeech
# Just run: ln -sfv /path/to/LibriSpeech data/
# If you have pre-downloaded it to /path/to/LibriSpeech,
# you can create a symlink to avoid downloading it again:
#
# ln -sfv /path/to/LibriSpeech data/
#

mkdir -p data/LibriSpeech
# TODO

if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
# It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
./local/download_data.py
fi

# If you have pre-downloaded it to /path/to/musan,
# you can create a symlink to avoid downloading it again:
#
# ln -s /path/to/musan data/
#
if [ ! -e data/musan ]; then
wget https://www.openslr.org/resources/17/musan.tar.gz
fi
fi

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
echo "Stage 1: Prepare librispeech manifest"
# We assume that you have downloaded the librispeech corpus
# to data/LibriSpeech
mkdir -p data/manifests
./local/prepare_librispeech_manifest.py
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
echo "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to data/musan
mkdir -p data/manifests
./local/prepare_musan_manifest.py
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
echo "Stage 3: Compute fbank for librispeech"
mkdir -p data/fbank
./local/compute_fbank_librispeech.py
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
echo "Stage 4: Compute fbank for librispeech"
mkdir -p data/fbank
./local/compute_fbank_musan.py
fi

0 comments on commit 0b19aa0

Please sign in to comment.