Skip to content

Commit

Permalink
Download LM for LibriSpeech.
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Jul 15, 2021
1 parent d146a4e commit 40eed74
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data
4 changes: 4 additions & 0 deletions egs/librispeech/ASR/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

Run `./prepare.sh` to prepare the data.

Run `./xxx_train.py` (to be added) to train a model.
43 changes: 43 additions & 0 deletions egs/librispeech/ASR/local/download_lm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python3

# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)

import gzip
import os
import shutil
from pathlib import Path

from lhotse.utils import urlretrieve_progress
from tqdm.auto import tqdm


def download_lm():
url = "http://www.openslr.org/resources/11"
target_dir = Path("data/lm")

files_to_download = (
"3-gram.pruned.1e-7.arpa.gz",
"4-gram.arpa.gz",
"librispeech-vocab.txt",
"librispeech-lexicon.txt",
)

for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
filename = target_dir / f
if filename.is_file() is False:
urlretrieve_progress(
f"{url}/{f}",
filename=filename,
desc=f"Downloading {filename}",
)

if ".gz" in str(filename):
unzip_file = Path(os.path.splitext(filename)[0])
if unzip_file.is_file() is False:
with gzip.open(filename, "rb") as f_in:
with open(unzip_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)


if __name__ == "__main__":
download_lm()
97 changes: 97 additions & 0 deletions egs/librispeech/ASR/local/parse_options.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env bash

# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).


###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###

# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done


###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi

# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";

# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done


# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


true; # so this script returns exit code 0.
26 changes: 26 additions & 0 deletions egs/librispeech/ASR/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash


set -eou pipefail

stage=-1
stop_stage=100

. local/parse_options.sh || exit 1

mkdir -p data

if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
echo "stage -1: Download LM"
mkdir -p data/lm
./local/download_lm.py
fi

if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
echo "stage 0: Download data"

# If you have pre-downloaded it in /path/to/LibriSpeech
# Just run: ln -sfv /path/to/LibriSpeech data/
mkdir -p data/LibriSpeech
# TODO
fi

0 comments on commit 40eed74

Please sign in to comment.