-
Notifications
You must be signed in to change notification settings - Fork 300
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d146a4e
commit 40eed74
Showing
5 changed files
with
171 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
|
||
Run `./prepare.sh` to prepare the data. | ||
|
||
Run `./xxx_train.py` (to be added) to train a model. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) | ||
|
||
import gzip | ||
import os | ||
import shutil | ||
from pathlib import Path | ||
|
||
from lhotse.utils import urlretrieve_progress | ||
from tqdm.auto import tqdm | ||
|
||
|
||
def download_lm(): | ||
url = "http://www.openslr.org/resources/11" | ||
target_dir = Path("data/lm") | ||
|
||
files_to_download = ( | ||
"3-gram.pruned.1e-7.arpa.gz", | ||
"4-gram.arpa.gz", | ||
"librispeech-vocab.txt", | ||
"librispeech-lexicon.txt", | ||
) | ||
|
||
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"): | ||
filename = target_dir / f | ||
if filename.is_file() is False: | ||
urlretrieve_progress( | ||
f"{url}/{f}", | ||
filename=filename, | ||
desc=f"Downloading {filename}", | ||
) | ||
|
||
if ".gz" in str(filename): | ||
unzip_file = Path(os.path.splitext(filename)[0]) | ||
if unzip_file.is_file() is False: | ||
with gzip.open(filename, "rb") as f_in: | ||
with open(unzip_file, "wb") as f_out: | ||
shutil.copyfileobj(f_in, f_out) | ||
|
||
|
||
if __name__ == "__main__": | ||
download_lm() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); | ||
# Arnab Ghoshal, Karel Vesely | ||
|
||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
# MERCHANTABLITY OR NON-INFRINGEMENT. | ||
# See the Apache 2 License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
# Parse command-line options. | ||
# To be sourced by another script (as in ". parse_options.sh"). | ||
# Option format is: --option-name arg | ||
# and shell variable "option_name" gets set to value "arg." | ||
# The exception is --help, which takes no arguments, but prints the | ||
# $help_message variable (if defined). | ||
|
||
|
||
### | ||
### The --config file options have lower priority to command line | ||
### options, so we need to import them first... | ||
### | ||
|
||
# Now import all the configs specified by command-line, in left-to-right order | ||
for ((argpos=1; argpos<$#; argpos++)); do | ||
if [ "${!argpos}" == "--config" ]; then | ||
argpos_plus1=$((argpos+1)) | ||
config=${!argpos_plus1} | ||
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 | ||
. $config # source the config file. | ||
fi | ||
done | ||
|
||
|
||
### | ||
### Now we process the command line options | ||
### | ||
while true; do | ||
[ -z "${1:-}" ] && break; # break if there are no arguments | ||
case "$1" in | ||
# If the enclosing script is called with --help option, print the help | ||
# message and exit. Scripts should put help messages in $help_message | ||
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; | ||
else printf "$help_message\n" 1>&2 ; fi; | ||
exit 0 ;; | ||
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" | ||
exit 1 ;; | ||
# If the first command-line argument begins with "--" (e.g. --foo-bar), | ||
# then work out the variable name as $name, which will equal "foo_bar". | ||
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; | ||
# Next we test whether the variable in question is undefned-- if so it's | ||
# an invalid option and we die. Note: $0 evaluates to the name of the | ||
# enclosing script. | ||
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar | ||
# is undefined. We then have to wrap this test inside "eval" because | ||
# foo_bar is itself inside a variable ($name). | ||
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; | ||
|
||
oldval="`eval echo \\$$name`"; | ||
# Work out whether we seem to be expecting a Boolean argument. | ||
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then | ||
was_bool=true; | ||
else | ||
was_bool=false; | ||
fi | ||
|
||
# Set the variable to the right value-- the escaped quotes make it work if | ||
# the option had spaces, like --cmd "queue.pl -sync y" | ||
eval $name=\"$2\"; | ||
|
||
# Check that Boolean-valued arguments are really Boolean. | ||
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then | ||
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 | ||
exit 1; | ||
fi | ||
shift 2; | ||
;; | ||
*) break; | ||
esac | ||
done | ||
|
||
|
||
# Check for an empty argument to the --cmd option, which can easily occur as a | ||
# result of scripting errors. | ||
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; | ||
|
||
|
||
true; # so this script returns exit code 0. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/usr/bin/env bash | ||
|
||
|
||
set -eou pipefail | ||
|
||
stage=-1 | ||
stop_stage=100 | ||
|
||
. local/parse_options.sh || exit 1 | ||
|
||
mkdir -p data | ||
|
||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then | ||
echo "stage -1: Download LM" | ||
mkdir -p data/lm | ||
./local/download_lm.py | ||
fi | ||
|
||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then | ||
echo "stage 0: Download data" | ||
|
||
# If you have pre-downloaded it in /path/to/LibriSpeech | ||
# Just run: ln -sfv /path/to/LibriSpeech data/ | ||
mkdir -p data/LibriSpeech | ||
# TODO | ||
fi |