Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP [not-for-merge]: run aishell with latest recipe in Kaldi #3868

Draft
wants to merge 2 commits into
base: pybind11
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions egs/aishell/s10/conf/mfcc_hires.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# config for high-resolution MFCC features, intended for neural network training
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--num-mel-bins=40 # similar to Google's setup.
--num-ceps=40 # there is no dimensionality reduction.
--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so
# there might be some information at the low end.
--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
1 change: 1 addition & 0 deletions egs/aishell/s10/conf/online_cmvn.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
181 changes: 181 additions & 0 deletions egs/aishell/s10/local/nnet3/run_ivector_common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/bin/bash

set -e -o pipefail


# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
# be called by more scripts). It contains the common feature preparation and iVector-related parts
# of the script. See those scripts for examples of usage.


stage=0
nj=30

train_set=train_cleaned # you might set this to e.g. train.
gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on;
# it should contain alignments for 'train_set'.
online_cmvn_iextractor=false

num_threads_ubm=8
nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it
# becomes exp/nnet3_cleaned or whatever.

. ./cmd.sh
. ./path.sh
. utils/parse_options.sh


gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${train_set}_sp

for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
exit 1
fi
done


# lowres features, alignments
if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 2 ]; then
echo "$0: data/${train_set}_sp/feats.scp already exists. Refusing to overwrite the features "
echo " to avoid wasting time. Please remove the file and continue if you really mean this."
exit 1;
fi

if [ $stage -le 1 ]; then
echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
utils/data/perturb_data_dir_speed_3way.sh \
data/${train_set} data/${train_set}_sp

for datadir in ${train_set}_sp dev test; do
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
done
fi

if [ $stage -le 2 ]; then
echo "$0: making MFCC features for low-resolution speed-perturbed data"
steps/make_mfcc.sh --nj $nj \
--cmd "$train_cmd" data/${train_set}_sp
steps/compute_cmvn_stats.sh data/${train_set}_sp
echo "$0: fixing input data-dir to remove nonexistent features, in case some "
echo ".. speed-perturbed segments were too short."
utils/fix_data_dir.sh data/${train_set}_sp
fi

if [ $stage -le 3 ]; then
if [ -f $ali_dir/ali.1.gz ]; then
echo "$0: alignments in $ali_dir appear to already exist. Please either remove them "
echo " ... or use a later --stage option."
exit 1
fi
echo "$0: aligning with the perturbed low-resolution data"
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
data/${train_set}_sp data/lang $gmm_dir $ali_dir
fi


if [ $stage -le 5 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
echo " ... Please either remove it, or rerun this script with stage > 2."
exit 1
fi

if [ $stage -le 5 ]; then
echo "$0: creating high-resolution MFCC features"

# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=data/${train_set}_sp_hires/data
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
fi

# do volume-perturbation on the training data prior to extracting hires
# features; this helps make trained nnets more invariant to test data volume.
utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires

for datadir in ${train_set}_sp dev test; do
steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${datadir}_hires
steps/compute_cmvn_stats.sh data/${datadir}_hires
utils/fix_data_dir.sh data/${datadir}_hires
done
fi

if [ $stage -le 6 ]; then
echo "$0: computing a subset of data to train the diagonal UBM."

mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm

# train a diagonal UBM using a subset of about a quarter of the data
num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
num_utts=$[$num_utts_total/4]
utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
$num_utts ${temp_data_root}/${train_set}_sp_hires_subset

echo "$0: computing a PCA transform from the hires data."
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
--max-utts 10000 --subsample 2 \
${temp_data_root}/${train_set}_sp_hires_subset \
exp/nnet3${nnet3_affix}/pca_transform

echo "$0: training the diagonal UBM."
# Use 512 Gaussians in the UBM.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
--num-frames 700000 \
--num-threads $num_threads_ubm \
${temp_data_root}/${train_set}_sp_hires_subset 512 \
exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
fi

if [ $stage -le 7 ]; then
# Train the iVector extractor. µUse all of the speed-perturbed data since iVector extractors
# can be sensitive to the amount of data. The script defaults to an iVector dimension of 100.
echo "$0: training the iVector extractor"
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 15 \
--num-threads 4 --num-processes 2 \
--online-cmvn-iextractor $online_cmvn_iextractor \
data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
exp/nnet3${nnet3_affix}/extractor || exit 1;
fi

if [ $stage -le 8 ]; then
# note, we don't encode the 'max2' in the name of the ivectordir even though
# that's the data we extract the ivectors from, as it's still going to be
# valid for the non-'max2' data, the utterance list is the same.
ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/tedlium-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
fi
# We now extract iVectors on the speed-perturbed training data . With
# --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
# each of these pairs as one speaker; this gives more diversity in iVectors..
# Note that these are extracted 'online' (they vary within the utterance).

# Having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (the iVector starts at zero at the beginning
# of each pseudo-speaker).
temp_data_root=${ivectordir}
utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2

steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
${temp_data_root}/${train_set}_sp_hires_max2 \
exp/nnet3${nnet3_affix}/extractor $ivectordir

# Also extract iVectors for the test data, but in this case we don't need the speed
# perturbation (sp) or small-segment concatenation (comb).
for data in dev test; do
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
exp/nnet3${nnet3_affix}/ivectors_${data}_hires
done
fi


exit 0;
68 changes: 68 additions & 0 deletions egs/aishell/s10/local/run_cleanup_segmentation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash

# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script demonstrates how to re-segment training data selecting only the
# "good" audio that matches the transcripts.
# The basic idea is to decode with an existing in-domain acoustic model, and a
# biased language model built from the reference, and then work out the
# segmentation from a ctm like file.

# For nnet3 and chain results after cleanup, see the scripts in
# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh

# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
# [will add these later].

set -e
set -o pipefail
set -u

stage=0
cleanup_stage=0
data=data/train
cleanup_affix=cleaned
srcdir=exp/tri3
nj=100
decode_nj=16
decode_num_threads=4

. ./path.sh
. ./cmd.sh
. utils/parse_options.sh

cleaned_data=${data}_${cleanup_affix}

dir=${srcdir}_${cleanup_affix}_work
cleaned_dir=${srcdir}_${cleanup_affix}

if [ $stage -le 1 ]; then
# This does the actual data cleanup.
steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
$data data/lang $srcdir $dir $cleaned_data
fi

if [ $stage -le 2 ]; then
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
$cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix}
fi

if [ $stage -le 3 ]; then
steps/train_sat.sh --cmd "$train_cmd" \
5000 100000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
fi

if [ $stage -le 4 ]; then
# Test with the models trained on cleaned-up data.
utils/mkgraph.sh data/lang ${cleaned_dir} ${cleaned_dir}/graph

for dset in dev test; do
steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
--cmd "$decode_cmd" --num-threads 4 \
${cleaned_dir}/graph data/${dset} ${cleaned_dir}/decode_${dset}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
data/${dset} ${cleaned_dir}/decode_${dset} ${cleaned_dir}/decode_${dset}_rescore
done
fi
Loading